{ "best_metric": 0.635880708694458, "best_model_checkpoint": "miner_id_24/checkpoint-1800", "epoch": 0.33105731429753776, "eval_steps": 200, "global_step": 2400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00013794054762397408, "grad_norm": 0.48702943325042725, "learning_rate": 6.666666666666667e-06, "loss": 1.011, "step": 1 }, { "epoch": 0.00013794054762397408, "eval_loss": 3.1967947483062744, "eval_runtime": 23.4225, "eval_samples_per_second": 2.519, "eval_steps_per_second": 2.519, "step": 1 }, { "epoch": 0.00027588109524794816, "grad_norm": 0.39228329062461853, "learning_rate": 1.3333333333333333e-05, "loss": 0.8269, "step": 2 }, { "epoch": 0.0004138216428719222, "grad_norm": 0.5630053877830505, "learning_rate": 2e-05, "loss": 1.0138, "step": 3 }, { "epoch": 0.0005517621904958963, "grad_norm": 0.5379119515419006, "learning_rate": 2.6666666666666667e-05, "loss": 0.8146, "step": 4 }, { "epoch": 0.0006897027381198703, "grad_norm": 0.5841886401176453, "learning_rate": 3.3333333333333335e-05, "loss": 0.7592, "step": 5 }, { "epoch": 0.0008276432857438444, "grad_norm": 0.46231576800346375, "learning_rate": 4e-05, "loss": 0.9071, "step": 6 }, { "epoch": 0.0009655838333678184, "grad_norm": 0.6419610381126404, "learning_rate": 4.666666666666667e-05, "loss": 0.9903, "step": 7 }, { "epoch": 0.0011035243809917926, "grad_norm": 0.4809350073337555, "learning_rate": 5.333333333333333e-05, "loss": 0.8859, "step": 8 }, { "epoch": 0.0012414649286157666, "grad_norm": 0.4701339900493622, "learning_rate": 6e-05, "loss": 0.8131, "step": 9 }, { "epoch": 0.0013794054762397406, "grad_norm": 0.4429624676704407, "learning_rate": 6.666666666666667e-05, "loss": 0.8727, "step": 10 }, { "epoch": 0.0015173460238637148, "grad_norm": 0.46402427554130554, "learning_rate": 7.333333333333333e-05, "loss": 0.7067, "step": 11 }, { "epoch": 0.0016552865714876887, "grad_norm": 0.4485568106174469, "learning_rate": 8e-05, "loss": 0.7596, "step": 12 }, { "epoch": 0.001793227119111663, "grad_norm": 0.3845665752887726, "learning_rate": 8.666666666666667e-05, "loss": 0.818, "step": 13 }, { "epoch": 0.0019311676667356369, "grad_norm": 0.5298904180526733, "learning_rate": 9.333333333333334e-05, "loss": 0.9865, "step": 14 }, { "epoch": 0.002069108214359611, "grad_norm": 0.5633847713470459, "learning_rate": 0.0001, "loss": 0.761, "step": 15 }, { "epoch": 0.0022070487619835853, "grad_norm": 0.5660894513130188, "learning_rate": 0.00010666666666666667, "loss": 0.6838, "step": 16 }, { "epoch": 0.0023449893096075592, "grad_norm": 0.6358833909034729, "learning_rate": 0.00011333333333333334, "loss": 0.9807, "step": 17 }, { "epoch": 0.002482929857231533, "grad_norm": 0.36259546875953674, "learning_rate": 0.00012, "loss": 0.7468, "step": 18 }, { "epoch": 0.002620870404855507, "grad_norm": 0.7406138777732849, "learning_rate": 0.00012666666666666666, "loss": 1.2012, "step": 19 }, { "epoch": 0.002758810952479481, "grad_norm": 0.5918175578117371, "learning_rate": 0.00013333333333333334, "loss": 0.7447, "step": 20 }, { "epoch": 0.0028967515001034555, "grad_norm": 0.4696408808231354, "learning_rate": 0.00014, "loss": 0.5811, "step": 21 }, { "epoch": 0.0030346920477274295, "grad_norm": 0.8551336526870728, "learning_rate": 0.00014666666666666666, "loss": 0.8785, "step": 22 }, { "epoch": 0.0031726325953514035, "grad_norm": 1.1069241762161255, "learning_rate": 0.00015333333333333334, "loss": 0.7652, "step": 23 }, { "epoch": 0.0033105731429753774, "grad_norm": 0.4773555099964142, "learning_rate": 0.00016, "loss": 0.5968, "step": 24 }, { "epoch": 0.003448513690599352, "grad_norm": 0.6919654011726379, "learning_rate": 0.0001666666666666667, "loss": 0.785, "step": 25 }, { "epoch": 0.003586454238223326, "grad_norm": 0.6016067862510681, "learning_rate": 0.00017333333333333334, "loss": 0.6783, "step": 26 }, { "epoch": 0.0037243947858473, "grad_norm": 0.6246861815452576, "learning_rate": 0.00018, "loss": 1.1969, "step": 27 }, { "epoch": 0.0038623353334712738, "grad_norm": 0.7044798135757446, "learning_rate": 0.0001866666666666667, "loss": 0.7046, "step": 28 }, { "epoch": 0.004000275881095248, "grad_norm": 0.44917258620262146, "learning_rate": 0.00019333333333333333, "loss": 0.5572, "step": 29 }, { "epoch": 0.004138216428719222, "grad_norm": 0.5091779232025146, "learning_rate": 0.0002, "loss": 0.6602, "step": 30 }, { "epoch": 0.004276156976343196, "grad_norm": 0.5474268198013306, "learning_rate": 0.0001999999989536666, "loss": 0.7928, "step": 31 }, { "epoch": 0.0044140975239671705, "grad_norm": 0.5607008934020996, "learning_rate": 0.00019999999581466645, "loss": 0.8733, "step": 32 }, { "epoch": 0.004552038071591144, "grad_norm": 0.8838821053504944, "learning_rate": 0.00019999999058299957, "loss": 1.2598, "step": 33 }, { "epoch": 0.0046899786192151184, "grad_norm": 0.41664811968803406, "learning_rate": 0.00019999998325866613, "loss": 0.5844, "step": 34 }, { "epoch": 0.004827919166839092, "grad_norm": 0.552245557308197, "learning_rate": 0.00019999997384166624, "loss": 0.7623, "step": 35 }, { "epoch": 0.004965859714463066, "grad_norm": 0.7509823441505432, "learning_rate": 0.0001999999623320001, "loss": 0.6058, "step": 36 }, { "epoch": 0.005103800262087041, "grad_norm": 0.527321457862854, "learning_rate": 0.00019999994872966798, "loss": 0.8593, "step": 37 }, { "epoch": 0.005241740809711014, "grad_norm": 0.793042778968811, "learning_rate": 0.00019999993303467014, "loss": 1.3254, "step": 38 }, { "epoch": 0.005379681357334989, "grad_norm": 0.6100105047225952, "learning_rate": 0.0001999999152470069, "loss": 1.3563, "step": 39 }, { "epoch": 0.005517621904958962, "grad_norm": 0.564547598361969, "learning_rate": 0.00019999989536667863, "loss": 0.977, "step": 40 }, { "epoch": 0.005655562452582937, "grad_norm": 0.633572518825531, "learning_rate": 0.0001999998733936858, "loss": 1.2476, "step": 41 }, { "epoch": 0.005793503000206911, "grad_norm": 0.4978499412536621, "learning_rate": 0.0001999998493280288, "loss": 0.5857, "step": 42 }, { "epoch": 0.005931443547830885, "grad_norm": 0.3591054677963257, "learning_rate": 0.00019999982316970817, "loss": 0.5134, "step": 43 }, { "epoch": 0.006069384095454859, "grad_norm": 0.6278326511383057, "learning_rate": 0.00019999979491872448, "loss": 0.8699, "step": 44 }, { "epoch": 0.006207324643078833, "grad_norm": 0.39985278248786926, "learning_rate": 0.00019999976457507826, "loss": 1.0652, "step": 45 }, { "epoch": 0.006345265190702807, "grad_norm": 0.6397168636322021, "learning_rate": 0.0001999997321387702, "loss": 0.8229, "step": 46 }, { "epoch": 0.006483205738326781, "grad_norm": 0.4380306601524353, "learning_rate": 0.00019999969760980095, "loss": 0.7043, "step": 47 }, { "epoch": 0.006621146285950755, "grad_norm": 0.5619568824768066, "learning_rate": 0.00019999966098817123, "loss": 0.7648, "step": 48 }, { "epoch": 0.006759086833574729, "grad_norm": 0.5171204209327698, "learning_rate": 0.0001999996222738818, "loss": 0.9302, "step": 49 }, { "epoch": 0.006897027381198704, "grad_norm": 0.36841997504234314, "learning_rate": 0.00019999958146693354, "loss": 0.6045, "step": 50 }, { "epoch": 0.007034967928822677, "grad_norm": 0.9860273599624634, "learning_rate": 0.0001999995385673272, "loss": 0.9305, "step": 51 }, { "epoch": 0.007172908476446652, "grad_norm": 0.5823448300361633, "learning_rate": 0.00019999949357506376, "loss": 0.5956, "step": 52 }, { "epoch": 0.007310849024070625, "grad_norm": 0.49382463097572327, "learning_rate": 0.0001999994464901441, "loss": 0.5102, "step": 53 }, { "epoch": 0.0074487895716946, "grad_norm": 0.688617467880249, "learning_rate": 0.00019999939731256926, "loss": 1.547, "step": 54 }, { "epoch": 0.007586730119318574, "grad_norm": 0.38612183928489685, "learning_rate": 0.0001999993460423402, "loss": 0.546, "step": 55 }, { "epoch": 0.0077246706669425475, "grad_norm": 0.38228049874305725, "learning_rate": 0.00019999929267945808, "loss": 0.4671, "step": 56 }, { "epoch": 0.007862611214566521, "grad_norm": 0.8089519143104553, "learning_rate": 0.00019999923722392398, "loss": 0.826, "step": 57 }, { "epoch": 0.008000551762190496, "grad_norm": 0.5636898875236511, "learning_rate": 0.00019999917967573904, "loss": 0.9961, "step": 58 }, { "epoch": 0.00813849230981447, "grad_norm": 0.5183124542236328, "learning_rate": 0.00019999912003490445, "loss": 0.5569, "step": 59 }, { "epoch": 0.008276432857438443, "grad_norm": 0.6319698095321655, "learning_rate": 0.00019999905830142152, "loss": 0.8879, "step": 60 }, { "epoch": 0.008414373405062419, "grad_norm": 0.48757442831993103, "learning_rate": 0.00019999899447529148, "loss": 0.511, "step": 61 }, { "epoch": 0.008552313952686392, "grad_norm": 0.39482003450393677, "learning_rate": 0.00019999892855651575, "loss": 0.7831, "step": 62 }, { "epoch": 0.008690254500310366, "grad_norm": 0.7696112990379333, "learning_rate": 0.0001999988605450956, "loss": 1.0776, "step": 63 }, { "epoch": 0.008828195047934341, "grad_norm": 0.45441484451293945, "learning_rate": 0.00019999879044103254, "loss": 0.6991, "step": 64 }, { "epoch": 0.008966135595558315, "grad_norm": 0.46844032406806946, "learning_rate": 0.00019999871824432798, "loss": 0.6784, "step": 65 }, { "epoch": 0.009104076143182288, "grad_norm": 0.7997144460678101, "learning_rate": 0.00019999864395498347, "loss": 1.0729, "step": 66 }, { "epoch": 0.009242016690806263, "grad_norm": 0.36192160844802856, "learning_rate": 0.0001999985675730005, "loss": 0.6021, "step": 67 }, { "epoch": 0.009379957238430237, "grad_norm": 0.529750645160675, "learning_rate": 0.00019999848909838078, "loss": 0.6529, "step": 68 }, { "epoch": 0.00951789778605421, "grad_norm": 1.026302695274353, "learning_rate": 0.00019999840853112587, "loss": 0.6982, "step": 69 }, { "epoch": 0.009655838333678184, "grad_norm": 0.4954220950603485, "learning_rate": 0.00019999832587123747, "loss": 0.7171, "step": 70 }, { "epoch": 0.00979377888130216, "grad_norm": 0.8853374719619751, "learning_rate": 0.0001999982411187173, "loss": 0.832, "step": 71 }, { "epoch": 0.009931719428926133, "grad_norm": 0.400943398475647, "learning_rate": 0.00019999815427356718, "loss": 0.76, "step": 72 }, { "epoch": 0.010069659976550106, "grad_norm": 0.40010085701942444, "learning_rate": 0.0001999980653357889, "loss": 0.5045, "step": 73 }, { "epoch": 0.010207600524174082, "grad_norm": 0.4322604238986969, "learning_rate": 0.00019999797430538427, "loss": 0.6147, "step": 74 }, { "epoch": 0.010345541071798055, "grad_norm": 0.5326778888702393, "learning_rate": 0.0001999978811823553, "loss": 0.8898, "step": 75 }, { "epoch": 0.010483481619422029, "grad_norm": 0.43207046389579773, "learning_rate": 0.00019999778596670385, "loss": 0.5025, "step": 76 }, { "epoch": 0.010621422167046004, "grad_norm": 0.5074573159217834, "learning_rate": 0.00019999768865843195, "loss": 0.9172, "step": 77 }, { "epoch": 0.010759362714669977, "grad_norm": 0.9970151782035828, "learning_rate": 0.00019999758925754162, "loss": 0.8964, "step": 78 }, { "epoch": 0.010897303262293951, "grad_norm": 0.49125778675079346, "learning_rate": 0.00019999748776403496, "loss": 0.6298, "step": 79 }, { "epoch": 0.011035243809917925, "grad_norm": 0.3789325952529907, "learning_rate": 0.00019999738417791408, "loss": 0.4213, "step": 80 }, { "epoch": 0.0111731843575419, "grad_norm": 0.4855377674102783, "learning_rate": 0.00019999727849918114, "loss": 0.8499, "step": 81 }, { "epoch": 0.011311124905165873, "grad_norm": 0.43113189935684204, "learning_rate": 0.00019999717072783838, "loss": 0.5535, "step": 82 }, { "epoch": 0.011449065452789847, "grad_norm": 0.5266700983047485, "learning_rate": 0.00019999706086388806, "loss": 0.9168, "step": 83 }, { "epoch": 0.011587006000413822, "grad_norm": 0.47261330485343933, "learning_rate": 0.00019999694890733243, "loss": 0.6962, "step": 84 }, { "epoch": 0.011724946548037796, "grad_norm": 0.5425558090209961, "learning_rate": 0.00019999683485817386, "loss": 0.7948, "step": 85 }, { "epoch": 0.01186288709566177, "grad_norm": 0.6287034749984741, "learning_rate": 0.00019999671871641473, "loss": 0.8707, "step": 86 }, { "epoch": 0.012000827643285744, "grad_norm": 0.637447714805603, "learning_rate": 0.00019999660048205747, "loss": 0.9725, "step": 87 }, { "epoch": 0.012138768190909718, "grad_norm": 0.42429324984550476, "learning_rate": 0.0001999964801551046, "loss": 0.7296, "step": 88 }, { "epoch": 0.012276708738533692, "grad_norm": 0.549072265625, "learning_rate": 0.00019999635773555857, "loss": 0.6929, "step": 89 }, { "epoch": 0.012414649286157667, "grad_norm": 0.4701695442199707, "learning_rate": 0.000199996233223422, "loss": 0.5134, "step": 90 }, { "epoch": 0.01255258983378164, "grad_norm": 0.4564104974269867, "learning_rate": 0.00019999610661869746, "loss": 1.08, "step": 91 }, { "epoch": 0.012690530381405614, "grad_norm": 6.529526233673096, "learning_rate": 0.00019999597792138757, "loss": 0.851, "step": 92 }, { "epoch": 0.012828470929029587, "grad_norm": 0.5705176591873169, "learning_rate": 0.00019999584713149512, "loss": 0.8064, "step": 93 }, { "epoch": 0.012966411476653563, "grad_norm": 1.0852056741714478, "learning_rate": 0.00019999571424902276, "loss": 1.0844, "step": 94 }, { "epoch": 0.013104352024277536, "grad_norm": 0.4061054587364197, "learning_rate": 0.00019999557927397328, "loss": 0.5822, "step": 95 }, { "epoch": 0.01324229257190151, "grad_norm": 0.560861349105835, "learning_rate": 0.00019999544220634954, "loss": 0.7624, "step": 96 }, { "epoch": 0.013380233119525485, "grad_norm": 0.4556518793106079, "learning_rate": 0.00019999530304615437, "loss": 0.4449, "step": 97 }, { "epoch": 0.013518173667149459, "grad_norm": 0.5117940902709961, "learning_rate": 0.00019999516179339076, "loss": 0.5312, "step": 98 }, { "epoch": 0.013656114214773432, "grad_norm": 0.4420356750488281, "learning_rate": 0.00019999501844806153, "loss": 0.7041, "step": 99 }, { "epoch": 0.013794054762397407, "grad_norm": 0.46731603145599365, "learning_rate": 0.00019999487301016982, "loss": 0.346, "step": 100 }, { "epoch": 0.013931995310021381, "grad_norm": 0.4286811053752899, "learning_rate": 0.0001999947254797186, "loss": 0.5261, "step": 101 }, { "epoch": 0.014069935857645354, "grad_norm": 0.5022907853126526, "learning_rate": 0.00019999457585671095, "loss": 0.4708, "step": 102 }, { "epoch": 0.01420787640526933, "grad_norm": 0.6970852017402649, "learning_rate": 0.00019999442414115004, "loss": 0.823, "step": 103 }, { "epoch": 0.014345816952893303, "grad_norm": 0.38055419921875, "learning_rate": 0.000199994270333039, "loss": 0.5039, "step": 104 }, { "epoch": 0.014483757500517277, "grad_norm": 0.4658089578151703, "learning_rate": 0.0001999941144323811, "loss": 0.6424, "step": 105 }, { "epoch": 0.01462169804814125, "grad_norm": 0.38460826873779297, "learning_rate": 0.00019999395643917955, "loss": 0.4712, "step": 106 }, { "epoch": 0.014759638595765226, "grad_norm": 0.4541981518268585, "learning_rate": 0.0001999937963534377, "loss": 0.6932, "step": 107 }, { "epoch": 0.0148975791433892, "grad_norm": 0.6719677448272705, "learning_rate": 0.00019999363417515887, "loss": 0.4079, "step": 108 }, { "epoch": 0.015035519691013173, "grad_norm": 0.9907113313674927, "learning_rate": 0.0001999934699043465, "loss": 0.8976, "step": 109 }, { "epoch": 0.015173460238637148, "grad_norm": 0.49312669038772583, "learning_rate": 0.00019999330354100397, "loss": 0.7548, "step": 110 }, { "epoch": 0.015311400786261121, "grad_norm": 0.718002200126648, "learning_rate": 0.0001999931350851348, "loss": 1.2318, "step": 111 }, { "epoch": 0.015449341333885095, "grad_norm": 0.4942666292190552, "learning_rate": 0.0001999929645367425, "loss": 0.6249, "step": 112 }, { "epoch": 0.01558728188150907, "grad_norm": 0.6063429117202759, "learning_rate": 0.0001999927918958306, "loss": 0.6771, "step": 113 }, { "epoch": 0.015725222429133042, "grad_norm": 0.41272974014282227, "learning_rate": 0.0001999926171624028, "loss": 0.4236, "step": 114 }, { "epoch": 0.01586316297675702, "grad_norm": 0.4982720911502838, "learning_rate": 0.0001999924403364627, "loss": 0.774, "step": 115 }, { "epoch": 0.016001103524380993, "grad_norm": 0.7102062106132507, "learning_rate": 0.00019999226141801402, "loss": 0.6597, "step": 116 }, { "epoch": 0.016139044072004966, "grad_norm": 0.6146669983863831, "learning_rate": 0.00019999208040706048, "loss": 0.7579, "step": 117 }, { "epoch": 0.01627698461962894, "grad_norm": 0.39095568656921387, "learning_rate": 0.00019999189730360585, "loss": 0.3573, "step": 118 }, { "epoch": 0.016414925167252913, "grad_norm": 0.510899007320404, "learning_rate": 0.00019999171210765404, "loss": 0.7301, "step": 119 }, { "epoch": 0.016552865714876887, "grad_norm": 0.7055239677429199, "learning_rate": 0.00019999152481920887, "loss": 0.8274, "step": 120 }, { "epoch": 0.016690806262500864, "grad_norm": 0.5454011559486389, "learning_rate": 0.00019999133543827427, "loss": 0.6872, "step": 121 }, { "epoch": 0.016828746810124837, "grad_norm": 0.889281690120697, "learning_rate": 0.0001999911439648542, "loss": 0.7992, "step": 122 }, { "epoch": 0.01696668735774881, "grad_norm": 0.5498670339584351, "learning_rate": 0.00019999095039895267, "loss": 0.7759, "step": 123 }, { "epoch": 0.017104627905372784, "grad_norm": 0.48078951239585876, "learning_rate": 0.0001999907547405737, "loss": 0.5658, "step": 124 }, { "epoch": 0.017242568452996758, "grad_norm": 0.48576879501342773, "learning_rate": 0.00019999055698972145, "loss": 0.6504, "step": 125 }, { "epoch": 0.01738050900062073, "grad_norm": 0.4521864652633667, "learning_rate": 0.0001999903571464, "loss": 0.5856, "step": 126 }, { "epoch": 0.017518449548244705, "grad_norm": 0.6498937606811523, "learning_rate": 0.00019999015521061358, "loss": 1.0937, "step": 127 }, { "epoch": 0.017656390095868682, "grad_norm": 0.5598773956298828, "learning_rate": 0.00019998995118236638, "loss": 0.4092, "step": 128 }, { "epoch": 0.017794330643492656, "grad_norm": 0.5167121887207031, "learning_rate": 0.00019998974506166265, "loss": 0.5908, "step": 129 }, { "epoch": 0.01793227119111663, "grad_norm": 0.4230504333972931, "learning_rate": 0.00019998953684850678, "loss": 0.5165, "step": 130 }, { "epoch": 0.018070211738740603, "grad_norm": 0.4121663570404053, "learning_rate": 0.00019998932654290307, "loss": 0.573, "step": 131 }, { "epoch": 0.018208152286364576, "grad_norm": 0.8193333745002747, "learning_rate": 0.0001999891141448559, "loss": 0.3659, "step": 132 }, { "epoch": 0.01834609283398855, "grad_norm": 0.5470033288002014, "learning_rate": 0.00019998889965436978, "loss": 0.4961, "step": 133 }, { "epoch": 0.018484033381612527, "grad_norm": 0.43804946541786194, "learning_rate": 0.00019998868307144913, "loss": 0.511, "step": 134 }, { "epoch": 0.0186219739292365, "grad_norm": 0.5354853272438049, "learning_rate": 0.00019998846439609852, "loss": 0.8857, "step": 135 }, { "epoch": 0.018759914476860474, "grad_norm": 0.44438180327415466, "learning_rate": 0.00019998824362832255, "loss": 0.7116, "step": 136 }, { "epoch": 0.018897855024484447, "grad_norm": 0.41642293334007263, "learning_rate": 0.0001999880207681258, "loss": 0.794, "step": 137 }, { "epoch": 0.01903579557210842, "grad_norm": 0.6521027088165283, "learning_rate": 0.00019998779581551296, "loss": 0.7167, "step": 138 }, { "epoch": 0.019173736119732394, "grad_norm": 0.45690736174583435, "learning_rate": 0.0001999875687704887, "loss": 0.4207, "step": 139 }, { "epoch": 0.019311676667356368, "grad_norm": 0.7973712682723999, "learning_rate": 0.00019998733963305784, "loss": 0.7225, "step": 140 }, { "epoch": 0.019449617214980345, "grad_norm": 0.5635350346565247, "learning_rate": 0.0001999871084032251, "loss": 0.4286, "step": 141 }, { "epoch": 0.01958755776260432, "grad_norm": 0.8758499026298523, "learning_rate": 0.00019998687508099536, "loss": 0.4271, "step": 142 }, { "epoch": 0.019725498310228292, "grad_norm": 0.4160282611846924, "learning_rate": 0.00019998663966637348, "loss": 0.6697, "step": 143 }, { "epoch": 0.019863438857852266, "grad_norm": 1.2644795179367065, "learning_rate": 0.00019998640215936438, "loss": 0.6819, "step": 144 }, { "epoch": 0.02000137940547624, "grad_norm": 0.9141055941581726, "learning_rate": 0.00019998616255997308, "loss": 0.5806, "step": 145 }, { "epoch": 0.020139319953100213, "grad_norm": 0.7500741481781006, "learning_rate": 0.00019998592086820457, "loss": 0.5593, "step": 146 }, { "epoch": 0.020277260500724186, "grad_norm": 0.49818581342697144, "learning_rate": 0.00019998567708406388, "loss": 0.5216, "step": 147 }, { "epoch": 0.020415201048348163, "grad_norm": 0.47777387499809265, "learning_rate": 0.00019998543120755612, "loss": 0.8662, "step": 148 }, { "epoch": 0.020553141595972137, "grad_norm": 0.5563037991523743, "learning_rate": 0.00019998518323868648, "loss": 0.7386, "step": 149 }, { "epoch": 0.02069108214359611, "grad_norm": 0.44376295804977417, "learning_rate": 0.0001999849331774601, "loss": 0.8587, "step": 150 }, { "epoch": 0.020829022691220084, "grad_norm": 0.7179962396621704, "learning_rate": 0.00019998468102388223, "loss": 0.7091, "step": 151 }, { "epoch": 0.020966963238844057, "grad_norm": 0.48467734456062317, "learning_rate": 0.00019998442677795814, "loss": 0.6801, "step": 152 }, { "epoch": 0.02110490378646803, "grad_norm": 0.4698256254196167, "learning_rate": 0.00019998417043969318, "loss": 0.3904, "step": 153 }, { "epoch": 0.021242844334092008, "grad_norm": 0.5565569400787354, "learning_rate": 0.0001999839120090927, "loss": 0.708, "step": 154 }, { "epoch": 0.02138078488171598, "grad_norm": 0.50385582447052, "learning_rate": 0.00019998365148616201, "loss": 0.6504, "step": 155 }, { "epoch": 0.021518725429339955, "grad_norm": 0.7868030667304993, "learning_rate": 0.00019998338887090676, "loss": 1.2273, "step": 156 }, { "epoch": 0.02165666597696393, "grad_norm": 0.5153452754020691, "learning_rate": 0.00019998312416333227, "loss": 1.1071, "step": 157 }, { "epoch": 0.021794606524587902, "grad_norm": 0.41460031270980835, "learning_rate": 0.00019998285736344418, "loss": 0.4554, "step": 158 }, { "epoch": 0.021932547072211876, "grad_norm": 0.3878832757472992, "learning_rate": 0.00019998258847124802, "loss": 0.3698, "step": 159 }, { "epoch": 0.02207048761983585, "grad_norm": 0.5454118251800537, "learning_rate": 0.00019998231748674948, "loss": 0.729, "step": 160 }, { "epoch": 0.022208428167459826, "grad_norm": 0.46344706416130066, "learning_rate": 0.00019998204440995415, "loss": 0.7578, "step": 161 }, { "epoch": 0.0223463687150838, "grad_norm": 1.1156365871429443, "learning_rate": 0.0001999817692408678, "loss": 0.8657, "step": 162 }, { "epoch": 0.022484309262707773, "grad_norm": 0.44892409443855286, "learning_rate": 0.00019998149197949613, "loss": 0.454, "step": 163 }, { "epoch": 0.022622249810331747, "grad_norm": 0.4572855532169342, "learning_rate": 0.00019998121262584503, "loss": 0.7067, "step": 164 }, { "epoch": 0.02276019035795572, "grad_norm": 0.6114012002944946, "learning_rate": 0.00019998093117992025, "loss": 0.5086, "step": 165 }, { "epoch": 0.022898130905579694, "grad_norm": 0.4878060221672058, "learning_rate": 0.00019998064764172778, "loss": 0.613, "step": 166 }, { "epoch": 0.02303607145320367, "grad_norm": 1.0030025243759155, "learning_rate": 0.00019998036201127346, "loss": 0.9149, "step": 167 }, { "epoch": 0.023174012000827644, "grad_norm": 0.7854377031326294, "learning_rate": 0.00019998007428856336, "loss": 0.2438, "step": 168 }, { "epoch": 0.023311952548451618, "grad_norm": 0.49808233976364136, "learning_rate": 0.0001999797844736034, "loss": 0.6435, "step": 169 }, { "epoch": 0.02344989309607559, "grad_norm": 0.46107861399650574, "learning_rate": 0.00019997949256639973, "loss": 0.695, "step": 170 }, { "epoch": 0.023587833643699565, "grad_norm": 1.883959412574768, "learning_rate": 0.0001999791985669584, "loss": 1.269, "step": 171 }, { "epoch": 0.02372577419132354, "grad_norm": 0.9229387044906616, "learning_rate": 0.0001999789024752856, "loss": 0.5674, "step": 172 }, { "epoch": 0.023863714738947512, "grad_norm": 0.559683084487915, "learning_rate": 0.0001999786042913875, "loss": 0.6982, "step": 173 }, { "epoch": 0.02400165528657149, "grad_norm": 0.7660646438598633, "learning_rate": 0.00019997830401527033, "loss": 0.4579, "step": 174 }, { "epoch": 0.024139595834195463, "grad_norm": 1.571911096572876, "learning_rate": 0.00019997800164694044, "loss": 1.0384, "step": 175 }, { "epoch": 0.024277536381819436, "grad_norm": 0.5258024334907532, "learning_rate": 0.00019997769718640412, "loss": 0.4984, "step": 176 }, { "epoch": 0.02441547692944341, "grad_norm": 0.40779802203178406, "learning_rate": 0.0001999773906336677, "loss": 0.3694, "step": 177 }, { "epoch": 0.024553417477067383, "grad_norm": 0.41210711002349854, "learning_rate": 0.00019997708198873763, "loss": 0.5131, "step": 178 }, { "epoch": 0.024691358024691357, "grad_norm": 0.6212690472602844, "learning_rate": 0.0001999767712516204, "loss": 1.1256, "step": 179 }, { "epoch": 0.024829298572315334, "grad_norm": 0.5157824754714966, "learning_rate": 0.00019997645842232244, "loss": 0.4837, "step": 180 }, { "epoch": 0.024967239119939307, "grad_norm": 0.5376565456390381, "learning_rate": 0.0001999761435008504, "loss": 0.8027, "step": 181 }, { "epoch": 0.02510517966756328, "grad_norm": 0.6731228828430176, "learning_rate": 0.00019997582648721075, "loss": 0.9505, "step": 182 }, { "epoch": 0.025243120215187254, "grad_norm": 0.7087239623069763, "learning_rate": 0.0001999755073814102, "loss": 0.8651, "step": 183 }, { "epoch": 0.025381060762811228, "grad_norm": 0.6519793272018433, "learning_rate": 0.00019997518618345542, "loss": 1.0137, "step": 184 }, { "epoch": 0.0255190013104352, "grad_norm": 0.5288932919502258, "learning_rate": 0.0001999748628933531, "loss": 0.8138, "step": 185 }, { "epoch": 0.025656941858059175, "grad_norm": 0.702359139919281, "learning_rate": 0.00019997453751111006, "loss": 0.7999, "step": 186 }, { "epoch": 0.025794882405683152, "grad_norm": 0.512363076210022, "learning_rate": 0.00019997421003673305, "loss": 0.8201, "step": 187 }, { "epoch": 0.025932822953307125, "grad_norm": 0.5105271935462952, "learning_rate": 0.00019997388047022897, "loss": 0.3588, "step": 188 }, { "epoch": 0.0260707635009311, "grad_norm": 0.5415822863578796, "learning_rate": 0.0001999735488116047, "loss": 0.4008, "step": 189 }, { "epoch": 0.026208704048555072, "grad_norm": 0.7703432440757751, "learning_rate": 0.00019997321506086714, "loss": 1.1452, "step": 190 }, { "epoch": 0.026346644596179046, "grad_norm": 0.46029290556907654, "learning_rate": 0.0001999728792180233, "loss": 0.5812, "step": 191 }, { "epoch": 0.02648458514380302, "grad_norm": 0.5847637057304382, "learning_rate": 0.0001999725412830803, "loss": 0.5187, "step": 192 }, { "epoch": 0.026622525691426997, "grad_norm": 0.4251965284347534, "learning_rate": 0.0001999722012560451, "loss": 0.5953, "step": 193 }, { "epoch": 0.02676046623905097, "grad_norm": 0.38125622272491455, "learning_rate": 0.0001999718591369248, "loss": 0.3066, "step": 194 }, { "epoch": 0.026898406786674944, "grad_norm": 0.384884774684906, "learning_rate": 0.00019997151492572664, "loss": 0.6125, "step": 195 }, { "epoch": 0.027036347334298917, "grad_norm": 0.499237596988678, "learning_rate": 0.00019997116862245778, "loss": 0.7623, "step": 196 }, { "epoch": 0.02717428788192289, "grad_norm": 0.3985907733440399, "learning_rate": 0.0001999708202271255, "loss": 0.4129, "step": 197 }, { "epoch": 0.027312228429546864, "grad_norm": 0.5830117464065552, "learning_rate": 0.00019997046973973704, "loss": 0.7606, "step": 198 }, { "epoch": 0.027450168977170838, "grad_norm": 0.43591436743736267, "learning_rate": 0.00019997011716029977, "loss": 0.7417, "step": 199 }, { "epoch": 0.027588109524794815, "grad_norm": 0.46609047055244446, "learning_rate": 0.00019996976248882103, "loss": 0.5169, "step": 200 }, { "epoch": 0.027588109524794815, "eval_loss": 0.8889594674110413, "eval_runtime": 23.752, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 200 }, { "epoch": 0.02772605007241879, "grad_norm": 0.567881166934967, "learning_rate": 0.0001999694057253083, "loss": 0.3107, "step": 201 }, { "epoch": 0.027863990620042762, "grad_norm": 0.4620385468006134, "learning_rate": 0.00019996904686976903, "loss": 0.5329, "step": 202 }, { "epoch": 0.028001931167666735, "grad_norm": 0.5823633074760437, "learning_rate": 0.0001999686859222107, "loss": 0.8301, "step": 203 }, { "epoch": 0.02813987171529071, "grad_norm": 0.6678638458251953, "learning_rate": 0.00019996832288264092, "loss": 0.5633, "step": 204 }, { "epoch": 0.028277812262914682, "grad_norm": 0.6890857815742493, "learning_rate": 0.0001999679577510672, "loss": 1.0085, "step": 205 }, { "epoch": 0.02841575281053866, "grad_norm": 1.7328273057937622, "learning_rate": 0.00019996759052749724, "loss": 0.7141, "step": 206 }, { "epoch": 0.028553693358162633, "grad_norm": 0.5034376382827759, "learning_rate": 0.00019996722121193872, "loss": 0.9981, "step": 207 }, { "epoch": 0.028691633905786607, "grad_norm": 0.5651301741600037, "learning_rate": 0.00019996684980439934, "loss": 0.598, "step": 208 }, { "epoch": 0.02882957445341058, "grad_norm": 0.5304937958717346, "learning_rate": 0.0001999664763048869, "loss": 0.695, "step": 209 }, { "epoch": 0.028967515001034554, "grad_norm": 0.49088260531425476, "learning_rate": 0.00019996610071340925, "loss": 0.3266, "step": 210 }, { "epoch": 0.029105455548658527, "grad_norm": 0.6354581117630005, "learning_rate": 0.00019996572302997422, "loss": 0.3793, "step": 211 }, { "epoch": 0.0292433960962825, "grad_norm": 0.8221268057823181, "learning_rate": 0.00019996534325458966, "loss": 0.3813, "step": 212 }, { "epoch": 0.029381336643906478, "grad_norm": 0.5460823178291321, "learning_rate": 0.00019996496138726354, "loss": 0.9557, "step": 213 }, { "epoch": 0.02951927719153045, "grad_norm": 0.9476891756057739, "learning_rate": 0.00019996457742800392, "loss": 1.6475, "step": 214 }, { "epoch": 0.029657217739154425, "grad_norm": 0.5727168917655945, "learning_rate": 0.00019996419137681878, "loss": 0.5071, "step": 215 }, { "epoch": 0.0297951582867784, "grad_norm": 0.8874626159667969, "learning_rate": 0.0001999638032337162, "loss": 0.7107, "step": 216 }, { "epoch": 0.029933098834402372, "grad_norm": 0.5887399911880493, "learning_rate": 0.0001999634129987043, "loss": 0.6336, "step": 217 }, { "epoch": 0.030071039382026345, "grad_norm": 0.7286300659179688, "learning_rate": 0.00019996302067179123, "loss": 1.0568, "step": 218 }, { "epoch": 0.030208979929650322, "grad_norm": 0.5702489614486694, "learning_rate": 0.00019996262625298527, "loss": 0.4868, "step": 219 }, { "epoch": 0.030346920477274296, "grad_norm": 0.5651228427886963, "learning_rate": 0.00019996222974229464, "loss": 0.6292, "step": 220 }, { "epoch": 0.03048486102489827, "grad_norm": 0.8664615750312805, "learning_rate": 0.0001999618311397276, "loss": 0.6027, "step": 221 }, { "epoch": 0.030622801572522243, "grad_norm": 0.5164960622787476, "learning_rate": 0.00019996143044529255, "loss": 0.4432, "step": 222 }, { "epoch": 0.030760742120146217, "grad_norm": 0.6755353808403015, "learning_rate": 0.0001999610276589978, "loss": 0.711, "step": 223 }, { "epoch": 0.03089868266777019, "grad_norm": 0.6317562460899353, "learning_rate": 0.00019996062278085185, "loss": 0.4977, "step": 224 }, { "epoch": 0.031036623215394164, "grad_norm": 0.49837130308151245, "learning_rate": 0.00019996021581086313, "loss": 0.8036, "step": 225 }, { "epoch": 0.03117456376301814, "grad_norm": 0.6384889483451843, "learning_rate": 0.00019995980674904015, "loss": 0.5388, "step": 226 }, { "epoch": 0.031312504310642114, "grad_norm": 0.49564528465270996, "learning_rate": 0.00019995939559539153, "loss": 0.4529, "step": 227 }, { "epoch": 0.031450444858266084, "grad_norm": 0.4562602937221527, "learning_rate": 0.00019995898234992585, "loss": 0.6436, "step": 228 }, { "epoch": 0.03158838540589006, "grad_norm": 0.5857105851173401, "learning_rate": 0.00019995856701265173, "loss": 0.5656, "step": 229 }, { "epoch": 0.03172632595351404, "grad_norm": 0.4887877404689789, "learning_rate": 0.00019995814958357784, "loss": 0.4531, "step": 230 }, { "epoch": 0.03186426650113801, "grad_norm": 0.4763462543487549, "learning_rate": 0.00019995773006271298, "loss": 0.4739, "step": 231 }, { "epoch": 0.032002207048761985, "grad_norm": 2.070396900177002, "learning_rate": 0.0001999573084500659, "loss": 1.0752, "step": 232 }, { "epoch": 0.032140147596385955, "grad_norm": 0.4410666823387146, "learning_rate": 0.00019995688474564542, "loss": 0.6371, "step": 233 }, { "epoch": 0.03227808814400993, "grad_norm": 0.5890915393829346, "learning_rate": 0.00019995645894946043, "loss": 0.7982, "step": 234 }, { "epoch": 0.0324160286916339, "grad_norm": 0.4370487332344055, "learning_rate": 0.00019995603106151978, "loss": 0.5403, "step": 235 }, { "epoch": 0.03255396923925788, "grad_norm": 0.6569862365722656, "learning_rate": 0.0001999556010818325, "loss": 0.6147, "step": 236 }, { "epoch": 0.032691909786881856, "grad_norm": 0.5827431082725525, "learning_rate": 0.00019995516901040754, "loss": 0.7185, "step": 237 }, { "epoch": 0.032829850334505827, "grad_norm": 0.35179388523101807, "learning_rate": 0.00019995473484725395, "loss": 0.5124, "step": 238 }, { "epoch": 0.032967790882129804, "grad_norm": 0.5688872933387756, "learning_rate": 0.00019995429859238085, "loss": 0.7586, "step": 239 }, { "epoch": 0.033105731429753774, "grad_norm": 0.42493298649787903, "learning_rate": 0.00019995386024579729, "loss": 0.3505, "step": 240 }, { "epoch": 0.03324367197737775, "grad_norm": 0.42503833770751953, "learning_rate": 0.0001999534198075125, "loss": 0.5089, "step": 241 }, { "epoch": 0.03338161252500173, "grad_norm": 0.4994729459285736, "learning_rate": 0.00019995297727753574, "loss": 0.4223, "step": 242 }, { "epoch": 0.0335195530726257, "grad_norm": 0.3687988221645355, "learning_rate": 0.00019995253265587618, "loss": 0.6847, "step": 243 }, { "epoch": 0.033657493620249675, "grad_norm": 0.45075079798698425, "learning_rate": 0.0001999520859425432, "loss": 0.5617, "step": 244 }, { "epoch": 0.033795434167873645, "grad_norm": 0.7460540533065796, "learning_rate": 0.00019995163713754606, "loss": 0.5124, "step": 245 }, { "epoch": 0.03393337471549762, "grad_norm": 0.5067691802978516, "learning_rate": 0.0001999511862408942, "loss": 0.6261, "step": 246 }, { "epoch": 0.03407131526312159, "grad_norm": 0.4315679967403412, "learning_rate": 0.00019995073325259713, "loss": 0.3627, "step": 247 }, { "epoch": 0.03420925581074557, "grad_norm": 1.7428961992263794, "learning_rate": 0.00019995027817266422, "loss": 1.4047, "step": 248 }, { "epoch": 0.034347196358369546, "grad_norm": 0.41287699341773987, "learning_rate": 0.000199949821001105, "loss": 0.4793, "step": 249 }, { "epoch": 0.034485136905993516, "grad_norm": 0.5184664726257324, "learning_rate": 0.00019994936173792913, "loss": 0.4743, "step": 250 }, { "epoch": 0.03462307745361749, "grad_norm": 0.45465287566185, "learning_rate": 0.00019994890038314614, "loss": 0.4442, "step": 251 }, { "epoch": 0.03476101800124146, "grad_norm": 0.7160200476646423, "learning_rate": 0.00019994843693676572, "loss": 0.9555, "step": 252 }, { "epoch": 0.03489895854886544, "grad_norm": 0.4697338342666626, "learning_rate": 0.00019994797139879752, "loss": 0.6089, "step": 253 }, { "epoch": 0.03503689909648941, "grad_norm": 0.6399049758911133, "learning_rate": 0.00019994750376925132, "loss": 0.747, "step": 254 }, { "epoch": 0.03517483964411339, "grad_norm": 0.6027230024337769, "learning_rate": 0.00019994703404813694, "loss": 0.8733, "step": 255 }, { "epoch": 0.035312780191737364, "grad_norm": 0.4540584981441498, "learning_rate": 0.00019994656223546416, "loss": 0.9172, "step": 256 }, { "epoch": 0.035450720739361334, "grad_norm": 0.5152713656425476, "learning_rate": 0.00019994608833124285, "loss": 0.8899, "step": 257 }, { "epoch": 0.03558866128698531, "grad_norm": 1.2631217241287231, "learning_rate": 0.00019994561233548295, "loss": 0.4138, "step": 258 }, { "epoch": 0.03572660183460928, "grad_norm": 0.5689836740493774, "learning_rate": 0.00019994513424819439, "loss": 0.601, "step": 259 }, { "epoch": 0.03586454238223326, "grad_norm": 0.3746460974216461, "learning_rate": 0.00019994465406938722, "loss": 0.6366, "step": 260 }, { "epoch": 0.03600248292985723, "grad_norm": 0.49796271324157715, "learning_rate": 0.00019994417179907146, "loss": 0.8401, "step": 261 }, { "epoch": 0.036140423477481205, "grad_norm": 0.5409201383590698, "learning_rate": 0.00019994368743725724, "loss": 0.5549, "step": 262 }, { "epoch": 0.03627836402510518, "grad_norm": 0.4652073085308075, "learning_rate": 0.00019994320098395464, "loss": 0.6272, "step": 263 }, { "epoch": 0.03641630457272915, "grad_norm": 0.4900420904159546, "learning_rate": 0.00019994271243917386, "loss": 0.665, "step": 264 }, { "epoch": 0.03655424512035313, "grad_norm": 0.5045313239097595, "learning_rate": 0.00019994222180292517, "loss": 0.4341, "step": 265 }, { "epoch": 0.0366921856679771, "grad_norm": 0.9025049805641174, "learning_rate": 0.00019994172907521876, "loss": 1.0971, "step": 266 }, { "epoch": 0.036830126215601076, "grad_norm": 0.4778205454349518, "learning_rate": 0.00019994123425606496, "loss": 1.0434, "step": 267 }, { "epoch": 0.03696806676322505, "grad_norm": 0.5192625522613525, "learning_rate": 0.0001999407373454742, "loss": 0.543, "step": 268 }, { "epoch": 0.037106007310849023, "grad_norm": 0.5512039065361023, "learning_rate": 0.00019994023834345677, "loss": 0.6199, "step": 269 }, { "epoch": 0.037243947858473, "grad_norm": 0.6469480395317078, "learning_rate": 0.00019993973725002318, "loss": 0.7352, "step": 270 }, { "epoch": 0.03738188840609697, "grad_norm": 0.6875861883163452, "learning_rate": 0.0001999392340651839, "loss": 1.0221, "step": 271 }, { "epoch": 0.03751982895372095, "grad_norm": 0.5370146632194519, "learning_rate": 0.00019993872878894945, "loss": 0.5557, "step": 272 }, { "epoch": 0.03765776950134492, "grad_norm": 0.5389960408210754, "learning_rate": 0.00019993822142133042, "loss": 0.7089, "step": 273 }, { "epoch": 0.037795710048968895, "grad_norm": 0.5115049481391907, "learning_rate": 0.00019993771196233744, "loss": 0.7516, "step": 274 }, { "epoch": 0.03793365059659287, "grad_norm": 0.6588063836097717, "learning_rate": 0.00019993720041198114, "loss": 0.5807, "step": 275 }, { "epoch": 0.03807159114421684, "grad_norm": 6.3809099197387695, "learning_rate": 0.00019993668677027225, "loss": 1.5417, "step": 276 }, { "epoch": 0.03820953169184082, "grad_norm": 0.5396524667739868, "learning_rate": 0.0001999361710372215, "loss": 0.5337, "step": 277 }, { "epoch": 0.03834747223946479, "grad_norm": 0.5248252749443054, "learning_rate": 0.00019993565321283966, "loss": 0.6824, "step": 278 }, { "epoch": 0.038485412787088766, "grad_norm": 0.7482089996337891, "learning_rate": 0.0001999351332971376, "loss": 0.9686, "step": 279 }, { "epoch": 0.038623353334712736, "grad_norm": 1.299159049987793, "learning_rate": 0.00019993461129012624, "loss": 0.5376, "step": 280 }, { "epoch": 0.03876129388233671, "grad_norm": 0.7115864753723145, "learning_rate": 0.00019993408719181643, "loss": 0.463, "step": 281 }, { "epoch": 0.03889923442996069, "grad_norm": 0.523969829082489, "learning_rate": 0.0001999335610022192, "loss": 0.25, "step": 282 }, { "epoch": 0.03903717497758466, "grad_norm": 0.6485568284988403, "learning_rate": 0.00019993303272134547, "loss": 0.5015, "step": 283 }, { "epoch": 0.03917511552520864, "grad_norm": 0.5250431299209595, "learning_rate": 0.00019993250234920636, "loss": 0.545, "step": 284 }, { "epoch": 0.03931305607283261, "grad_norm": 0.5995900630950928, "learning_rate": 0.000199931969885813, "loss": 0.7626, "step": 285 }, { "epoch": 0.039450996620456584, "grad_norm": 0.45174479484558105, "learning_rate": 0.0001999314353311765, "loss": 0.4052, "step": 286 }, { "epoch": 0.039588937168080554, "grad_norm": 0.5686671137809753, "learning_rate": 0.000199930898685308, "loss": 1.0164, "step": 287 }, { "epoch": 0.03972687771570453, "grad_norm": 0.46925264596939087, "learning_rate": 0.0001999303599482188, "loss": 0.6088, "step": 288 }, { "epoch": 0.03986481826332851, "grad_norm": 0.6501463651657104, "learning_rate": 0.00019992981911992014, "loss": 0.8467, "step": 289 }, { "epoch": 0.04000275881095248, "grad_norm": 0.4021241366863251, "learning_rate": 0.00019992927620042332, "loss": 0.4012, "step": 290 }, { "epoch": 0.040140699358576455, "grad_norm": 0.4685521721839905, "learning_rate": 0.00019992873118973976, "loss": 0.3325, "step": 291 }, { "epoch": 0.040278639906200425, "grad_norm": 0.4316225051879883, "learning_rate": 0.0001999281840878808, "loss": 0.5658, "step": 292 }, { "epoch": 0.0404165804538244, "grad_norm": 0.5590253472328186, "learning_rate": 0.00019992763489485796, "loss": 0.735, "step": 293 }, { "epoch": 0.04055452100144837, "grad_norm": 0.40875402092933655, "learning_rate": 0.00019992708361068267, "loss": 0.5094, "step": 294 }, { "epoch": 0.04069246154907235, "grad_norm": 0.41538792848587036, "learning_rate": 0.00019992653023536647, "loss": 0.4938, "step": 295 }, { "epoch": 0.040830402096696326, "grad_norm": 3.5067529678344727, "learning_rate": 0.00019992597476892096, "loss": 0.7828, "step": 296 }, { "epoch": 0.040968342644320296, "grad_norm": 0.4723116159439087, "learning_rate": 0.00019992541721135777, "loss": 0.657, "step": 297 }, { "epoch": 0.04110628319194427, "grad_norm": 0.4845767021179199, "learning_rate": 0.0001999248575626886, "loss": 0.3398, "step": 298 }, { "epoch": 0.04124422373956824, "grad_norm": 0.4803406894207001, "learning_rate": 0.00019992429582292508, "loss": 0.4603, "step": 299 }, { "epoch": 0.04138216428719222, "grad_norm": 0.5879464149475098, "learning_rate": 0.00019992373199207903, "loss": 0.7187, "step": 300 }, { "epoch": 0.0415201048348162, "grad_norm": 0.45234090089797974, "learning_rate": 0.0001999231660701622, "loss": 0.694, "step": 301 }, { "epoch": 0.04165804538244017, "grad_norm": 0.6118064522743225, "learning_rate": 0.00019992259805718648, "loss": 0.579, "step": 302 }, { "epoch": 0.041795985930064145, "grad_norm": 0.6022830009460449, "learning_rate": 0.00019992202795316374, "loss": 0.4436, "step": 303 }, { "epoch": 0.041933926477688115, "grad_norm": 0.5346994400024414, "learning_rate": 0.0001999214557581059, "loss": 0.4457, "step": 304 }, { "epoch": 0.04207186702531209, "grad_norm": 0.35214391350746155, "learning_rate": 0.00019992088147202493, "loss": 0.3394, "step": 305 }, { "epoch": 0.04220980757293606, "grad_norm": 0.8591717481613159, "learning_rate": 0.00019992030509493288, "loss": 0.9673, "step": 306 }, { "epoch": 0.04234774812056004, "grad_norm": 0.3884176015853882, "learning_rate": 0.00019991972662684177, "loss": 0.255, "step": 307 }, { "epoch": 0.042485688668184016, "grad_norm": 0.549793004989624, "learning_rate": 0.00019991914606776371, "loss": 0.4176, "step": 308 }, { "epoch": 0.042623629215807986, "grad_norm": 0.5462009906768799, "learning_rate": 0.0001999185634177109, "loss": 0.5013, "step": 309 }, { "epoch": 0.04276156976343196, "grad_norm": 0.5183460116386414, "learning_rate": 0.0001999179786766955, "loss": 0.8391, "step": 310 }, { "epoch": 0.04289951031105593, "grad_norm": 0.6750245094299316, "learning_rate": 0.00019991739184472974, "loss": 0.7856, "step": 311 }, { "epoch": 0.04303745085867991, "grad_norm": 0.49737048149108887, "learning_rate": 0.00019991680292182586, "loss": 0.7965, "step": 312 }, { "epoch": 0.04317539140630388, "grad_norm": 0.5741978883743286, "learning_rate": 0.00019991621190799627, "loss": 0.6435, "step": 313 }, { "epoch": 0.04331333195392786, "grad_norm": 0.6549875736236572, "learning_rate": 0.0001999156188032533, "loss": 0.5347, "step": 314 }, { "epoch": 0.043451272501551834, "grad_norm": 0.5544412136077881, "learning_rate": 0.00019991502360760933, "loss": 0.5398, "step": 315 }, { "epoch": 0.043589213049175804, "grad_norm": 0.4168173670768738, "learning_rate": 0.00019991442632107687, "loss": 0.567, "step": 316 }, { "epoch": 0.04372715359679978, "grad_norm": 0.6658691763877869, "learning_rate": 0.0001999138269436684, "loss": 0.8135, "step": 317 }, { "epoch": 0.04386509414442375, "grad_norm": 0.507588803768158, "learning_rate": 0.00019991322547539643, "loss": 0.5389, "step": 318 }, { "epoch": 0.04400303469204773, "grad_norm": 0.7096207737922668, "learning_rate": 0.0001999126219162736, "loss": 0.7837, "step": 319 }, { "epoch": 0.0441409752396717, "grad_norm": 0.7609978318214417, "learning_rate": 0.0001999120162663125, "loss": 0.8759, "step": 320 }, { "epoch": 0.044278915787295675, "grad_norm": 0.599826991558075, "learning_rate": 0.00019991140852552582, "loss": 0.6457, "step": 321 }, { "epoch": 0.04441685633491965, "grad_norm": 0.5656265616416931, "learning_rate": 0.0001999107986939263, "loss": 0.6269, "step": 322 }, { "epoch": 0.04455479688254362, "grad_norm": 0.5724309086799622, "learning_rate": 0.00019991018677152663, "loss": 0.9758, "step": 323 }, { "epoch": 0.0446927374301676, "grad_norm": 2.0058224201202393, "learning_rate": 0.00019990957275833968, "loss": 0.8825, "step": 324 }, { "epoch": 0.04483067797779157, "grad_norm": 0.6560097932815552, "learning_rate": 0.00019990895665437827, "loss": 0.8677, "step": 325 }, { "epoch": 0.044968618525415546, "grad_norm": 0.7454385161399841, "learning_rate": 0.00019990833845965535, "loss": 0.6125, "step": 326 }, { "epoch": 0.04510655907303952, "grad_norm": 0.4617150127887726, "learning_rate": 0.00019990771817418376, "loss": 0.4493, "step": 327 }, { "epoch": 0.04524449962066349, "grad_norm": 0.6617423295974731, "learning_rate": 0.00019990709579797658, "loss": 0.6253, "step": 328 }, { "epoch": 0.04538244016828747, "grad_norm": 0.467551589012146, "learning_rate": 0.00019990647133104675, "loss": 0.5161, "step": 329 }, { "epoch": 0.04552038071591144, "grad_norm": 0.575173020362854, "learning_rate": 0.0001999058447734074, "loss": 0.6497, "step": 330 }, { "epoch": 0.04565832126353542, "grad_norm": 0.5697531700134277, "learning_rate": 0.0001999052161250716, "loss": 0.6676, "step": 331 }, { "epoch": 0.04579626181115939, "grad_norm": 0.6267597079277039, "learning_rate": 0.00019990458538605253, "loss": 0.7716, "step": 332 }, { "epoch": 0.045934202358783364, "grad_norm": 0.7443068027496338, "learning_rate": 0.00019990395255636338, "loss": 0.9048, "step": 333 }, { "epoch": 0.04607214290640734, "grad_norm": 0.5780167579650879, "learning_rate": 0.0001999033176360174, "loss": 0.6191, "step": 334 }, { "epoch": 0.04621008345403131, "grad_norm": 0.4723268449306488, "learning_rate": 0.00019990268062502784, "loss": 0.5335, "step": 335 }, { "epoch": 0.04634802400165529, "grad_norm": 0.7877517938613892, "learning_rate": 0.0001999020415234081, "loss": 0.6372, "step": 336 }, { "epoch": 0.04648596454927926, "grad_norm": 0.6168155670166016, "learning_rate": 0.00019990140033117147, "loss": 0.4169, "step": 337 }, { "epoch": 0.046623905096903236, "grad_norm": 0.5653532147407532, "learning_rate": 0.00019990075704833147, "loss": 0.6975, "step": 338 }, { "epoch": 0.046761845644527206, "grad_norm": 0.40092945098876953, "learning_rate": 0.00019990011167490148, "loss": 0.81, "step": 339 }, { "epoch": 0.04689978619215118, "grad_norm": 0.3615861237049103, "learning_rate": 0.00019989946421089503, "loss": 0.4071, "step": 340 }, { "epoch": 0.04703772673977516, "grad_norm": 0.6372112035751343, "learning_rate": 0.0001998988146563257, "loss": 0.6211, "step": 341 }, { "epoch": 0.04717566728739913, "grad_norm": 0.5781791806221008, "learning_rate": 0.00019989816301120702, "loss": 0.7205, "step": 342 }, { "epoch": 0.04731360783502311, "grad_norm": 0.4111423194408417, "learning_rate": 0.00019989750927555265, "loss": 0.5174, "step": 343 }, { "epoch": 0.04745154838264708, "grad_norm": 0.3542708456516266, "learning_rate": 0.00019989685344937633, "loss": 0.6286, "step": 344 }, { "epoch": 0.047589488930271054, "grad_norm": 0.49733197689056396, "learning_rate": 0.00019989619553269168, "loss": 0.955, "step": 345 }, { "epoch": 0.047727429477895024, "grad_norm": 0.6014874577522278, "learning_rate": 0.00019989553552551254, "loss": 0.7402, "step": 346 }, { "epoch": 0.047865370025519, "grad_norm": 0.6734299659729004, "learning_rate": 0.00019989487342785274, "loss": 1.0639, "step": 347 }, { "epoch": 0.04800331057314298, "grad_norm": 0.6077990531921387, "learning_rate": 0.00019989420923972607, "loss": 0.8076, "step": 348 }, { "epoch": 0.04814125112076695, "grad_norm": 1.3071717023849487, "learning_rate": 0.00019989354296114645, "loss": 0.5172, "step": 349 }, { "epoch": 0.048279191668390925, "grad_norm": 0.4025424122810364, "learning_rate": 0.00019989287459212785, "loss": 0.485, "step": 350 }, { "epoch": 0.048417132216014895, "grad_norm": 0.4533528983592987, "learning_rate": 0.00019989220413268424, "loss": 0.5435, "step": 351 }, { "epoch": 0.04855507276363887, "grad_norm": 0.41913115978240967, "learning_rate": 0.00019989153158282962, "loss": 0.374, "step": 352 }, { "epoch": 0.04869301331126285, "grad_norm": 0.7913991212844849, "learning_rate": 0.0001998908569425781, "loss": 0.5623, "step": 353 }, { "epoch": 0.04883095385888682, "grad_norm": 0.4342532157897949, "learning_rate": 0.00019989018021194382, "loss": 0.5775, "step": 354 }, { "epoch": 0.048968894406510796, "grad_norm": 0.6135227084159851, "learning_rate": 0.00019988950139094092, "loss": 0.9293, "step": 355 }, { "epoch": 0.049106834954134766, "grad_norm": 0.5631657838821411, "learning_rate": 0.00019988882047958356, "loss": 0.493, "step": 356 }, { "epoch": 0.04924477550175874, "grad_norm": 0.8268153667449951, "learning_rate": 0.00019988813747788607, "loss": 0.8949, "step": 357 }, { "epoch": 0.04938271604938271, "grad_norm": 0.5059098601341248, "learning_rate": 0.00019988745238586265, "loss": 0.5139, "step": 358 }, { "epoch": 0.04952065659700669, "grad_norm": 0.5837445855140686, "learning_rate": 0.00019988676520352777, "loss": 0.6238, "step": 359 }, { "epoch": 0.04965859714463067, "grad_norm": 0.9709502458572388, "learning_rate": 0.00019988607593089566, "loss": 0.8823, "step": 360 }, { "epoch": 0.04979653769225464, "grad_norm": 0.6761531233787537, "learning_rate": 0.00019988538456798083, "loss": 0.5401, "step": 361 }, { "epoch": 0.049934478239878614, "grad_norm": 0.6025043725967407, "learning_rate": 0.00019988469111479774, "loss": 0.8026, "step": 362 }, { "epoch": 0.050072418787502584, "grad_norm": 0.4677552878856659, "learning_rate": 0.00019988399557136085, "loss": 0.5568, "step": 363 }, { "epoch": 0.05021035933512656, "grad_norm": 0.4840632379055023, "learning_rate": 0.00019988329793768481, "loss": 0.5436, "step": 364 }, { "epoch": 0.05034829988275053, "grad_norm": 0.5546537041664124, "learning_rate": 0.00019988259821378414, "loss": 0.5336, "step": 365 }, { "epoch": 0.05048624043037451, "grad_norm": 0.5456935167312622, "learning_rate": 0.00019988189639967354, "loss": 0.4862, "step": 366 }, { "epoch": 0.050624180977998486, "grad_norm": 0.5957455039024353, "learning_rate": 0.00019988119249536763, "loss": 0.536, "step": 367 }, { "epoch": 0.050762121525622456, "grad_norm": 0.43449798226356506, "learning_rate": 0.0001998804865008812, "loss": 0.4695, "step": 368 }, { "epoch": 0.05090006207324643, "grad_norm": 0.5384145379066467, "learning_rate": 0.000199879778416229, "loss": 0.7314, "step": 369 }, { "epoch": 0.0510380026208704, "grad_norm": 0.5711908936500549, "learning_rate": 0.0001998790682414258, "loss": 0.9143, "step": 370 }, { "epoch": 0.05117594316849438, "grad_norm": 0.6064605712890625, "learning_rate": 0.00019987835597648656, "loss": 0.3422, "step": 371 }, { "epoch": 0.05131388371611835, "grad_norm": 0.881544828414917, "learning_rate": 0.00019987764162142613, "loss": 1.4164, "step": 372 }, { "epoch": 0.05145182426374233, "grad_norm": 0.529297947883606, "learning_rate": 0.00019987692517625948, "loss": 0.5785, "step": 373 }, { "epoch": 0.051589764811366304, "grad_norm": 0.6973689198493958, "learning_rate": 0.00019987620664100155, "loss": 0.3716, "step": 374 }, { "epoch": 0.051727705358990274, "grad_norm": 0.45007166266441345, "learning_rate": 0.00019987548601566743, "loss": 0.7802, "step": 375 }, { "epoch": 0.05186564590661425, "grad_norm": 0.5009759068489075, "learning_rate": 0.0001998747633002722, "loss": 1.0373, "step": 376 }, { "epoch": 0.05200358645423822, "grad_norm": 0.5319408178329468, "learning_rate": 0.00019987403849483094, "loss": 0.671, "step": 377 }, { "epoch": 0.0521415270018622, "grad_norm": 0.4193972945213318, "learning_rate": 0.00019987331159935888, "loss": 0.6209, "step": 378 }, { "epoch": 0.052279467549486175, "grad_norm": 0.8057774901390076, "learning_rate": 0.00019987258261387116, "loss": 0.6371, "step": 379 }, { "epoch": 0.052417408097110145, "grad_norm": 0.4312836229801178, "learning_rate": 0.0001998718515383831, "loss": 0.8446, "step": 380 }, { "epoch": 0.05255534864473412, "grad_norm": 0.48356157541275024, "learning_rate": 0.00019987111837290998, "loss": 0.6438, "step": 381 }, { "epoch": 0.05269328919235809, "grad_norm": 0.47384414076805115, "learning_rate": 0.00019987038311746708, "loss": 0.5999, "step": 382 }, { "epoch": 0.05283122973998207, "grad_norm": 0.4600420594215393, "learning_rate": 0.0001998696457720699, "loss": 0.7346, "step": 383 }, { "epoch": 0.05296917028760604, "grad_norm": 0.6309019327163696, "learning_rate": 0.0001998689063367338, "loss": 0.9566, "step": 384 }, { "epoch": 0.053107110835230016, "grad_norm": 0.4424024224281311, "learning_rate": 0.00019986816481147425, "loss": 0.5094, "step": 385 }, { "epoch": 0.05324505138285399, "grad_norm": 0.3627796173095703, "learning_rate": 0.00019986742119630676, "loss": 0.223, "step": 386 }, { "epoch": 0.05338299193047796, "grad_norm": 0.4305715262889862, "learning_rate": 0.00019986667549124696, "loss": 0.533, "step": 387 }, { "epoch": 0.05352093247810194, "grad_norm": 0.9336516261100769, "learning_rate": 0.0001998659276963104, "loss": 0.8498, "step": 388 }, { "epoch": 0.05365887302572591, "grad_norm": 0.4217759370803833, "learning_rate": 0.00019986517781151272, "loss": 0.641, "step": 389 }, { "epoch": 0.05379681357334989, "grad_norm": 0.5158895254135132, "learning_rate": 0.00019986442583686963, "loss": 0.3782, "step": 390 }, { "epoch": 0.05393475412097386, "grad_norm": 0.4665093421936035, "learning_rate": 0.00019986367177239687, "loss": 0.5374, "step": 391 }, { "epoch": 0.054072694668597834, "grad_norm": 0.6842906475067139, "learning_rate": 0.00019986291561811023, "loss": 0.7518, "step": 392 }, { "epoch": 0.05421063521622181, "grad_norm": 0.8710291981697083, "learning_rate": 0.00019986215737402552, "loss": 1.1841, "step": 393 }, { "epoch": 0.05434857576384578, "grad_norm": 0.4958980977535248, "learning_rate": 0.0001998613970401586, "loss": 0.4862, "step": 394 }, { "epoch": 0.05448651631146976, "grad_norm": 0.5832077860832214, "learning_rate": 0.0001998606346165254, "loss": 0.4542, "step": 395 }, { "epoch": 0.05462445685909373, "grad_norm": 0.9587634801864624, "learning_rate": 0.00019985987010314184, "loss": 0.6823, "step": 396 }, { "epoch": 0.054762397406717706, "grad_norm": 0.5516191720962524, "learning_rate": 0.00019985910350002394, "loss": 0.6536, "step": 397 }, { "epoch": 0.054900337954341676, "grad_norm": 0.5614568591117859, "learning_rate": 0.00019985833480718778, "loss": 0.7727, "step": 398 }, { "epoch": 0.05503827850196565, "grad_norm": 0.43879610300064087, "learning_rate": 0.0001998575640246494, "loss": 0.3453, "step": 399 }, { "epoch": 0.05517621904958963, "grad_norm": 1.1493773460388184, "learning_rate": 0.00019985679115242495, "loss": 1.0769, "step": 400 }, { "epoch": 0.05517621904958963, "eval_loss": 0.8703657388687134, "eval_runtime": 23.7666, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482, "step": 400 }, { "epoch": 0.0553141595972136, "grad_norm": 0.7394590377807617, "learning_rate": 0.00019985601619053057, "loss": 0.7617, "step": 401 }, { "epoch": 0.05545210014483758, "grad_norm": 0.7590972185134888, "learning_rate": 0.00019985523913898252, "loss": 1.1592, "step": 402 }, { "epoch": 0.05559004069246155, "grad_norm": 0.5156362056732178, "learning_rate": 0.00019985445999779703, "loss": 0.5136, "step": 403 }, { "epoch": 0.055727981240085524, "grad_norm": 0.45237261056900024, "learning_rate": 0.00019985367876699045, "loss": 0.5057, "step": 404 }, { "epoch": 0.055865921787709494, "grad_norm": 0.7329659461975098, "learning_rate": 0.0001998528954465791, "loss": 0.6923, "step": 405 }, { "epoch": 0.05600386233533347, "grad_norm": 0.4460107684135437, "learning_rate": 0.0001998521100365793, "loss": 0.6953, "step": 406 }, { "epoch": 0.05614180288295745, "grad_norm": 0.5005888342857361, "learning_rate": 0.0001998513225370076, "loss": 0.3755, "step": 407 }, { "epoch": 0.05627974343058142, "grad_norm": 3.537179470062256, "learning_rate": 0.00019985053294788044, "loss": 0.6381, "step": 408 }, { "epoch": 0.056417683978205395, "grad_norm": 0.47960856556892395, "learning_rate": 0.00019984974126921438, "loss": 0.7603, "step": 409 }, { "epoch": 0.056555624525829365, "grad_norm": 0.46081554889678955, "learning_rate": 0.00019984894750102588, "loss": 0.6036, "step": 410 }, { "epoch": 0.05669356507345334, "grad_norm": 0.47110533714294434, "learning_rate": 0.00019984815164333163, "loss": 0.4789, "step": 411 }, { "epoch": 0.05683150562107732, "grad_norm": 0.70445716381073, "learning_rate": 0.00019984735369614828, "loss": 0.6804, "step": 412 }, { "epoch": 0.05696944616870129, "grad_norm": 0.484677255153656, "learning_rate": 0.00019984655365949252, "loss": 0.5013, "step": 413 }, { "epoch": 0.057107386716325266, "grad_norm": 1.0418232679367065, "learning_rate": 0.0001998457515333811, "loss": 0.5937, "step": 414 }, { "epoch": 0.057245327263949236, "grad_norm": 0.512485146522522, "learning_rate": 0.00019984494731783078, "loss": 0.5499, "step": 415 }, { "epoch": 0.05738326781157321, "grad_norm": 4.174819469451904, "learning_rate": 0.00019984414101285842, "loss": 0.5912, "step": 416 }, { "epoch": 0.05752120835919718, "grad_norm": 0.40738022327423096, "learning_rate": 0.00019984333261848089, "loss": 0.2472, "step": 417 }, { "epoch": 0.05765914890682116, "grad_norm": 0.7387332916259766, "learning_rate": 0.0001998425221347151, "loss": 0.9284, "step": 418 }, { "epoch": 0.05779708945444514, "grad_norm": 0.5807563662528992, "learning_rate": 0.00019984170956157798, "loss": 0.6251, "step": 419 }, { "epoch": 0.05793503000206911, "grad_norm": 0.5696909427642822, "learning_rate": 0.00019984089489908657, "loss": 0.374, "step": 420 }, { "epoch": 0.058072970549693084, "grad_norm": 0.6673562526702881, "learning_rate": 0.00019984007814725794, "loss": 0.6269, "step": 421 }, { "epoch": 0.058210911097317054, "grad_norm": 0.9080434441566467, "learning_rate": 0.0001998392593061091, "loss": 0.7718, "step": 422 }, { "epoch": 0.05834885164494103, "grad_norm": 0.6215444803237915, "learning_rate": 0.0001998384383756573, "loss": 0.6963, "step": 423 }, { "epoch": 0.058486792192565, "grad_norm": 2.9790985584259033, "learning_rate": 0.0001998376153559196, "loss": 1.2189, "step": 424 }, { "epoch": 0.05862473274018898, "grad_norm": 0.7409806847572327, "learning_rate": 0.00019983679024691332, "loss": 0.5597, "step": 425 }, { "epoch": 0.058762673287812955, "grad_norm": 0.6343380212783813, "learning_rate": 0.00019983596304865567, "loss": 0.4839, "step": 426 }, { "epoch": 0.058900613835436925, "grad_norm": 0.3927859663963318, "learning_rate": 0.000199835133761164, "loss": 0.4682, "step": 427 }, { "epoch": 0.0590385543830609, "grad_norm": 1.0523737668991089, "learning_rate": 0.00019983430238445563, "loss": 1.2581, "step": 428 }, { "epoch": 0.05917649493068487, "grad_norm": 0.5161516070365906, "learning_rate": 0.00019983346891854796, "loss": 0.3462, "step": 429 }, { "epoch": 0.05931443547830885, "grad_norm": 0.6147312521934509, "learning_rate": 0.00019983263336345846, "loss": 1.1771, "step": 430 }, { "epoch": 0.05945237602593282, "grad_norm": 0.5741944313049316, "learning_rate": 0.0001998317957192046, "loss": 0.5494, "step": 431 }, { "epoch": 0.0595903165735568, "grad_norm": 1.0944523811340332, "learning_rate": 0.00019983095598580388, "loss": 0.4464, "step": 432 }, { "epoch": 0.059728257121180774, "grad_norm": 0.4419618248939514, "learning_rate": 0.0001998301141632739, "loss": 0.5649, "step": 433 }, { "epoch": 0.059866197668804744, "grad_norm": 0.6602014899253845, "learning_rate": 0.00019982927025163233, "loss": 0.7508, "step": 434 }, { "epoch": 0.06000413821642872, "grad_norm": 0.45457127690315247, "learning_rate": 0.00019982842425089671, "loss": 0.4535, "step": 435 }, { "epoch": 0.06014207876405269, "grad_norm": 0.522692084312439, "learning_rate": 0.00019982757616108482, "loss": 0.3318, "step": 436 }, { "epoch": 0.06028001931167667, "grad_norm": 0.5625709891319275, "learning_rate": 0.00019982672598221443, "loss": 0.6074, "step": 437 }, { "epoch": 0.060417959859300645, "grad_norm": 0.7829237580299377, "learning_rate": 0.0001998258737143033, "loss": 0.6148, "step": 438 }, { "epoch": 0.060555900406924615, "grad_norm": 1.0776660442352295, "learning_rate": 0.00019982501935736924, "loss": 0.8393, "step": 439 }, { "epoch": 0.06069384095454859, "grad_norm": 0.5030855536460876, "learning_rate": 0.00019982416291143014, "loss": 0.4331, "step": 440 }, { "epoch": 0.06083178150217256, "grad_norm": 0.4305086135864258, "learning_rate": 0.00019982330437650397, "loss": 0.3484, "step": 441 }, { "epoch": 0.06096972204979654, "grad_norm": 0.4626142680644989, "learning_rate": 0.00019982244375260866, "loss": 0.4165, "step": 442 }, { "epoch": 0.06110766259742051, "grad_norm": 0.5233110785484314, "learning_rate": 0.0001998215810397622, "loss": 0.4617, "step": 443 }, { "epoch": 0.061245603145044486, "grad_norm": 0.7016578316688538, "learning_rate": 0.00019982071623798268, "loss": 0.6823, "step": 444 }, { "epoch": 0.06138354369266846, "grad_norm": 0.7269102334976196, "learning_rate": 0.00019981984934728817, "loss": 0.9527, "step": 445 }, { "epoch": 0.06152148424029243, "grad_norm": 0.532417893409729, "learning_rate": 0.00019981898036769684, "loss": 0.954, "step": 446 }, { "epoch": 0.06165942478791641, "grad_norm": 0.45132485032081604, "learning_rate": 0.00019981810929922686, "loss": 0.5275, "step": 447 }, { "epoch": 0.06179736533554038, "grad_norm": 0.4695104956626892, "learning_rate": 0.00019981723614189646, "loss": 0.6349, "step": 448 }, { "epoch": 0.06193530588316436, "grad_norm": 0.5177847146987915, "learning_rate": 0.00019981636089572388, "loss": 0.6627, "step": 449 }, { "epoch": 0.06207324643078833, "grad_norm": 0.5801154971122742, "learning_rate": 0.0001998154835607275, "loss": 0.7978, "step": 450 }, { "epoch": 0.062211186978412304, "grad_norm": 0.7457065582275391, "learning_rate": 0.00019981460413692563, "loss": 0.7857, "step": 451 }, { "epoch": 0.06234912752603628, "grad_norm": 0.5405957698822021, "learning_rate": 0.00019981372262433672, "loss": 0.2473, "step": 452 }, { "epoch": 0.06248706807366025, "grad_norm": 0.6344797611236572, "learning_rate": 0.00019981283902297914, "loss": 0.6377, "step": 453 }, { "epoch": 0.06262500862128423, "grad_norm": 0.6278123259544373, "learning_rate": 0.00019981195333287143, "loss": 0.7772, "step": 454 }, { "epoch": 0.0627629491689082, "grad_norm": 0.5916573405265808, "learning_rate": 0.00019981106555403216, "loss": 0.4493, "step": 455 }, { "epoch": 0.06290088971653217, "grad_norm": 0.46801498532295227, "learning_rate": 0.00019981017568647985, "loss": 0.4865, "step": 456 }, { "epoch": 0.06303883026415615, "grad_norm": 0.5247326493263245, "learning_rate": 0.00019980928373023314, "loss": 0.5591, "step": 457 }, { "epoch": 0.06317677081178012, "grad_norm": 0.5179060697555542, "learning_rate": 0.00019980838968531068, "loss": 0.6585, "step": 458 }, { "epoch": 0.0633147113594041, "grad_norm": 0.4230126142501831, "learning_rate": 0.0001998074935517312, "loss": 0.6402, "step": 459 }, { "epoch": 0.06345265190702808, "grad_norm": 0.713741660118103, "learning_rate": 0.0001998065953295135, "loss": 0.4387, "step": 460 }, { "epoch": 0.06359059245465204, "grad_norm": 0.5001334547996521, "learning_rate": 0.00019980569501867627, "loss": 0.6783, "step": 461 }, { "epoch": 0.06372853300227602, "grad_norm": 0.48040321469306946, "learning_rate": 0.00019980479261923843, "loss": 0.4563, "step": 462 }, { "epoch": 0.0638664735499, "grad_norm": 0.5295237302780151, "learning_rate": 0.00019980388813121885, "loss": 0.4345, "step": 463 }, { "epoch": 0.06400441409752397, "grad_norm": 1.023322582244873, "learning_rate": 0.0001998029815546364, "loss": 1.8321, "step": 464 }, { "epoch": 0.06414235464514795, "grad_norm": 0.5460634231567383, "learning_rate": 0.00019980207288951016, "loss": 0.6535, "step": 465 }, { "epoch": 0.06428029519277191, "grad_norm": 0.5623580813407898, "learning_rate": 0.0001998011621358591, "loss": 0.6441, "step": 466 }, { "epoch": 0.06441823574039589, "grad_norm": 0.612251877784729, "learning_rate": 0.0001998002492937022, "loss": 0.9532, "step": 467 }, { "epoch": 0.06455617628801986, "grad_norm": 0.393111914396286, "learning_rate": 0.00019979933436305866, "loss": 0.4654, "step": 468 }, { "epoch": 0.06469411683564384, "grad_norm": 0.5091925263404846, "learning_rate": 0.0001997984173439476, "loss": 0.6132, "step": 469 }, { "epoch": 0.0648320573832678, "grad_norm": 0.46031099557876587, "learning_rate": 0.0001997974982363882, "loss": 0.5191, "step": 470 }, { "epoch": 0.06496999793089178, "grad_norm": 0.552535355091095, "learning_rate": 0.0001997965770403997, "loss": 0.3869, "step": 471 }, { "epoch": 0.06510793847851576, "grad_norm": 0.5436117053031921, "learning_rate": 0.00019979565375600138, "loss": 0.6431, "step": 472 }, { "epoch": 0.06524587902613974, "grad_norm": 0.8215335011482239, "learning_rate": 0.00019979472838321254, "loss": 0.3204, "step": 473 }, { "epoch": 0.06538381957376371, "grad_norm": 0.8435911536216736, "learning_rate": 0.0001997938009220526, "loss": 0.5209, "step": 474 }, { "epoch": 0.06552176012138768, "grad_norm": 0.5616580247879028, "learning_rate": 0.00019979287137254087, "loss": 0.8198, "step": 475 }, { "epoch": 0.06565970066901165, "grad_norm": 0.5500820875167847, "learning_rate": 0.00019979193973469693, "loss": 0.51, "step": 476 }, { "epoch": 0.06579764121663563, "grad_norm": 0.6228001713752747, "learning_rate": 0.00019979100600854015, "loss": 0.522, "step": 477 }, { "epoch": 0.06593558176425961, "grad_norm": 0.5467067360877991, "learning_rate": 0.00019979007019409016, "loss": 0.5695, "step": 478 }, { "epoch": 0.06607352231188358, "grad_norm": 0.6926558613777161, "learning_rate": 0.00019978913229136652, "loss": 0.3539, "step": 479 }, { "epoch": 0.06621146285950755, "grad_norm": 0.6375585794448853, "learning_rate": 0.00019978819230038883, "loss": 0.4727, "step": 480 }, { "epoch": 0.06634940340713152, "grad_norm": 0.6646603941917419, "learning_rate": 0.0001997872502211768, "loss": 0.5597, "step": 481 }, { "epoch": 0.0664873439547555, "grad_norm": 0.5541056990623474, "learning_rate": 0.0001997863060537501, "loss": 0.5865, "step": 482 }, { "epoch": 0.06662528450237948, "grad_norm": 0.4391460120677948, "learning_rate": 0.00019978535979812853, "loss": 0.5291, "step": 483 }, { "epoch": 0.06676322505000346, "grad_norm": 4.984373569488525, "learning_rate": 0.0001997844114543319, "loss": 1.3206, "step": 484 }, { "epoch": 0.06690116559762742, "grad_norm": 0.9175629019737244, "learning_rate": 0.00019978346102238, "loss": 0.6788, "step": 485 }, { "epoch": 0.0670391061452514, "grad_norm": 0.46203866600990295, "learning_rate": 0.00019978250850229276, "loss": 0.4348, "step": 486 }, { "epoch": 0.06717704669287537, "grad_norm": 1.3392490148544312, "learning_rate": 0.0001997815538940901, "loss": 0.7456, "step": 487 }, { "epoch": 0.06731498724049935, "grad_norm": 0.9457517266273499, "learning_rate": 0.00019978059719779202, "loss": 0.7232, "step": 488 }, { "epoch": 0.06745292778812331, "grad_norm": 0.713071346282959, "learning_rate": 0.00019977963841341846, "loss": 0.4191, "step": 489 }, { "epoch": 0.06759086833574729, "grad_norm": 0.5800969004631042, "learning_rate": 0.0001997786775409896, "loss": 0.4815, "step": 490 }, { "epoch": 0.06772880888337127, "grad_norm": 0.7276319861412048, "learning_rate": 0.00019977771458052547, "loss": 0.4348, "step": 491 }, { "epoch": 0.06786674943099524, "grad_norm": 0.6077960133552551, "learning_rate": 0.00019977674953204627, "loss": 0.8063, "step": 492 }, { "epoch": 0.06800468997861922, "grad_norm": 1.4135379791259766, "learning_rate": 0.00019977578239557215, "loss": 0.7839, "step": 493 }, { "epoch": 0.06814263052624318, "grad_norm": 0.6413766145706177, "learning_rate": 0.00019977481317112338, "loss": 0.5164, "step": 494 }, { "epoch": 0.06828057107386716, "grad_norm": 0.4834112823009491, "learning_rate": 0.00019977384185872023, "loss": 0.4097, "step": 495 }, { "epoch": 0.06841851162149114, "grad_norm": 0.6751548051834106, "learning_rate": 0.000199772868458383, "loss": 0.8011, "step": 496 }, { "epoch": 0.06855645216911511, "grad_norm": 0.4742380678653717, "learning_rate": 0.00019977189297013212, "loss": 0.5572, "step": 497 }, { "epoch": 0.06869439271673909, "grad_norm": 0.7303953766822815, "learning_rate": 0.00019977091539398798, "loss": 0.7315, "step": 498 }, { "epoch": 0.06883233326436305, "grad_norm": 0.3794839084148407, "learning_rate": 0.00019976993572997102, "loss": 0.308, "step": 499 }, { "epoch": 0.06897027381198703, "grad_norm": 0.4521902799606323, "learning_rate": 0.00019976895397810174, "loss": 0.4479, "step": 500 }, { "epoch": 0.06910821435961101, "grad_norm": 0.7732378244400024, "learning_rate": 0.00019976797013840068, "loss": 0.7395, "step": 501 }, { "epoch": 0.06924615490723499, "grad_norm": 1.1078095436096191, "learning_rate": 0.00019976698421088847, "loss": 0.5569, "step": 502 }, { "epoch": 0.06938409545485895, "grad_norm": 0.6252771019935608, "learning_rate": 0.00019976599619558568, "loss": 0.6724, "step": 503 }, { "epoch": 0.06952203600248293, "grad_norm": 0.6018311381340027, "learning_rate": 0.00019976500609251307, "loss": 0.5194, "step": 504 }, { "epoch": 0.0696599765501069, "grad_norm": 0.8232583403587341, "learning_rate": 0.00019976401390169127, "loss": 0.8651, "step": 505 }, { "epoch": 0.06979791709773088, "grad_norm": 0.5815649032592773, "learning_rate": 0.00019976301962314113, "loss": 0.6046, "step": 506 }, { "epoch": 0.06993585764535486, "grad_norm": 0.9798987507820129, "learning_rate": 0.00019976202325688337, "loss": 0.9387, "step": 507 }, { "epoch": 0.07007379819297882, "grad_norm": 0.6313984990119934, "learning_rate": 0.0001997610248029389, "loss": 0.5486, "step": 508 }, { "epoch": 0.0702117387406028, "grad_norm": 0.479829341173172, "learning_rate": 0.0001997600242613286, "loss": 0.4614, "step": 509 }, { "epoch": 0.07034967928822677, "grad_norm": 0.4298437833786011, "learning_rate": 0.0001997590216320734, "loss": 0.3145, "step": 510 }, { "epoch": 0.07048761983585075, "grad_norm": 0.6168974041938782, "learning_rate": 0.00019975801691519426, "loss": 0.5537, "step": 511 }, { "epoch": 0.07062556038347473, "grad_norm": 0.6348540782928467, "learning_rate": 0.00019975701011071227, "loss": 0.8361, "step": 512 }, { "epoch": 0.07076350093109869, "grad_norm": 0.5376604795455933, "learning_rate": 0.00019975600121864844, "loss": 0.5901, "step": 513 }, { "epoch": 0.07090144147872267, "grad_norm": 0.7549625039100647, "learning_rate": 0.00019975499023902388, "loss": 0.464, "step": 514 }, { "epoch": 0.07103938202634665, "grad_norm": 0.6116847991943359, "learning_rate": 0.0001997539771718598, "loss": 0.5585, "step": 515 }, { "epoch": 0.07117732257397062, "grad_norm": 0.6240098476409912, "learning_rate": 0.00019975296201717738, "loss": 0.7202, "step": 516 }, { "epoch": 0.0713152631215946, "grad_norm": 0.6997739672660828, "learning_rate": 0.00019975194477499784, "loss": 0.4296, "step": 517 }, { "epoch": 0.07145320366921856, "grad_norm": 0.5400416851043701, "learning_rate": 0.00019975092544534246, "loss": 0.4749, "step": 518 }, { "epoch": 0.07159114421684254, "grad_norm": 0.5933730602264404, "learning_rate": 0.00019974990402823263, "loss": 0.7171, "step": 519 }, { "epoch": 0.07172908476446652, "grad_norm": 0.554554283618927, "learning_rate": 0.00019974888052368966, "loss": 0.4776, "step": 520 }, { "epoch": 0.0718670253120905, "grad_norm": 0.45907062292099, "learning_rate": 0.000199747854931735, "loss": 0.451, "step": 521 }, { "epoch": 0.07200496585971446, "grad_norm": 0.5568957328796387, "learning_rate": 0.00019974682725239012, "loss": 0.6005, "step": 522 }, { "epoch": 0.07214290640733843, "grad_norm": 4.979952335357666, "learning_rate": 0.0001997457974856765, "loss": 0.5554, "step": 523 }, { "epoch": 0.07228084695496241, "grad_norm": 0.6133274435997009, "learning_rate": 0.00019974476563161571, "loss": 0.4279, "step": 524 }, { "epoch": 0.07241878750258639, "grad_norm": 1.0026589632034302, "learning_rate": 0.00019974373169022932, "loss": 0.3212, "step": 525 }, { "epoch": 0.07255672805021036, "grad_norm": 0.40984684228897095, "learning_rate": 0.000199742695661539, "loss": 0.2718, "step": 526 }, { "epoch": 0.07269466859783433, "grad_norm": 0.9331387281417847, "learning_rate": 0.00019974165754556641, "loss": 0.4044, "step": 527 }, { "epoch": 0.0728326091454583, "grad_norm": 0.46805036067962646, "learning_rate": 0.0001997406173423333, "loss": 0.2967, "step": 528 }, { "epoch": 0.07297054969308228, "grad_norm": 0.9887330532073975, "learning_rate": 0.00019973957505186137, "loss": 0.9084, "step": 529 }, { "epoch": 0.07310849024070626, "grad_norm": 0.5215080976486206, "learning_rate": 0.0001997385306741725, "loss": 0.4278, "step": 530 }, { "epoch": 0.07324643078833024, "grad_norm": 0.8102514147758484, "learning_rate": 0.00019973748420928854, "loss": 0.4712, "step": 531 }, { "epoch": 0.0733843713359542, "grad_norm": 0.676751971244812, "learning_rate": 0.00019973643565723137, "loss": 0.6074, "step": 532 }, { "epoch": 0.07352231188357818, "grad_norm": 0.9786521196365356, "learning_rate": 0.0001997353850180229, "loss": 0.7998, "step": 533 }, { "epoch": 0.07366025243120215, "grad_norm": 0.5815269351005554, "learning_rate": 0.00019973433229168518, "loss": 0.3798, "step": 534 }, { "epoch": 0.07379819297882613, "grad_norm": 1.0060030221939087, "learning_rate": 0.0001997332774782402, "loss": 0.6771, "step": 535 }, { "epoch": 0.0739361335264501, "grad_norm": 0.5525199174880981, "learning_rate": 0.00019973222057771006, "loss": 0.5958, "step": 536 }, { "epoch": 0.07407407407407407, "grad_norm": 0.46688395738601685, "learning_rate": 0.00019973116159011683, "loss": 0.7773, "step": 537 }, { "epoch": 0.07421201462169805, "grad_norm": 0.7055450081825256, "learning_rate": 0.00019973010051548275, "loss": 0.7769, "step": 538 }, { "epoch": 0.07434995516932202, "grad_norm": 2.1164026260375977, "learning_rate": 0.00019972903735382996, "loss": 0.6389, "step": 539 }, { "epoch": 0.074487895716946, "grad_norm": 0.4807243049144745, "learning_rate": 0.00019972797210518072, "loss": 0.5949, "step": 540 }, { "epoch": 0.07462583626456996, "grad_norm": 0.5901051163673401, "learning_rate": 0.00019972690476955732, "loss": 0.7589, "step": 541 }, { "epoch": 0.07476377681219394, "grad_norm": 0.7955170273780823, "learning_rate": 0.00019972583534698211, "loss": 0.7414, "step": 542 }, { "epoch": 0.07490171735981792, "grad_norm": 0.4119940996170044, "learning_rate": 0.00019972476383747748, "loss": 0.2528, "step": 543 }, { "epoch": 0.0750396579074419, "grad_norm": 0.5379517078399658, "learning_rate": 0.00019972369024106584, "loss": 0.8701, "step": 544 }, { "epoch": 0.07517759845506587, "grad_norm": 0.560516357421875, "learning_rate": 0.0001997226145577696, "loss": 0.4306, "step": 545 }, { "epoch": 0.07531553900268984, "grad_norm": 0.5738652348518372, "learning_rate": 0.00019972153678761138, "loss": 0.7237, "step": 546 }, { "epoch": 0.07545347955031381, "grad_norm": 0.5861878395080566, "learning_rate": 0.00019972045693061367, "loss": 0.9988, "step": 547 }, { "epoch": 0.07559142009793779, "grad_norm": 0.524368405342102, "learning_rate": 0.00019971937498679908, "loss": 0.9187, "step": 548 }, { "epoch": 0.07572936064556177, "grad_norm": 0.46548980474472046, "learning_rate": 0.00019971829095619023, "loss": 0.4057, "step": 549 }, { "epoch": 0.07586730119318574, "grad_norm": 0.4150684177875519, "learning_rate": 0.00019971720483880984, "loss": 0.4457, "step": 550 }, { "epoch": 0.0760052417408097, "grad_norm": 0.6656027436256409, "learning_rate": 0.0001997161166346806, "loss": 0.6603, "step": 551 }, { "epoch": 0.07614318228843368, "grad_norm": 0.6574470400810242, "learning_rate": 0.00019971502634382534, "loss": 0.4266, "step": 552 }, { "epoch": 0.07628112283605766, "grad_norm": 1.108852505683899, "learning_rate": 0.0001997139339662668, "loss": 0.8485, "step": 553 }, { "epoch": 0.07641906338368164, "grad_norm": 0.7075757384300232, "learning_rate": 0.0001997128395020279, "loss": 0.8539, "step": 554 }, { "epoch": 0.0765570039313056, "grad_norm": 0.4994277060031891, "learning_rate": 0.00019971174295113154, "loss": 0.4206, "step": 555 }, { "epoch": 0.07669494447892958, "grad_norm": 0.7874775528907776, "learning_rate": 0.00019971064431360063, "loss": 0.4535, "step": 556 }, { "epoch": 0.07683288502655355, "grad_norm": 0.48713603615760803, "learning_rate": 0.00019970954358945818, "loss": 0.6906, "step": 557 }, { "epoch": 0.07697082557417753, "grad_norm": 0.5980125069618225, "learning_rate": 0.0001997084407787272, "loss": 0.7195, "step": 558 }, { "epoch": 0.07710876612180151, "grad_norm": 0.6836782097816467, "learning_rate": 0.00019970733588143085, "loss": 0.6231, "step": 559 }, { "epoch": 0.07724670666942547, "grad_norm": 0.577415943145752, "learning_rate": 0.00019970622889759217, "loss": 0.5613, "step": 560 }, { "epoch": 0.07738464721704945, "grad_norm": 0.7536243200302124, "learning_rate": 0.00019970511982723434, "loss": 0.802, "step": 561 }, { "epoch": 0.07752258776467343, "grad_norm": 0.4730790853500366, "learning_rate": 0.00019970400867038061, "loss": 0.3753, "step": 562 }, { "epoch": 0.0776605283122974, "grad_norm": 0.7478041648864746, "learning_rate": 0.00019970289542705418, "loss": 0.8585, "step": 563 }, { "epoch": 0.07779846885992138, "grad_norm": 0.7362046241760254, "learning_rate": 0.00019970178009727838, "loss": 0.5089, "step": 564 }, { "epoch": 0.07793640940754534, "grad_norm": 0.547188401222229, "learning_rate": 0.00019970066268107655, "loss": 0.7786, "step": 565 }, { "epoch": 0.07807434995516932, "grad_norm": 0.5870764851570129, "learning_rate": 0.00019969954317847205, "loss": 0.3834, "step": 566 }, { "epoch": 0.0782122905027933, "grad_norm": 0.8039633631706238, "learning_rate": 0.00019969842158948833, "loss": 0.4987, "step": 567 }, { "epoch": 0.07835023105041727, "grad_norm": 0.448749840259552, "learning_rate": 0.00019969729791414884, "loss": 0.3257, "step": 568 }, { "epoch": 0.07848817159804125, "grad_norm": 0.8656350374221802, "learning_rate": 0.0001996961721524771, "loss": 0.6265, "step": 569 }, { "epoch": 0.07862611214566521, "grad_norm": 0.5044237375259399, "learning_rate": 0.0001996950443044967, "loss": 0.5335, "step": 570 }, { "epoch": 0.07876405269328919, "grad_norm": 0.5112646222114563, "learning_rate": 0.00019969391437023122, "loss": 0.52, "step": 571 }, { "epoch": 0.07890199324091317, "grad_norm": 0.5807754993438721, "learning_rate": 0.0001996927823497043, "loss": 0.2772, "step": 572 }, { "epoch": 0.07903993378853715, "grad_norm": 0.24601247906684875, "learning_rate": 0.00019969164824293963, "loss": 0.0507, "step": 573 }, { "epoch": 0.07917787433616111, "grad_norm": 0.4788058400154114, "learning_rate": 0.00019969051204996093, "loss": 0.6902, "step": 574 }, { "epoch": 0.07931581488378509, "grad_norm": 0.686082661151886, "learning_rate": 0.000199689373770792, "loss": 0.4271, "step": 575 }, { "epoch": 0.07945375543140906, "grad_norm": 0.5949164628982544, "learning_rate": 0.00019968823340545667, "loss": 0.7189, "step": 576 }, { "epoch": 0.07959169597903304, "grad_norm": 0.4743265211582184, "learning_rate": 0.00019968709095397874, "loss": 0.7042, "step": 577 }, { "epoch": 0.07972963652665702, "grad_norm": 0.547727108001709, "learning_rate": 0.00019968594641638223, "loss": 0.6304, "step": 578 }, { "epoch": 0.07986757707428098, "grad_norm": 0.5755452513694763, "learning_rate": 0.000199684799792691, "loss": 0.5862, "step": 579 }, { "epoch": 0.08000551762190496, "grad_norm": 0.93170565366745, "learning_rate": 0.00019968365108292906, "loss": 0.3989, "step": 580 }, { "epoch": 0.08014345816952893, "grad_norm": 0.7697615623474121, "learning_rate": 0.00019968250028712048, "loss": 1.0362, "step": 581 }, { "epoch": 0.08028139871715291, "grad_norm": 0.6355724930763245, "learning_rate": 0.0001996813474052893, "loss": 0.7472, "step": 582 }, { "epoch": 0.08041933926477689, "grad_norm": 0.6851533651351929, "learning_rate": 0.0001996801924374597, "loss": 0.4039, "step": 583 }, { "epoch": 0.08055727981240085, "grad_norm": 0.6417593955993652, "learning_rate": 0.00019967903538365577, "loss": 0.5847, "step": 584 }, { "epoch": 0.08069522036002483, "grad_norm": 0.5994210243225098, "learning_rate": 0.0001996778762439018, "loss": 0.7387, "step": 585 }, { "epoch": 0.0808331609076488, "grad_norm": 0.5934234261512756, "learning_rate": 0.00019967671501822202, "loss": 0.471, "step": 586 }, { "epoch": 0.08097110145527278, "grad_norm": 0.6684454083442688, "learning_rate": 0.00019967555170664073, "loss": 0.4216, "step": 587 }, { "epoch": 0.08110904200289674, "grad_norm": 0.6558318734169006, "learning_rate": 0.0001996743863091823, "loss": 1.1186, "step": 588 }, { "epoch": 0.08124698255052072, "grad_norm": 0.7587666511535645, "learning_rate": 0.00019967321882587104, "loss": 0.8318, "step": 589 }, { "epoch": 0.0813849230981447, "grad_norm": 0.5224142074584961, "learning_rate": 0.00019967204925673145, "loss": 0.8441, "step": 590 }, { "epoch": 0.08152286364576868, "grad_norm": 0.682381272315979, "learning_rate": 0.000199670877601788, "loss": 0.5282, "step": 591 }, { "epoch": 0.08166080419339265, "grad_norm": 0.7016668319702148, "learning_rate": 0.0001996697038610652, "loss": 0.8494, "step": 592 }, { "epoch": 0.08179874474101662, "grad_norm": 0.47111374139785767, "learning_rate": 0.00019966852803458759, "loss": 0.3833, "step": 593 }, { "epoch": 0.08193668528864059, "grad_norm": 0.9120432138442993, "learning_rate": 0.00019966735012237982, "loss": 0.7583, "step": 594 }, { "epoch": 0.08207462583626457, "grad_norm": 0.5179333090782166, "learning_rate": 0.00019966617012446647, "loss": 0.4687, "step": 595 }, { "epoch": 0.08221256638388855, "grad_norm": 0.6437809467315674, "learning_rate": 0.0001996649880408723, "loss": 0.874, "step": 596 }, { "epoch": 0.08235050693151252, "grad_norm": 1.0903011560440063, "learning_rate": 0.00019966380387162203, "loss": 0.9305, "step": 597 }, { "epoch": 0.08248844747913649, "grad_norm": 0.8891686797142029, "learning_rate": 0.00019966261761674044, "loss": 0.6303, "step": 598 }, { "epoch": 0.08262638802676046, "grad_norm": 0.4703718423843384, "learning_rate": 0.00019966142927625236, "loss": 0.7474, "step": 599 }, { "epoch": 0.08276432857438444, "grad_norm": 0.46095022559165955, "learning_rate": 0.00019966023885018263, "loss": 0.4427, "step": 600 }, { "epoch": 0.08276432857438444, "eval_loss": 0.6994988322257996, "eval_runtime": 23.7685, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482, "step": 600 }, { "epoch": 0.08290226912200842, "grad_norm": 0.6329456567764282, "learning_rate": 0.00019965904633855618, "loss": 0.6795, "step": 601 }, { "epoch": 0.0830402096696324, "grad_norm": 0.5478696227073669, "learning_rate": 0.00019965785174139798, "loss": 0.6493, "step": 602 }, { "epoch": 0.08317815021725636, "grad_norm": 0.5811643600463867, "learning_rate": 0.00019965665505873302, "loss": 0.3661, "step": 603 }, { "epoch": 0.08331609076488034, "grad_norm": 0.7568562030792236, "learning_rate": 0.00019965545629058631, "loss": 1.1299, "step": 604 }, { "epoch": 0.08345403131250431, "grad_norm": 0.7818085551261902, "learning_rate": 0.000199654255436983, "loss": 0.6477, "step": 605 }, { "epoch": 0.08359197186012829, "grad_norm": 0.6207999587059021, "learning_rate": 0.00019965305249794815, "loss": 0.6089, "step": 606 }, { "epoch": 0.08372991240775225, "grad_norm": 0.6402393579483032, "learning_rate": 0.00019965184747350697, "loss": 0.3813, "step": 607 }, { "epoch": 0.08386785295537623, "grad_norm": 0.4845639169216156, "learning_rate": 0.0001996506403636847, "loss": 0.5147, "step": 608 }, { "epoch": 0.0840057935030002, "grad_norm": 0.6269810795783997, "learning_rate": 0.0001996494311685065, "loss": 0.7288, "step": 609 }, { "epoch": 0.08414373405062418, "grad_norm": 0.6568842530250549, "learning_rate": 0.00019964821988799783, "loss": 0.8139, "step": 610 }, { "epoch": 0.08428167459824816, "grad_norm": 0.643670916557312, "learning_rate": 0.00019964700652218395, "loss": 1.0303, "step": 611 }, { "epoch": 0.08441961514587212, "grad_norm": 0.594838559627533, "learning_rate": 0.00019964579107109023, "loss": 0.5529, "step": 612 }, { "epoch": 0.0845575556934961, "grad_norm": 0.5024222135543823, "learning_rate": 0.00019964457353474212, "loss": 0.4661, "step": 613 }, { "epoch": 0.08469549624112008, "grad_norm": 0.5264707803726196, "learning_rate": 0.00019964335391316516, "loss": 0.3273, "step": 614 }, { "epoch": 0.08483343678874405, "grad_norm": 0.6307628750801086, "learning_rate": 0.0001996421322063848, "loss": 0.4593, "step": 615 }, { "epoch": 0.08497137733636803, "grad_norm": 1.9060884714126587, "learning_rate": 0.00019964090841442664, "loss": 0.8541, "step": 616 }, { "epoch": 0.085109317883992, "grad_norm": 0.6357777714729309, "learning_rate": 0.00019963968253731626, "loss": 0.5706, "step": 617 }, { "epoch": 0.08524725843161597, "grad_norm": 0.5257875323295593, "learning_rate": 0.00019963845457507938, "loss": 0.717, "step": 618 }, { "epoch": 0.08538519897923995, "grad_norm": 0.7160320281982422, "learning_rate": 0.0001996372245277416, "loss": 0.5284, "step": 619 }, { "epoch": 0.08552313952686393, "grad_norm": 0.6436088681221008, "learning_rate": 0.00019963599239532876, "loss": 0.6883, "step": 620 }, { "epoch": 0.0856610800744879, "grad_norm": 0.6516035199165344, "learning_rate": 0.00019963475817786656, "loss": 0.5014, "step": 621 }, { "epoch": 0.08579902062211187, "grad_norm": 0.636218786239624, "learning_rate": 0.00019963352187538086, "loss": 0.7013, "step": 622 }, { "epoch": 0.08593696116973584, "grad_norm": 0.5847918391227722, "learning_rate": 0.0001996322834878976, "loss": 0.3843, "step": 623 }, { "epoch": 0.08607490171735982, "grad_norm": 0.48539549112319946, "learning_rate": 0.00019963104301544258, "loss": 0.4288, "step": 624 }, { "epoch": 0.0862128422649838, "grad_norm": 0.5698838233947754, "learning_rate": 0.00019962980045804183, "loss": 0.5235, "step": 625 }, { "epoch": 0.08635078281260776, "grad_norm": 1.0381183624267578, "learning_rate": 0.00019962855581572134, "loss": 0.7357, "step": 626 }, { "epoch": 0.08648872336023174, "grad_norm": 0.7514489889144897, "learning_rate": 0.0001996273090885071, "loss": 0.4777, "step": 627 }, { "epoch": 0.08662666390785571, "grad_norm": 0.7849544882774353, "learning_rate": 0.00019962606027642532, "loss": 0.6248, "step": 628 }, { "epoch": 0.08676460445547969, "grad_norm": 1.4124609231948853, "learning_rate": 0.00019962480937950203, "loss": 0.7421, "step": 629 }, { "epoch": 0.08690254500310367, "grad_norm": 0.6680216789245605, "learning_rate": 0.00019962355639776346, "loss": 0.4358, "step": 630 }, { "epoch": 0.08704048555072763, "grad_norm": 0.5260584950447083, "learning_rate": 0.00019962230133123576, "loss": 0.831, "step": 631 }, { "epoch": 0.08717842609835161, "grad_norm": 0.518754780292511, "learning_rate": 0.00019962104417994527, "loss": 0.464, "step": 632 }, { "epoch": 0.08731636664597558, "grad_norm": 0.4825124144554138, "learning_rate": 0.0001996197849439183, "loss": 0.4555, "step": 633 }, { "epoch": 0.08745430719359956, "grad_norm": 0.5619780421257019, "learning_rate": 0.00019961852362318118, "loss": 0.7752, "step": 634 }, { "epoch": 0.08759224774122354, "grad_norm": 0.7256078124046326, "learning_rate": 0.00019961726021776026, "loss": 0.7239, "step": 635 }, { "epoch": 0.0877301882888475, "grad_norm": 0.6695348620414734, "learning_rate": 0.00019961599472768205, "loss": 0.4609, "step": 636 }, { "epoch": 0.08786812883647148, "grad_norm": 0.8457784056663513, "learning_rate": 0.000199614727152973, "loss": 0.873, "step": 637 }, { "epoch": 0.08800606938409546, "grad_norm": 0.7318458557128906, "learning_rate": 0.00019961345749365962, "loss": 0.9221, "step": 638 }, { "epoch": 0.08814400993171943, "grad_norm": 0.6614384055137634, "learning_rate": 0.0001996121857497685, "loss": 0.5968, "step": 639 }, { "epoch": 0.0882819504793434, "grad_norm": 0.5121036171913147, "learning_rate": 0.00019961091192132626, "loss": 0.5198, "step": 640 }, { "epoch": 0.08841989102696737, "grad_norm": 0.6493807435035706, "learning_rate": 0.00019960963600835956, "loss": 0.6223, "step": 641 }, { "epoch": 0.08855783157459135, "grad_norm": 0.46382856369018555, "learning_rate": 0.0001996083580108951, "loss": 0.4491, "step": 642 }, { "epoch": 0.08869577212221533, "grad_norm": 0.5297259092330933, "learning_rate": 0.00019960707792895957, "loss": 0.4547, "step": 643 }, { "epoch": 0.0888337126698393, "grad_norm": 0.9048398733139038, "learning_rate": 0.00019960579576257982, "loss": 0.8354, "step": 644 }, { "epoch": 0.08897165321746327, "grad_norm": 0.4620378315448761, "learning_rate": 0.00019960451151178267, "loss": 0.4789, "step": 645 }, { "epoch": 0.08910959376508724, "grad_norm": 0.4356841742992401, "learning_rate": 0.000199603225176595, "loss": 0.508, "step": 646 }, { "epoch": 0.08924753431271122, "grad_norm": 0.5129390954971313, "learning_rate": 0.0001996019367570437, "loss": 0.4617, "step": 647 }, { "epoch": 0.0893854748603352, "grad_norm": 0.4920646846294403, "learning_rate": 0.00019960064625315575, "loss": 0.4944, "step": 648 }, { "epoch": 0.08952341540795918, "grad_norm": 0.5648272037506104, "learning_rate": 0.00019959935366495814, "loss": 0.3804, "step": 649 }, { "epoch": 0.08966135595558314, "grad_norm": 0.847991943359375, "learning_rate": 0.00019959805899247797, "loss": 0.7792, "step": 650 }, { "epoch": 0.08979929650320712, "grad_norm": 1.1991604566574097, "learning_rate": 0.00019959676223574226, "loss": 0.6559, "step": 651 }, { "epoch": 0.08993723705083109, "grad_norm": 0.6075736880302429, "learning_rate": 0.00019959546339477823, "loss": 0.6457, "step": 652 }, { "epoch": 0.09007517759845507, "grad_norm": 0.619528591632843, "learning_rate": 0.00019959416246961295, "loss": 0.5797, "step": 653 }, { "epoch": 0.09021311814607905, "grad_norm": 0.607652485370636, "learning_rate": 0.00019959285946027374, "loss": 0.7165, "step": 654 }, { "epoch": 0.09035105869370301, "grad_norm": 1.0300213098526, "learning_rate": 0.00019959155436678785, "loss": 0.5351, "step": 655 }, { "epoch": 0.09048899924132699, "grad_norm": 0.7099342942237854, "learning_rate": 0.00019959024718918257, "loss": 1.1064, "step": 656 }, { "epoch": 0.09062693978895096, "grad_norm": 0.512968122959137, "learning_rate": 0.00019958893792748524, "loss": 0.5042, "step": 657 }, { "epoch": 0.09076488033657494, "grad_norm": 0.6866976022720337, "learning_rate": 0.0001995876265817233, "loss": 0.3854, "step": 658 }, { "epoch": 0.0909028208841989, "grad_norm": 0.8152676224708557, "learning_rate": 0.00019958631315192417, "loss": 0.5973, "step": 659 }, { "epoch": 0.09104076143182288, "grad_norm": 0.6425837278366089, "learning_rate": 0.00019958499763811537, "loss": 0.5432, "step": 660 }, { "epoch": 0.09117870197944686, "grad_norm": 0.7320981025695801, "learning_rate": 0.00019958368004032435, "loss": 0.6684, "step": 661 }, { "epoch": 0.09131664252707083, "grad_norm": 0.5170556902885437, "learning_rate": 0.00019958236035857877, "loss": 0.5425, "step": 662 }, { "epoch": 0.09145458307469481, "grad_norm": 0.5608139634132385, "learning_rate": 0.00019958103859290619, "loss": 0.8382, "step": 663 }, { "epoch": 0.09159252362231877, "grad_norm": 0.809233546257019, "learning_rate": 0.00019957971474333428, "loss": 0.8924, "step": 664 }, { "epoch": 0.09173046416994275, "grad_norm": 0.7292246222496033, "learning_rate": 0.00019957838880989078, "loss": 0.5633, "step": 665 }, { "epoch": 0.09186840471756673, "grad_norm": 0.4839702546596527, "learning_rate": 0.00019957706079260336, "loss": 0.3983, "step": 666 }, { "epoch": 0.0920063452651907, "grad_norm": 0.5444965958595276, "learning_rate": 0.0001995757306914999, "loss": 0.7614, "step": 667 }, { "epoch": 0.09214428581281468, "grad_norm": 0.59327232837677, "learning_rate": 0.00019957439850660816, "loss": 0.3671, "step": 668 }, { "epoch": 0.09228222636043865, "grad_norm": 0.7113454937934875, "learning_rate": 0.00019957306423795608, "loss": 0.5764, "step": 669 }, { "epoch": 0.09242016690806262, "grad_norm": 0.5695451498031616, "learning_rate": 0.00019957172788557154, "loss": 0.7481, "step": 670 }, { "epoch": 0.0925581074556866, "grad_norm": 0.6515781879425049, "learning_rate": 0.00019957038944948255, "loss": 0.6533, "step": 671 }, { "epoch": 0.09269604800331058, "grad_norm": 0.4510314166545868, "learning_rate": 0.00019956904892971705, "loss": 0.5409, "step": 672 }, { "epoch": 0.09283398855093454, "grad_norm": 0.540710985660553, "learning_rate": 0.00019956770632630316, "loss": 0.7009, "step": 673 }, { "epoch": 0.09297192909855852, "grad_norm": 0.5610421895980835, "learning_rate": 0.00019956636163926894, "loss": 0.6575, "step": 674 }, { "epoch": 0.0931098696461825, "grad_norm": 0.6776841282844543, "learning_rate": 0.00019956501486864255, "loss": 0.6708, "step": 675 }, { "epoch": 0.09324781019380647, "grad_norm": 0.5023597478866577, "learning_rate": 0.0001995636660144521, "loss": 0.5779, "step": 676 }, { "epoch": 0.09338575074143045, "grad_norm": 0.5431024432182312, "learning_rate": 0.00019956231507672594, "loss": 0.6545, "step": 677 }, { "epoch": 0.09352369128905441, "grad_norm": 0.7699187397956848, "learning_rate": 0.00019956096205549226, "loss": 0.6455, "step": 678 }, { "epoch": 0.09366163183667839, "grad_norm": 0.8377233147621155, "learning_rate": 0.0001995596069507794, "loss": 0.5763, "step": 679 }, { "epoch": 0.09379957238430237, "grad_norm": 0.4526061713695526, "learning_rate": 0.0001995582497626157, "loss": 0.3688, "step": 680 }, { "epoch": 0.09393751293192634, "grad_norm": 0.7141347527503967, "learning_rate": 0.00019955689049102956, "loss": 0.5253, "step": 681 }, { "epoch": 0.09407545347955032, "grad_norm": 0.6976096034049988, "learning_rate": 0.00019955552913604948, "loss": 0.3001, "step": 682 }, { "epoch": 0.09421339402717428, "grad_norm": 0.9586352109909058, "learning_rate": 0.00019955416569770386, "loss": 0.6465, "step": 683 }, { "epoch": 0.09435133457479826, "grad_norm": 0.6387444138526917, "learning_rate": 0.0001995528001760213, "loss": 0.6699, "step": 684 }, { "epoch": 0.09448927512242224, "grad_norm": 0.5329609513282776, "learning_rate": 0.00019955143257103036, "loss": 0.4904, "step": 685 }, { "epoch": 0.09462721567004621, "grad_norm": 0.6259768009185791, "learning_rate": 0.00019955006288275963, "loss": 0.5922, "step": 686 }, { "epoch": 0.09476515621767019, "grad_norm": 1.0014195442199707, "learning_rate": 0.0001995486911112378, "loss": 0.5604, "step": 687 }, { "epoch": 0.09490309676529415, "grad_norm": 0.675841748714447, "learning_rate": 0.0001995473172564936, "loss": 0.4573, "step": 688 }, { "epoch": 0.09504103731291813, "grad_norm": 0.7129027247428894, "learning_rate": 0.00019954594131855574, "loss": 0.6872, "step": 689 }, { "epoch": 0.09517897786054211, "grad_norm": 0.5275905132293701, "learning_rate": 0.00019954456329745302, "loss": 0.6498, "step": 690 }, { "epoch": 0.09531691840816608, "grad_norm": 0.5839846730232239, "learning_rate": 0.00019954318319321428, "loss": 0.3989, "step": 691 }, { "epoch": 0.09545485895579005, "grad_norm": 0.8275391459465027, "learning_rate": 0.00019954180100586842, "loss": 1.0373, "step": 692 }, { "epoch": 0.09559279950341402, "grad_norm": 0.5609612464904785, "learning_rate": 0.00019954041673544433, "loss": 0.7015, "step": 693 }, { "epoch": 0.095730740051038, "grad_norm": 0.5435266494750977, "learning_rate": 0.000199539030381971, "loss": 0.7207, "step": 694 }, { "epoch": 0.09586868059866198, "grad_norm": 0.4195777475833893, "learning_rate": 0.00019953764194547747, "loss": 0.4839, "step": 695 }, { "epoch": 0.09600662114628596, "grad_norm": 0.6146819591522217, "learning_rate": 0.00019953625142599273, "loss": 0.7853, "step": 696 }, { "epoch": 0.09614456169390992, "grad_norm": 0.6549080610275269, "learning_rate": 0.00019953485882354593, "loss": 0.3945, "step": 697 }, { "epoch": 0.0962825022415339, "grad_norm": 0.6978781223297119, "learning_rate": 0.0001995334641381662, "loss": 0.4962, "step": 698 }, { "epoch": 0.09642044278915787, "grad_norm": 0.7769024968147278, "learning_rate": 0.0001995320673698827, "loss": 0.6696, "step": 699 }, { "epoch": 0.09655838333678185, "grad_norm": 0.5703161954879761, "learning_rate": 0.0001995306685187247, "loss": 0.8526, "step": 700 }, { "epoch": 0.09669632388440583, "grad_norm": 0.6974921822547913, "learning_rate": 0.00019952926758472146, "loss": 0.7957, "step": 701 }, { "epoch": 0.09683426443202979, "grad_norm": 0.6350487470626831, "learning_rate": 0.00019952786456790226, "loss": 0.3942, "step": 702 }, { "epoch": 0.09697220497965377, "grad_norm": 0.5960195660591125, "learning_rate": 0.00019952645946829653, "loss": 0.6674, "step": 703 }, { "epoch": 0.09711014552727774, "grad_norm": 0.8359782695770264, "learning_rate": 0.00019952505228593362, "loss": 0.8576, "step": 704 }, { "epoch": 0.09724808607490172, "grad_norm": 0.5339675545692444, "learning_rate": 0.00019952364302084301, "loss": 0.512, "step": 705 }, { "epoch": 0.0973860266225257, "grad_norm": 0.7097964286804199, "learning_rate": 0.00019952223167305416, "loss": 1.008, "step": 706 }, { "epoch": 0.09752396717014966, "grad_norm": 0.5932406783103943, "learning_rate": 0.00019952081824259663, "loss": 0.2798, "step": 707 }, { "epoch": 0.09766190771777364, "grad_norm": 0.7194920182228088, "learning_rate": 0.00019951940272949997, "loss": 0.8895, "step": 708 }, { "epoch": 0.09779984826539762, "grad_norm": 0.873084306716919, "learning_rate": 0.00019951798513379384, "loss": 0.7076, "step": 709 }, { "epoch": 0.09793778881302159, "grad_norm": 0.6830382943153381, "learning_rate": 0.0001995165654555079, "loss": 0.4155, "step": 710 }, { "epoch": 0.09807572936064556, "grad_norm": 0.6346055269241333, "learning_rate": 0.00019951514369467185, "loss": 0.8283, "step": 711 }, { "epoch": 0.09821366990826953, "grad_norm": 0.9484327435493469, "learning_rate": 0.0001995137198513154, "loss": 0.462, "step": 712 }, { "epoch": 0.09835161045589351, "grad_norm": 0.6650841236114502, "learning_rate": 0.0001995122939254684, "loss": 0.4688, "step": 713 }, { "epoch": 0.09848955100351749, "grad_norm": 0.6398036479949951, "learning_rate": 0.0001995108659171607, "loss": 0.8078, "step": 714 }, { "epoch": 0.09862749155114146, "grad_norm": 0.6665511131286621, "learning_rate": 0.0001995094358264221, "loss": 0.8413, "step": 715 }, { "epoch": 0.09876543209876543, "grad_norm": 1.0712066888809204, "learning_rate": 0.0001995080036532826, "loss": 1.0082, "step": 716 }, { "epoch": 0.0989033726463894, "grad_norm": 0.8648816347122192, "learning_rate": 0.0001995065693977722, "loss": 1.1701, "step": 717 }, { "epoch": 0.09904131319401338, "grad_norm": 0.47819796204566956, "learning_rate": 0.00019950513305992082, "loss": 0.4831, "step": 718 }, { "epoch": 0.09917925374163736, "grad_norm": 0.5053466558456421, "learning_rate": 0.0001995036946397586, "loss": 0.7266, "step": 719 }, { "epoch": 0.09931719428926133, "grad_norm": 0.5433461666107178, "learning_rate": 0.00019950225413731563, "loss": 0.5563, "step": 720 }, { "epoch": 0.0994551348368853, "grad_norm": 0.47349217534065247, "learning_rate": 0.00019950081155262198, "loss": 0.6306, "step": 721 }, { "epoch": 0.09959307538450927, "grad_norm": 0.4916892349720001, "learning_rate": 0.0001994993668857079, "loss": 0.3809, "step": 722 }, { "epoch": 0.09973101593213325, "grad_norm": 0.494800329208374, "learning_rate": 0.00019949792013660366, "loss": 0.3842, "step": 723 }, { "epoch": 0.09986895647975723, "grad_norm": 0.7297869920730591, "learning_rate": 0.00019949647130533947, "loss": 0.8015, "step": 724 }, { "epoch": 0.10000689702738119, "grad_norm": 0.48482975363731384, "learning_rate": 0.00019949502039194562, "loss": 0.4649, "step": 725 }, { "epoch": 0.10014483757500517, "grad_norm": 0.458311527967453, "learning_rate": 0.0001994935673964526, "loss": 0.4227, "step": 726 }, { "epoch": 0.10028277812262915, "grad_norm": 0.7237085700035095, "learning_rate": 0.00019949211231889069, "loss": 0.9086, "step": 727 }, { "epoch": 0.10042071867025312, "grad_norm": 0.42740538716316223, "learning_rate": 0.0001994906551592904, "loss": 0.4184, "step": 728 }, { "epoch": 0.1005586592178771, "grad_norm": 0.6918229460716248, "learning_rate": 0.0001994891959176822, "loss": 0.6799, "step": 729 }, { "epoch": 0.10069659976550106, "grad_norm": 0.4926545321941376, "learning_rate": 0.00019948773459409663, "loss": 0.4966, "step": 730 }, { "epoch": 0.10083454031312504, "grad_norm": 0.434721976518631, "learning_rate": 0.00019948627118856433, "loss": 0.4296, "step": 731 }, { "epoch": 0.10097248086074902, "grad_norm": 0.5751129984855652, "learning_rate": 0.00019948480570111584, "loss": 0.8211, "step": 732 }, { "epoch": 0.101110421408373, "grad_norm": 0.514031708240509, "learning_rate": 0.00019948333813178185, "loss": 0.4787, "step": 733 }, { "epoch": 0.10124836195599697, "grad_norm": 0.7144253253936768, "learning_rate": 0.0001994818684805931, "loss": 0.7398, "step": 734 }, { "epoch": 0.10138630250362093, "grad_norm": 0.8652357459068298, "learning_rate": 0.00019948039674758033, "loss": 0.6524, "step": 735 }, { "epoch": 0.10152424305124491, "grad_norm": 1.019052267074585, "learning_rate": 0.00019947892293277434, "loss": 0.3777, "step": 736 }, { "epoch": 0.10166218359886889, "grad_norm": 0.4818892776966095, "learning_rate": 0.00019947744703620597, "loss": 0.4218, "step": 737 }, { "epoch": 0.10180012414649287, "grad_norm": 0.48969703912734985, "learning_rate": 0.0001994759690579061, "loss": 0.3985, "step": 738 }, { "epoch": 0.10193806469411684, "grad_norm": 0.9150455594062805, "learning_rate": 0.0001994744889979056, "loss": 0.6028, "step": 739 }, { "epoch": 0.1020760052417408, "grad_norm": 0.9380991458892822, "learning_rate": 0.0001994730068562356, "loss": 0.6335, "step": 740 }, { "epoch": 0.10221394578936478, "grad_norm": 0.6422870755195618, "learning_rate": 0.00019947152263292698, "loss": 0.3667, "step": 741 }, { "epoch": 0.10235188633698876, "grad_norm": 0.6221140623092651, "learning_rate": 0.00019947003632801084, "loss": 0.4239, "step": 742 }, { "epoch": 0.10248982688461274, "grad_norm": 0.7663617134094238, "learning_rate": 0.00019946854794151828, "loss": 0.4966, "step": 743 }, { "epoch": 0.1026277674322367, "grad_norm": 0.4023009240627289, "learning_rate": 0.00019946705747348048, "loss": 0.3837, "step": 744 }, { "epoch": 0.10276570797986068, "grad_norm": 0.6791490316390991, "learning_rate": 0.0001994655649239286, "loss": 0.7697, "step": 745 }, { "epoch": 0.10290364852748465, "grad_norm": 0.7459297776222229, "learning_rate": 0.00019946407029289385, "loss": 0.6906, "step": 746 }, { "epoch": 0.10304158907510863, "grad_norm": 0.437961220741272, "learning_rate": 0.00019946257358040753, "loss": 0.4495, "step": 747 }, { "epoch": 0.10317952962273261, "grad_norm": 0.551419198513031, "learning_rate": 0.00019946107478650097, "loss": 0.6946, "step": 748 }, { "epoch": 0.10331747017035657, "grad_norm": 0.7225035429000854, "learning_rate": 0.00019945957391120556, "loss": 0.8352, "step": 749 }, { "epoch": 0.10345541071798055, "grad_norm": 0.6412903070449829, "learning_rate": 0.00019945807095455266, "loss": 1.0359, "step": 750 }, { "epoch": 0.10359335126560452, "grad_norm": 1.015834927558899, "learning_rate": 0.00019945656591657373, "loss": 1.0846, "step": 751 }, { "epoch": 0.1037312918132285, "grad_norm": 0.5166991353034973, "learning_rate": 0.0001994550587973003, "loss": 0.9909, "step": 752 }, { "epoch": 0.10386923236085248, "grad_norm": 0.37953466176986694, "learning_rate": 0.00019945354959676387, "loss": 0.5363, "step": 753 }, { "epoch": 0.10400717290847644, "grad_norm": 0.4988449215888977, "learning_rate": 0.00019945203831499603, "loss": 0.3587, "step": 754 }, { "epoch": 0.10414511345610042, "grad_norm": 0.4564686417579651, "learning_rate": 0.00019945052495202843, "loss": 0.359, "step": 755 }, { "epoch": 0.1042830540037244, "grad_norm": 0.4701521098613739, "learning_rate": 0.00019944900950789274, "loss": 0.5442, "step": 756 }, { "epoch": 0.10442099455134837, "grad_norm": 0.5886171460151672, "learning_rate": 0.00019944749198262063, "loss": 0.8095, "step": 757 }, { "epoch": 0.10455893509897235, "grad_norm": 0.652115523815155, "learning_rate": 0.0001994459723762439, "loss": 0.5434, "step": 758 }, { "epoch": 0.10469687564659631, "grad_norm": 0.48053914308547974, "learning_rate": 0.0001994444506887943, "loss": 0.4875, "step": 759 }, { "epoch": 0.10483481619422029, "grad_norm": 0.5411897301673889, "learning_rate": 0.00019944292692030373, "loss": 0.6873, "step": 760 }, { "epoch": 0.10497275674184427, "grad_norm": 0.4819433093070984, "learning_rate": 0.00019944140107080408, "loss": 0.6266, "step": 761 }, { "epoch": 0.10511069728946824, "grad_norm": 0.48322534561157227, "learning_rate": 0.00019943987314032722, "loss": 0.5062, "step": 762 }, { "epoch": 0.10524863783709221, "grad_norm": 0.4658261835575104, "learning_rate": 0.00019943834312890517, "loss": 0.339, "step": 763 }, { "epoch": 0.10538657838471618, "grad_norm": 0.8437706828117371, "learning_rate": 0.00019943681103656993, "loss": 0.8745, "step": 764 }, { "epoch": 0.10552451893234016, "grad_norm": 0.6944304704666138, "learning_rate": 0.00019943527686335356, "loss": 0.9149, "step": 765 }, { "epoch": 0.10566245947996414, "grad_norm": 0.5543552041053772, "learning_rate": 0.0001994337406092882, "loss": 0.3785, "step": 766 }, { "epoch": 0.10580040002758812, "grad_norm": 0.5634193420410156, "learning_rate": 0.00019943220227440595, "loss": 0.5484, "step": 767 }, { "epoch": 0.10593834057521208, "grad_norm": 0.5241608023643494, "learning_rate": 0.00019943066185873907, "loss": 0.5313, "step": 768 }, { "epoch": 0.10607628112283606, "grad_norm": 0.9513688683509827, "learning_rate": 0.00019942911936231972, "loss": 0.572, "step": 769 }, { "epoch": 0.10621422167046003, "grad_norm": 0.7142233848571777, "learning_rate": 0.00019942757478518019, "loss": 0.8572, "step": 770 }, { "epoch": 0.10635216221808401, "grad_norm": 0.9635162353515625, "learning_rate": 0.00019942602812735288, "loss": 0.654, "step": 771 }, { "epoch": 0.10649010276570799, "grad_norm": 0.7087379097938538, "learning_rate": 0.00019942447938887007, "loss": 1.1718, "step": 772 }, { "epoch": 0.10662804331333195, "grad_norm": 0.8098393082618713, "learning_rate": 0.0001994229285697642, "loss": 0.7998, "step": 773 }, { "epoch": 0.10676598386095593, "grad_norm": 0.5133201479911804, "learning_rate": 0.0001994213756700677, "loss": 0.6008, "step": 774 }, { "epoch": 0.1069039244085799, "grad_norm": 0.5162947177886963, "learning_rate": 0.00019941982068981311, "loss": 0.493, "step": 775 }, { "epoch": 0.10704186495620388, "grad_norm": 0.6204620003700256, "learning_rate": 0.00019941826362903297, "loss": 0.8284, "step": 776 }, { "epoch": 0.10717980550382784, "grad_norm": 0.5684283375740051, "learning_rate": 0.00019941670448775983, "loss": 0.6248, "step": 777 }, { "epoch": 0.10731774605145182, "grad_norm": 0.597649097442627, "learning_rate": 0.00019941514326602634, "loss": 0.9311, "step": 778 }, { "epoch": 0.1074556865990758, "grad_norm": 0.5404520034790039, "learning_rate": 0.00019941357996386514, "loss": 0.5261, "step": 779 }, { "epoch": 0.10759362714669977, "grad_norm": 0.59906405210495, "learning_rate": 0.00019941201458130897, "loss": 0.886, "step": 780 }, { "epoch": 0.10773156769432375, "grad_norm": 1.2685493230819702, "learning_rate": 0.00019941044711839064, "loss": 1.3075, "step": 781 }, { "epoch": 0.10786950824194771, "grad_norm": 1.1808830499649048, "learning_rate": 0.00019940887757514283, "loss": 0.6042, "step": 782 }, { "epoch": 0.10800744878957169, "grad_norm": 0.704940676689148, "learning_rate": 0.0001994073059515985, "loss": 0.5839, "step": 783 }, { "epoch": 0.10814538933719567, "grad_norm": 0.48852789402008057, "learning_rate": 0.00019940573224779047, "loss": 0.4642, "step": 784 }, { "epoch": 0.10828332988481965, "grad_norm": 0.670227587223053, "learning_rate": 0.0001994041564637517, "loss": 0.739, "step": 785 }, { "epoch": 0.10842127043244362, "grad_norm": 0.37136662006378174, "learning_rate": 0.0001994025785995152, "loss": 0.274, "step": 786 }, { "epoch": 0.10855921098006759, "grad_norm": 0.5912745594978333, "learning_rate": 0.00019940099865511387, "loss": 0.4663, "step": 787 }, { "epoch": 0.10869715152769156, "grad_norm": 0.5499754548072815, "learning_rate": 0.0001993994166305809, "loss": 0.7147, "step": 788 }, { "epoch": 0.10883509207531554, "grad_norm": 0.655288577079773, "learning_rate": 0.00019939783252594936, "loss": 0.5379, "step": 789 }, { "epoch": 0.10897303262293952, "grad_norm": 0.4208991229534149, "learning_rate": 0.00019939624634125237, "loss": 0.5169, "step": 790 }, { "epoch": 0.1091109731705635, "grad_norm": 0.6119178533554077, "learning_rate": 0.0001993946580765232, "loss": 0.7345, "step": 791 }, { "epoch": 0.10924891371818746, "grad_norm": 0.5834212303161621, "learning_rate": 0.00019939306773179497, "loss": 0.427, "step": 792 }, { "epoch": 0.10938685426581143, "grad_norm": 0.7387834191322327, "learning_rate": 0.00019939147530710104, "loss": 0.541, "step": 793 }, { "epoch": 0.10952479481343541, "grad_norm": 0.6745947599411011, "learning_rate": 0.0001993898808024747, "loss": 0.8625, "step": 794 }, { "epoch": 0.10966273536105939, "grad_norm": 0.6173790693283081, "learning_rate": 0.00019938828421794936, "loss": 0.3403, "step": 795 }, { "epoch": 0.10980067590868335, "grad_norm": 1.0587728023529053, "learning_rate": 0.00019938668555355838, "loss": 0.7851, "step": 796 }, { "epoch": 0.10993861645630733, "grad_norm": 0.5665553212165833, "learning_rate": 0.00019938508480933523, "loss": 0.3259, "step": 797 }, { "epoch": 0.1100765570039313, "grad_norm": 0.5360016822814941, "learning_rate": 0.00019938348198531345, "loss": 0.8205, "step": 798 }, { "epoch": 0.11021449755155528, "grad_norm": 0.6615592837333679, "learning_rate": 0.00019938187708152651, "loss": 0.3743, "step": 799 }, { "epoch": 0.11035243809917926, "grad_norm": 0.7461992502212524, "learning_rate": 0.0001993802700980081, "loss": 0.4257, "step": 800 }, { "epoch": 0.11035243809917926, "eval_loss": 0.72956782579422, "eval_runtime": 23.7526, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 800 }, { "epoch": 0.11049037864680322, "grad_norm": 0.5562842488288879, "learning_rate": 0.00019937866103479174, "loss": 0.618, "step": 801 }, { "epoch": 0.1106283191944272, "grad_norm": 0.7992231845855713, "learning_rate": 0.00019937704989191112, "loss": 0.9125, "step": 802 }, { "epoch": 0.11076625974205118, "grad_norm": 0.7528833150863647, "learning_rate": 0.00019937543666940003, "loss": 0.8585, "step": 803 }, { "epoch": 0.11090420028967515, "grad_norm": 0.5760588049888611, "learning_rate": 0.00019937382136729216, "loss": 0.6163, "step": 804 }, { "epoch": 0.11104214083729913, "grad_norm": 0.9151485562324524, "learning_rate": 0.00019937220398562132, "loss": 0.4289, "step": 805 }, { "epoch": 0.1111800813849231, "grad_norm": 0.7043417096138, "learning_rate": 0.0001993705845244214, "loss": 0.923, "step": 806 }, { "epoch": 0.11131802193254707, "grad_norm": 0.4929838478565216, "learning_rate": 0.00019936896298372621, "loss": 0.4696, "step": 807 }, { "epoch": 0.11145596248017105, "grad_norm": 0.7298506498336792, "learning_rate": 0.0001993673393635698, "loss": 0.8528, "step": 808 }, { "epoch": 0.11159390302779502, "grad_norm": 0.5436805486679077, "learning_rate": 0.00019936571366398605, "loss": 0.4578, "step": 809 }, { "epoch": 0.11173184357541899, "grad_norm": 0.7715519070625305, "learning_rate": 0.00019936408588500898, "loss": 0.5356, "step": 810 }, { "epoch": 0.11186978412304296, "grad_norm": 0.600720226764679, "learning_rate": 0.00019936245602667274, "loss": 0.4458, "step": 811 }, { "epoch": 0.11200772467066694, "grad_norm": 0.8915072679519653, "learning_rate": 0.00019936082408901135, "loss": 0.6377, "step": 812 }, { "epoch": 0.11214566521829092, "grad_norm": 0.6954822540283203, "learning_rate": 0.000199359190072059, "loss": 0.4618, "step": 813 }, { "epoch": 0.1122836057659149, "grad_norm": 0.7498542070388794, "learning_rate": 0.0001993575539758499, "loss": 0.5229, "step": 814 }, { "epoch": 0.11242154631353886, "grad_norm": 0.5488598942756653, "learning_rate": 0.00019935591580041824, "loss": 0.721, "step": 815 }, { "epoch": 0.11255948686116284, "grad_norm": 0.5599109530448914, "learning_rate": 0.0001993542755457983, "loss": 0.6374, "step": 816 }, { "epoch": 0.11269742740878681, "grad_norm": 0.9349027276039124, "learning_rate": 0.00019935263321202448, "loss": 0.5607, "step": 817 }, { "epoch": 0.11283536795641079, "grad_norm": 0.6541851758956909, "learning_rate": 0.0001993509887991311, "loss": 0.9875, "step": 818 }, { "epoch": 0.11297330850403477, "grad_norm": 0.512224555015564, "learning_rate": 0.00019934934230715254, "loss": 0.6612, "step": 819 }, { "epoch": 0.11311124905165873, "grad_norm": 0.8283054232597351, "learning_rate": 0.00019934769373612332, "loss": 0.9835, "step": 820 }, { "epoch": 0.1132491895992827, "grad_norm": 0.9326686263084412, "learning_rate": 0.0001993460430860779, "loss": 0.8147, "step": 821 }, { "epoch": 0.11338713014690668, "grad_norm": 0.6854963898658752, "learning_rate": 0.00019934439035705084, "loss": 0.2893, "step": 822 }, { "epoch": 0.11352507069453066, "grad_norm": 0.8152254223823547, "learning_rate": 0.0001993427355490767, "loss": 0.6379, "step": 823 }, { "epoch": 0.11366301124215464, "grad_norm": 1.24638032913208, "learning_rate": 0.00019934107866219014, "loss": 0.6338, "step": 824 }, { "epoch": 0.1138009517897786, "grad_norm": 0.4197789132595062, "learning_rate": 0.0001993394196964258, "loss": 0.2971, "step": 825 }, { "epoch": 0.11393889233740258, "grad_norm": 0.8748552203178406, "learning_rate": 0.0001993377586518184, "loss": 0.5474, "step": 826 }, { "epoch": 0.11407683288502656, "grad_norm": 0.5217332243919373, "learning_rate": 0.00019933609552840275, "loss": 0.7427, "step": 827 }, { "epoch": 0.11421477343265053, "grad_norm": 0.55879807472229, "learning_rate": 0.00019933443032621364, "loss": 0.336, "step": 828 }, { "epoch": 0.1143527139802745, "grad_norm": 0.6472502946853638, "learning_rate": 0.00019933276304528585, "loss": 0.9251, "step": 829 }, { "epoch": 0.11449065452789847, "grad_norm": 0.7320190668106079, "learning_rate": 0.00019933109368565433, "loss": 0.635, "step": 830 }, { "epoch": 0.11462859507552245, "grad_norm": 0.7303676605224609, "learning_rate": 0.000199329422247354, "loss": 0.8298, "step": 831 }, { "epoch": 0.11476653562314643, "grad_norm": 0.5881328582763672, "learning_rate": 0.00019932774873041985, "loss": 0.7584, "step": 832 }, { "epoch": 0.1149044761707704, "grad_norm": 0.7314452528953552, "learning_rate": 0.0001993260731348869, "loss": 0.7519, "step": 833 }, { "epoch": 0.11504241671839437, "grad_norm": 0.6046438217163086, "learning_rate": 0.0001993243954607902, "loss": 0.6759, "step": 834 }, { "epoch": 0.11518035726601834, "grad_norm": 0.541251540184021, "learning_rate": 0.00019932271570816487, "loss": 0.5828, "step": 835 }, { "epoch": 0.11531829781364232, "grad_norm": 0.545982837677002, "learning_rate": 0.00019932103387704602, "loss": 0.686, "step": 836 }, { "epoch": 0.1154562383612663, "grad_norm": 0.7743253707885742, "learning_rate": 0.0001993193499674689, "loss": 0.9118, "step": 837 }, { "epoch": 0.11559417890889027, "grad_norm": 0.5599899888038635, "learning_rate": 0.00019931766397946873, "loss": 0.7246, "step": 838 }, { "epoch": 0.11573211945651424, "grad_norm": 0.6533938646316528, "learning_rate": 0.0001993159759130808, "loss": 0.3831, "step": 839 }, { "epoch": 0.11587006000413821, "grad_norm": 0.6783917546272278, "learning_rate": 0.0001993142857683404, "loss": 0.6821, "step": 840 }, { "epoch": 0.11600800055176219, "grad_norm": 0.5326493978500366, "learning_rate": 0.00019931259354528296, "loss": 0.3043, "step": 841 }, { "epoch": 0.11614594109938617, "grad_norm": 0.7620488405227661, "learning_rate": 0.00019931089924394384, "loss": 0.6876, "step": 842 }, { "epoch": 0.11628388164701015, "grad_norm": 0.33520621061325073, "learning_rate": 0.00019930920286435853, "loss": 0.28, "step": 843 }, { "epoch": 0.11642182219463411, "grad_norm": 0.6359095573425293, "learning_rate": 0.00019930750440656247, "loss": 0.7847, "step": 844 }, { "epoch": 0.11655976274225809, "grad_norm": 0.42402467131614685, "learning_rate": 0.00019930580387059128, "loss": 0.4056, "step": 845 }, { "epoch": 0.11669770328988206, "grad_norm": 0.5360769033432007, "learning_rate": 0.00019930410125648052, "loss": 0.7712, "step": 846 }, { "epoch": 0.11683564383750604, "grad_norm": 1.0035808086395264, "learning_rate": 0.0001993023965642658, "loss": 0.6776, "step": 847 }, { "epoch": 0.11697358438513, "grad_norm": 0.63447505235672, "learning_rate": 0.00019930068979398283, "loss": 0.3934, "step": 848 }, { "epoch": 0.11711152493275398, "grad_norm": 0.5662627816200256, "learning_rate": 0.00019929898094566727, "loss": 0.7684, "step": 849 }, { "epoch": 0.11724946548037796, "grad_norm": 0.8410472869873047, "learning_rate": 0.0001992972700193549, "loss": 0.8659, "step": 850 }, { "epoch": 0.11738740602800193, "grad_norm": 0.8104351162910461, "learning_rate": 0.00019929555701508157, "loss": 0.6195, "step": 851 }, { "epoch": 0.11752534657562591, "grad_norm": 0.5049868226051331, "learning_rate": 0.0001992938419328831, "loss": 0.3345, "step": 852 }, { "epoch": 0.11766328712324987, "grad_norm": 0.7743363380432129, "learning_rate": 0.00019929212477279537, "loss": 1.0902, "step": 853 }, { "epoch": 0.11780122767087385, "grad_norm": 0.5063313841819763, "learning_rate": 0.00019929040553485434, "loss": 0.6214, "step": 854 }, { "epoch": 0.11793916821849783, "grad_norm": 0.433849960565567, "learning_rate": 0.00019928868421909593, "loss": 0.4598, "step": 855 }, { "epoch": 0.1180771087661218, "grad_norm": 0.5227097272872925, "learning_rate": 0.00019928696082555623, "loss": 0.7462, "step": 856 }, { "epoch": 0.11821504931374578, "grad_norm": 0.4988686144351959, "learning_rate": 0.00019928523535427126, "loss": 0.4738, "step": 857 }, { "epoch": 0.11835298986136975, "grad_norm": 1.0886865854263306, "learning_rate": 0.00019928350780527713, "loss": 0.7828, "step": 858 }, { "epoch": 0.11849093040899372, "grad_norm": 0.5777763724327087, "learning_rate": 0.00019928177817861006, "loss": 0.5497, "step": 859 }, { "epoch": 0.1186288709566177, "grad_norm": 0.7343029975891113, "learning_rate": 0.00019928004647430615, "loss": 0.8655, "step": 860 }, { "epoch": 0.11876681150424168, "grad_norm": 0.6810327768325806, "learning_rate": 0.0001992783126924017, "loss": 0.8245, "step": 861 }, { "epoch": 0.11890475205186564, "grad_norm": 0.4897983968257904, "learning_rate": 0.00019927657683293295, "loss": 0.5277, "step": 862 }, { "epoch": 0.11904269259948962, "grad_norm": 0.7019502520561218, "learning_rate": 0.00019927483889593624, "loss": 0.7431, "step": 863 }, { "epoch": 0.1191806331471136, "grad_norm": 0.6201822757720947, "learning_rate": 0.00019927309888144795, "loss": 0.4851, "step": 864 }, { "epoch": 0.11931857369473757, "grad_norm": 0.543670117855072, "learning_rate": 0.0001992713567895045, "loss": 0.4167, "step": 865 }, { "epoch": 0.11945651424236155, "grad_norm": 0.6636750102043152, "learning_rate": 0.00019926961262014238, "loss": 0.9595, "step": 866 }, { "epoch": 0.11959445478998551, "grad_norm": 0.8131356835365295, "learning_rate": 0.00019926786637339795, "loss": 0.8473, "step": 867 }, { "epoch": 0.11973239533760949, "grad_norm": 0.6473090052604675, "learning_rate": 0.00019926611804930793, "loss": 0.6085, "step": 868 }, { "epoch": 0.11987033588523346, "grad_norm": 0.5095342993736267, "learning_rate": 0.0001992643676479088, "loss": 0.5239, "step": 869 }, { "epoch": 0.12000827643285744, "grad_norm": 0.492301344871521, "learning_rate": 0.0001992626151692372, "loss": 0.3738, "step": 870 }, { "epoch": 0.12014621698048142, "grad_norm": 0.4516502916812897, "learning_rate": 0.00019926086061332985, "loss": 0.5932, "step": 871 }, { "epoch": 0.12028415752810538, "grad_norm": 0.6153187155723572, "learning_rate": 0.0001992591039802234, "loss": 0.4441, "step": 872 }, { "epoch": 0.12042209807572936, "grad_norm": 0.78718101978302, "learning_rate": 0.00019925734526995468, "loss": 0.8676, "step": 873 }, { "epoch": 0.12056003862335334, "grad_norm": 0.5705772042274475, "learning_rate": 0.00019925558448256046, "loss": 0.5242, "step": 874 }, { "epoch": 0.12069797917097731, "grad_norm": 0.5079905986785889, "learning_rate": 0.00019925382161807754, "loss": 0.6117, "step": 875 }, { "epoch": 0.12083591971860129, "grad_norm": 0.9623110294342041, "learning_rate": 0.0001992520566765429, "loss": 0.2973, "step": 876 }, { "epoch": 0.12097386026622525, "grad_norm": 0.8648927211761475, "learning_rate": 0.00019925028965799344, "loss": 0.986, "step": 877 }, { "epoch": 0.12111180081384923, "grad_norm": 0.4438282549381256, "learning_rate": 0.0001992485205624661, "loss": 0.6697, "step": 878 }, { "epoch": 0.1212497413614732, "grad_norm": 0.8015272617340088, "learning_rate": 0.00019924674938999796, "loss": 0.4598, "step": 879 }, { "epoch": 0.12138768190909718, "grad_norm": 0.5760028958320618, "learning_rate": 0.00019924497614062602, "loss": 0.4522, "step": 880 }, { "epoch": 0.12152562245672115, "grad_norm": 0.5543778538703918, "learning_rate": 0.00019924320081438746, "loss": 0.5178, "step": 881 }, { "epoch": 0.12166356300434512, "grad_norm": 0.6130492687225342, "learning_rate": 0.0001992414234113194, "loss": 0.4862, "step": 882 }, { "epoch": 0.1218015035519691, "grad_norm": 0.5187296271324158, "learning_rate": 0.000199239643931459, "loss": 0.4425, "step": 883 }, { "epoch": 0.12193944409959308, "grad_norm": 0.5691360235214233, "learning_rate": 0.00019923786237484358, "loss": 0.579, "step": 884 }, { "epoch": 0.12207738464721705, "grad_norm": 0.5792080163955688, "learning_rate": 0.00019923607874151032, "loss": 0.3572, "step": 885 }, { "epoch": 0.12221532519484102, "grad_norm": 0.4903048574924469, "learning_rate": 0.00019923429303149662, "loss": 0.452, "step": 886 }, { "epoch": 0.122353265742465, "grad_norm": 0.6367040276527405, "learning_rate": 0.00019923250524483984, "loss": 0.7433, "step": 887 }, { "epoch": 0.12249120629008897, "grad_norm": 0.5987071394920349, "learning_rate": 0.00019923071538157733, "loss": 0.457, "step": 888 }, { "epoch": 0.12262914683771295, "grad_norm": 0.5882094502449036, "learning_rate": 0.00019922892344174665, "loss": 0.6746, "step": 889 }, { "epoch": 0.12276708738533693, "grad_norm": 0.8648632764816284, "learning_rate": 0.0001992271294253852, "loss": 0.598, "step": 890 }, { "epoch": 0.12290502793296089, "grad_norm": 0.47741732001304626, "learning_rate": 0.00019922533333253058, "loss": 0.4759, "step": 891 }, { "epoch": 0.12304296848058487, "grad_norm": 0.5298755764961243, "learning_rate": 0.00019922353516322038, "loss": 0.4357, "step": 892 }, { "epoch": 0.12318090902820884, "grad_norm": 0.8141801953315735, "learning_rate": 0.00019922173491749223, "loss": 0.7693, "step": 893 }, { "epoch": 0.12331884957583282, "grad_norm": 0.5574595928192139, "learning_rate": 0.00019921993259538378, "loss": 0.5226, "step": 894 }, { "epoch": 0.12345679012345678, "grad_norm": 0.5708062052726746, "learning_rate": 0.0001992181281969327, "loss": 0.4868, "step": 895 }, { "epoch": 0.12359473067108076, "grad_norm": 0.8587258458137512, "learning_rate": 0.00019921632172217686, "loss": 0.7201, "step": 896 }, { "epoch": 0.12373267121870474, "grad_norm": 0.6692728996276855, "learning_rate": 0.00019921451317115397, "loss": 0.8659, "step": 897 }, { "epoch": 0.12387061176632871, "grad_norm": 0.7045578956604004, "learning_rate": 0.00019921270254390195, "loss": 0.6286, "step": 898 }, { "epoch": 0.12400855231395269, "grad_norm": 0.581339418888092, "learning_rate": 0.00019921088984045863, "loss": 0.5383, "step": 899 }, { "epoch": 0.12414649286157665, "grad_norm": 0.6217530965805054, "learning_rate": 0.00019920907506086198, "loss": 0.4864, "step": 900 }, { "epoch": 0.12428443340920063, "grad_norm": 0.6814983487129211, "learning_rate": 0.00019920725820514993, "loss": 0.825, "step": 901 }, { "epoch": 0.12442237395682461, "grad_norm": 0.6613954901695251, "learning_rate": 0.00019920543927336057, "loss": 0.8532, "step": 902 }, { "epoch": 0.12456031450444859, "grad_norm": 0.6362931132316589, "learning_rate": 0.00019920361826553193, "loss": 0.8427, "step": 903 }, { "epoch": 0.12469825505207256, "grad_norm": 0.7602605819702148, "learning_rate": 0.0001992017951817021, "loss": 0.5785, "step": 904 }, { "epoch": 0.12483619559969653, "grad_norm": 0.7023992538452148, "learning_rate": 0.0001991999700219093, "loss": 0.7264, "step": 905 }, { "epoch": 0.1249741361473205, "grad_norm": 0.4894685745239258, "learning_rate": 0.00019919814278619163, "loss": 0.5062, "step": 906 }, { "epoch": 0.12511207669494448, "grad_norm": 0.6051757335662842, "learning_rate": 0.00019919631347458733, "loss": 0.6501, "step": 907 }, { "epoch": 0.12525001724256846, "grad_norm": 0.9823623895645142, "learning_rate": 0.00019919448208713478, "loss": 0.4309, "step": 908 }, { "epoch": 0.12538795779019243, "grad_norm": 0.5565893650054932, "learning_rate": 0.00019919264862387222, "loss": 0.6234, "step": 909 }, { "epoch": 0.1255258983378164, "grad_norm": 0.6466925740242004, "learning_rate": 0.00019919081308483806, "loss": 0.5459, "step": 910 }, { "epoch": 0.1256638388854404, "grad_norm": 0.5692632794380188, "learning_rate": 0.0001991889754700707, "loss": 0.6509, "step": 911 }, { "epoch": 0.12580177943306434, "grad_norm": 0.5603244304656982, "learning_rate": 0.0001991871357796086, "loss": 0.5062, "step": 912 }, { "epoch": 0.1259397199806883, "grad_norm": 1.371334195137024, "learning_rate": 0.00019918529401349026, "loss": 0.8327, "step": 913 }, { "epoch": 0.1260776605283123, "grad_norm": 0.5690975785255432, "learning_rate": 0.00019918345017175417, "loss": 0.7654, "step": 914 }, { "epoch": 0.12621560107593627, "grad_norm": 0.7009579539299011, "learning_rate": 0.00019918160425443898, "loss": 0.6546, "step": 915 }, { "epoch": 0.12635354162356024, "grad_norm": 0.6453429460525513, "learning_rate": 0.00019917975626158332, "loss": 0.4413, "step": 916 }, { "epoch": 0.12649148217118422, "grad_norm": 0.6604295969009399, "learning_rate": 0.00019917790619322577, "loss": 0.5524, "step": 917 }, { "epoch": 0.1266294227188082, "grad_norm": 0.7455960512161255, "learning_rate": 0.00019917605404940522, "loss": 0.7734, "step": 918 }, { "epoch": 0.12676736326643218, "grad_norm": 0.7169288992881775, "learning_rate": 0.00019917419983016025, "loss": 0.5976, "step": 919 }, { "epoch": 0.12690530381405615, "grad_norm": 0.6484135389328003, "learning_rate": 0.00019917234353552976, "loss": 0.6162, "step": 920 }, { "epoch": 0.1270432443616801, "grad_norm": 0.9628545045852661, "learning_rate": 0.00019917048516555257, "loss": 0.2702, "step": 921 }, { "epoch": 0.12718118490930408, "grad_norm": 0.7649837732315063, "learning_rate": 0.00019916862472026757, "loss": 0.6825, "step": 922 }, { "epoch": 0.12731912545692806, "grad_norm": 0.486932635307312, "learning_rate": 0.0001991667621997137, "loss": 0.3469, "step": 923 }, { "epoch": 0.12745706600455203, "grad_norm": 1.0600336790084839, "learning_rate": 0.00019916489760392992, "loss": 0.8961, "step": 924 }, { "epoch": 0.127595006552176, "grad_norm": 0.7716004252433777, "learning_rate": 0.00019916303093295526, "loss": 0.9294, "step": 925 }, { "epoch": 0.1277329470998, "grad_norm": 1.2135393619537354, "learning_rate": 0.0001991611621868288, "loss": 0.8881, "step": 926 }, { "epoch": 0.12787088764742396, "grad_norm": 0.7027686834335327, "learning_rate": 0.0001991592913655896, "loss": 0.4266, "step": 927 }, { "epoch": 0.12800882819504794, "grad_norm": 0.8562885522842407, "learning_rate": 0.0001991574184692769, "loss": 0.5674, "step": 928 }, { "epoch": 0.12814676874267192, "grad_norm": 0.6562439799308777, "learning_rate": 0.0001991555434979298, "loss": 0.8152, "step": 929 }, { "epoch": 0.1282847092902959, "grad_norm": 0.9852779507637024, "learning_rate": 0.00019915366645158758, "loss": 1.2481, "step": 930 }, { "epoch": 0.12842264983791984, "grad_norm": 0.5373440384864807, "learning_rate": 0.0001991517873302895, "loss": 0.5952, "step": 931 }, { "epoch": 0.12856059038554382, "grad_norm": 0.5123794674873352, "learning_rate": 0.00019914990613407492, "loss": 0.7772, "step": 932 }, { "epoch": 0.1286985309331678, "grad_norm": 0.6559303998947144, "learning_rate": 0.00019914802286298317, "loss": 0.3859, "step": 933 }, { "epoch": 0.12883647148079178, "grad_norm": 0.6393718123435974, "learning_rate": 0.0001991461375170537, "loss": 0.5565, "step": 934 }, { "epoch": 0.12897441202841575, "grad_norm": 1.1857366561889648, "learning_rate": 0.0001991442500963259, "loss": 0.9502, "step": 935 }, { "epoch": 0.12911235257603973, "grad_norm": 1.1064331531524658, "learning_rate": 0.0001991423606008393, "loss": 0.7295, "step": 936 }, { "epoch": 0.1292502931236637, "grad_norm": 2.1012537479400635, "learning_rate": 0.00019914046903063347, "loss": 0.9118, "step": 937 }, { "epoch": 0.12938823367128768, "grad_norm": 0.6332145929336548, "learning_rate": 0.000199138575385748, "loss": 0.6456, "step": 938 }, { "epoch": 0.12952617421891166, "grad_norm": 0.5807342529296875, "learning_rate": 0.00019913667966622243, "loss": 0.3909, "step": 939 }, { "epoch": 0.1296641147665356, "grad_norm": 0.6866700053215027, "learning_rate": 0.0001991347818720965, "loss": 0.7402, "step": 940 }, { "epoch": 0.1298020553141596, "grad_norm": 0.5732684135437012, "learning_rate": 0.00019913288200340993, "loss": 0.6631, "step": 941 }, { "epoch": 0.12993999586178356, "grad_norm": 0.6961843371391296, "learning_rate": 0.00019913098006020245, "loss": 0.5532, "step": 942 }, { "epoch": 0.13007793640940754, "grad_norm": 0.5015955567359924, "learning_rate": 0.0001991290760425139, "loss": 0.6612, "step": 943 }, { "epoch": 0.13021587695703152, "grad_norm": 0.5921115279197693, "learning_rate": 0.00019912716995038407, "loss": 0.6177, "step": 944 }, { "epoch": 0.1303538175046555, "grad_norm": 0.6619390249252319, "learning_rate": 0.0001991252617838529, "loss": 0.2306, "step": 945 }, { "epoch": 0.13049175805227947, "grad_norm": 2.814476490020752, "learning_rate": 0.0001991233515429603, "loss": 0.6729, "step": 946 }, { "epoch": 0.13062969859990345, "grad_norm": 0.7949285507202148, "learning_rate": 0.0001991214392277462, "loss": 0.6715, "step": 947 }, { "epoch": 0.13076763914752743, "grad_norm": 0.7350752949714661, "learning_rate": 0.0001991195248382507, "loss": 0.6896, "step": 948 }, { "epoch": 0.1309055796951514, "grad_norm": 0.5575198531150818, "learning_rate": 0.00019911760837451378, "loss": 0.616, "step": 949 }, { "epoch": 0.13104352024277535, "grad_norm": 0.9318765997886658, "learning_rate": 0.00019911568983657562, "loss": 0.8401, "step": 950 }, { "epoch": 0.13118146079039933, "grad_norm": 0.7584351301193237, "learning_rate": 0.00019911376922447635, "loss": 0.7064, "step": 951 }, { "epoch": 0.1313194013380233, "grad_norm": 1.1349750757217407, "learning_rate": 0.00019911184653825613, "loss": 0.9523, "step": 952 }, { "epoch": 0.13145734188564728, "grad_norm": 0.5599570274353027, "learning_rate": 0.00019910992177795523, "loss": 0.4488, "step": 953 }, { "epoch": 0.13159528243327126, "grad_norm": 0.6333723664283752, "learning_rate": 0.00019910799494361387, "loss": 0.5216, "step": 954 }, { "epoch": 0.13173322298089524, "grad_norm": 0.5742828845977783, "learning_rate": 0.00019910606603527247, "loss": 0.3357, "step": 955 }, { "epoch": 0.13187116352851921, "grad_norm": 0.6496573686599731, "learning_rate": 0.0001991041350529713, "loss": 0.8959, "step": 956 }, { "epoch": 0.1320091040761432, "grad_norm": 0.6439496874809265, "learning_rate": 0.00019910220199675083, "loss": 0.5147, "step": 957 }, { "epoch": 0.13214704462376717, "grad_norm": 0.803395688533783, "learning_rate": 0.00019910026686665149, "loss": 0.8305, "step": 958 }, { "epoch": 0.13228498517139112, "grad_norm": 0.8443595767021179, "learning_rate": 0.00019909832966271378, "loss": 0.5471, "step": 959 }, { "epoch": 0.1324229257190151, "grad_norm": 0.5295692682266235, "learning_rate": 0.00019909639038497823, "loss": 0.6649, "step": 960 }, { "epoch": 0.13256086626663907, "grad_norm": 0.7029833793640137, "learning_rate": 0.00019909444903348545, "loss": 0.6606, "step": 961 }, { "epoch": 0.13269880681426305, "grad_norm": 0.4842614531517029, "learning_rate": 0.000199092505608276, "loss": 0.4145, "step": 962 }, { "epoch": 0.13283674736188703, "grad_norm": 0.8168975114822388, "learning_rate": 0.00019909056010939064, "loss": 0.7523, "step": 963 }, { "epoch": 0.132974687909511, "grad_norm": 0.6924858689308167, "learning_rate": 0.00019908861253687, "loss": 0.3966, "step": 964 }, { "epoch": 0.13311262845713498, "grad_norm": 0.607775092124939, "learning_rate": 0.0001990866628907549, "loss": 0.5342, "step": 965 }, { "epoch": 0.13325056900475896, "grad_norm": 0.5263320207595825, "learning_rate": 0.0001990847111710861, "loss": 0.4501, "step": 966 }, { "epoch": 0.13338850955238293, "grad_norm": 0.6196678280830383, "learning_rate": 0.00019908275737790448, "loss": 0.4111, "step": 967 }, { "epoch": 0.1335264501000069, "grad_norm": 0.756392240524292, "learning_rate": 0.0001990808015112509, "loss": 1.2263, "step": 968 }, { "epoch": 0.13366439064763086, "grad_norm": 0.8430196046829224, "learning_rate": 0.00019907884357116626, "loss": 0.8274, "step": 969 }, { "epoch": 0.13380233119525484, "grad_norm": 0.751544713973999, "learning_rate": 0.00019907688355769157, "loss": 0.669, "step": 970 }, { "epoch": 0.1339402717428788, "grad_norm": 0.5267643332481384, "learning_rate": 0.0001990749214708679, "loss": 0.4388, "step": 971 }, { "epoch": 0.1340782122905028, "grad_norm": 0.5386031866073608, "learning_rate": 0.0001990729573107362, "loss": 0.3993, "step": 972 }, { "epoch": 0.13421615283812677, "grad_norm": 0.6708813905715942, "learning_rate": 0.00019907099107733763, "loss": 0.5899, "step": 973 }, { "epoch": 0.13435409338575074, "grad_norm": 1.0802927017211914, "learning_rate": 0.00019906902277071336, "loss": 0.7474, "step": 974 }, { "epoch": 0.13449203393337472, "grad_norm": 0.4827090799808502, "learning_rate": 0.00019906705239090453, "loss": 0.4944, "step": 975 }, { "epoch": 0.1346299744809987, "grad_norm": 0.5705152153968811, "learning_rate": 0.0001990650799379524, "loss": 0.803, "step": 976 }, { "epoch": 0.13476791502862268, "grad_norm": 0.5141153335571289, "learning_rate": 0.00019906310541189823, "loss": 0.6533, "step": 977 }, { "epoch": 0.13490585557624662, "grad_norm": 0.5709956288337708, "learning_rate": 0.00019906112881278338, "loss": 0.4779, "step": 978 }, { "epoch": 0.1350437961238706, "grad_norm": 0.71205735206604, "learning_rate": 0.00019905915014064917, "loss": 0.5385, "step": 979 }, { "epoch": 0.13518173667149458, "grad_norm": 0.6685202717781067, "learning_rate": 0.00019905716939553703, "loss": 0.7742, "step": 980 }, { "epoch": 0.13531967721911856, "grad_norm": 0.7648952007293701, "learning_rate": 0.00019905518657748841, "loss": 0.4701, "step": 981 }, { "epoch": 0.13545761776674253, "grad_norm": 0.6259269714355469, "learning_rate": 0.00019905320168654478, "loss": 0.4423, "step": 982 }, { "epoch": 0.1355955583143665, "grad_norm": 1.0309432744979858, "learning_rate": 0.00019905121472274772, "loss": 0.9491, "step": 983 }, { "epoch": 0.1357334988619905, "grad_norm": 0.9954834580421448, "learning_rate": 0.00019904922568613877, "loss": 0.7951, "step": 984 }, { "epoch": 0.13587143940961446, "grad_norm": 0.5893868207931519, "learning_rate": 0.00019904723457675957, "loss": 0.4945, "step": 985 }, { "epoch": 0.13600937995723844, "grad_norm": 0.7338295578956604, "learning_rate": 0.0001990452413946518, "loss": 0.8869, "step": 986 }, { "epoch": 0.13614732050486242, "grad_norm": 0.5506496429443359, "learning_rate": 0.00019904324613985712, "loss": 0.5334, "step": 987 }, { "epoch": 0.13628526105248637, "grad_norm": 0.6891979575157166, "learning_rate": 0.00019904124881241737, "loss": 0.5732, "step": 988 }, { "epoch": 0.13642320160011034, "grad_norm": 1.0241178274154663, "learning_rate": 0.00019903924941237427, "loss": 0.5747, "step": 989 }, { "epoch": 0.13656114214773432, "grad_norm": 0.9838054776191711, "learning_rate": 0.0001990372479397697, "loss": 0.6857, "step": 990 }, { "epoch": 0.1366990826953583, "grad_norm": 0.8340192437171936, "learning_rate": 0.00019903524439464555, "loss": 0.7007, "step": 991 }, { "epoch": 0.13683702324298228, "grad_norm": 0.6701921820640564, "learning_rate": 0.0001990332387770437, "loss": 0.5101, "step": 992 }, { "epoch": 0.13697496379060625, "grad_norm": 0.8436320424079895, "learning_rate": 0.00019903123108700615, "loss": 0.9108, "step": 993 }, { "epoch": 0.13711290433823023, "grad_norm": 0.8240708708763123, "learning_rate": 0.00019902922132457494, "loss": 0.5379, "step": 994 }, { "epoch": 0.1372508448858542, "grad_norm": 0.8546607494354248, "learning_rate": 0.0001990272094897921, "loss": 0.7445, "step": 995 }, { "epoch": 0.13738878543347818, "grad_norm": 0.6038249731063843, "learning_rate": 0.00019902519558269975, "loss": 0.3701, "step": 996 }, { "epoch": 0.13752672598110213, "grad_norm": 0.9398077130317688, "learning_rate": 0.00019902317960333998, "loss": 0.6717, "step": 997 }, { "epoch": 0.1376646665287261, "grad_norm": 0.5250031352043152, "learning_rate": 0.00019902116155175504, "loss": 0.5007, "step": 998 }, { "epoch": 0.1378026070763501, "grad_norm": 0.4816218912601471, "learning_rate": 0.00019901914142798715, "loss": 0.3877, "step": 999 }, { "epoch": 0.13794054762397406, "grad_norm": 0.526080310344696, "learning_rate": 0.00019901711923207857, "loss": 0.6477, "step": 1000 }, { "epoch": 0.13794054762397406, "eval_loss": 0.6941623091697693, "eval_runtime": 23.4728, "eval_samples_per_second": 2.514, "eval_steps_per_second": 2.514, "step": 1000 }, { "epoch": 0.13807848817159804, "grad_norm": 1.398197889328003, "learning_rate": 0.00019901509496407158, "loss": 1.289, "step": 1001 }, { "epoch": 0.13821642871922202, "grad_norm": 0.5432860255241394, "learning_rate": 0.00019901306862400863, "loss": 0.7642, "step": 1002 }, { "epoch": 0.138354369266846, "grad_norm": 0.561249852180481, "learning_rate": 0.00019901104021193206, "loss": 0.3437, "step": 1003 }, { "epoch": 0.13849230981446997, "grad_norm": 0.9974400997161865, "learning_rate": 0.00019900900972788432, "loss": 0.7149, "step": 1004 }, { "epoch": 0.13863025036209395, "grad_norm": 0.45983076095581055, "learning_rate": 0.00019900697717190795, "loss": 0.541, "step": 1005 }, { "epoch": 0.1387681909097179, "grad_norm": 0.9060397148132324, "learning_rate": 0.00019900494254404544, "loss": 0.626, "step": 1006 }, { "epoch": 0.13890613145734187, "grad_norm": 0.8451761603355408, "learning_rate": 0.00019900290584433936, "loss": 0.5893, "step": 1007 }, { "epoch": 0.13904407200496585, "grad_norm": 1.0828523635864258, "learning_rate": 0.00019900086707283236, "loss": 0.5828, "step": 1008 }, { "epoch": 0.13918201255258983, "grad_norm": 0.5632979869842529, "learning_rate": 0.00019899882622956708, "loss": 0.6301, "step": 1009 }, { "epoch": 0.1393199531002138, "grad_norm": 0.8237811326980591, "learning_rate": 0.00019899678331458627, "loss": 0.8342, "step": 1010 }, { "epoch": 0.13945789364783778, "grad_norm": 0.5707277059555054, "learning_rate": 0.00019899473832793265, "loss": 0.4903, "step": 1011 }, { "epoch": 0.13959583419546176, "grad_norm": 0.6229791641235352, "learning_rate": 0.00019899269126964902, "loss": 0.6022, "step": 1012 }, { "epoch": 0.13973377474308574, "grad_norm": 0.680052638053894, "learning_rate": 0.0001989906421397782, "loss": 0.4722, "step": 1013 }, { "epoch": 0.1398717152907097, "grad_norm": 0.7106280326843262, "learning_rate": 0.0001989885909383631, "loss": 0.7087, "step": 1014 }, { "epoch": 0.1400096558383337, "grad_norm": 0.544165313243866, "learning_rate": 0.0001989865376654466, "loss": 0.8087, "step": 1015 }, { "epoch": 0.14014759638595764, "grad_norm": 1.1418883800506592, "learning_rate": 0.00019898448232107173, "loss": 0.6598, "step": 1016 }, { "epoch": 0.14028553693358162, "grad_norm": 0.5702589750289917, "learning_rate": 0.00019898242490528148, "loss": 0.3596, "step": 1017 }, { "epoch": 0.1404234774812056, "grad_norm": 0.5326302647590637, "learning_rate": 0.00019898036541811888, "loss": 0.7046, "step": 1018 }, { "epoch": 0.14056141802882957, "grad_norm": 0.8221389055252075, "learning_rate": 0.00019897830385962707, "loss": 0.4593, "step": 1019 }, { "epoch": 0.14069935857645355, "grad_norm": 0.5502816438674927, "learning_rate": 0.00019897624022984913, "loss": 0.8344, "step": 1020 }, { "epoch": 0.14083729912407753, "grad_norm": 0.5726627707481384, "learning_rate": 0.00019897417452882833, "loss": 0.5595, "step": 1021 }, { "epoch": 0.1409752396717015, "grad_norm": 0.6620048880577087, "learning_rate": 0.0001989721067566078, "loss": 0.8188, "step": 1022 }, { "epoch": 0.14111318021932548, "grad_norm": 0.49105188250541687, "learning_rate": 0.00019897003691323088, "loss": 0.4105, "step": 1023 }, { "epoch": 0.14125112076694946, "grad_norm": 0.7146965861320496, "learning_rate": 0.0001989679649987409, "loss": 0.7198, "step": 1024 }, { "epoch": 0.1413890613145734, "grad_norm": 0.5202399492263794, "learning_rate": 0.00019896589101318113, "loss": 0.1911, "step": 1025 }, { "epoch": 0.14152700186219738, "grad_norm": 0.6405767202377319, "learning_rate": 0.00019896381495659507, "loss": 0.4308, "step": 1026 }, { "epoch": 0.14166494240982136, "grad_norm": 0.7755346298217773, "learning_rate": 0.0001989617368290261, "loss": 0.461, "step": 1027 }, { "epoch": 0.14180288295744534, "grad_norm": 0.6936823129653931, "learning_rate": 0.00019895965663051774, "loss": 0.7364, "step": 1028 }, { "epoch": 0.1419408235050693, "grad_norm": 0.4888296127319336, "learning_rate": 0.0001989575743611135, "loss": 0.4525, "step": 1029 }, { "epoch": 0.1420787640526933, "grad_norm": 0.48920688033103943, "learning_rate": 0.000198955490020857, "loss": 0.3753, "step": 1030 }, { "epoch": 0.14221670460031727, "grad_norm": 0.6072588562965393, "learning_rate": 0.00019895340360979178, "loss": 0.4575, "step": 1031 }, { "epoch": 0.14235464514794124, "grad_norm": 0.9503058195114136, "learning_rate": 0.00019895131512796157, "loss": 0.9761, "step": 1032 }, { "epoch": 0.14249258569556522, "grad_norm": 0.9930344820022583, "learning_rate": 0.00019894922457541005, "loss": 0.637, "step": 1033 }, { "epoch": 0.1426305262431892, "grad_norm": 0.7928357124328613, "learning_rate": 0.00019894713195218094, "loss": 0.7327, "step": 1034 }, { "epoch": 0.14276846679081315, "grad_norm": 0.6444113254547119, "learning_rate": 0.0001989450372583181, "loss": 0.5983, "step": 1035 }, { "epoch": 0.14290640733843712, "grad_norm": 0.719067394733429, "learning_rate": 0.00019894294049386532, "loss": 0.7405, "step": 1036 }, { "epoch": 0.1430443478860611, "grad_norm": 0.8210891485214233, "learning_rate": 0.00019894084165886648, "loss": 0.8187, "step": 1037 }, { "epoch": 0.14318228843368508, "grad_norm": 0.5367610454559326, "learning_rate": 0.0001989387407533655, "loss": 0.5754, "step": 1038 }, { "epoch": 0.14332022898130906, "grad_norm": 0.7345871329307556, "learning_rate": 0.00019893663777740636, "loss": 0.569, "step": 1039 }, { "epoch": 0.14345816952893303, "grad_norm": 0.7352200150489807, "learning_rate": 0.00019893453273103306, "loss": 0.6986, "step": 1040 }, { "epoch": 0.143596110076557, "grad_norm": 0.5269816517829895, "learning_rate": 0.00019893242561428966, "loss": 0.553, "step": 1041 }, { "epoch": 0.143734050624181, "grad_norm": 0.5359818935394287, "learning_rate": 0.00019893031642722022, "loss": 0.2862, "step": 1042 }, { "epoch": 0.14387199117180496, "grad_norm": 0.5108398795127869, "learning_rate": 0.00019892820516986893, "loss": 0.3082, "step": 1043 }, { "epoch": 0.1440099317194289, "grad_norm": 0.6995589733123779, "learning_rate": 0.00019892609184227994, "loss": 0.5536, "step": 1044 }, { "epoch": 0.1441478722670529, "grad_norm": 0.7048972845077515, "learning_rate": 0.00019892397644449746, "loss": 0.6272, "step": 1045 }, { "epoch": 0.14428581281467687, "grad_norm": 0.8321963548660278, "learning_rate": 0.00019892185897656578, "loss": 0.4383, "step": 1046 }, { "epoch": 0.14442375336230084, "grad_norm": 0.872501790523529, "learning_rate": 0.00019891973943852924, "loss": 0.91, "step": 1047 }, { "epoch": 0.14456169390992482, "grad_norm": 0.5986654162406921, "learning_rate": 0.00019891761783043217, "loss": 0.5851, "step": 1048 }, { "epoch": 0.1446996344575488, "grad_norm": 0.8885194659233093, "learning_rate": 0.00019891549415231895, "loss": 0.6983, "step": 1049 }, { "epoch": 0.14483757500517278, "grad_norm": 0.718977153301239, "learning_rate": 0.000198913368404234, "loss": 0.9496, "step": 1050 }, { "epoch": 0.14497551555279675, "grad_norm": 0.4697357416152954, "learning_rate": 0.0001989112405862219, "loss": 0.4848, "step": 1051 }, { "epoch": 0.14511345610042073, "grad_norm": 0.6707521080970764, "learning_rate": 0.00019890911069832707, "loss": 0.8293, "step": 1052 }, { "epoch": 0.1452513966480447, "grad_norm": 0.7777612209320068, "learning_rate": 0.00019890697874059415, "loss": 0.5272, "step": 1053 }, { "epoch": 0.14538933719566866, "grad_norm": 0.7452659010887146, "learning_rate": 0.00019890484471306774, "loss": 0.6659, "step": 1054 }, { "epoch": 0.14552727774329263, "grad_norm": 0.6287086606025696, "learning_rate": 0.00019890270861579247, "loss": 0.6635, "step": 1055 }, { "epoch": 0.1456652182909166, "grad_norm": 0.5724009275436401, "learning_rate": 0.00019890057044881306, "loss": 0.567, "step": 1056 }, { "epoch": 0.1458031588385406, "grad_norm": 0.5259819626808167, "learning_rate": 0.00019889843021217432, "loss": 0.5194, "step": 1057 }, { "epoch": 0.14594109938616456, "grad_norm": 0.6209348440170288, "learning_rate": 0.0001988962879059209, "loss": 0.4594, "step": 1058 }, { "epoch": 0.14607903993378854, "grad_norm": 0.8408915400505066, "learning_rate": 0.00019889414353009777, "loss": 1.084, "step": 1059 }, { "epoch": 0.14621698048141252, "grad_norm": 0.6033788919448853, "learning_rate": 0.00019889199708474968, "loss": 0.652, "step": 1060 }, { "epoch": 0.1463549210290365, "grad_norm": 0.7292734384536743, "learning_rate": 0.00019888984856992166, "loss": 1.0806, "step": 1061 }, { "epoch": 0.14649286157666047, "grad_norm": 0.6448029279708862, "learning_rate": 0.0001988876979856586, "loss": 0.4945, "step": 1062 }, { "epoch": 0.14663080212428442, "grad_norm": 0.643459677696228, "learning_rate": 0.00019888554533200554, "loss": 0.3989, "step": 1063 }, { "epoch": 0.1467687426719084, "grad_norm": 0.9197812676429749, "learning_rate": 0.0001988833906090075, "loss": 0.7581, "step": 1064 }, { "epoch": 0.14690668321953237, "grad_norm": 0.5030396580696106, "learning_rate": 0.0001988812338167096, "loss": 0.5135, "step": 1065 }, { "epoch": 0.14704462376715635, "grad_norm": 0.5288397669792175, "learning_rate": 0.00019887907495515694, "loss": 0.4682, "step": 1066 }, { "epoch": 0.14718256431478033, "grad_norm": 0.8954821228981018, "learning_rate": 0.00019887691402439473, "loss": 0.8933, "step": 1067 }, { "epoch": 0.1473205048624043, "grad_norm": 0.737835705280304, "learning_rate": 0.00019887475102446818, "loss": 0.7471, "step": 1068 }, { "epoch": 0.14745844541002828, "grad_norm": 0.6842964291572571, "learning_rate": 0.00019887258595542256, "loss": 0.5195, "step": 1069 }, { "epoch": 0.14759638595765226, "grad_norm": 0.548966646194458, "learning_rate": 0.00019887041881730316, "loss": 0.7025, "step": 1070 }, { "epoch": 0.14773432650527624, "grad_norm": 1.2203510999679565, "learning_rate": 0.00019886824961015532, "loss": 0.6191, "step": 1071 }, { "epoch": 0.1478722670529002, "grad_norm": 0.5693142414093018, "learning_rate": 0.0001988660783340245, "loss": 0.514, "step": 1072 }, { "epoch": 0.14801020760052416, "grad_norm": 0.7296965718269348, "learning_rate": 0.00019886390498895606, "loss": 0.6241, "step": 1073 }, { "epoch": 0.14814814814814814, "grad_norm": 0.5822553634643555, "learning_rate": 0.00019886172957499554, "loss": 0.4982, "step": 1074 }, { "epoch": 0.14828608869577212, "grad_norm": 0.5920300483703613, "learning_rate": 0.00019885955209218841, "loss": 0.5262, "step": 1075 }, { "epoch": 0.1484240292433961, "grad_norm": 0.5377691984176636, "learning_rate": 0.0001988573725405803, "loss": 0.4467, "step": 1076 }, { "epoch": 0.14856196979102007, "grad_norm": 1.061208963394165, "learning_rate": 0.00019885519092021676, "loss": 1.1714, "step": 1077 }, { "epoch": 0.14869991033864405, "grad_norm": 0.9259366989135742, "learning_rate": 0.00019885300723114347, "loss": 0.994, "step": 1078 }, { "epoch": 0.14883785088626802, "grad_norm": 0.7091456651687622, "learning_rate": 0.00019885082147340615, "loss": 0.6296, "step": 1079 }, { "epoch": 0.148975791433892, "grad_norm": 1.0205203294754028, "learning_rate": 0.0001988486336470505, "loss": 0.7989, "step": 1080 }, { "epoch": 0.14911373198151598, "grad_norm": 0.9037676453590393, "learning_rate": 0.00019884644375212234, "loss": 0.7736, "step": 1081 }, { "epoch": 0.14925167252913993, "grad_norm": 0.5947265028953552, "learning_rate": 0.00019884425178866747, "loss": 0.3562, "step": 1082 }, { "epoch": 0.1493896130767639, "grad_norm": 0.6127558350563049, "learning_rate": 0.00019884205775673176, "loss": 0.8421, "step": 1083 }, { "epoch": 0.14952755362438788, "grad_norm": 0.48429784178733826, "learning_rate": 0.00019883986165636114, "loss": 0.477, "step": 1084 }, { "epoch": 0.14966549417201186, "grad_norm": 0.8586926460266113, "learning_rate": 0.00019883766348760158, "loss": 0.63, "step": 1085 }, { "epoch": 0.14980343471963584, "grad_norm": 0.5277228951454163, "learning_rate": 0.00019883546325049904, "loss": 0.4597, "step": 1086 }, { "epoch": 0.1499413752672598, "grad_norm": 0.5512163043022156, "learning_rate": 0.0001988332609450996, "loss": 0.4724, "step": 1087 }, { "epoch": 0.1500793158148838, "grad_norm": 0.7132527232170105, "learning_rate": 0.0001988310565714493, "loss": 0.9453, "step": 1088 }, { "epoch": 0.15021725636250777, "grad_norm": 0.5154200196266174, "learning_rate": 0.00019882885012959434, "loss": 0.5259, "step": 1089 }, { "epoch": 0.15035519691013174, "grad_norm": 0.8650720715522766, "learning_rate": 0.00019882664161958085, "loss": 0.6371, "step": 1090 }, { "epoch": 0.1504931374577557, "grad_norm": 0.5647125840187073, "learning_rate": 0.00019882443104145506, "loss": 0.5616, "step": 1091 }, { "epoch": 0.15063107800537967, "grad_norm": 0.5622859001159668, "learning_rate": 0.0001988222183952632, "loss": 0.4064, "step": 1092 }, { "epoch": 0.15076901855300365, "grad_norm": 0.6190979480743408, "learning_rate": 0.0001988200036810516, "loss": 0.6327, "step": 1093 }, { "epoch": 0.15090695910062762, "grad_norm": 0.7699036598205566, "learning_rate": 0.00019881778689886657, "loss": 0.5536, "step": 1094 }, { "epoch": 0.1510448996482516, "grad_norm": 0.6469339728355408, "learning_rate": 0.00019881556804875457, "loss": 0.3551, "step": 1095 }, { "epoch": 0.15118284019587558, "grad_norm": 1.0487968921661377, "learning_rate": 0.00019881334713076204, "loss": 0.9006, "step": 1096 }, { "epoch": 0.15132078074349956, "grad_norm": 0.670295000076294, "learning_rate": 0.00019881112414493534, "loss": 0.7408, "step": 1097 }, { "epoch": 0.15145872129112353, "grad_norm": 0.657656729221344, "learning_rate": 0.00019880889909132109, "loss": 0.5069, "step": 1098 }, { "epoch": 0.1515966618387475, "grad_norm": 0.7504099011421204, "learning_rate": 0.00019880667196996583, "loss": 0.6002, "step": 1099 }, { "epoch": 0.1517346023863715, "grad_norm": 0.7372997403144836, "learning_rate": 0.00019880444278091614, "loss": 0.6913, "step": 1100 }, { "epoch": 0.15187254293399544, "grad_norm": 0.8303672075271606, "learning_rate": 0.00019880221152421873, "loss": 0.7843, "step": 1101 }, { "epoch": 0.1520104834816194, "grad_norm": 0.8954166769981384, "learning_rate": 0.00019879997819992023, "loss": 0.5839, "step": 1102 }, { "epoch": 0.1521484240292434, "grad_norm": 1.0840219259262085, "learning_rate": 0.00019879774280806738, "loss": 0.6029, "step": 1103 }, { "epoch": 0.15228636457686737, "grad_norm": 0.6892568469047546, "learning_rate": 0.00019879550534870698, "loss": 0.4332, "step": 1104 }, { "epoch": 0.15242430512449134, "grad_norm": 0.9044513702392578, "learning_rate": 0.00019879326582188588, "loss": 0.6318, "step": 1105 }, { "epoch": 0.15256224567211532, "grad_norm": 0.5749319791793823, "learning_rate": 0.0001987910242276509, "loss": 0.6552, "step": 1106 }, { "epoch": 0.1527001862197393, "grad_norm": 0.40898945927619934, "learning_rate": 0.00019878878056604898, "loss": 0.303, "step": 1107 }, { "epoch": 0.15283812676736327, "grad_norm": 0.6765570640563965, "learning_rate": 0.00019878653483712704, "loss": 0.8492, "step": 1108 }, { "epoch": 0.15297606731498725, "grad_norm": 0.6239065527915955, "learning_rate": 0.0001987842870409321, "loss": 0.7018, "step": 1109 }, { "epoch": 0.1531140078626112, "grad_norm": 0.5025203227996826, "learning_rate": 0.00019878203717751117, "loss": 0.4241, "step": 1110 }, { "epoch": 0.15325194841023518, "grad_norm": 0.7260491251945496, "learning_rate": 0.00019877978524691141, "loss": 0.7082, "step": 1111 }, { "epoch": 0.15338988895785916, "grad_norm": 0.6543614864349365, "learning_rate": 0.00019877753124917984, "loss": 0.7759, "step": 1112 }, { "epoch": 0.15352782950548313, "grad_norm": 0.540330171585083, "learning_rate": 0.0001987752751843637, "loss": 0.5454, "step": 1113 }, { "epoch": 0.1536657700531071, "grad_norm": 0.5570470094680786, "learning_rate": 0.00019877301705251016, "loss": 0.4424, "step": 1114 }, { "epoch": 0.1538037106007311, "grad_norm": 0.5284799933433533, "learning_rate": 0.00019877075685366654, "loss": 0.2687, "step": 1115 }, { "epoch": 0.15394165114835506, "grad_norm": 0.5984044671058655, "learning_rate": 0.00019876849458788008, "loss": 0.3418, "step": 1116 }, { "epoch": 0.15407959169597904, "grad_norm": 0.622887372970581, "learning_rate": 0.00019876623025519811, "loss": 0.6005, "step": 1117 }, { "epoch": 0.15421753224360302, "grad_norm": 0.7443903088569641, "learning_rate": 0.00019876396385566807, "loss": 0.9561, "step": 1118 }, { "epoch": 0.154355472791227, "grad_norm": 0.6838943958282471, "learning_rate": 0.00019876169538933733, "loss": 0.5958, "step": 1119 }, { "epoch": 0.15449341333885094, "grad_norm": 0.4985027611255646, "learning_rate": 0.0001987594248562534, "loss": 0.5444, "step": 1120 }, { "epoch": 0.15463135388647492, "grad_norm": 0.8119350075721741, "learning_rate": 0.00019875715225646382, "loss": 0.4419, "step": 1121 }, { "epoch": 0.1547692944340989, "grad_norm": 0.6677895188331604, "learning_rate": 0.00019875487759001605, "loss": 0.8143, "step": 1122 }, { "epoch": 0.15490723498172287, "grad_norm": 0.7385493516921997, "learning_rate": 0.0001987526008569578, "loss": 0.5564, "step": 1123 }, { "epoch": 0.15504517552934685, "grad_norm": 0.6388130784034729, "learning_rate": 0.00019875032205733667, "loss": 0.608, "step": 1124 }, { "epoch": 0.15518311607697083, "grad_norm": 0.722061812877655, "learning_rate": 0.00019874804119120033, "loss": 0.5137, "step": 1125 }, { "epoch": 0.1553210566245948, "grad_norm": 0.6539713144302368, "learning_rate": 0.00019874575825859655, "loss": 0.5288, "step": 1126 }, { "epoch": 0.15545899717221878, "grad_norm": 0.5178963541984558, "learning_rate": 0.00019874347325957306, "loss": 0.4996, "step": 1127 }, { "epoch": 0.15559693771984276, "grad_norm": 1.919731855392456, "learning_rate": 0.0001987411861941777, "loss": 0.6895, "step": 1128 }, { "epoch": 0.1557348782674667, "grad_norm": 1.0255264043807983, "learning_rate": 0.0001987388970624583, "loss": 0.8685, "step": 1129 }, { "epoch": 0.15587281881509069, "grad_norm": 0.557336151599884, "learning_rate": 0.00019873660586446285, "loss": 0.3655, "step": 1130 }, { "epoch": 0.15601075936271466, "grad_norm": 0.6181217432022095, "learning_rate": 0.00019873431260023923, "loss": 0.5749, "step": 1131 }, { "epoch": 0.15614869991033864, "grad_norm": 0.9874257445335388, "learning_rate": 0.00019873201726983543, "loss": 0.632, "step": 1132 }, { "epoch": 0.15628664045796262, "grad_norm": 0.8962194323539734, "learning_rate": 0.00019872971987329948, "loss": 0.6092, "step": 1133 }, { "epoch": 0.1564245810055866, "grad_norm": 0.6445840001106262, "learning_rate": 0.00019872742041067952, "loss": 0.5592, "step": 1134 }, { "epoch": 0.15656252155321057, "grad_norm": 0.7233692407608032, "learning_rate": 0.00019872511888202358, "loss": 0.6566, "step": 1135 }, { "epoch": 0.15670046210083455, "grad_norm": 0.5418415665626526, "learning_rate": 0.00019872281528737987, "loss": 0.411, "step": 1136 }, { "epoch": 0.15683840264845852, "grad_norm": 0.651696503162384, "learning_rate": 0.0001987205096267966, "loss": 0.4241, "step": 1137 }, { "epoch": 0.1569763431960825, "grad_norm": 0.8026519417762756, "learning_rate": 0.00019871820190032203, "loss": 0.4847, "step": 1138 }, { "epoch": 0.15711428374370645, "grad_norm": 0.5497678518295288, "learning_rate": 0.00019871589210800438, "loss": 0.37, "step": 1139 }, { "epoch": 0.15725222429133043, "grad_norm": 0.5791033506393433, "learning_rate": 0.00019871358024989207, "loss": 0.6614, "step": 1140 }, { "epoch": 0.1573901648389544, "grad_norm": 2.8463683128356934, "learning_rate": 0.00019871126632603345, "loss": 0.9519, "step": 1141 }, { "epoch": 0.15752810538657838, "grad_norm": 0.757117509841919, "learning_rate": 0.00019870895033647696, "loss": 0.5509, "step": 1142 }, { "epoch": 0.15766604593420236, "grad_norm": 0.5873023867607117, "learning_rate": 0.00019870663228127103, "loss": 0.358, "step": 1143 }, { "epoch": 0.15780398648182634, "grad_norm": 0.637850284576416, "learning_rate": 0.00019870431216046419, "loss": 0.8534, "step": 1144 }, { "epoch": 0.1579419270294503, "grad_norm": 0.5257639288902283, "learning_rate": 0.00019870198997410502, "loss": 0.4937, "step": 1145 }, { "epoch": 0.1580798675770743, "grad_norm": 1.0933887958526611, "learning_rate": 0.00019869966572224205, "loss": 0.8645, "step": 1146 }, { "epoch": 0.15821780812469827, "grad_norm": 0.8901828527450562, "learning_rate": 0.00019869733940492396, "loss": 0.7593, "step": 1147 }, { "epoch": 0.15835574867232222, "grad_norm": 0.7828998565673828, "learning_rate": 0.00019869501102219946, "loss": 0.4997, "step": 1148 }, { "epoch": 0.1584936892199462, "grad_norm": 0.684245765209198, "learning_rate": 0.0001986926805741172, "loss": 0.6882, "step": 1149 }, { "epoch": 0.15863162976757017, "grad_norm": 0.7516327500343323, "learning_rate": 0.000198690348060726, "loss": 0.8402, "step": 1150 }, { "epoch": 0.15876957031519415, "grad_norm": 0.667762279510498, "learning_rate": 0.00019868801348207467, "loss": 0.7318, "step": 1151 }, { "epoch": 0.15890751086281812, "grad_norm": 0.5685754418373108, "learning_rate": 0.00019868567683821206, "loss": 0.3791, "step": 1152 }, { "epoch": 0.1590454514104421, "grad_norm": 1.2174463272094727, "learning_rate": 0.00019868333812918704, "loss": 1.3797, "step": 1153 }, { "epoch": 0.15918339195806608, "grad_norm": 1.0083547830581665, "learning_rate": 0.0001986809973550486, "loss": 0.6277, "step": 1154 }, { "epoch": 0.15932133250569006, "grad_norm": 0.5130175948143005, "learning_rate": 0.00019867865451584572, "loss": 0.4544, "step": 1155 }, { "epoch": 0.15945927305331403, "grad_norm": 0.46207770705223083, "learning_rate": 0.00019867630961162739, "loss": 0.3214, "step": 1156 }, { "epoch": 0.159597213600938, "grad_norm": 0.6137313842773438, "learning_rate": 0.00019867396264244271, "loss": 0.6259, "step": 1157 }, { "epoch": 0.15973515414856196, "grad_norm": 0.5760210752487183, "learning_rate": 0.00019867161360834077, "loss": 0.3862, "step": 1158 }, { "epoch": 0.15987309469618594, "grad_norm": 0.6963263750076294, "learning_rate": 0.00019866926250937075, "loss": 0.747, "step": 1159 }, { "epoch": 0.1600110352438099, "grad_norm": 0.5522167682647705, "learning_rate": 0.00019866690934558184, "loss": 0.3872, "step": 1160 }, { "epoch": 0.1601489757914339, "grad_norm": 0.6380563974380493, "learning_rate": 0.0001986645541170233, "loss": 0.4821, "step": 1161 }, { "epoch": 0.16028691633905787, "grad_norm": 0.8671901226043701, "learning_rate": 0.00019866219682374437, "loss": 0.8727, "step": 1162 }, { "epoch": 0.16042485688668184, "grad_norm": 0.7571714520454407, "learning_rate": 0.00019865983746579445, "loss": 0.4456, "step": 1163 }, { "epoch": 0.16056279743430582, "grad_norm": 0.6641622185707092, "learning_rate": 0.00019865747604322283, "loss": 0.6605, "step": 1164 }, { "epoch": 0.1607007379819298, "grad_norm": 0.5024450421333313, "learning_rate": 0.00019865511255607901, "loss": 0.6206, "step": 1165 }, { "epoch": 0.16083867852955377, "grad_norm": 0.5311720967292786, "learning_rate": 0.00019865274700441244, "loss": 0.5197, "step": 1166 }, { "epoch": 0.16097661907717772, "grad_norm": 0.5018906593322754, "learning_rate": 0.00019865037938827256, "loss": 0.5966, "step": 1167 }, { "epoch": 0.1611145596248017, "grad_norm": 0.5599327683448792, "learning_rate": 0.00019864800970770897, "loss": 0.3931, "step": 1168 }, { "epoch": 0.16125250017242568, "grad_norm": 0.5333655476570129, "learning_rate": 0.00019864563796277124, "loss": 0.4852, "step": 1169 }, { "epoch": 0.16139044072004965, "grad_norm": 0.708154022693634, "learning_rate": 0.000198643264153509, "loss": 0.4032, "step": 1170 }, { "epoch": 0.16152838126767363, "grad_norm": 0.6808910965919495, "learning_rate": 0.00019864088827997194, "loss": 0.7821, "step": 1171 }, { "epoch": 0.1616663218152976, "grad_norm": 3.688157081604004, "learning_rate": 0.00019863851034220977, "loss": 0.611, "step": 1172 }, { "epoch": 0.16180426236292159, "grad_norm": 0.5643519163131714, "learning_rate": 0.00019863613034027224, "loss": 0.4735, "step": 1173 }, { "epoch": 0.16194220291054556, "grad_norm": 1.0255013704299927, "learning_rate": 0.0001986337482742092, "loss": 1.0907, "step": 1174 }, { "epoch": 0.16208014345816954, "grad_norm": 0.7241511940956116, "learning_rate": 0.00019863136414407042, "loss": 1.0426, "step": 1175 }, { "epoch": 0.1622180840057935, "grad_norm": 0.4938182532787323, "learning_rate": 0.00019862897794990588, "loss": 0.3653, "step": 1176 }, { "epoch": 0.16235602455341747, "grad_norm": 0.8098915219306946, "learning_rate": 0.00019862658969176548, "loss": 0.5998, "step": 1177 }, { "epoch": 0.16249396510104144, "grad_norm": 0.45475316047668457, "learning_rate": 0.00019862419936969916, "loss": 0.1607, "step": 1178 }, { "epoch": 0.16263190564866542, "grad_norm": 0.662933349609375, "learning_rate": 0.00019862180698375695, "loss": 0.4803, "step": 1179 }, { "epoch": 0.1627698461962894, "grad_norm": 0.9009214639663696, "learning_rate": 0.000198619412533989, "loss": 0.5197, "step": 1180 }, { "epoch": 0.16290778674391337, "grad_norm": 0.6184802055358887, "learning_rate": 0.00019861701602044533, "loss": 0.6945, "step": 1181 }, { "epoch": 0.16304572729153735, "grad_norm": 0.7415738105773926, "learning_rate": 0.00019861461744317613, "loss": 0.5511, "step": 1182 }, { "epoch": 0.16318366783916133, "grad_norm": 0.8678780794143677, "learning_rate": 0.00019861221680223158, "loss": 0.4388, "step": 1183 }, { "epoch": 0.1633216083867853, "grad_norm": 0.8106904625892639, "learning_rate": 0.0001986098140976619, "loss": 0.8894, "step": 1184 }, { "epoch": 0.16345954893440928, "grad_norm": 0.62105393409729, "learning_rate": 0.00019860740932951742, "loss": 0.339, "step": 1185 }, { "epoch": 0.16359748948203323, "grad_norm": 0.7359816431999207, "learning_rate": 0.00019860500249784842, "loss": 0.619, "step": 1186 }, { "epoch": 0.1637354300296572, "grad_norm": 0.5324639678001404, "learning_rate": 0.00019860259360270525, "loss": 0.5086, "step": 1187 }, { "epoch": 0.16387337057728119, "grad_norm": 0.5897564888000488, "learning_rate": 0.0001986001826441384, "loss": 0.6535, "step": 1188 }, { "epoch": 0.16401131112490516, "grad_norm": 1.0819058418273926, "learning_rate": 0.00019859776962219826, "loss": 0.7402, "step": 1189 }, { "epoch": 0.16414925167252914, "grad_norm": 0.6753519773483276, "learning_rate": 0.00019859535453693532, "loss": 0.9195, "step": 1190 }, { "epoch": 0.16428719222015312, "grad_norm": 0.6425819993019104, "learning_rate": 0.00019859293738840018, "loss": 0.4202, "step": 1191 }, { "epoch": 0.1644251327677771, "grad_norm": 0.6733855605125427, "learning_rate": 0.00019859051817664334, "loss": 0.5485, "step": 1192 }, { "epoch": 0.16456307331540107, "grad_norm": 0.604741096496582, "learning_rate": 0.0001985880969017155, "loss": 0.8566, "step": 1193 }, { "epoch": 0.16470101386302505, "grad_norm": 0.7113268971443176, "learning_rate": 0.00019858567356366725, "loss": 0.4957, "step": 1194 }, { "epoch": 0.164838954410649, "grad_norm": 0.6288824081420898, "learning_rate": 0.0001985832481625494, "loss": 0.4495, "step": 1195 }, { "epoch": 0.16497689495827297, "grad_norm": 0.5524368286132812, "learning_rate": 0.00019858082069841267, "loss": 0.2739, "step": 1196 }, { "epoch": 0.16511483550589695, "grad_norm": 0.5463117361068726, "learning_rate": 0.0001985783911713078, "loss": 0.5338, "step": 1197 }, { "epoch": 0.16525277605352093, "grad_norm": 0.7307978868484497, "learning_rate": 0.00019857595958128573, "loss": 0.4713, "step": 1198 }, { "epoch": 0.1653907166011449, "grad_norm": 0.6012675762176514, "learning_rate": 0.00019857352592839723, "loss": 0.4267, "step": 1199 }, { "epoch": 0.16552865714876888, "grad_norm": 0.6487913727760315, "learning_rate": 0.00019857109021269333, "loss": 0.5291, "step": 1200 }, { "epoch": 0.16552865714876888, "eval_loss": 0.6941049098968506, "eval_runtime": 23.4931, "eval_samples_per_second": 2.511, "eval_steps_per_second": 2.511, "step": 1200 }, { "epoch": 0.16566659769639286, "grad_norm": 0.7660422325134277, "learning_rate": 0.00019856865243422496, "loss": 0.4538, "step": 1201 }, { "epoch": 0.16580453824401684, "grad_norm": 0.5843665599822998, "learning_rate": 0.00019856621259304311, "loss": 0.5072, "step": 1202 }, { "epoch": 0.1659424787916408, "grad_norm": 0.8580763339996338, "learning_rate": 0.0001985637706891989, "loss": 0.8842, "step": 1203 }, { "epoch": 0.1660804193392648, "grad_norm": 0.576759397983551, "learning_rate": 0.00019856132672274338, "loss": 0.3074, "step": 1204 }, { "epoch": 0.16621835988688874, "grad_norm": 0.7838415503501892, "learning_rate": 0.0001985588806937277, "loss": 0.8171, "step": 1205 }, { "epoch": 0.16635630043451272, "grad_norm": 0.5142651200294495, "learning_rate": 0.00019855643260220304, "loss": 0.522, "step": 1206 }, { "epoch": 0.1664942409821367, "grad_norm": 0.4573938548564911, "learning_rate": 0.00019855398244822067, "loss": 0.4331, "step": 1207 }, { "epoch": 0.16663218152976067, "grad_norm": 0.9972742795944214, "learning_rate": 0.00019855153023183187, "loss": 0.6818, "step": 1208 }, { "epoch": 0.16677012207738465, "grad_norm": 0.9345471858978271, "learning_rate": 0.00019854907595308788, "loss": 0.8203, "step": 1209 }, { "epoch": 0.16690806262500862, "grad_norm": 1.159678339958191, "learning_rate": 0.00019854661961204016, "loss": 0.8935, "step": 1210 }, { "epoch": 0.1670460031726326, "grad_norm": 0.6871662735939026, "learning_rate": 0.00019854416120874, "loss": 0.7125, "step": 1211 }, { "epoch": 0.16718394372025658, "grad_norm": 0.6230341792106628, "learning_rate": 0.00019854170074323896, "loss": 0.7967, "step": 1212 }, { "epoch": 0.16732188426788056, "grad_norm": 0.6269369125366211, "learning_rate": 0.0001985392382155884, "loss": 0.6406, "step": 1213 }, { "epoch": 0.1674598248155045, "grad_norm": 0.7250765562057495, "learning_rate": 0.00019853677362584, "loss": 0.3399, "step": 1214 }, { "epoch": 0.16759776536312848, "grad_norm": 0.4939933121204376, "learning_rate": 0.00019853430697404524, "loss": 0.5202, "step": 1215 }, { "epoch": 0.16773570591075246, "grad_norm": 0.7530661821365356, "learning_rate": 0.00019853183826025575, "loss": 0.7126, "step": 1216 }, { "epoch": 0.16787364645837644, "grad_norm": 0.7523278594017029, "learning_rate": 0.00019852936748452322, "loss": 1.2054, "step": 1217 }, { "epoch": 0.1680115870060004, "grad_norm": 0.6227564811706543, "learning_rate": 0.00019852689464689935, "loss": 0.5264, "step": 1218 }, { "epoch": 0.1681495275536244, "grad_norm": 0.6861101388931274, "learning_rate": 0.00019852441974743586, "loss": 0.846, "step": 1219 }, { "epoch": 0.16828746810124837, "grad_norm": 0.7924994230270386, "learning_rate": 0.00019852194278618458, "loss": 0.8325, "step": 1220 }, { "epoch": 0.16842540864887234, "grad_norm": 0.6716384291648865, "learning_rate": 0.0001985194637631973, "loss": 0.8085, "step": 1221 }, { "epoch": 0.16856334919649632, "grad_norm": 0.7923401594161987, "learning_rate": 0.00019851698267852593, "loss": 0.7989, "step": 1222 }, { "epoch": 0.1687012897441203, "grad_norm": 0.47927549481391907, "learning_rate": 0.0001985144995322224, "loss": 0.5192, "step": 1223 }, { "epoch": 0.16883923029174425, "grad_norm": 1.2143771648406982, "learning_rate": 0.00019851201432433862, "loss": 0.8591, "step": 1224 }, { "epoch": 0.16897717083936822, "grad_norm": 0.7667599320411682, "learning_rate": 0.00019850952705492665, "loss": 1.102, "step": 1225 }, { "epoch": 0.1691151113869922, "grad_norm": 0.8590033054351807, "learning_rate": 0.00019850703772403852, "loss": 0.5396, "step": 1226 }, { "epoch": 0.16925305193461618, "grad_norm": 0.5094299912452698, "learning_rate": 0.00019850454633172631, "loss": 0.4253, "step": 1227 }, { "epoch": 0.16939099248224015, "grad_norm": 0.7882555723190308, "learning_rate": 0.00019850205287804223, "loss": 0.9953, "step": 1228 }, { "epoch": 0.16952893302986413, "grad_norm": 0.6991984844207764, "learning_rate": 0.00019849955736303838, "loss": 0.6115, "step": 1229 }, { "epoch": 0.1696668735774881, "grad_norm": 0.5561696887016296, "learning_rate": 0.000198497059786767, "loss": 0.4541, "step": 1230 }, { "epoch": 0.16980481412511209, "grad_norm": 0.6001817584037781, "learning_rate": 0.00019849456014928033, "loss": 0.3223, "step": 1231 }, { "epoch": 0.16994275467273606, "grad_norm": 0.9406478404998779, "learning_rate": 0.00019849205845063077, "loss": 0.6149, "step": 1232 }, { "epoch": 0.17008069522036, "grad_norm": 0.7759897708892822, "learning_rate": 0.0001984895546908706, "loss": 0.6282, "step": 1233 }, { "epoch": 0.170218635767984, "grad_norm": 1.1730425357818604, "learning_rate": 0.0001984870488700522, "loss": 0.9585, "step": 1234 }, { "epoch": 0.17035657631560797, "grad_norm": 0.8819521069526672, "learning_rate": 0.00019848454098822803, "loss": 0.6042, "step": 1235 }, { "epoch": 0.17049451686323194, "grad_norm": 0.6303958892822266, "learning_rate": 0.0001984820310454506, "loss": 0.4283, "step": 1236 }, { "epoch": 0.17063245741085592, "grad_norm": 0.6245417594909668, "learning_rate": 0.0001984795190417724, "loss": 0.4636, "step": 1237 }, { "epoch": 0.1707703979584799, "grad_norm": 0.591786801815033, "learning_rate": 0.00019847700497724604, "loss": 0.5676, "step": 1238 }, { "epoch": 0.17090833850610387, "grad_norm": 0.8272263407707214, "learning_rate": 0.00019847448885192408, "loss": 0.5375, "step": 1239 }, { "epoch": 0.17104627905372785, "grad_norm": 0.5363914370536804, "learning_rate": 0.00019847197066585922, "loss": 0.3348, "step": 1240 }, { "epoch": 0.17118421960135183, "grad_norm": 0.6910378932952881, "learning_rate": 0.0001984694504191041, "loss": 0.5404, "step": 1241 }, { "epoch": 0.1713221601489758, "grad_norm": 0.8528624176979065, "learning_rate": 0.00019846692811171151, "loss": 0.6671, "step": 1242 }, { "epoch": 0.17146010069659975, "grad_norm": 0.7034489512443542, "learning_rate": 0.0001984644037437342, "loss": 0.4098, "step": 1243 }, { "epoch": 0.17159804124422373, "grad_norm": 0.821880578994751, "learning_rate": 0.00019846187731522506, "loss": 0.485, "step": 1244 }, { "epoch": 0.1717359817918477, "grad_norm": 0.6154242753982544, "learning_rate": 0.00019845934882623686, "loss": 0.4786, "step": 1245 }, { "epoch": 0.17187392233947169, "grad_norm": 0.499807208776474, "learning_rate": 0.00019845681827682264, "loss": 0.4803, "step": 1246 }, { "epoch": 0.17201186288709566, "grad_norm": 0.5233286619186401, "learning_rate": 0.0001984542856670352, "loss": 0.3539, "step": 1247 }, { "epoch": 0.17214980343471964, "grad_norm": 0.771405816078186, "learning_rate": 0.00019845175099692766, "loss": 0.7544, "step": 1248 }, { "epoch": 0.17228774398234362, "grad_norm": 0.6272239089012146, "learning_rate": 0.00019844921426655305, "loss": 0.6721, "step": 1249 }, { "epoch": 0.1724256845299676, "grad_norm": 0.6721327900886536, "learning_rate": 0.00019844667547596438, "loss": 0.6322, "step": 1250 }, { "epoch": 0.17256362507759157, "grad_norm": 0.7286562323570251, "learning_rate": 0.00019844413462521488, "loss": 0.4269, "step": 1251 }, { "epoch": 0.17270156562521552, "grad_norm": 0.7433802485466003, "learning_rate": 0.0001984415917143576, "loss": 0.833, "step": 1252 }, { "epoch": 0.1728395061728395, "grad_norm": 0.7148652076721191, "learning_rate": 0.00019843904674344585, "loss": 0.6804, "step": 1253 }, { "epoch": 0.17297744672046347, "grad_norm": 0.7036007642745972, "learning_rate": 0.0001984364997125329, "loss": 0.5319, "step": 1254 }, { "epoch": 0.17311538726808745, "grad_norm": 0.6977048516273499, "learning_rate": 0.000198433950621672, "loss": 0.9111, "step": 1255 }, { "epoch": 0.17325332781571143, "grad_norm": 0.888334333896637, "learning_rate": 0.0001984313994709165, "loss": 0.7865, "step": 1256 }, { "epoch": 0.1733912683633354, "grad_norm": 0.6975079774856567, "learning_rate": 0.0001984288462603198, "loss": 0.6823, "step": 1257 }, { "epoch": 0.17352920891095938, "grad_norm": 0.7903949022293091, "learning_rate": 0.0001984262909899353, "loss": 0.6823, "step": 1258 }, { "epoch": 0.17366714945858336, "grad_norm": 0.8779317140579224, "learning_rate": 0.0001984237336598165, "loss": 1.1926, "step": 1259 }, { "epoch": 0.17380509000620734, "grad_norm": 0.6393109560012817, "learning_rate": 0.00019842117427001693, "loss": 0.4333, "step": 1260 }, { "epoch": 0.17394303055383128, "grad_norm": 0.6505012512207031, "learning_rate": 0.00019841861282059013, "loss": 0.59, "step": 1261 }, { "epoch": 0.17408097110145526, "grad_norm": 0.6244116425514221, "learning_rate": 0.0001984160493115897, "loss": 0.607, "step": 1262 }, { "epoch": 0.17421891164907924, "grad_norm": 0.6466581225395203, "learning_rate": 0.0001984134837430693, "loss": 0.5512, "step": 1263 }, { "epoch": 0.17435685219670322, "grad_norm": 0.700474202632904, "learning_rate": 0.0001984109161150826, "loss": 0.8231, "step": 1264 }, { "epoch": 0.1744947927443272, "grad_norm": 0.5180341601371765, "learning_rate": 0.00019840834642768335, "loss": 0.4056, "step": 1265 }, { "epoch": 0.17463273329195117, "grad_norm": 0.5920750498771667, "learning_rate": 0.0001984057746809253, "loss": 0.7128, "step": 1266 }, { "epoch": 0.17477067383957515, "grad_norm": 0.6344928741455078, "learning_rate": 0.0001984032008748623, "loss": 0.3764, "step": 1267 }, { "epoch": 0.17490861438719912, "grad_norm": 0.894112765789032, "learning_rate": 0.0001984006250095482, "loss": 0.4984, "step": 1268 }, { "epoch": 0.1750465549348231, "grad_norm": 0.6044466495513916, "learning_rate": 0.0001983980470850369, "loss": 0.385, "step": 1269 }, { "epoch": 0.17518449548244708, "grad_norm": 0.5050377249717712, "learning_rate": 0.00019839546710138234, "loss": 0.4992, "step": 1270 }, { "epoch": 0.17532243603007103, "grad_norm": 0.6368976831436157, "learning_rate": 0.00019839288505863853, "loss": 0.471, "step": 1271 }, { "epoch": 0.175460376577695, "grad_norm": 0.7522995471954346, "learning_rate": 0.00019839030095685948, "loss": 0.9362, "step": 1272 }, { "epoch": 0.17559831712531898, "grad_norm": 0.5082296133041382, "learning_rate": 0.0001983877147960993, "loss": 0.4645, "step": 1273 }, { "epoch": 0.17573625767294296, "grad_norm": 0.7700505256652832, "learning_rate": 0.00019838512657641206, "loss": 0.4923, "step": 1274 }, { "epoch": 0.17587419822056694, "grad_norm": 0.6577361226081848, "learning_rate": 0.00019838253629785198, "loss": 0.5454, "step": 1275 }, { "epoch": 0.1760121387681909, "grad_norm": 1.0001535415649414, "learning_rate": 0.0001983799439604732, "loss": 0.331, "step": 1276 }, { "epoch": 0.1761500793158149, "grad_norm": 0.8388822078704834, "learning_rate": 0.00019837734956433003, "loss": 0.6103, "step": 1277 }, { "epoch": 0.17628801986343887, "grad_norm": 0.6725074052810669, "learning_rate": 0.00019837475310947674, "loss": 0.4451, "step": 1278 }, { "epoch": 0.17642596041106284, "grad_norm": 0.5467301607131958, "learning_rate": 0.00019837215459596766, "loss": 0.4478, "step": 1279 }, { "epoch": 0.1765639009586868, "grad_norm": 0.47652316093444824, "learning_rate": 0.00019836955402385717, "loss": 0.4242, "step": 1280 }, { "epoch": 0.17670184150631077, "grad_norm": 1.4824885129928589, "learning_rate": 0.0001983669513931997, "loss": 0.9623, "step": 1281 }, { "epoch": 0.17683978205393475, "grad_norm": 0.5727549195289612, "learning_rate": 0.0001983643467040497, "loss": 0.5369, "step": 1282 }, { "epoch": 0.17697772260155872, "grad_norm": 1.101233959197998, "learning_rate": 0.00019836173995646167, "loss": 0.8185, "step": 1283 }, { "epoch": 0.1771156631491827, "grad_norm": 0.6880276203155518, "learning_rate": 0.0001983591311504902, "loss": 0.4434, "step": 1284 }, { "epoch": 0.17725360369680668, "grad_norm": 0.827042818069458, "learning_rate": 0.00019835652028618985, "loss": 0.5137, "step": 1285 }, { "epoch": 0.17739154424443065, "grad_norm": 0.5843849182128906, "learning_rate": 0.00019835390736361525, "loss": 0.5191, "step": 1286 }, { "epoch": 0.17752948479205463, "grad_norm": 0.571264386177063, "learning_rate": 0.00019835129238282112, "loss": 0.7086, "step": 1287 }, { "epoch": 0.1776674253396786, "grad_norm": 0.6771243214607239, "learning_rate": 0.00019834867534386215, "loss": 0.6968, "step": 1288 }, { "epoch": 0.17780536588730259, "grad_norm": 0.6978261470794678, "learning_rate": 0.0001983460562467931, "loss": 0.5242, "step": 1289 }, { "epoch": 0.17794330643492653, "grad_norm": 0.7202770113945007, "learning_rate": 0.00019834343509166882, "loss": 0.6071, "step": 1290 }, { "epoch": 0.1780812469825505, "grad_norm": 0.9898126125335693, "learning_rate": 0.00019834081187854413, "loss": 0.4903, "step": 1291 }, { "epoch": 0.1782191875301745, "grad_norm": 0.8469486236572266, "learning_rate": 0.0001983381866074739, "loss": 0.6258, "step": 1292 }, { "epoch": 0.17835712807779847, "grad_norm": 0.9509524703025818, "learning_rate": 0.00019833555927851313, "loss": 0.668, "step": 1293 }, { "epoch": 0.17849506862542244, "grad_norm": 0.5267693400382996, "learning_rate": 0.00019833292989171675, "loss": 0.3385, "step": 1294 }, { "epoch": 0.17863300917304642, "grad_norm": 1.0337375402450562, "learning_rate": 0.00019833029844713984, "loss": 0.5466, "step": 1295 }, { "epoch": 0.1787709497206704, "grad_norm": 0.7010806798934937, "learning_rate": 0.00019832766494483738, "loss": 0.3554, "step": 1296 }, { "epoch": 0.17890889026829437, "grad_norm": 0.7125682234764099, "learning_rate": 0.00019832502938486456, "loss": 0.5388, "step": 1297 }, { "epoch": 0.17904683081591835, "grad_norm": 0.8770589828491211, "learning_rate": 0.00019832239176727651, "loss": 1.0312, "step": 1298 }, { "epoch": 0.1791847713635423, "grad_norm": 0.7130751013755798, "learning_rate": 0.0001983197520921284, "loss": 0.5928, "step": 1299 }, { "epoch": 0.17932271191116628, "grad_norm": 0.5972537994384766, "learning_rate": 0.0001983171103594755, "loss": 0.4878, "step": 1300 }, { "epoch": 0.17946065245879025, "grad_norm": 0.8252794146537781, "learning_rate": 0.00019831446656937311, "loss": 0.6034, "step": 1301 }, { "epoch": 0.17959859300641423, "grad_norm": 1.9833587408065796, "learning_rate": 0.0001983118207218765, "loss": 0.8991, "step": 1302 }, { "epoch": 0.1797365335540382, "grad_norm": 0.69463711977005, "learning_rate": 0.00019830917281704106, "loss": 0.5663, "step": 1303 }, { "epoch": 0.17987447410166219, "grad_norm": 0.9654061794281006, "learning_rate": 0.00019830652285492226, "loss": 0.9713, "step": 1304 }, { "epoch": 0.18001241464928616, "grad_norm": 1.0621752738952637, "learning_rate": 0.00019830387083557546, "loss": 0.7269, "step": 1305 }, { "epoch": 0.18015035519691014, "grad_norm": 0.7390806674957275, "learning_rate": 0.00019830121675905622, "loss": 0.8111, "step": 1306 }, { "epoch": 0.18028829574453412, "grad_norm": 0.6989424824714661, "learning_rate": 0.00019829856062542005, "loss": 0.6935, "step": 1307 }, { "epoch": 0.1804262362921581, "grad_norm": 0.6748825907707214, "learning_rate": 0.00019829590243472257, "loss": 0.8181, "step": 1308 }, { "epoch": 0.18056417683978204, "grad_norm": 0.8862558007240295, "learning_rate": 0.00019829324218701937, "loss": 0.7099, "step": 1309 }, { "epoch": 0.18070211738740602, "grad_norm": 0.6579875349998474, "learning_rate": 0.0001982905798823661, "loss": 0.4928, "step": 1310 }, { "epoch": 0.18084005793503, "grad_norm": 0.7875022292137146, "learning_rate": 0.00019828791552081861, "loss": 1.074, "step": 1311 }, { "epoch": 0.18097799848265397, "grad_norm": 0.6504039764404297, "learning_rate": 0.00019828524910243248, "loss": 0.6877, "step": 1312 }, { "epoch": 0.18111593903027795, "grad_norm": 0.6015701293945312, "learning_rate": 0.0001982825806272636, "loss": 0.5485, "step": 1313 }, { "epoch": 0.18125387957790193, "grad_norm": 0.5628058314323425, "learning_rate": 0.0001982799100953678, "loss": 0.4862, "step": 1314 }, { "epoch": 0.1813918201255259, "grad_norm": 0.6889582276344299, "learning_rate": 0.00019827723750680094, "loss": 0.4997, "step": 1315 }, { "epoch": 0.18152976067314988, "grad_norm": 0.6245821118354797, "learning_rate": 0.000198274562861619, "loss": 0.3727, "step": 1316 }, { "epoch": 0.18166770122077386, "grad_norm": 0.7458206415176392, "learning_rate": 0.00019827188615987787, "loss": 0.7899, "step": 1317 }, { "epoch": 0.1818056417683978, "grad_norm": 0.7906347513198853, "learning_rate": 0.00019826920740163365, "loss": 0.578, "step": 1318 }, { "epoch": 0.18194358231602178, "grad_norm": 0.596684992313385, "learning_rate": 0.00019826652658694237, "loss": 0.4991, "step": 1319 }, { "epoch": 0.18208152286364576, "grad_norm": 0.6883934736251831, "learning_rate": 0.00019826384371586013, "loss": 0.5329, "step": 1320 }, { "epoch": 0.18221946341126974, "grad_norm": 0.6976962685585022, "learning_rate": 0.00019826115878844304, "loss": 0.3356, "step": 1321 }, { "epoch": 0.18235740395889372, "grad_norm": 0.7330277562141418, "learning_rate": 0.00019825847180474735, "loss": 0.7215, "step": 1322 }, { "epoch": 0.1824953445065177, "grad_norm": 1.3039255142211914, "learning_rate": 0.00019825578276482924, "loss": 0.7614, "step": 1323 }, { "epoch": 0.18263328505414167, "grad_norm": 0.6663621068000793, "learning_rate": 0.00019825309166874498, "loss": 0.4895, "step": 1324 }, { "epoch": 0.18277122560176565, "grad_norm": 0.5757015347480774, "learning_rate": 0.00019825039851655092, "loss": 0.4318, "step": 1325 }, { "epoch": 0.18290916614938962, "grad_norm": 0.6393371820449829, "learning_rate": 0.0001982477033083034, "loss": 0.6978, "step": 1326 }, { "epoch": 0.1830471066970136, "grad_norm": 1.4632211923599243, "learning_rate": 0.0001982450060440588, "loss": 1.2002, "step": 1327 }, { "epoch": 0.18318504724463755, "grad_norm": 0.7976914644241333, "learning_rate": 0.0001982423067238736, "loss": 0.477, "step": 1328 }, { "epoch": 0.18332298779226153, "grad_norm": 0.9316898584365845, "learning_rate": 0.00019823960534780428, "loss": 1.0479, "step": 1329 }, { "epoch": 0.1834609283398855, "grad_norm": 0.5720082521438599, "learning_rate": 0.00019823690191590735, "loss": 0.5106, "step": 1330 }, { "epoch": 0.18359886888750948, "grad_norm": 0.6023032665252686, "learning_rate": 0.00019823419642823944, "loss": 0.6278, "step": 1331 }, { "epoch": 0.18373680943513346, "grad_norm": 0.7243465781211853, "learning_rate": 0.0001982314888848571, "loss": 0.7179, "step": 1332 }, { "epoch": 0.18387474998275743, "grad_norm": 0.5361267924308777, "learning_rate": 0.000198228779285817, "loss": 0.2759, "step": 1333 }, { "epoch": 0.1840126905303814, "grad_norm": 0.7113077044487, "learning_rate": 0.00019822606763117588, "loss": 0.6184, "step": 1334 }, { "epoch": 0.1841506310780054, "grad_norm": 0.5451545119285583, "learning_rate": 0.00019822335392099044, "loss": 0.553, "step": 1335 }, { "epoch": 0.18428857162562937, "grad_norm": 0.5552999973297119, "learning_rate": 0.00019822063815531753, "loss": 0.4488, "step": 1336 }, { "epoch": 0.18442651217325332, "grad_norm": 0.6058651804924011, "learning_rate": 0.0001982179203342139, "loss": 0.5958, "step": 1337 }, { "epoch": 0.1845644527208773, "grad_norm": 0.7197383046150208, "learning_rate": 0.00019821520045773655, "loss": 0.6075, "step": 1338 }, { "epoch": 0.18470239326850127, "grad_norm": 0.8067410588264465, "learning_rate": 0.00019821247852594225, "loss": 0.5428, "step": 1339 }, { "epoch": 0.18484033381612525, "grad_norm": 0.8334001898765564, "learning_rate": 0.00019820975453888805, "loss": 0.6031, "step": 1340 }, { "epoch": 0.18497827436374922, "grad_norm": 0.86203533411026, "learning_rate": 0.0001982070284966309, "loss": 0.2256, "step": 1341 }, { "epoch": 0.1851162149113732, "grad_norm": 0.5989809036254883, "learning_rate": 0.00019820430039922796, "loss": 0.3341, "step": 1342 }, { "epoch": 0.18525415545899718, "grad_norm": 0.7577527761459351, "learning_rate": 0.00019820157024673618, "loss": 0.3494, "step": 1343 }, { "epoch": 0.18539209600662115, "grad_norm": 0.5864545702934265, "learning_rate": 0.00019819883803921275, "loss": 0.3967, "step": 1344 }, { "epoch": 0.18553003655424513, "grad_norm": 2.4594223499298096, "learning_rate": 0.00019819610377671488, "loss": 0.7102, "step": 1345 }, { "epoch": 0.18566797710186908, "grad_norm": 0.6784563660621643, "learning_rate": 0.00019819336745929974, "loss": 0.5271, "step": 1346 }, { "epoch": 0.18580591764949306, "grad_norm": 0.8047828078269958, "learning_rate": 0.0001981906290870246, "loss": 0.796, "step": 1347 }, { "epoch": 0.18594385819711703, "grad_norm": 0.628571093082428, "learning_rate": 0.00019818788865994682, "loss": 0.4588, "step": 1348 }, { "epoch": 0.186081798744741, "grad_norm": 0.9610311388969421, "learning_rate": 0.00019818514617812367, "loss": 0.8873, "step": 1349 }, { "epoch": 0.186219739292365, "grad_norm": 0.7361674904823303, "learning_rate": 0.00019818240164161258, "loss": 0.4124, "step": 1350 }, { "epoch": 0.18635767983998897, "grad_norm": 0.5245830416679382, "learning_rate": 0.00019817965505047097, "loss": 0.5351, "step": 1351 }, { "epoch": 0.18649562038761294, "grad_norm": 0.6559715270996094, "learning_rate": 0.0001981769064047563, "loss": 0.6047, "step": 1352 }, { "epoch": 0.18663356093523692, "grad_norm": 0.7970331907272339, "learning_rate": 0.00019817415570452617, "loss": 0.7487, "step": 1353 }, { "epoch": 0.1867715014828609, "grad_norm": 0.7353190779685974, "learning_rate": 0.0001981714029498381, "loss": 0.6831, "step": 1354 }, { "epoch": 0.18690944203048487, "grad_norm": 0.9027265906333923, "learning_rate": 0.00019816864814074965, "loss": 0.5628, "step": 1355 }, { "epoch": 0.18704738257810882, "grad_norm": 0.7597387433052063, "learning_rate": 0.0001981658912773185, "loss": 0.4643, "step": 1356 }, { "epoch": 0.1871853231257328, "grad_norm": 0.6016260385513306, "learning_rate": 0.00019816313235960234, "loss": 0.5395, "step": 1357 }, { "epoch": 0.18732326367335678, "grad_norm": 0.673842191696167, "learning_rate": 0.00019816037138765892, "loss": 0.5217, "step": 1358 }, { "epoch": 0.18746120422098075, "grad_norm": 0.8495957851409912, "learning_rate": 0.000198157608361546, "loss": 0.9105, "step": 1359 }, { "epoch": 0.18759914476860473, "grad_norm": 0.7947543263435364, "learning_rate": 0.00019815484328132143, "loss": 0.5888, "step": 1360 }, { "epoch": 0.1877370853162287, "grad_norm": 0.9071835875511169, "learning_rate": 0.00019815207614704304, "loss": 0.5472, "step": 1361 }, { "epoch": 0.18787502586385268, "grad_norm": 0.6747380495071411, "learning_rate": 0.00019814930695876876, "loss": 0.488, "step": 1362 }, { "epoch": 0.18801296641147666, "grad_norm": 1.034006118774414, "learning_rate": 0.00019814653571655652, "loss": 0.7151, "step": 1363 }, { "epoch": 0.18815090695910064, "grad_norm": 0.5463393926620483, "learning_rate": 0.00019814376242046432, "loss": 0.488, "step": 1364 }, { "epoch": 0.1882888475067246, "grad_norm": 0.724962592124939, "learning_rate": 0.0001981409870705502, "loss": 0.3134, "step": 1365 }, { "epoch": 0.18842678805434857, "grad_norm": 0.8950023651123047, "learning_rate": 0.00019813820966687226, "loss": 0.6485, "step": 1366 }, { "epoch": 0.18856472860197254, "grad_norm": 0.7584651112556458, "learning_rate": 0.00019813543020948858, "loss": 0.767, "step": 1367 }, { "epoch": 0.18870266914959652, "grad_norm": 0.7800304889678955, "learning_rate": 0.00019813264869845737, "loss": 1.0948, "step": 1368 }, { "epoch": 0.1888406096972205, "grad_norm": 0.657414972782135, "learning_rate": 0.0001981298651338368, "loss": 0.3891, "step": 1369 }, { "epoch": 0.18897855024484447, "grad_norm": 0.8079352974891663, "learning_rate": 0.0001981270795156851, "loss": 0.5913, "step": 1370 }, { "epoch": 0.18911649079246845, "grad_norm": 0.9968591928482056, "learning_rate": 0.00019812429184406063, "loss": 0.3255, "step": 1371 }, { "epoch": 0.18925443134009243, "grad_norm": 0.599799394607544, "learning_rate": 0.00019812150211902165, "loss": 0.4604, "step": 1372 }, { "epoch": 0.1893923718877164, "grad_norm": 1.048561453819275, "learning_rate": 0.00019811871034062663, "loss": 1.1433, "step": 1373 }, { "epoch": 0.18953031243534038, "grad_norm": 0.6983380913734436, "learning_rate": 0.00019811591650893394, "loss": 0.8263, "step": 1374 }, { "epoch": 0.18966825298296433, "grad_norm": 0.6309385895729065, "learning_rate": 0.00019811312062400203, "loss": 0.6981, "step": 1375 }, { "epoch": 0.1898061935305883, "grad_norm": 0.8139154314994812, "learning_rate": 0.00019811032268588944, "loss": 0.5451, "step": 1376 }, { "epoch": 0.18994413407821228, "grad_norm": 6.198182582855225, "learning_rate": 0.00019810752269465472, "loss": 1.4244, "step": 1377 }, { "epoch": 0.19008207462583626, "grad_norm": 0.8092629313468933, "learning_rate": 0.00019810472065035645, "loss": 0.8775, "step": 1378 }, { "epoch": 0.19022001517346024, "grad_norm": 0.49326083064079285, "learning_rate": 0.00019810191655305327, "loss": 0.3639, "step": 1379 }, { "epoch": 0.19035795572108422, "grad_norm": 0.56129390001297, "learning_rate": 0.00019809911040280384, "loss": 0.7192, "step": 1380 }, { "epoch": 0.1904958962687082, "grad_norm": 1.1733331680297852, "learning_rate": 0.00019809630219966694, "loss": 0.9176, "step": 1381 }, { "epoch": 0.19063383681633217, "grad_norm": 0.7035924196243286, "learning_rate": 0.00019809349194370129, "loss": 0.4331, "step": 1382 }, { "epoch": 0.19077177736395615, "grad_norm": 0.5710757374763489, "learning_rate": 0.0001980906796349657, "loss": 0.2989, "step": 1383 }, { "epoch": 0.1909097179115801, "grad_norm": 0.5652586817741394, "learning_rate": 0.00019808786527351907, "loss": 0.6384, "step": 1384 }, { "epoch": 0.19104765845920407, "grad_norm": 1.3671326637268066, "learning_rate": 0.00019808504885942022, "loss": 0.7014, "step": 1385 }, { "epoch": 0.19118559900682805, "grad_norm": 0.5562953948974609, "learning_rate": 0.00019808223039272815, "loss": 0.4959, "step": 1386 }, { "epoch": 0.19132353955445203, "grad_norm": 0.6569441556930542, "learning_rate": 0.00019807940987350185, "loss": 0.5137, "step": 1387 }, { "epoch": 0.191461480102076, "grad_norm": 1.103259563446045, "learning_rate": 0.00019807658730180028, "loss": 0.6768, "step": 1388 }, { "epoch": 0.19159942064969998, "grad_norm": 0.5823691487312317, "learning_rate": 0.0001980737626776825, "loss": 0.6531, "step": 1389 }, { "epoch": 0.19173736119732396, "grad_norm": 0.6827458739280701, "learning_rate": 0.00019807093600120773, "loss": 0.7069, "step": 1390 }, { "epoch": 0.19187530174494793, "grad_norm": 0.6564488410949707, "learning_rate": 0.00019806810727243502, "loss": 0.6042, "step": 1391 }, { "epoch": 0.1920132422925719, "grad_norm": 0.7399798631668091, "learning_rate": 0.00019806527649142362, "loss": 0.7239, "step": 1392 }, { "epoch": 0.1921511828401959, "grad_norm": 0.9162158370018005, "learning_rate": 0.0001980624436582327, "loss": 0.6947, "step": 1393 }, { "epoch": 0.19228912338781984, "grad_norm": 0.7776380181312561, "learning_rate": 0.00019805960877292164, "loss": 0.74, "step": 1394 }, { "epoch": 0.19242706393544382, "grad_norm": 1.0710697174072266, "learning_rate": 0.00019805677183554966, "loss": 1.1499, "step": 1395 }, { "epoch": 0.1925650044830678, "grad_norm": 0.7296781539916992, "learning_rate": 0.00019805393284617624, "loss": 0.4905, "step": 1396 }, { "epoch": 0.19270294503069177, "grad_norm": 0.5105857253074646, "learning_rate": 0.00019805109180486073, "loss": 0.5242, "step": 1397 }, { "epoch": 0.19284088557831575, "grad_norm": 0.7485494613647461, "learning_rate": 0.00019804824871166255, "loss": 0.9468, "step": 1398 }, { "epoch": 0.19297882612593972, "grad_norm": 0.6187512278556824, "learning_rate": 0.00019804540356664125, "loss": 0.4783, "step": 1399 }, { "epoch": 0.1931167666735637, "grad_norm": 0.775763750076294, "learning_rate": 0.00019804255636985633, "loss": 0.3514, "step": 1400 }, { "epoch": 0.1931167666735637, "eval_loss": 0.6971297860145569, "eval_runtime": 23.4976, "eval_samples_per_second": 2.511, "eval_steps_per_second": 2.511, "step": 1400 }, { "epoch": 0.19325470722118768, "grad_norm": 0.8662362098693848, "learning_rate": 0.0001980397071213674, "loss": 0.9185, "step": 1401 }, { "epoch": 0.19339264776881165, "grad_norm": 0.4343247711658478, "learning_rate": 0.00019803685582123412, "loss": 0.3892, "step": 1402 }, { "epoch": 0.1935305883164356, "grad_norm": 0.6295523047447205, "learning_rate": 0.00019803400246951606, "loss": 0.478, "step": 1403 }, { "epoch": 0.19366852886405958, "grad_norm": 0.6570201516151428, "learning_rate": 0.00019803114706627305, "loss": 0.478, "step": 1404 }, { "epoch": 0.19380646941168356, "grad_norm": 0.7862534523010254, "learning_rate": 0.00019802828961156473, "loss": 0.6387, "step": 1405 }, { "epoch": 0.19394440995930753, "grad_norm": 0.6784535050392151, "learning_rate": 0.00019802543010545099, "loss": 0.6172, "step": 1406 }, { "epoch": 0.1940823505069315, "grad_norm": 0.9805498719215393, "learning_rate": 0.0001980225685479916, "loss": 0.7294, "step": 1407 }, { "epoch": 0.1942202910545555, "grad_norm": 0.5521594882011414, "learning_rate": 0.00019801970493924648, "loss": 0.5163, "step": 1408 }, { "epoch": 0.19435823160217947, "grad_norm": 0.7201843857765198, "learning_rate": 0.00019801683927927558, "loss": 0.4305, "step": 1409 }, { "epoch": 0.19449617214980344, "grad_norm": 0.9199146628379822, "learning_rate": 0.0001980139715681388, "loss": 0.7673, "step": 1410 }, { "epoch": 0.19463411269742742, "grad_norm": 0.603888213634491, "learning_rate": 0.0001980111018058962, "loss": 0.6015, "step": 1411 }, { "epoch": 0.1947720532450514, "grad_norm": 0.7373432517051697, "learning_rate": 0.0001980082299926078, "loss": 0.6025, "step": 1412 }, { "epoch": 0.19490999379267535, "grad_norm": 0.598432719707489, "learning_rate": 0.00019800535612833376, "loss": 0.4569, "step": 1413 }, { "epoch": 0.19504793434029932, "grad_norm": 0.6446057558059692, "learning_rate": 0.00019800248021313417, "loss": 0.6147, "step": 1414 }, { "epoch": 0.1951858748879233, "grad_norm": 1.0996989011764526, "learning_rate": 0.00019799960224706922, "loss": 0.8579, "step": 1415 }, { "epoch": 0.19532381543554728, "grad_norm": 0.634935736656189, "learning_rate": 0.00019799672223019913, "loss": 0.857, "step": 1416 }, { "epoch": 0.19546175598317125, "grad_norm": 0.8313598036766052, "learning_rate": 0.0001979938401625842, "loss": 0.5807, "step": 1417 }, { "epoch": 0.19559969653079523, "grad_norm": 0.5541883707046509, "learning_rate": 0.0001979909560442847, "loss": 0.4145, "step": 1418 }, { "epoch": 0.1957376370784192, "grad_norm": 0.7889317274093628, "learning_rate": 0.00019798806987536102, "loss": 0.3321, "step": 1419 }, { "epoch": 0.19587557762604318, "grad_norm": 0.5884930491447449, "learning_rate": 0.00019798518165587355, "loss": 0.6436, "step": 1420 }, { "epoch": 0.19601351817366716, "grad_norm": 0.4990857243537903, "learning_rate": 0.00019798229138588274, "loss": 0.3125, "step": 1421 }, { "epoch": 0.1961514587212911, "grad_norm": 0.6287117600440979, "learning_rate": 0.00019797939906544902, "loss": 0.7401, "step": 1422 }, { "epoch": 0.1962893992689151, "grad_norm": 0.6580937504768372, "learning_rate": 0.00019797650469463297, "loss": 0.3514, "step": 1423 }, { "epoch": 0.19642733981653906, "grad_norm": 0.8169111609458923, "learning_rate": 0.00019797360827349515, "loss": 0.5915, "step": 1424 }, { "epoch": 0.19656528036416304, "grad_norm": 0.5426296591758728, "learning_rate": 0.0001979707098020962, "loss": 0.552, "step": 1425 }, { "epoch": 0.19670322091178702, "grad_norm": 0.7340196371078491, "learning_rate": 0.00019796780928049669, "loss": 0.7031, "step": 1426 }, { "epoch": 0.196841161459411, "grad_norm": 1.1579564809799194, "learning_rate": 0.0001979649067087574, "loss": 0.6673, "step": 1427 }, { "epoch": 0.19697910200703497, "grad_norm": 0.7036253213882446, "learning_rate": 0.00019796200208693906, "loss": 0.9771, "step": 1428 }, { "epoch": 0.19711704255465895, "grad_norm": 0.7708396315574646, "learning_rate": 0.0001979590954151024, "loss": 0.8293, "step": 1429 }, { "epoch": 0.19725498310228293, "grad_norm": 0.970969557762146, "learning_rate": 0.0001979561866933083, "loss": 0.6898, "step": 1430 }, { "epoch": 0.1973929236499069, "grad_norm": 0.4992259442806244, "learning_rate": 0.00019795327592161762, "loss": 0.3729, "step": 1431 }, { "epoch": 0.19753086419753085, "grad_norm": 0.5976670980453491, "learning_rate": 0.00019795036310009124, "loss": 0.6637, "step": 1432 }, { "epoch": 0.19766880474515483, "grad_norm": 1.2521162033081055, "learning_rate": 0.00019794744822879018, "loss": 0.4687, "step": 1433 }, { "epoch": 0.1978067452927788, "grad_norm": 0.578628659248352, "learning_rate": 0.00019794453130777536, "loss": 0.3506, "step": 1434 }, { "epoch": 0.19794468584040278, "grad_norm": 0.5497108101844788, "learning_rate": 0.0001979416123371079, "loss": 0.4068, "step": 1435 }, { "epoch": 0.19808262638802676, "grad_norm": 0.6828850507736206, "learning_rate": 0.00019793869131684883, "loss": 0.646, "step": 1436 }, { "epoch": 0.19822056693565074, "grad_norm": 0.7199423313140869, "learning_rate": 0.00019793576824705928, "loss": 0.8331, "step": 1437 }, { "epoch": 0.19835850748327472, "grad_norm": 0.6055320501327515, "learning_rate": 0.00019793284312780046, "loss": 0.5686, "step": 1438 }, { "epoch": 0.1984964480308987, "grad_norm": 0.7404488325119019, "learning_rate": 0.00019792991595913355, "loss": 0.7525, "step": 1439 }, { "epoch": 0.19863438857852267, "grad_norm": 0.5744838714599609, "learning_rate": 0.0001979269867411198, "loss": 0.4627, "step": 1440 }, { "epoch": 0.19877232912614662, "grad_norm": 0.5375235676765442, "learning_rate": 0.0001979240554738205, "loss": 0.4692, "step": 1441 }, { "epoch": 0.1989102696737706, "grad_norm": 0.790148913860321, "learning_rate": 0.00019792112215729705, "loss": 0.6927, "step": 1442 }, { "epoch": 0.19904821022139457, "grad_norm": 0.625100314617157, "learning_rate": 0.0001979181867916108, "loss": 0.422, "step": 1443 }, { "epoch": 0.19918615076901855, "grad_norm": 0.6314361691474915, "learning_rate": 0.00019791524937682318, "loss": 0.621, "step": 1444 }, { "epoch": 0.19932409131664253, "grad_norm": 0.5800315141677856, "learning_rate": 0.00019791230991299564, "loss": 0.5747, "step": 1445 }, { "epoch": 0.1994620318642665, "grad_norm": 0.7787585258483887, "learning_rate": 0.00019790936840018969, "loss": 0.5767, "step": 1446 }, { "epoch": 0.19959997241189048, "grad_norm": 0.6373497843742371, "learning_rate": 0.00019790642483846695, "loss": 0.5257, "step": 1447 }, { "epoch": 0.19973791295951446, "grad_norm": 1.4769777059555054, "learning_rate": 0.00019790347922788893, "loss": 0.9494, "step": 1448 }, { "epoch": 0.19987585350713843, "grad_norm": 0.6758845448493958, "learning_rate": 0.00019790053156851732, "loss": 0.5581, "step": 1449 }, { "epoch": 0.20001379405476238, "grad_norm": 0.7243471145629883, "learning_rate": 0.00019789758186041382, "loss": 0.6731, "step": 1450 }, { "epoch": 0.20015173460238636, "grad_norm": 1.2024186849594116, "learning_rate": 0.00019789463010364014, "loss": 0.8144, "step": 1451 }, { "epoch": 0.20028967515001034, "grad_norm": 0.8731943368911743, "learning_rate": 0.00019789167629825804, "loss": 0.4002, "step": 1452 }, { "epoch": 0.20042761569763431, "grad_norm": 0.7492139339447021, "learning_rate": 0.00019788872044432934, "loss": 0.3735, "step": 1453 }, { "epoch": 0.2005655562452583, "grad_norm": 0.6955728530883789, "learning_rate": 0.0001978857625419159, "loss": 0.5902, "step": 1454 }, { "epoch": 0.20070349679288227, "grad_norm": 0.7084043622016907, "learning_rate": 0.00019788280259107962, "loss": 0.6807, "step": 1455 }, { "epoch": 0.20084143734050625, "grad_norm": 0.8103449940681458, "learning_rate": 0.00019787984059188243, "loss": 0.603, "step": 1456 }, { "epoch": 0.20097937788813022, "grad_norm": 0.6231652498245239, "learning_rate": 0.00019787687654438632, "loss": 0.5323, "step": 1457 }, { "epoch": 0.2011173184357542, "grad_norm": 0.8590201735496521, "learning_rate": 0.00019787391044865334, "loss": 0.926, "step": 1458 }, { "epoch": 0.20125525898337818, "grad_norm": 0.9764631986618042, "learning_rate": 0.00019787094230474552, "loss": 0.6645, "step": 1459 }, { "epoch": 0.20139319953100213, "grad_norm": 0.6858428716659546, "learning_rate": 0.00019786797211272502, "loss": 0.6053, "step": 1460 }, { "epoch": 0.2015311400786261, "grad_norm": 0.7849305868148804, "learning_rate": 0.00019786499987265393, "loss": 0.5137, "step": 1461 }, { "epoch": 0.20166908062625008, "grad_norm": 0.5999508500099182, "learning_rate": 0.0001978620255845945, "loss": 0.4905, "step": 1462 }, { "epoch": 0.20180702117387406, "grad_norm": 0.8393694758415222, "learning_rate": 0.00019785904924860898, "loss": 0.3715, "step": 1463 }, { "epoch": 0.20194496172149803, "grad_norm": 0.5062437653541565, "learning_rate": 0.00019785607086475964, "loss": 0.3396, "step": 1464 }, { "epoch": 0.202082902269122, "grad_norm": 0.8367435932159424, "learning_rate": 0.00019785309043310881, "loss": 1.0617, "step": 1465 }, { "epoch": 0.202220842816746, "grad_norm": 0.8095563054084778, "learning_rate": 0.00019785010795371884, "loss": 0.4996, "step": 1466 }, { "epoch": 0.20235878336436997, "grad_norm": 0.7065717577934265, "learning_rate": 0.00019784712342665214, "loss": 0.8014, "step": 1467 }, { "epoch": 0.20249672391199394, "grad_norm": 0.6713412404060364, "learning_rate": 0.00019784413685197124, "loss": 0.595, "step": 1468 }, { "epoch": 0.2026346644596179, "grad_norm": 0.6227356791496277, "learning_rate": 0.00019784114822973852, "loss": 0.5142, "step": 1469 }, { "epoch": 0.20277260500724187, "grad_norm": 0.6739059090614319, "learning_rate": 0.00019783815756001661, "loss": 0.8218, "step": 1470 }, { "epoch": 0.20291054555486585, "grad_norm": 1.0218162536621094, "learning_rate": 0.00019783516484286808, "loss": 0.6881, "step": 1471 }, { "epoch": 0.20304848610248982, "grad_norm": 0.749244749546051, "learning_rate": 0.00019783217007835556, "loss": 0.5561, "step": 1472 }, { "epoch": 0.2031864266501138, "grad_norm": 1.5935417413711548, "learning_rate": 0.00019782917326654166, "loss": 0.8062, "step": 1473 }, { "epoch": 0.20332436719773778, "grad_norm": 0.5497024059295654, "learning_rate": 0.0001978261744074892, "loss": 0.2932, "step": 1474 }, { "epoch": 0.20346230774536175, "grad_norm": 0.8250129222869873, "learning_rate": 0.00019782317350126082, "loss": 0.7616, "step": 1475 }, { "epoch": 0.20360024829298573, "grad_norm": 0.5339081287384033, "learning_rate": 0.0001978201705479194, "loss": 0.3876, "step": 1476 }, { "epoch": 0.2037381888406097, "grad_norm": 0.4277068078517914, "learning_rate": 0.0001978171655475278, "loss": 0.3837, "step": 1477 }, { "epoch": 0.20387612938823368, "grad_norm": 0.5383226871490479, "learning_rate": 0.0001978141585001488, "loss": 0.508, "step": 1478 }, { "epoch": 0.20401406993585763, "grad_norm": 0.9465408325195312, "learning_rate": 0.00019781114940584544, "loss": 0.4469, "step": 1479 }, { "epoch": 0.2041520104834816, "grad_norm": 0.5293963551521301, "learning_rate": 0.0001978081382646806, "loss": 0.3079, "step": 1480 }, { "epoch": 0.2042899510311056, "grad_norm": 0.5232430696487427, "learning_rate": 0.00019780512507671736, "loss": 0.3975, "step": 1481 }, { "epoch": 0.20442789157872956, "grad_norm": 0.7306784391403198, "learning_rate": 0.00019780210984201874, "loss": 1.1105, "step": 1482 }, { "epoch": 0.20456583212635354, "grad_norm": 0.6948933005332947, "learning_rate": 0.00019779909256064786, "loss": 0.6559, "step": 1483 }, { "epoch": 0.20470377267397752, "grad_norm": 0.859988808631897, "learning_rate": 0.00019779607323266784, "loss": 0.3701, "step": 1484 }, { "epoch": 0.2048417132216015, "grad_norm": 0.42747312784194946, "learning_rate": 0.00019779305185814187, "loss": 0.4033, "step": 1485 }, { "epoch": 0.20497965376922547, "grad_norm": 0.6109987497329712, "learning_rate": 0.0001977900284371332, "loss": 0.6567, "step": 1486 }, { "epoch": 0.20511759431684945, "grad_norm": 0.9378513693809509, "learning_rate": 0.00019778700296970507, "loss": 0.505, "step": 1487 }, { "epoch": 0.2052555348644734, "grad_norm": 0.6264609694480896, "learning_rate": 0.0001977839754559208, "loss": 0.6342, "step": 1488 }, { "epoch": 0.20539347541209738, "grad_norm": 1.1089897155761719, "learning_rate": 0.00019778094589584377, "loss": 0.6905, "step": 1489 }, { "epoch": 0.20553141595972135, "grad_norm": 0.963965117931366, "learning_rate": 0.00019777791428953734, "loss": 1.0408, "step": 1490 }, { "epoch": 0.20566935650734533, "grad_norm": 0.650866687297821, "learning_rate": 0.000197774880637065, "loss": 0.4537, "step": 1491 }, { "epoch": 0.2058072970549693, "grad_norm": 0.702157199382782, "learning_rate": 0.00019777184493849017, "loss": 0.4833, "step": 1492 }, { "epoch": 0.20594523760259328, "grad_norm": 0.5543553233146667, "learning_rate": 0.00019776880719387643, "loss": 0.5947, "step": 1493 }, { "epoch": 0.20608317815021726, "grad_norm": 0.6553021669387817, "learning_rate": 0.00019776576740328735, "loss": 0.366, "step": 1494 }, { "epoch": 0.20622111869784124, "grad_norm": 0.6664499044418335, "learning_rate": 0.00019776272556678648, "loss": 0.5255, "step": 1495 }, { "epoch": 0.20635905924546522, "grad_norm": 1.020633339881897, "learning_rate": 0.00019775968168443754, "loss": 0.6729, "step": 1496 }, { "epoch": 0.2064969997930892, "grad_norm": 0.6680289506912231, "learning_rate": 0.00019775663575630423, "loss": 0.5426, "step": 1497 }, { "epoch": 0.20663494034071314, "grad_norm": 1.2508795261383057, "learning_rate": 0.00019775358778245026, "loss": 0.493, "step": 1498 }, { "epoch": 0.20677288088833712, "grad_norm": 0.7222293019294739, "learning_rate": 0.00019775053776293939, "loss": 0.4108, "step": 1499 }, { "epoch": 0.2069108214359611, "grad_norm": 0.5310907959938049, "learning_rate": 0.0001977474856978355, "loss": 0.3916, "step": 1500 }, { "epoch": 0.20704876198358507, "grad_norm": 0.5898303985595703, "learning_rate": 0.0001977444315872025, "loss": 0.5454, "step": 1501 }, { "epoch": 0.20718670253120905, "grad_norm": 0.5404325723648071, "learning_rate": 0.00019774137543110415, "loss": 0.5323, "step": 1502 }, { "epoch": 0.20732464307883303, "grad_norm": 0.5166662335395813, "learning_rate": 0.00019773831722960457, "loss": 0.474, "step": 1503 }, { "epoch": 0.207462583626457, "grad_norm": 0.47797584533691406, "learning_rate": 0.00019773525698276766, "loss": 0.5979, "step": 1504 }, { "epoch": 0.20760052417408098, "grad_norm": 0.7770309448242188, "learning_rate": 0.00019773219469065745, "loss": 0.7688, "step": 1505 }, { "epoch": 0.20773846472170496, "grad_norm": 1.0345587730407715, "learning_rate": 0.0001977291303533381, "loss": 0.2656, "step": 1506 }, { "epoch": 0.2078764052693289, "grad_norm": 1.0005046129226685, "learning_rate": 0.0001977260639708737, "loss": 0.4979, "step": 1507 }, { "epoch": 0.20801434581695288, "grad_norm": 0.9885176420211792, "learning_rate": 0.0001977229955433284, "loss": 0.4243, "step": 1508 }, { "epoch": 0.20815228636457686, "grad_norm": 1.0221974849700928, "learning_rate": 0.00019771992507076642, "loss": 0.5926, "step": 1509 }, { "epoch": 0.20829022691220084, "grad_norm": 0.6053209900856018, "learning_rate": 0.00019771685255325202, "loss": 0.4508, "step": 1510 }, { "epoch": 0.20842816745982481, "grad_norm": 0.55621737241745, "learning_rate": 0.0001977137779908495, "loss": 0.3533, "step": 1511 }, { "epoch": 0.2085661080074488, "grad_norm": 0.6023468971252441, "learning_rate": 0.00019771070138362325, "loss": 0.2508, "step": 1512 }, { "epoch": 0.20870404855507277, "grad_norm": 0.9730260968208313, "learning_rate": 0.00019770762273163753, "loss": 0.8603, "step": 1513 }, { "epoch": 0.20884198910269675, "grad_norm": 0.748810887336731, "learning_rate": 0.0001977045420349569, "loss": 0.3751, "step": 1514 }, { "epoch": 0.20897992965032072, "grad_norm": 0.7026892304420471, "learning_rate": 0.00019770145929364573, "loss": 0.5308, "step": 1515 }, { "epoch": 0.2091178701979447, "grad_norm": 0.681950569152832, "learning_rate": 0.0001976983745077686, "loss": 0.5067, "step": 1516 }, { "epoch": 0.20925581074556865, "grad_norm": 0.5148996710777283, "learning_rate": 0.00019769528767739, "loss": 0.2771, "step": 1517 }, { "epoch": 0.20939375129319263, "grad_norm": 0.8231521844863892, "learning_rate": 0.0001976921988025746, "loss": 0.6476, "step": 1518 }, { "epoch": 0.2095316918408166, "grad_norm": 0.7600322365760803, "learning_rate": 0.00019768910788338695, "loss": 0.5764, "step": 1519 }, { "epoch": 0.20966963238844058, "grad_norm": 0.587195873260498, "learning_rate": 0.00019768601491989182, "loss": 0.4479, "step": 1520 }, { "epoch": 0.20980757293606456, "grad_norm": 1.1260074377059937, "learning_rate": 0.00019768291991215388, "loss": 0.5687, "step": 1521 }, { "epoch": 0.20994551348368853, "grad_norm": 0.4878457188606262, "learning_rate": 0.00019767982286023793, "loss": 0.3647, "step": 1522 }, { "epoch": 0.2100834540313125, "grad_norm": 0.5540122389793396, "learning_rate": 0.00019767672376420879, "loss": 0.4167, "step": 1523 }, { "epoch": 0.2102213945789365, "grad_norm": 0.7711905241012573, "learning_rate": 0.00019767362262413125, "loss": 0.6219, "step": 1524 }, { "epoch": 0.21035933512656046, "grad_norm": 0.7483119964599609, "learning_rate": 0.00019767051944007027, "loss": 0.6464, "step": 1525 }, { "epoch": 0.21049727567418441, "grad_norm": 0.9003910422325134, "learning_rate": 0.00019766741421209076, "loss": 0.7722, "step": 1526 }, { "epoch": 0.2106352162218084, "grad_norm": 0.567668080329895, "learning_rate": 0.00019766430694025773, "loss": 0.4479, "step": 1527 }, { "epoch": 0.21077315676943237, "grad_norm": 1.163555383682251, "learning_rate": 0.00019766119762463616, "loss": 0.9576, "step": 1528 }, { "epoch": 0.21091109731705635, "grad_norm": 0.9036771059036255, "learning_rate": 0.00019765808626529113, "loss": 0.5609, "step": 1529 }, { "epoch": 0.21104903786468032, "grad_norm": 1.0214260816574097, "learning_rate": 0.00019765497286228781, "loss": 0.7341, "step": 1530 }, { "epoch": 0.2111869784123043, "grad_norm": 0.55262690782547, "learning_rate": 0.00019765185741569126, "loss": 0.4457, "step": 1531 }, { "epoch": 0.21132491895992828, "grad_norm": 0.7800946831703186, "learning_rate": 0.00019764873992556673, "loss": 0.5666, "step": 1532 }, { "epoch": 0.21146285950755225, "grad_norm": 0.6976976990699768, "learning_rate": 0.00019764562039197947, "loss": 0.4865, "step": 1533 }, { "epoch": 0.21160080005517623, "grad_norm": 0.5879265069961548, "learning_rate": 0.0001976424988149947, "loss": 0.3241, "step": 1534 }, { "epoch": 0.21173874060280018, "grad_norm": 0.7656161785125732, "learning_rate": 0.00019763937519467784, "loss": 1.0343, "step": 1535 }, { "epoch": 0.21187668115042416, "grad_norm": 1.35551917552948, "learning_rate": 0.00019763624953109417, "loss": 0.3279, "step": 1536 }, { "epoch": 0.21201462169804813, "grad_norm": 0.7385879158973694, "learning_rate": 0.00019763312182430914, "loss": 0.5431, "step": 1537 }, { "epoch": 0.2121525622456721, "grad_norm": 0.9399107694625854, "learning_rate": 0.0001976299920743882, "loss": 0.909, "step": 1538 }, { "epoch": 0.2122905027932961, "grad_norm": 1.1988216638565063, "learning_rate": 0.00019762686028139683, "loss": 0.3251, "step": 1539 }, { "epoch": 0.21242844334092006, "grad_norm": 0.537767231464386, "learning_rate": 0.00019762372644540057, "loss": 0.465, "step": 1540 }, { "epoch": 0.21256638388854404, "grad_norm": 0.7680913209915161, "learning_rate": 0.00019762059056646503, "loss": 0.5476, "step": 1541 }, { "epoch": 0.21270432443616802, "grad_norm": 0.6851320862770081, "learning_rate": 0.0001976174526446558, "loss": 0.4175, "step": 1542 }, { "epoch": 0.212842264983792, "grad_norm": 0.8495534658432007, "learning_rate": 0.00019761431268003858, "loss": 0.6096, "step": 1543 }, { "epoch": 0.21298020553141597, "grad_norm": 0.84898442029953, "learning_rate": 0.00019761117067267905, "loss": 1.0083, "step": 1544 }, { "epoch": 0.21311814607903992, "grad_norm": 1.312572717666626, "learning_rate": 0.00019760802662264294, "loss": 0.5228, "step": 1545 }, { "epoch": 0.2132560866266639, "grad_norm": 0.5963274240493774, "learning_rate": 0.0001976048805299961, "loss": 0.7231, "step": 1546 }, { "epoch": 0.21339402717428788, "grad_norm": 0.9802027940750122, "learning_rate": 0.00019760173239480433, "loss": 0.3235, "step": 1547 }, { "epoch": 0.21353196772191185, "grad_norm": 0.514875054359436, "learning_rate": 0.00019759858221713356, "loss": 0.532, "step": 1548 }, { "epoch": 0.21366990826953583, "grad_norm": 1.1341639757156372, "learning_rate": 0.00019759542999704964, "loss": 0.8835, "step": 1549 }, { "epoch": 0.2138078488171598, "grad_norm": 0.7439461350440979, "learning_rate": 0.00019759227573461856, "loss": 0.4452, "step": 1550 }, { "epoch": 0.21394578936478378, "grad_norm": 0.8558647632598877, "learning_rate": 0.00019758911942990633, "loss": 0.7188, "step": 1551 }, { "epoch": 0.21408372991240776, "grad_norm": 0.6747344136238098, "learning_rate": 0.00019758596108297903, "loss": 0.609, "step": 1552 }, { "epoch": 0.21422167046003174, "grad_norm": 1.0071959495544434, "learning_rate": 0.00019758280069390275, "loss": 0.7166, "step": 1553 }, { "epoch": 0.2143596110076557, "grad_norm": 0.6133373379707336, "learning_rate": 0.00019757963826274357, "loss": 0.3146, "step": 1554 }, { "epoch": 0.21449755155527966, "grad_norm": 0.625426173210144, "learning_rate": 0.00019757647378956775, "loss": 0.4699, "step": 1555 }, { "epoch": 0.21463549210290364, "grad_norm": 1.1678266525268555, "learning_rate": 0.00019757330727444142, "loss": 0.5549, "step": 1556 }, { "epoch": 0.21477343265052762, "grad_norm": 0.5959186553955078, "learning_rate": 0.0001975701387174309, "loss": 0.3031, "step": 1557 }, { "epoch": 0.2149113731981516, "grad_norm": 0.6856188774108887, "learning_rate": 0.00019756696811860252, "loss": 0.4353, "step": 1558 }, { "epoch": 0.21504931374577557, "grad_norm": 0.6395816206932068, "learning_rate": 0.00019756379547802258, "loss": 0.5386, "step": 1559 }, { "epoch": 0.21518725429339955, "grad_norm": 0.6361191272735596, "learning_rate": 0.00019756062079575752, "loss": 0.4931, "step": 1560 }, { "epoch": 0.21532519484102353, "grad_norm": 0.6858872175216675, "learning_rate": 0.00019755744407187376, "loss": 0.4928, "step": 1561 }, { "epoch": 0.2154631353886475, "grad_norm": 0.827880859375, "learning_rate": 0.00019755426530643772, "loss": 0.5443, "step": 1562 }, { "epoch": 0.21560107593627148, "grad_norm": 0.7604259252548218, "learning_rate": 0.000197551084499516, "loss": 0.6041, "step": 1563 }, { "epoch": 0.21573901648389543, "grad_norm": 0.8052539825439453, "learning_rate": 0.00019754790165117512, "loss": 0.5421, "step": 1564 }, { "epoch": 0.2158769570315194, "grad_norm": 0.9283300042152405, "learning_rate": 0.00019754471676148173, "loss": 0.5405, "step": 1565 }, { "epoch": 0.21601489757914338, "grad_norm": 0.9971221089363098, "learning_rate": 0.0001975415298305024, "loss": 0.7415, "step": 1566 }, { "epoch": 0.21615283812676736, "grad_norm": 0.7091429829597473, "learning_rate": 0.00019753834085830388, "loss": 0.7384, "step": 1567 }, { "epoch": 0.21629077867439134, "grad_norm": 0.8151398301124573, "learning_rate": 0.00019753514984495292, "loss": 0.8926, "step": 1568 }, { "epoch": 0.21642871922201531, "grad_norm": 0.7444909811019897, "learning_rate": 0.00019753195679051628, "loss": 0.8458, "step": 1569 }, { "epoch": 0.2165666597696393, "grad_norm": 0.5962792038917542, "learning_rate": 0.00019752876169506074, "loss": 0.501, "step": 1570 }, { "epoch": 0.21670460031726327, "grad_norm": 0.7183015942573547, "learning_rate": 0.0001975255645586532, "loss": 0.4636, "step": 1571 }, { "epoch": 0.21684254086488725, "grad_norm": 0.5104637742042542, "learning_rate": 0.00019752236538136058, "loss": 0.5038, "step": 1572 }, { "epoch": 0.2169804814125112, "grad_norm": 0.8308044672012329, "learning_rate": 0.0001975191641632498, "loss": 0.8071, "step": 1573 }, { "epoch": 0.21711842196013517, "grad_norm": 0.7244855761528015, "learning_rate": 0.00019751596090438787, "loss": 0.5333, "step": 1574 }, { "epoch": 0.21725636250775915, "grad_norm": 0.5203992128372192, "learning_rate": 0.00019751275560484177, "loss": 0.248, "step": 1575 }, { "epoch": 0.21739430305538313, "grad_norm": 0.8960958123207092, "learning_rate": 0.00019750954826467867, "loss": 0.4987, "step": 1576 }, { "epoch": 0.2175322436030071, "grad_norm": 0.6225624084472656, "learning_rate": 0.0001975063388839656, "loss": 0.3772, "step": 1577 }, { "epoch": 0.21767018415063108, "grad_norm": 0.9374969005584717, "learning_rate": 0.00019750312746276977, "loss": 0.7636, "step": 1578 }, { "epoch": 0.21780812469825506, "grad_norm": 0.59400874376297, "learning_rate": 0.00019749991400115838, "loss": 0.4365, "step": 1579 }, { "epoch": 0.21794606524587903, "grad_norm": 1.242187261581421, "learning_rate": 0.00019749669849919866, "loss": 0.458, "step": 1580 }, { "epoch": 0.218084005793503, "grad_norm": 0.7381463050842285, "learning_rate": 0.0001974934809569579, "loss": 0.5409, "step": 1581 }, { "epoch": 0.218221946341127, "grad_norm": 0.7372673153877258, "learning_rate": 0.0001974902613745035, "loss": 0.5081, "step": 1582 }, { "epoch": 0.21835988688875094, "grad_norm": 0.7406336665153503, "learning_rate": 0.0001974870397519027, "loss": 0.7379, "step": 1583 }, { "epoch": 0.21849782743637491, "grad_norm": 1.171842098236084, "learning_rate": 0.0001974838160892231, "loss": 0.918, "step": 1584 }, { "epoch": 0.2186357679839989, "grad_norm": 0.6384474635124207, "learning_rate": 0.00019748059038653194, "loss": 0.6122, "step": 1585 }, { "epoch": 0.21877370853162287, "grad_norm": 0.4516240656375885, "learning_rate": 0.00019747736264389692, "loss": 0.4121, "step": 1586 }, { "epoch": 0.21891164907924685, "grad_norm": 0.7692304253578186, "learning_rate": 0.00019747413286138547, "loss": 0.8502, "step": 1587 }, { "epoch": 0.21904958962687082, "grad_norm": 0.5636818408966064, "learning_rate": 0.00019747090103906524, "loss": 0.3508, "step": 1588 }, { "epoch": 0.2191875301744948, "grad_norm": 0.7130869030952454, "learning_rate": 0.0001974676671770038, "loss": 0.5638, "step": 1589 }, { "epoch": 0.21932547072211878, "grad_norm": 1.677419662475586, "learning_rate": 0.00019746443127526887, "loss": 1.0134, "step": 1590 }, { "epoch": 0.21946341126974275, "grad_norm": 1.5180758237838745, "learning_rate": 0.0001974611933339282, "loss": 1.0784, "step": 1591 }, { "epoch": 0.2196013518173667, "grad_norm": 0.6577237248420715, "learning_rate": 0.00019745795335304945, "loss": 0.7213, "step": 1592 }, { "epoch": 0.21973929236499068, "grad_norm": 0.7375402450561523, "learning_rate": 0.00019745471133270052, "loss": 0.7146, "step": 1593 }, { "epoch": 0.21987723291261466, "grad_norm": 0.6971603035926819, "learning_rate": 0.00019745146727294917, "loss": 0.5163, "step": 1594 }, { "epoch": 0.22001517346023863, "grad_norm": 0.8723885416984558, "learning_rate": 0.00019744822117386335, "loss": 0.8007, "step": 1595 }, { "epoch": 0.2201531140078626, "grad_norm": 0.9547663927078247, "learning_rate": 0.00019744497303551096, "loss": 0.7624, "step": 1596 }, { "epoch": 0.2202910545554866, "grad_norm": 0.5836478471755981, "learning_rate": 0.00019744172285795998, "loss": 0.5073, "step": 1597 }, { "epoch": 0.22042899510311056, "grad_norm": 0.7372373938560486, "learning_rate": 0.00019743847064127845, "loss": 0.5698, "step": 1598 }, { "epoch": 0.22056693565073454, "grad_norm": 0.7943384647369385, "learning_rate": 0.00019743521638553436, "loss": 0.4357, "step": 1599 }, { "epoch": 0.22070487619835852, "grad_norm": 0.7061851024627686, "learning_rate": 0.00019743196009079593, "loss": 0.7765, "step": 1600 }, { "epoch": 0.22070487619835852, "eval_loss": 0.6701160073280334, "eval_runtime": 23.4934, "eval_samples_per_second": 2.511, "eval_steps_per_second": 2.511, "step": 1600 }, { "epoch": 0.2208428167459825, "grad_norm": 0.8099935054779053, "learning_rate": 0.00019742870175713117, "loss": 0.7272, "step": 1601 }, { "epoch": 0.22098075729360644, "grad_norm": 0.6026046872138977, "learning_rate": 0.00019742544138460835, "loss": 0.405, "step": 1602 }, { "epoch": 0.22111869784123042, "grad_norm": 0.6762717366218567, "learning_rate": 0.00019742217897329568, "loss": 0.7754, "step": 1603 }, { "epoch": 0.2212566383888544, "grad_norm": 0.6282803416252136, "learning_rate": 0.0001974189145232614, "loss": 0.3972, "step": 1604 }, { "epoch": 0.22139457893647838, "grad_norm": 0.6936721205711365, "learning_rate": 0.00019741564803457386, "loss": 0.4259, "step": 1605 }, { "epoch": 0.22153251948410235, "grad_norm": 2.1612634658813477, "learning_rate": 0.00019741237950730143, "loss": 0.6188, "step": 1606 }, { "epoch": 0.22167046003172633, "grad_norm": 0.8184899091720581, "learning_rate": 0.00019740910894151245, "loss": 0.6147, "step": 1607 }, { "epoch": 0.2218084005793503, "grad_norm": 0.5751150846481323, "learning_rate": 0.00019740583633727542, "loss": 0.3876, "step": 1608 }, { "epoch": 0.22194634112697428, "grad_norm": 0.582902193069458, "learning_rate": 0.0001974025616946588, "loss": 0.3761, "step": 1609 }, { "epoch": 0.22208428167459826, "grad_norm": 0.9781992435455322, "learning_rate": 0.00019739928501373113, "loss": 0.9236, "step": 1610 }, { "epoch": 0.2222222222222222, "grad_norm": 0.6478968262672424, "learning_rate": 0.00019739600629456097, "loss": 0.4393, "step": 1611 }, { "epoch": 0.2223601627698462, "grad_norm": 0.6555303335189819, "learning_rate": 0.0001973927255372169, "loss": 0.7777, "step": 1612 }, { "epoch": 0.22249810331747016, "grad_norm": 1.2468045949935913, "learning_rate": 0.00019738944274176766, "loss": 0.6122, "step": 1613 }, { "epoch": 0.22263604386509414, "grad_norm": 1.0547524690628052, "learning_rate": 0.00019738615790828185, "loss": 0.6539, "step": 1614 }, { "epoch": 0.22277398441271812, "grad_norm": 0.7945654988288879, "learning_rate": 0.00019738287103682827, "loss": 0.7588, "step": 1615 }, { "epoch": 0.2229119249603421, "grad_norm": 0.7305927276611328, "learning_rate": 0.0001973795821274757, "loss": 0.2584, "step": 1616 }, { "epoch": 0.22304986550796607, "grad_norm": 1.2466413974761963, "learning_rate": 0.00019737629118029296, "loss": 0.8028, "step": 1617 }, { "epoch": 0.22318780605559005, "grad_norm": 0.8502911329269409, "learning_rate": 0.00019737299819534888, "loss": 0.4633, "step": 1618 }, { "epoch": 0.22332574660321403, "grad_norm": 1.0902429819107056, "learning_rate": 0.00019736970317271244, "loss": 0.3811, "step": 1619 }, { "epoch": 0.22346368715083798, "grad_norm": 0.9698812961578369, "learning_rate": 0.00019736640611245254, "loss": 0.7376, "step": 1620 }, { "epoch": 0.22360162769846195, "grad_norm": 0.6695729494094849, "learning_rate": 0.0001973631070146382, "loss": 0.647, "step": 1621 }, { "epoch": 0.22373956824608593, "grad_norm": 0.9130712747573853, "learning_rate": 0.0001973598058793385, "loss": 0.5384, "step": 1622 }, { "epoch": 0.2238775087937099, "grad_norm": 1.0346002578735352, "learning_rate": 0.00019735650270662244, "loss": 0.6423, "step": 1623 }, { "epoch": 0.22401544934133388, "grad_norm": 0.770313024520874, "learning_rate": 0.00019735319749655914, "loss": 0.9142, "step": 1624 }, { "epoch": 0.22415338988895786, "grad_norm": 0.7130119204521179, "learning_rate": 0.00019734989024921786, "loss": 0.6208, "step": 1625 }, { "epoch": 0.22429133043658184, "grad_norm": 0.6501592993736267, "learning_rate": 0.00019734658096466775, "loss": 0.9638, "step": 1626 }, { "epoch": 0.22442927098420581, "grad_norm": 0.5239920020103455, "learning_rate": 0.00019734326964297806, "loss": 0.217, "step": 1627 }, { "epoch": 0.2245672115318298, "grad_norm": 0.6280699372291565, "learning_rate": 0.00019733995628421811, "loss": 0.3711, "step": 1628 }, { "epoch": 0.22470515207945377, "grad_norm": 1.1753147840499878, "learning_rate": 0.0001973366408884572, "loss": 0.4814, "step": 1629 }, { "epoch": 0.22484309262707772, "grad_norm": 0.6587424278259277, "learning_rate": 0.00019733332345576475, "loss": 0.502, "step": 1630 }, { "epoch": 0.2249810331747017, "grad_norm": 0.5983806252479553, "learning_rate": 0.0001973300039862102, "loss": 0.4848, "step": 1631 }, { "epoch": 0.22511897372232567, "grad_norm": 0.5221776366233826, "learning_rate": 0.00019732668247986293, "loss": 0.4227, "step": 1632 }, { "epoch": 0.22525691426994965, "grad_norm": 0.7659717202186584, "learning_rate": 0.00019732335893679251, "loss": 0.6224, "step": 1633 }, { "epoch": 0.22539485481757363, "grad_norm": 1.1571626663208008, "learning_rate": 0.00019732003335706848, "loss": 0.7001, "step": 1634 }, { "epoch": 0.2255327953651976, "grad_norm": 1.23952317237854, "learning_rate": 0.00019731670574076043, "loss": 0.5385, "step": 1635 }, { "epoch": 0.22567073591282158, "grad_norm": 0.8199877142906189, "learning_rate": 0.00019731337608793804, "loss": 0.4246, "step": 1636 }, { "epoch": 0.22580867646044556, "grad_norm": 0.6410521864891052, "learning_rate": 0.0001973100443986709, "loss": 0.7775, "step": 1637 }, { "epoch": 0.22594661700806953, "grad_norm": 0.9207361340522766, "learning_rate": 0.00019730671067302876, "loss": 0.5826, "step": 1638 }, { "epoch": 0.22608455755569348, "grad_norm": 1.314924955368042, "learning_rate": 0.00019730337491108147, "loss": 1.0358, "step": 1639 }, { "epoch": 0.22622249810331746, "grad_norm": 0.9965962767601013, "learning_rate": 0.00019730003711289872, "loss": 0.6844, "step": 1640 }, { "epoch": 0.22636043865094144, "grad_norm": 0.7914469242095947, "learning_rate": 0.00019729669727855043, "loss": 0.6494, "step": 1641 }, { "epoch": 0.2264983791985654, "grad_norm": 0.6033255457878113, "learning_rate": 0.00019729335540810645, "loss": 0.6953, "step": 1642 }, { "epoch": 0.2266363197461894, "grad_norm": 1.0636558532714844, "learning_rate": 0.00019729001150163678, "loss": 0.4315, "step": 1643 }, { "epoch": 0.22677426029381337, "grad_norm": 0.6750774383544922, "learning_rate": 0.00019728666555921128, "loss": 0.5561, "step": 1644 }, { "epoch": 0.22691220084143734, "grad_norm": 0.6471349000930786, "learning_rate": 0.00019728331758090008, "loss": 0.5916, "step": 1645 }, { "epoch": 0.22705014138906132, "grad_norm": 0.795585572719574, "learning_rate": 0.00019727996756677322, "loss": 0.4582, "step": 1646 }, { "epoch": 0.2271880819366853, "grad_norm": 1.4341682195663452, "learning_rate": 0.00019727661551690074, "loss": 0.6804, "step": 1647 }, { "epoch": 0.22732602248430928, "grad_norm": 0.7439274787902832, "learning_rate": 0.00019727326143135286, "loss": 0.6335, "step": 1648 }, { "epoch": 0.22746396303193323, "grad_norm": 0.6036856770515442, "learning_rate": 0.00019726990531019974, "loss": 0.775, "step": 1649 }, { "epoch": 0.2276019035795572, "grad_norm": 0.8078903555870056, "learning_rate": 0.00019726654715351166, "loss": 0.5423, "step": 1650 }, { "epoch": 0.22773984412718118, "grad_norm": 0.6876071095466614, "learning_rate": 0.00019726318696135881, "loss": 0.61, "step": 1651 }, { "epoch": 0.22787778467480516, "grad_norm": 0.5452876687049866, "learning_rate": 0.00019725982473381155, "loss": 0.5557, "step": 1652 }, { "epoch": 0.22801572522242913, "grad_norm": 1.0623972415924072, "learning_rate": 0.00019725646047094025, "loss": 1.5374, "step": 1653 }, { "epoch": 0.2281536657700531, "grad_norm": 0.5268773436546326, "learning_rate": 0.0001972530941728153, "loss": 0.6776, "step": 1654 }, { "epoch": 0.2282916063176771, "grad_norm": 0.8703244924545288, "learning_rate": 0.00019724972583950713, "loss": 0.4816, "step": 1655 }, { "epoch": 0.22842954686530106, "grad_norm": 1.0654100179672241, "learning_rate": 0.00019724635547108626, "loss": 0.779, "step": 1656 }, { "epoch": 0.22856748741292504, "grad_norm": 4.7631072998046875, "learning_rate": 0.0001972429830676232, "loss": 0.3965, "step": 1657 }, { "epoch": 0.228705427960549, "grad_norm": 0.6532920002937317, "learning_rate": 0.0001972396086291885, "loss": 0.7204, "step": 1658 }, { "epoch": 0.22884336850817297, "grad_norm": 0.761963963508606, "learning_rate": 0.00019723623215585288, "loss": 0.4874, "step": 1659 }, { "epoch": 0.22898130905579694, "grad_norm": 0.786673367023468, "learning_rate": 0.00019723285364768684, "loss": 0.5179, "step": 1660 }, { "epoch": 0.22911924960342092, "grad_norm": 0.9895481467247009, "learning_rate": 0.0001972294731047612, "loss": 0.7026, "step": 1661 }, { "epoch": 0.2292571901510449, "grad_norm": 0.965067982673645, "learning_rate": 0.00019722609052714668, "loss": 0.9159, "step": 1662 }, { "epoch": 0.22939513069866888, "grad_norm": 0.7370869517326355, "learning_rate": 0.00019722270591491405, "loss": 0.5713, "step": 1663 }, { "epoch": 0.22953307124629285, "grad_norm": 0.6077229380607605, "learning_rate": 0.00019721931926813414, "loss": 0.8126, "step": 1664 }, { "epoch": 0.22967101179391683, "grad_norm": 0.9202563166618347, "learning_rate": 0.00019721593058687784, "loss": 0.6216, "step": 1665 }, { "epoch": 0.2298089523415408, "grad_norm": 0.7052692174911499, "learning_rate": 0.000197212539871216, "loss": 0.5423, "step": 1666 }, { "epoch": 0.22994689288916478, "grad_norm": 1.378886103630066, "learning_rate": 0.00019720914712121967, "loss": 1.1994, "step": 1667 }, { "epoch": 0.23008483343678873, "grad_norm": 0.834113359451294, "learning_rate": 0.0001972057523369598, "loss": 0.3369, "step": 1668 }, { "epoch": 0.2302227739844127, "grad_norm": 0.7306934595108032, "learning_rate": 0.0001972023555185074, "loss": 0.669, "step": 1669 }, { "epoch": 0.2303607145320367, "grad_norm": 0.924947202205658, "learning_rate": 0.0001971989566659336, "loss": 0.5974, "step": 1670 }, { "epoch": 0.23049865507966066, "grad_norm": 0.6231887936592102, "learning_rate": 0.00019719555577930954, "loss": 0.7273, "step": 1671 }, { "epoch": 0.23063659562728464, "grad_norm": 0.7231675982475281, "learning_rate": 0.00019719215285870636, "loss": 0.5177, "step": 1672 }, { "epoch": 0.23077453617490862, "grad_norm": 0.6720378994941711, "learning_rate": 0.00019718874790419525, "loss": 0.4229, "step": 1673 }, { "epoch": 0.2309124767225326, "grad_norm": 0.8965453505516052, "learning_rate": 0.0001971853409158475, "loss": 0.7835, "step": 1674 }, { "epoch": 0.23105041727015657, "grad_norm": 0.8259643316268921, "learning_rate": 0.00019718193189373443, "loss": 0.4734, "step": 1675 }, { "epoch": 0.23118835781778055, "grad_norm": 1.0671875476837158, "learning_rate": 0.00019717852083792729, "loss": 0.8342, "step": 1676 }, { "epoch": 0.2313262983654045, "grad_norm": 1.2469580173492432, "learning_rate": 0.00019717510774849755, "loss": 0.8646, "step": 1677 }, { "epoch": 0.23146423891302847, "grad_norm": 0.7264925241470337, "learning_rate": 0.0001971716926255166, "loss": 0.5659, "step": 1678 }, { "epoch": 0.23160217946065245, "grad_norm": 0.6982684135437012, "learning_rate": 0.00019716827546905588, "loss": 0.4057, "step": 1679 }, { "epoch": 0.23174012000827643, "grad_norm": 0.7751342058181763, "learning_rate": 0.00019716485627918696, "loss": 0.6077, "step": 1680 }, { "epoch": 0.2318780605559004, "grad_norm": 0.5583308339118958, "learning_rate": 0.0001971614350559814, "loss": 0.6444, "step": 1681 }, { "epoch": 0.23201600110352438, "grad_norm": 0.6357859969139099, "learning_rate": 0.0001971580117995107, "loss": 0.4583, "step": 1682 }, { "epoch": 0.23215394165114836, "grad_norm": 0.5525439381599426, "learning_rate": 0.00019715458650984658, "loss": 0.5348, "step": 1683 }, { "epoch": 0.23229188219877234, "grad_norm": 0.644550621509552, "learning_rate": 0.00019715115918706068, "loss": 0.4757, "step": 1684 }, { "epoch": 0.23242982274639631, "grad_norm": 0.6693544983863831, "learning_rate": 0.00019714772983122475, "loss": 0.7162, "step": 1685 }, { "epoch": 0.2325677632940203, "grad_norm": 0.9517789483070374, "learning_rate": 0.00019714429844241055, "loss": 0.4147, "step": 1686 }, { "epoch": 0.23270570384164424, "grad_norm": 0.8975241184234619, "learning_rate": 0.00019714086502068987, "loss": 0.4934, "step": 1687 }, { "epoch": 0.23284364438926822, "grad_norm": 0.9465944766998291, "learning_rate": 0.00019713742956613457, "loss": 0.4213, "step": 1688 }, { "epoch": 0.2329815849368922, "grad_norm": 0.7063184976577759, "learning_rate": 0.00019713399207881653, "loss": 0.3294, "step": 1689 }, { "epoch": 0.23311952548451617, "grad_norm": 0.9378213882446289, "learning_rate": 0.00019713055255880771, "loss": 0.6856, "step": 1690 }, { "epoch": 0.23325746603214015, "grad_norm": 0.8049567341804504, "learning_rate": 0.0001971271110061801, "loss": 0.7159, "step": 1691 }, { "epoch": 0.23339540657976413, "grad_norm": 0.7498889565467834, "learning_rate": 0.00019712366742100565, "loss": 1.4433, "step": 1692 }, { "epoch": 0.2335333471273881, "grad_norm": 0.9926367402076721, "learning_rate": 0.0001971202218033565, "loss": 0.6912, "step": 1693 }, { "epoch": 0.23367128767501208, "grad_norm": 0.7228891253471375, "learning_rate": 0.0001971167741533047, "loss": 0.3714, "step": 1694 }, { "epoch": 0.23380922822263606, "grad_norm": 0.6421617269515991, "learning_rate": 0.00019711332447092244, "loss": 0.4247, "step": 1695 }, { "epoch": 0.23394716877026, "grad_norm": 0.6030147671699524, "learning_rate": 0.00019710987275628188, "loss": 0.5216, "step": 1696 }, { "epoch": 0.23408510931788398, "grad_norm": 0.7749969363212585, "learning_rate": 0.00019710641900945526, "loss": 0.5579, "step": 1697 }, { "epoch": 0.23422304986550796, "grad_norm": 0.6434635519981384, "learning_rate": 0.00019710296323051488, "loss": 0.4522, "step": 1698 }, { "epoch": 0.23436099041313194, "grad_norm": 0.902519166469574, "learning_rate": 0.00019709950541953303, "loss": 0.7444, "step": 1699 }, { "epoch": 0.2344989309607559, "grad_norm": 0.7153852581977844, "learning_rate": 0.00019709604557658204, "loss": 0.4498, "step": 1700 }, { "epoch": 0.2346368715083799, "grad_norm": 0.817779541015625, "learning_rate": 0.0001970925837017344, "loss": 0.6874, "step": 1701 }, { "epoch": 0.23477481205600387, "grad_norm": 0.7079076766967773, "learning_rate": 0.00019708911979506247, "loss": 0.6127, "step": 1702 }, { "epoch": 0.23491275260362784, "grad_norm": 0.9277624487876892, "learning_rate": 0.0001970856538566388, "loss": 0.5379, "step": 1703 }, { "epoch": 0.23505069315125182, "grad_norm": 0.5891031622886658, "learning_rate": 0.0001970821858865359, "loss": 0.3578, "step": 1704 }, { "epoch": 0.23518863369887577, "grad_norm": 0.6753404140472412, "learning_rate": 0.00019707871588482633, "loss": 0.4398, "step": 1705 }, { "epoch": 0.23532657424649975, "grad_norm": 0.9718192219734192, "learning_rate": 0.0001970752438515827, "loss": 0.3373, "step": 1706 }, { "epoch": 0.23546451479412372, "grad_norm": 0.8236162662506104, "learning_rate": 0.0001970717697868777, "loss": 0.8217, "step": 1707 }, { "epoch": 0.2356024553417477, "grad_norm": 0.5888224244117737, "learning_rate": 0.00019706829369078404, "loss": 0.4586, "step": 1708 }, { "epoch": 0.23574039588937168, "grad_norm": 0.7651521563529968, "learning_rate": 0.0001970648155633744, "loss": 0.4987, "step": 1709 }, { "epoch": 0.23587833643699566, "grad_norm": 0.6184590458869934, "learning_rate": 0.00019706133540472158, "loss": 0.5298, "step": 1710 }, { "epoch": 0.23601627698461963, "grad_norm": 0.9709598422050476, "learning_rate": 0.00019705785321489847, "loss": 0.8537, "step": 1711 }, { "epoch": 0.2361542175322436, "grad_norm": 0.6483273506164551, "learning_rate": 0.0001970543689939779, "loss": 0.4203, "step": 1712 }, { "epoch": 0.2362921580798676, "grad_norm": 0.9784144163131714, "learning_rate": 0.00019705088274203276, "loss": 0.6402, "step": 1713 }, { "epoch": 0.23643009862749156, "grad_norm": 0.5241904258728027, "learning_rate": 0.00019704739445913604, "loss": 0.4466, "step": 1714 }, { "epoch": 0.2365680391751155, "grad_norm": 0.6275275945663452, "learning_rate": 0.0001970439041453607, "loss": 0.8982, "step": 1715 }, { "epoch": 0.2367059797227395, "grad_norm": 0.7198060750961304, "learning_rate": 0.00019704041180077983, "loss": 0.5906, "step": 1716 }, { "epoch": 0.23684392027036347, "grad_norm": 0.702900230884552, "learning_rate": 0.0001970369174254665, "loss": 0.5757, "step": 1717 }, { "epoch": 0.23698186081798744, "grad_norm": 1.613067865371704, "learning_rate": 0.0001970334210194938, "loss": 0.7592, "step": 1718 }, { "epoch": 0.23711980136561142, "grad_norm": 0.7699272632598877, "learning_rate": 0.00019702992258293497, "loss": 0.7353, "step": 1719 }, { "epoch": 0.2372577419132354, "grad_norm": 0.933709979057312, "learning_rate": 0.00019702642211586312, "loss": 0.818, "step": 1720 }, { "epoch": 0.23739568246085938, "grad_norm": 0.6716951727867126, "learning_rate": 0.0001970229196183516, "loss": 0.5607, "step": 1721 }, { "epoch": 0.23753362300848335, "grad_norm": 0.8655287027359009, "learning_rate": 0.00019701941509047365, "loss": 0.6888, "step": 1722 }, { "epoch": 0.23767156355610733, "grad_norm": 0.6017300486564636, "learning_rate": 0.00019701590853230262, "loss": 0.4247, "step": 1723 }, { "epoch": 0.23780950410373128, "grad_norm": 0.7415484189987183, "learning_rate": 0.0001970123999439119, "loss": 0.4916, "step": 1724 }, { "epoch": 0.23794744465135526, "grad_norm": 0.6755270957946777, "learning_rate": 0.0001970088893253749, "loss": 0.3593, "step": 1725 }, { "epoch": 0.23808538519897923, "grad_norm": 0.7471739649772644, "learning_rate": 0.0001970053766767651, "loss": 0.524, "step": 1726 }, { "epoch": 0.2382233257466032, "grad_norm": 0.6816518306732178, "learning_rate": 0.000197001861998156, "loss": 0.6338, "step": 1727 }, { "epoch": 0.2383612662942272, "grad_norm": 0.6671456098556519, "learning_rate": 0.00019699834528962113, "loss": 0.9366, "step": 1728 }, { "epoch": 0.23849920684185116, "grad_norm": 0.6730070114135742, "learning_rate": 0.00019699482655123412, "loss": 0.8387, "step": 1729 }, { "epoch": 0.23863714738947514, "grad_norm": 0.5691518783569336, "learning_rate": 0.00019699130578306858, "loss": 0.4091, "step": 1730 }, { "epoch": 0.23877508793709912, "grad_norm": 1.0355315208435059, "learning_rate": 0.0001969877829851982, "loss": 0.6185, "step": 1731 }, { "epoch": 0.2389130284847231, "grad_norm": 0.8908019661903381, "learning_rate": 0.0001969842581576967, "loss": 0.6629, "step": 1732 }, { "epoch": 0.23905096903234707, "grad_norm": 1.7036361694335938, "learning_rate": 0.0001969807313006378, "loss": 0.8992, "step": 1733 }, { "epoch": 0.23918890957997102, "grad_norm": 0.778057336807251, "learning_rate": 0.0001969772024140954, "loss": 0.606, "step": 1734 }, { "epoch": 0.239326850127595, "grad_norm": 0.786087691783905, "learning_rate": 0.00019697367149814328, "loss": 0.5447, "step": 1735 }, { "epoch": 0.23946479067521897, "grad_norm": 0.8138632774353027, "learning_rate": 0.00019697013855285534, "loss": 0.6551, "step": 1736 }, { "epoch": 0.23960273122284295, "grad_norm": 0.5912622809410095, "learning_rate": 0.0001969666035783055, "loss": 0.5, "step": 1737 }, { "epoch": 0.23974067177046693, "grad_norm": 0.9670955538749695, "learning_rate": 0.00019696306657456775, "loss": 0.7541, "step": 1738 }, { "epoch": 0.2398786123180909, "grad_norm": 0.8279018402099609, "learning_rate": 0.00019695952754171612, "loss": 0.5332, "step": 1739 }, { "epoch": 0.24001655286571488, "grad_norm": 0.8141160607337952, "learning_rate": 0.00019695598647982468, "loss": 0.6147, "step": 1740 }, { "epoch": 0.24015449341333886, "grad_norm": 0.8058760166168213, "learning_rate": 0.0001969524433889675, "loss": 0.478, "step": 1741 }, { "epoch": 0.24029243396096284, "grad_norm": 0.6743835210800171, "learning_rate": 0.0001969488982692187, "loss": 0.6254, "step": 1742 }, { "epoch": 0.24043037450858679, "grad_norm": 0.5762358903884888, "learning_rate": 0.00019694535112065254, "loss": 0.2735, "step": 1743 }, { "epoch": 0.24056831505621076, "grad_norm": 0.6553184986114502, "learning_rate": 0.0001969418019433432, "loss": 0.5841, "step": 1744 }, { "epoch": 0.24070625560383474, "grad_norm": 0.981372058391571, "learning_rate": 0.00019693825073736497, "loss": 0.3405, "step": 1745 }, { "epoch": 0.24084419615145872, "grad_norm": 0.645092248916626, "learning_rate": 0.00019693469750279216, "loss": 0.5043, "step": 1746 }, { "epoch": 0.2409821366990827, "grad_norm": 0.5902348756790161, "learning_rate": 0.00019693114223969912, "loss": 0.289, "step": 1747 }, { "epoch": 0.24112007724670667, "grad_norm": 0.7485817074775696, "learning_rate": 0.00019692758494816028, "loss": 0.5017, "step": 1748 }, { "epoch": 0.24125801779433065, "grad_norm": 0.820565402507782, "learning_rate": 0.00019692402562825005, "loss": 0.5045, "step": 1749 }, { "epoch": 0.24139595834195463, "grad_norm": 0.6907612085342407, "learning_rate": 0.00019692046428004294, "loss": 0.4931, "step": 1750 }, { "epoch": 0.2415338988895786, "grad_norm": 0.6738660335540771, "learning_rate": 0.00019691690090361345, "loss": 0.5456, "step": 1751 }, { "epoch": 0.24167183943720258, "grad_norm": 0.8403408527374268, "learning_rate": 0.00019691333549903614, "loss": 0.6784, "step": 1752 }, { "epoch": 0.24180977998482653, "grad_norm": 0.9201971292495728, "learning_rate": 0.0001969097680663857, "loss": 0.5875, "step": 1753 }, { "epoch": 0.2419477205324505, "grad_norm": 0.5478414297103882, "learning_rate": 0.00019690619860573668, "loss": 0.5182, "step": 1754 }, { "epoch": 0.24208566108007448, "grad_norm": 0.675180971622467, "learning_rate": 0.00019690262711716383, "loss": 0.4528, "step": 1755 }, { "epoch": 0.24222360162769846, "grad_norm": 0.6279581189155579, "learning_rate": 0.0001968990536007419, "loss": 0.6329, "step": 1756 }, { "epoch": 0.24236154217532244, "grad_norm": 1.191409945487976, "learning_rate": 0.00019689547805654565, "loss": 0.7446, "step": 1757 }, { "epoch": 0.2424994827229464, "grad_norm": 1.2488009929656982, "learning_rate": 0.0001968919004846499, "loss": 0.2938, "step": 1758 }, { "epoch": 0.2426374232705704, "grad_norm": 0.6544635891914368, "learning_rate": 0.00019688832088512955, "loss": 0.8059, "step": 1759 }, { "epoch": 0.24277536381819437, "grad_norm": 0.6988946199417114, "learning_rate": 0.00019688473925805948, "loss": 0.4907, "step": 1760 }, { "epoch": 0.24291330436581834, "grad_norm": 0.7457058429718018, "learning_rate": 0.00019688115560351465, "loss": 0.4547, "step": 1761 }, { "epoch": 0.2430512449134423, "grad_norm": 1.157456874847412, "learning_rate": 0.00019687756992157004, "loss": 1.2253, "step": 1762 }, { "epoch": 0.24318918546106627, "grad_norm": 0.7071637511253357, "learning_rate": 0.00019687398221230072, "loss": 0.754, "step": 1763 }, { "epoch": 0.24332712600869025, "grad_norm": 0.5502597093582153, "learning_rate": 0.00019687039247578172, "loss": 0.3702, "step": 1764 }, { "epoch": 0.24346506655631422, "grad_norm": 0.861042857170105, "learning_rate": 0.00019686680071208824, "loss": 0.8272, "step": 1765 }, { "epoch": 0.2436030071039382, "grad_norm": 0.6246613264083862, "learning_rate": 0.0001968632069212953, "loss": 0.5083, "step": 1766 }, { "epoch": 0.24374094765156218, "grad_norm": 0.6523869037628174, "learning_rate": 0.00019685961110347828, "loss": 0.542, "step": 1767 }, { "epoch": 0.24387888819918616, "grad_norm": 1.2454462051391602, "learning_rate": 0.00019685601325871233, "loss": 0.8554, "step": 1768 }, { "epoch": 0.24401682874681013, "grad_norm": 0.5910493731498718, "learning_rate": 0.00019685241338707275, "loss": 0.5882, "step": 1769 }, { "epoch": 0.2441547692944341, "grad_norm": 1.25151789188385, "learning_rate": 0.00019684881148863487, "loss": 1.2085, "step": 1770 }, { "epoch": 0.2442927098420581, "grad_norm": 0.9368802905082703, "learning_rate": 0.00019684520756347408, "loss": 1.0366, "step": 1771 }, { "epoch": 0.24443065038968204, "grad_norm": 0.531817615032196, "learning_rate": 0.00019684160161166582, "loss": 0.3506, "step": 1772 }, { "epoch": 0.244568590937306, "grad_norm": 1.4014822244644165, "learning_rate": 0.00019683799363328551, "loss": 0.6225, "step": 1773 }, { "epoch": 0.24470653148493, "grad_norm": 1.0450271368026733, "learning_rate": 0.00019683438362840864, "loss": 0.5852, "step": 1774 }, { "epoch": 0.24484447203255397, "grad_norm": 0.6562264561653137, "learning_rate": 0.00019683077159711083, "loss": 0.5401, "step": 1775 }, { "epoch": 0.24498241258017794, "grad_norm": 0.7233772873878479, "learning_rate": 0.00019682715753946761, "loss": 0.5065, "step": 1776 }, { "epoch": 0.24512035312780192, "grad_norm": 0.6274629831314087, "learning_rate": 0.0001968235414555546, "loss": 0.5792, "step": 1777 }, { "epoch": 0.2452582936754259, "grad_norm": 0.677960991859436, "learning_rate": 0.00019681992334544753, "loss": 0.6799, "step": 1778 }, { "epoch": 0.24539623422304987, "grad_norm": 1.155444860458374, "learning_rate": 0.00019681630320922205, "loss": 0.6348, "step": 1779 }, { "epoch": 0.24553417477067385, "grad_norm": 1.2415990829467773, "learning_rate": 0.00019681268104695395, "loss": 1.1697, "step": 1780 }, { "epoch": 0.2456721153182978, "grad_norm": 0.5191280841827393, "learning_rate": 0.00019680905685871903, "loss": 0.5037, "step": 1781 }, { "epoch": 0.24581005586592178, "grad_norm": 0.510772705078125, "learning_rate": 0.00019680543064459313, "loss": 0.5, "step": 1782 }, { "epoch": 0.24594799641354576, "grad_norm": 0.7765876650810242, "learning_rate": 0.00019680180240465214, "loss": 0.5203, "step": 1783 }, { "epoch": 0.24608593696116973, "grad_norm": 0.6907384991645813, "learning_rate": 0.00019679817213897195, "loss": 0.4657, "step": 1784 }, { "epoch": 0.2462238775087937, "grad_norm": 0.7738047242164612, "learning_rate": 0.00019679453984762862, "loss": 0.6757, "step": 1785 }, { "epoch": 0.2463618180564177, "grad_norm": 0.7272236347198486, "learning_rate": 0.00019679090553069805, "loss": 0.5232, "step": 1786 }, { "epoch": 0.24649975860404166, "grad_norm": 1.0538573265075684, "learning_rate": 0.00019678726918825637, "loss": 0.9284, "step": 1787 }, { "epoch": 0.24663769915166564, "grad_norm": 0.5996009707450867, "learning_rate": 0.00019678363082037962, "loss": 0.5656, "step": 1788 }, { "epoch": 0.24677563969928962, "grad_norm": 0.8180288672447205, "learning_rate": 0.000196779990427144, "loss": 0.5238, "step": 1789 }, { "epoch": 0.24691358024691357, "grad_norm": 0.506227970123291, "learning_rate": 0.00019677634800862568, "loss": 0.3884, "step": 1790 }, { "epoch": 0.24705152079453754, "grad_norm": 0.9007536768913269, "learning_rate": 0.00019677270356490082, "loss": 0.6447, "step": 1791 }, { "epoch": 0.24718946134216152, "grad_norm": 0.5686006546020508, "learning_rate": 0.00019676905709604576, "loss": 0.4411, "step": 1792 }, { "epoch": 0.2473274018897855, "grad_norm": 0.5894454717636108, "learning_rate": 0.0001967654086021368, "loss": 0.2373, "step": 1793 }, { "epoch": 0.24746534243740947, "grad_norm": 0.709105908870697, "learning_rate": 0.00019676175808325025, "loss": 0.5798, "step": 1794 }, { "epoch": 0.24760328298503345, "grad_norm": 0.8104382753372192, "learning_rate": 0.0001967581055394625, "loss": 1.1853, "step": 1795 }, { "epoch": 0.24774122353265743, "grad_norm": 0.8987683653831482, "learning_rate": 0.00019675445097085, "loss": 0.5887, "step": 1796 }, { "epoch": 0.2478791640802814, "grad_norm": 0.7938177585601807, "learning_rate": 0.00019675079437748928, "loss": 0.6521, "step": 1797 }, { "epoch": 0.24801710462790538, "grad_norm": 0.901508092880249, "learning_rate": 0.00019674713575945684, "loss": 0.7232, "step": 1798 }, { "epoch": 0.24815504517552936, "grad_norm": 0.8801524639129639, "learning_rate": 0.0001967434751168292, "loss": 0.7131, "step": 1799 }, { "epoch": 0.2482929857231533, "grad_norm": 1.1454228162765503, "learning_rate": 0.00019673981244968297, "loss": 1.1211, "step": 1800 }, { "epoch": 0.2482929857231533, "eval_loss": 0.635880708694458, "eval_runtime": 23.4922, "eval_samples_per_second": 2.511, "eval_steps_per_second": 2.511, "step": 1800 }, { "epoch": 0.24843092627077729, "grad_norm": 0.99720698595047, "learning_rate": 0.0001967361477580948, "loss": 0.2777, "step": 1801 }, { "epoch": 0.24856886681840126, "grad_norm": 0.6186218857765198, "learning_rate": 0.00019673248104214142, "loss": 0.4516, "step": 1802 }, { "epoch": 0.24870680736602524, "grad_norm": 1.4804497957229614, "learning_rate": 0.00019672881230189954, "loss": 0.9793, "step": 1803 }, { "epoch": 0.24884474791364922, "grad_norm": 0.7268720865249634, "learning_rate": 0.0001967251415374459, "loss": 0.3936, "step": 1804 }, { "epoch": 0.2489826884612732, "grad_norm": 0.6801441311836243, "learning_rate": 0.00019672146874885735, "loss": 0.5946, "step": 1805 }, { "epoch": 0.24912062900889717, "grad_norm": 0.680694580078125, "learning_rate": 0.00019671779393621078, "loss": 0.4763, "step": 1806 }, { "epoch": 0.24925856955652115, "grad_norm": 0.6481361389160156, "learning_rate": 0.00019671411709958303, "loss": 0.4414, "step": 1807 }, { "epoch": 0.24939651010414512, "grad_norm": 0.6604426503181458, "learning_rate": 0.0001967104382390511, "loss": 0.7995, "step": 1808 }, { "epoch": 0.24953445065176907, "grad_norm": 0.6832650899887085, "learning_rate": 0.0001967067573546919, "loss": 0.6032, "step": 1809 }, { "epoch": 0.24967239119939305, "grad_norm": 0.8404244184494019, "learning_rate": 0.00019670307444658253, "loss": 0.8818, "step": 1810 }, { "epoch": 0.24981033174701703, "grad_norm": 0.6409951448440552, "learning_rate": 0.00019669938951480008, "loss": 0.3945, "step": 1811 }, { "epoch": 0.249948272294641, "grad_norm": 0.52744460105896, "learning_rate": 0.00019669570255942157, "loss": 0.2547, "step": 1812 }, { "epoch": 0.250086212842265, "grad_norm": 0.8543083667755127, "learning_rate": 0.00019669201358052423, "loss": 1.0292, "step": 1813 }, { "epoch": 0.25022415338988896, "grad_norm": 0.7495229244232178, "learning_rate": 0.0001966883225781852, "loss": 0.3624, "step": 1814 }, { "epoch": 0.2503620939375129, "grad_norm": 0.8973178863525391, "learning_rate": 0.0001966846295524818, "loss": 0.6058, "step": 1815 }, { "epoch": 0.2505000344851369, "grad_norm": 0.681062638759613, "learning_rate": 0.00019668093450349124, "loss": 0.4538, "step": 1816 }, { "epoch": 0.25063797503276086, "grad_norm": 0.794296145439148, "learning_rate": 0.00019667723743129088, "loss": 0.6439, "step": 1817 }, { "epoch": 0.25077591558038487, "grad_norm": 0.9845653772354126, "learning_rate": 0.00019667353833595808, "loss": 0.8014, "step": 1818 }, { "epoch": 0.2509138561280088, "grad_norm": 0.7362353205680847, "learning_rate": 0.00019666983721757024, "loss": 0.3373, "step": 1819 }, { "epoch": 0.2510517966756328, "grad_norm": 0.6090799570083618, "learning_rate": 0.00019666613407620483, "loss": 0.6159, "step": 1820 }, { "epoch": 0.25118973722325677, "grad_norm": 0.890593945980072, "learning_rate": 0.00019666242891193934, "loss": 0.5172, "step": 1821 }, { "epoch": 0.2513276777708808, "grad_norm": 0.7111486792564392, "learning_rate": 0.00019665872172485132, "loss": 0.4715, "step": 1822 }, { "epoch": 0.2514656183185047, "grad_norm": 1.0255473852157593, "learning_rate": 0.0001966550125150183, "loss": 0.4709, "step": 1823 }, { "epoch": 0.2516035588661287, "grad_norm": 0.6688308119773865, "learning_rate": 0.00019665130128251797, "loss": 0.6013, "step": 1824 }, { "epoch": 0.2517414994137527, "grad_norm": 1.3637018203735352, "learning_rate": 0.0001966475880274279, "loss": 1.4028, "step": 1825 }, { "epoch": 0.2518794399613766, "grad_norm": 0.5552223324775696, "learning_rate": 0.00019664387274982592, "loss": 0.4163, "step": 1826 }, { "epoch": 0.25201738050900063, "grad_norm": 0.8371832966804504, "learning_rate": 0.00019664015544978967, "loss": 0.7152, "step": 1827 }, { "epoch": 0.2521553210566246, "grad_norm": 0.6464745998382568, "learning_rate": 0.000196636436127397, "loss": 0.4217, "step": 1828 }, { "epoch": 0.2522932616042486, "grad_norm": 0.6575655937194824, "learning_rate": 0.00019663271478272568, "loss": 0.5735, "step": 1829 }, { "epoch": 0.25243120215187254, "grad_norm": 1.1887664794921875, "learning_rate": 0.0001966289914158537, "loss": 0.5801, "step": 1830 }, { "epoch": 0.25256914269949654, "grad_norm": 0.8655521869659424, "learning_rate": 0.00019662526602685888, "loss": 0.5999, "step": 1831 }, { "epoch": 0.2527070832471205, "grad_norm": 1.5546205043792725, "learning_rate": 0.0001966215386158192, "loss": 0.9561, "step": 1832 }, { "epoch": 0.25284502379474444, "grad_norm": 0.7921841740608215, "learning_rate": 0.00019661780918281267, "loss": 1.028, "step": 1833 }, { "epoch": 0.25298296434236844, "grad_norm": 0.6538952589035034, "learning_rate": 0.00019661407772791732, "loss": 0.549, "step": 1834 }, { "epoch": 0.2531209048899924, "grad_norm": 0.6631762385368347, "learning_rate": 0.0001966103442512113, "loss": 0.8401, "step": 1835 }, { "epoch": 0.2532588454376164, "grad_norm": 1.1536856889724731, "learning_rate": 0.00019660660875277266, "loss": 1.1299, "step": 1836 }, { "epoch": 0.25339678598524035, "grad_norm": 0.7850022315979004, "learning_rate": 0.00019660287123267963, "loss": 0.574, "step": 1837 }, { "epoch": 0.25353472653286435, "grad_norm": 0.8013026118278503, "learning_rate": 0.00019659913169101036, "loss": 0.5503, "step": 1838 }, { "epoch": 0.2536726670804883, "grad_norm": 1.6716663837432861, "learning_rate": 0.00019659539012784318, "loss": 0.926, "step": 1839 }, { "epoch": 0.2538106076281123, "grad_norm": 5.523809432983398, "learning_rate": 0.00019659164654325633, "loss": 1.5072, "step": 1840 }, { "epoch": 0.25394854817573626, "grad_norm": 0.8511666655540466, "learning_rate": 0.00019658790093732815, "loss": 1.0499, "step": 1841 }, { "epoch": 0.2540864887233602, "grad_norm": 1.0895229578018188, "learning_rate": 0.00019658415331013707, "loss": 0.7317, "step": 1842 }, { "epoch": 0.2542244292709842, "grad_norm": 0.8840420246124268, "learning_rate": 0.0001965804036617615, "loss": 0.4546, "step": 1843 }, { "epoch": 0.25436236981860816, "grad_norm": 0.7697427272796631, "learning_rate": 0.0001965766519922799, "loss": 0.5082, "step": 1844 }, { "epoch": 0.25450031036623216, "grad_norm": 0.9901034832000732, "learning_rate": 0.00019657289830177078, "loss": 0.7591, "step": 1845 }, { "epoch": 0.2546382509138561, "grad_norm": 0.7947630286216736, "learning_rate": 0.00019656914259031264, "loss": 0.5419, "step": 1846 }, { "epoch": 0.2547761914614801, "grad_norm": 0.9673348665237427, "learning_rate": 0.00019656538485798416, "loss": 0.4785, "step": 1847 }, { "epoch": 0.25491413200910407, "grad_norm": 0.8805794715881348, "learning_rate": 0.00019656162510486397, "loss": 0.6473, "step": 1848 }, { "epoch": 0.25505207255672807, "grad_norm": 0.7855883836746216, "learning_rate": 0.00019655786333103067, "loss": 0.5182, "step": 1849 }, { "epoch": 0.255190013104352, "grad_norm": 0.7122774720191956, "learning_rate": 0.00019655409953656306, "loss": 0.4469, "step": 1850 }, { "epoch": 0.255327953651976, "grad_norm": 0.8488814234733582, "learning_rate": 0.00019655033372153988, "loss": 0.5004, "step": 1851 }, { "epoch": 0.2554658941996, "grad_norm": 0.6714233756065369, "learning_rate": 0.00019654656588603993, "loss": 0.721, "step": 1852 }, { "epoch": 0.2556038347472239, "grad_norm": 0.6011818051338196, "learning_rate": 0.00019654279603014205, "loss": 0.5414, "step": 1853 }, { "epoch": 0.25574177529484793, "grad_norm": 0.8252864480018616, "learning_rate": 0.00019653902415392517, "loss": 0.7765, "step": 1854 }, { "epoch": 0.2558797158424719, "grad_norm": 0.7937601804733276, "learning_rate": 0.00019653525025746814, "loss": 0.6495, "step": 1855 }, { "epoch": 0.2560176563900959, "grad_norm": 0.7072235941886902, "learning_rate": 0.00019653147434085005, "loss": 0.5307, "step": 1856 }, { "epoch": 0.25615559693771983, "grad_norm": 0.7056595683097839, "learning_rate": 0.0001965276964041498, "loss": 0.7625, "step": 1857 }, { "epoch": 0.25629353748534384, "grad_norm": 0.7287213802337646, "learning_rate": 0.0001965239164474465, "loss": 0.7, "step": 1858 }, { "epoch": 0.2564314780329678, "grad_norm": 0.8468892574310303, "learning_rate": 0.00019652013447081932, "loss": 0.5814, "step": 1859 }, { "epoch": 0.2565694185805918, "grad_norm": 0.6189303994178772, "learning_rate": 0.0001965163504743473, "loss": 0.5017, "step": 1860 }, { "epoch": 0.25670735912821574, "grad_norm": 0.5941527485847473, "learning_rate": 0.00019651256445810968, "loss": 0.393, "step": 1861 }, { "epoch": 0.2568452996758397, "grad_norm": 0.8749145865440369, "learning_rate": 0.00019650877642218568, "loss": 0.6318, "step": 1862 }, { "epoch": 0.2569832402234637, "grad_norm": 0.7625764012336731, "learning_rate": 0.00019650498636665455, "loss": 0.9335, "step": 1863 }, { "epoch": 0.25712118077108764, "grad_norm": 0.628250241279602, "learning_rate": 0.00019650119429159566, "loss": 0.9172, "step": 1864 }, { "epoch": 0.25725912131871165, "grad_norm": 0.8198027610778809, "learning_rate": 0.0001964974001970883, "loss": 0.6012, "step": 1865 }, { "epoch": 0.2573970618663356, "grad_norm": 0.7354773283004761, "learning_rate": 0.0001964936040832119, "loss": 0.4291, "step": 1866 }, { "epoch": 0.2575350024139596, "grad_norm": 0.5571432709693909, "learning_rate": 0.00019648980595004587, "loss": 0.3519, "step": 1867 }, { "epoch": 0.25767294296158355, "grad_norm": 0.652762234210968, "learning_rate": 0.00019648600579766972, "loss": 0.5196, "step": 1868 }, { "epoch": 0.25781088350920756, "grad_norm": 0.5905240774154663, "learning_rate": 0.00019648220362616302, "loss": 0.5048, "step": 1869 }, { "epoch": 0.2579488240568315, "grad_norm": 0.5894907116889954, "learning_rate": 0.00019647839943560524, "loss": 0.6131, "step": 1870 }, { "epoch": 0.25808676460445545, "grad_norm": 0.6585453152656555, "learning_rate": 0.00019647459322607604, "loss": 0.7121, "step": 1871 }, { "epoch": 0.25822470515207946, "grad_norm": 0.6665537357330322, "learning_rate": 0.0001964707849976551, "loss": 0.5655, "step": 1872 }, { "epoch": 0.2583626456997034, "grad_norm": 1.1846274137496948, "learning_rate": 0.00019646697475042205, "loss": 0.8935, "step": 1873 }, { "epoch": 0.2585005862473274, "grad_norm": 0.9661015868186951, "learning_rate": 0.00019646316248445665, "loss": 0.3233, "step": 1874 }, { "epoch": 0.25863852679495136, "grad_norm": 0.7290651202201843, "learning_rate": 0.0001964593481998387, "loss": 0.6422, "step": 1875 }, { "epoch": 0.25877646734257537, "grad_norm": 0.7646875381469727, "learning_rate": 0.00019645553189664804, "loss": 0.6223, "step": 1876 }, { "epoch": 0.2589144078901993, "grad_norm": 1.009062647819519, "learning_rate": 0.00019645171357496444, "loss": 0.4979, "step": 1877 }, { "epoch": 0.2590523484378233, "grad_norm": 0.665295422077179, "learning_rate": 0.0001964478932348679, "loss": 0.5433, "step": 1878 }, { "epoch": 0.25919028898544727, "grad_norm": 0.861430287361145, "learning_rate": 0.0001964440708764383, "loss": 0.5723, "step": 1879 }, { "epoch": 0.2593282295330712, "grad_norm": 0.7060577869415283, "learning_rate": 0.0001964402464997557, "loss": 0.8107, "step": 1880 }, { "epoch": 0.2594661700806952, "grad_norm": 0.7186295390129089, "learning_rate": 0.00019643642010490006, "loss": 0.6082, "step": 1881 }, { "epoch": 0.2596041106283192, "grad_norm": 0.6485528349876404, "learning_rate": 0.00019643259169195153, "loss": 0.7899, "step": 1882 }, { "epoch": 0.2597420511759432, "grad_norm": 0.7202152013778687, "learning_rate": 0.00019642876126099013, "loss": 0.6429, "step": 1883 }, { "epoch": 0.2598799917235671, "grad_norm": 0.7233872413635254, "learning_rate": 0.00019642492881209606, "loss": 0.3552, "step": 1884 }, { "epoch": 0.26001793227119113, "grad_norm": 0.4954456686973572, "learning_rate": 0.00019642109434534958, "loss": 0.6679, "step": 1885 }, { "epoch": 0.2601558728188151, "grad_norm": 0.9985360503196716, "learning_rate": 0.00019641725786083086, "loss": 0.8218, "step": 1886 }, { "epoch": 0.2602938133664391, "grad_norm": 0.7533819675445557, "learning_rate": 0.0001964134193586202, "loss": 0.4082, "step": 1887 }, { "epoch": 0.26043175391406304, "grad_norm": 1.040789008140564, "learning_rate": 0.00019640957883879792, "loss": 0.6183, "step": 1888 }, { "epoch": 0.26056969446168704, "grad_norm": 0.6111286878585815, "learning_rate": 0.0001964057363014444, "loss": 0.4457, "step": 1889 }, { "epoch": 0.260707635009311, "grad_norm": 0.7093996405601501, "learning_rate": 0.00019640189174664006, "loss": 0.5728, "step": 1890 }, { "epoch": 0.26084557555693494, "grad_norm": 0.8528907299041748, "learning_rate": 0.00019639804517446535, "loss": 0.6318, "step": 1891 }, { "epoch": 0.26098351610455894, "grad_norm": 0.8848522305488586, "learning_rate": 0.0001963941965850008, "loss": 0.7564, "step": 1892 }, { "epoch": 0.2611214566521829, "grad_norm": 0.6769065260887146, "learning_rate": 0.0001963903459783269, "loss": 0.4853, "step": 1893 }, { "epoch": 0.2612593971998069, "grad_norm": 0.6242778897285461, "learning_rate": 0.00019638649335452423, "loss": 0.6436, "step": 1894 }, { "epoch": 0.26139733774743085, "grad_norm": 0.6486254930496216, "learning_rate": 0.00019638263871367339, "loss": 0.4826, "step": 1895 }, { "epoch": 0.26153527829505485, "grad_norm": 0.5979126691818237, "learning_rate": 0.00019637878205585514, "loss": 0.3943, "step": 1896 }, { "epoch": 0.2616732188426788, "grad_norm": 0.8692513704299927, "learning_rate": 0.00019637492338115008, "loss": 0.8218, "step": 1897 }, { "epoch": 0.2618111593903028, "grad_norm": 0.8511347770690918, "learning_rate": 0.00019637106268963902, "loss": 0.7774, "step": 1898 }, { "epoch": 0.26194909993792675, "grad_norm": 1.0287235975265503, "learning_rate": 0.00019636719998140275, "loss": 0.2615, "step": 1899 }, { "epoch": 0.2620870404855507, "grad_norm": 0.6668317914009094, "learning_rate": 0.00019636333525652208, "loss": 0.7495, "step": 1900 }, { "epoch": 0.2622249810331747, "grad_norm": 0.7268804311752319, "learning_rate": 0.0001963594685150779, "loss": 0.5185, "step": 1901 }, { "epoch": 0.26236292158079866, "grad_norm": 0.7757692933082581, "learning_rate": 0.00019635559975715111, "loss": 0.5444, "step": 1902 }, { "epoch": 0.26250086212842266, "grad_norm": 0.6161015033721924, "learning_rate": 0.00019635172898282272, "loss": 0.4839, "step": 1903 }, { "epoch": 0.2626388026760466, "grad_norm": 1.149086356163025, "learning_rate": 0.00019634785619217367, "loss": 0.8034, "step": 1904 }, { "epoch": 0.2627767432236706, "grad_norm": 0.6020526885986328, "learning_rate": 0.00019634398138528502, "loss": 0.3668, "step": 1905 }, { "epoch": 0.26291468377129457, "grad_norm": 1.1235734224319458, "learning_rate": 0.0001963401045622379, "loss": 0.7069, "step": 1906 }, { "epoch": 0.26305262431891857, "grad_norm": 0.7203898429870605, "learning_rate": 0.00019633622572311338, "loss": 0.5809, "step": 1907 }, { "epoch": 0.2631905648665425, "grad_norm": 0.8023480176925659, "learning_rate": 0.00019633234486799271, "loss": 0.5076, "step": 1908 }, { "epoch": 0.26332850541416647, "grad_norm": 0.42404186725616455, "learning_rate": 0.000196328461996957, "loss": 0.3387, "step": 1909 }, { "epoch": 0.2634664459617905, "grad_norm": 0.8660046458244324, "learning_rate": 0.00019632457711008758, "loss": 0.4406, "step": 1910 }, { "epoch": 0.2636043865094144, "grad_norm": 0.7847766876220703, "learning_rate": 0.00019632069020746572, "loss": 1.0109, "step": 1911 }, { "epoch": 0.26374232705703843, "grad_norm": 0.7731550335884094, "learning_rate": 0.00019631680128917275, "loss": 0.5357, "step": 1912 }, { "epoch": 0.2638802676046624, "grad_norm": 0.6353673338890076, "learning_rate": 0.0001963129103552901, "loss": 0.3999, "step": 1913 }, { "epoch": 0.2640182081522864, "grad_norm": 1.6938080787658691, "learning_rate": 0.00019630901740589916, "loss": 0.7445, "step": 1914 }, { "epoch": 0.26415614869991033, "grad_norm": 0.6076092720031738, "learning_rate": 0.00019630512244108136, "loss": 0.26, "step": 1915 }, { "epoch": 0.26429408924753434, "grad_norm": 0.5662344694137573, "learning_rate": 0.00019630122546091826, "loss": 0.4033, "step": 1916 }, { "epoch": 0.2644320297951583, "grad_norm": 0.6494229435920715, "learning_rate": 0.00019629732646549142, "loss": 0.7196, "step": 1917 }, { "epoch": 0.26456997034278223, "grad_norm": 0.9408316016197205, "learning_rate": 0.0001962934254548824, "loss": 0.3355, "step": 1918 }, { "epoch": 0.26470791089040624, "grad_norm": 1.035130262374878, "learning_rate": 0.00019628952242917286, "loss": 0.7799, "step": 1919 }, { "epoch": 0.2648458514380302, "grad_norm": 0.7484390139579773, "learning_rate": 0.0001962856173884444, "loss": 0.6456, "step": 1920 }, { "epoch": 0.2649837919856542, "grad_norm": 0.8826567530632019, "learning_rate": 0.0001962817103327789, "loss": 1.0731, "step": 1921 }, { "epoch": 0.26512173253327814, "grad_norm": 0.9564138054847717, "learning_rate": 0.00019627780126225798, "loss": 0.7073, "step": 1922 }, { "epoch": 0.26525967308090215, "grad_norm": 0.8987119793891907, "learning_rate": 0.00019627389017696346, "loss": 0.7057, "step": 1923 }, { "epoch": 0.2653976136285261, "grad_norm": 0.6832298636436462, "learning_rate": 0.00019626997707697724, "loss": 0.5131, "step": 1924 }, { "epoch": 0.2655355541761501, "grad_norm": 0.7652315497398376, "learning_rate": 0.0001962660619623812, "loss": 0.8216, "step": 1925 }, { "epoch": 0.26567349472377405, "grad_norm": 0.7434420585632324, "learning_rate": 0.0001962621448332572, "loss": 0.4573, "step": 1926 }, { "epoch": 0.265811435271398, "grad_norm": 0.8376505970954895, "learning_rate": 0.0001962582256896873, "loss": 1.1563, "step": 1927 }, { "epoch": 0.265949375819022, "grad_norm": 0.8742194771766663, "learning_rate": 0.00019625430453175347, "loss": 0.8819, "step": 1928 }, { "epoch": 0.26608731636664595, "grad_norm": 0.624729335308075, "learning_rate": 0.0001962503813595378, "loss": 0.732, "step": 1929 }, { "epoch": 0.26622525691426996, "grad_norm": 0.86820387840271, "learning_rate": 0.00019624645617312235, "loss": 0.876, "step": 1930 }, { "epoch": 0.2663631974618939, "grad_norm": 0.6308462619781494, "learning_rate": 0.0001962425289725893, "loss": 0.9112, "step": 1931 }, { "epoch": 0.2665011380095179, "grad_norm": 0.6708442568778992, "learning_rate": 0.00019623859975802078, "loss": 0.7087, "step": 1932 }, { "epoch": 0.26663907855714186, "grad_norm": 0.4869930148124695, "learning_rate": 0.00019623466852949903, "loss": 0.2134, "step": 1933 }, { "epoch": 0.26677701910476587, "grad_norm": 1.0010393857955933, "learning_rate": 0.00019623073528710636, "loss": 0.6263, "step": 1934 }, { "epoch": 0.2669149596523898, "grad_norm": 0.6358545422554016, "learning_rate": 0.00019622680003092503, "loss": 0.6447, "step": 1935 }, { "epoch": 0.2670529002000138, "grad_norm": 1.382300615310669, "learning_rate": 0.00019622286276103746, "loss": 0.8739, "step": 1936 }, { "epoch": 0.26719084074763777, "grad_norm": 0.7059391736984253, "learning_rate": 0.000196218923477526, "loss": 0.4168, "step": 1937 }, { "epoch": 0.2673287812952617, "grad_norm": 0.634362518787384, "learning_rate": 0.000196214982180473, "loss": 0.64, "step": 1938 }, { "epoch": 0.2674667218428857, "grad_norm": 0.5203801393508911, "learning_rate": 0.00019621103886996108, "loss": 0.3878, "step": 1939 }, { "epoch": 0.2676046623905097, "grad_norm": 0.7763768434524536, "learning_rate": 0.0001962070935460727, "loss": 0.6848, "step": 1940 }, { "epoch": 0.2677426029381337, "grad_norm": 1.2101529836654663, "learning_rate": 0.00019620314620889042, "loss": 0.6218, "step": 1941 }, { "epoch": 0.2678805434857576, "grad_norm": 0.7524515390396118, "learning_rate": 0.00019619919685849685, "loss": 0.4572, "step": 1942 }, { "epoch": 0.26801848403338163, "grad_norm": 0.8168664574623108, "learning_rate": 0.00019619524549497466, "loss": 0.558, "step": 1943 }, { "epoch": 0.2681564245810056, "grad_norm": 0.7882441282272339, "learning_rate": 0.00019619129211840647, "loss": 0.6891, "step": 1944 }, { "epoch": 0.2682943651286296, "grad_norm": 0.5386197566986084, "learning_rate": 0.00019618733672887507, "loss": 0.3957, "step": 1945 }, { "epoch": 0.26843230567625354, "grad_norm": 0.7028146982192993, "learning_rate": 0.00019618337932646328, "loss": 0.6151, "step": 1946 }, { "epoch": 0.2685702462238775, "grad_norm": 0.5782563090324402, "learning_rate": 0.0001961794199112538, "loss": 0.4059, "step": 1947 }, { "epoch": 0.2687081867715015, "grad_norm": 0.5078962445259094, "learning_rate": 0.00019617545848332952, "loss": 0.2537, "step": 1948 }, { "epoch": 0.26884612731912544, "grad_norm": 0.7615155577659607, "learning_rate": 0.00019617149504277338, "loss": 0.6574, "step": 1949 }, { "epoch": 0.26898406786674944, "grad_norm": 1.6328436136245728, "learning_rate": 0.00019616752958966834, "loss": 0.6543, "step": 1950 }, { "epoch": 0.2691220084143734, "grad_norm": 0.9626491665840149, "learning_rate": 0.0001961635621240973, "loss": 0.8887, "step": 1951 }, { "epoch": 0.2692599489619974, "grad_norm": 0.6434158086776733, "learning_rate": 0.00019615959264614335, "loss": 0.542, "step": 1952 }, { "epoch": 0.26939788950962135, "grad_norm": 0.4906614124774933, "learning_rate": 0.00019615562115588955, "loss": 0.3502, "step": 1953 }, { "epoch": 0.26953583005724535, "grad_norm": 0.8258650302886963, "learning_rate": 0.000196151647653419, "loss": 0.5331, "step": 1954 }, { "epoch": 0.2696737706048693, "grad_norm": 1.175118327140808, "learning_rate": 0.00019614767213881485, "loss": 0.6168, "step": 1955 }, { "epoch": 0.26981171115249325, "grad_norm": 0.84894859790802, "learning_rate": 0.0001961436946121603, "loss": 0.5634, "step": 1956 }, { "epoch": 0.26994965170011725, "grad_norm": 0.8358104825019836, "learning_rate": 0.00019613971507353854, "loss": 0.5099, "step": 1957 }, { "epoch": 0.2700875922477412, "grad_norm": 0.48822519183158875, "learning_rate": 0.00019613573352303293, "loss": 0.2676, "step": 1958 }, { "epoch": 0.2702255327953652, "grad_norm": 0.6187692284584045, "learning_rate": 0.00019613174996072676, "loss": 0.2216, "step": 1959 }, { "epoch": 0.27036347334298916, "grad_norm": 0.6481124758720398, "learning_rate": 0.0001961277643867034, "loss": 0.7876, "step": 1960 }, { "epoch": 0.27050141389061316, "grad_norm": 0.7270785570144653, "learning_rate": 0.0001961237768010462, "loss": 0.6293, "step": 1961 }, { "epoch": 0.2706393544382371, "grad_norm": 1.133460521697998, "learning_rate": 0.00019611978720383868, "loss": 0.5158, "step": 1962 }, { "epoch": 0.2707772949858611, "grad_norm": 1.0767223834991455, "learning_rate": 0.00019611579559516425, "loss": 0.7052, "step": 1963 }, { "epoch": 0.27091523553348507, "grad_norm": 0.9242454171180725, "learning_rate": 0.00019611180197510653, "loss": 0.9248, "step": 1964 }, { "epoch": 0.271053176081109, "grad_norm": 0.676384687423706, "learning_rate": 0.00019610780634374907, "loss": 0.2746, "step": 1965 }, { "epoch": 0.271191116628733, "grad_norm": 0.8368886113166809, "learning_rate": 0.0001961038087011754, "loss": 0.4871, "step": 1966 }, { "epoch": 0.27132905717635697, "grad_norm": 0.81053227186203, "learning_rate": 0.00019609980904746927, "loss": 0.4405, "step": 1967 }, { "epoch": 0.271466997723981, "grad_norm": 0.7386883497238159, "learning_rate": 0.00019609580738271437, "loss": 0.6491, "step": 1968 }, { "epoch": 0.2716049382716049, "grad_norm": 0.8443131446838379, "learning_rate": 0.0001960918037069944, "loss": 0.3156, "step": 1969 }, { "epoch": 0.27174287881922893, "grad_norm": 0.7694213390350342, "learning_rate": 0.00019608779802039318, "loss": 0.3285, "step": 1970 }, { "epoch": 0.2718808193668529, "grad_norm": 0.9998572468757629, "learning_rate": 0.00019608379032299452, "loss": 0.5763, "step": 1971 }, { "epoch": 0.2720187599144769, "grad_norm": 0.8617352247238159, "learning_rate": 0.0001960797806148823, "loss": 0.4276, "step": 1972 }, { "epoch": 0.27215670046210083, "grad_norm": 0.8109136819839478, "learning_rate": 0.0001960757688961404, "loss": 0.7774, "step": 1973 }, { "epoch": 0.27229464100972484, "grad_norm": 0.832369327545166, "learning_rate": 0.0001960717551668528, "loss": 0.5779, "step": 1974 }, { "epoch": 0.2724325815573488, "grad_norm": 0.8160262703895569, "learning_rate": 0.0001960677394271035, "loss": 0.7573, "step": 1975 }, { "epoch": 0.27257052210497273, "grad_norm": 0.5495042204856873, "learning_rate": 0.00019606372167697654, "loss": 0.4421, "step": 1976 }, { "epoch": 0.27270846265259674, "grad_norm": 0.9263167381286621, "learning_rate": 0.00019605970191655594, "loss": 0.5289, "step": 1977 }, { "epoch": 0.2728464032002207, "grad_norm": 0.5080690383911133, "learning_rate": 0.00019605568014592587, "loss": 0.3154, "step": 1978 }, { "epoch": 0.2729843437478447, "grad_norm": 0.5967199802398682, "learning_rate": 0.0001960516563651705, "loss": 0.56, "step": 1979 }, { "epoch": 0.27312228429546864, "grad_norm": 0.8977450132369995, "learning_rate": 0.000196047630574374, "loss": 0.6299, "step": 1980 }, { "epoch": 0.27326022484309265, "grad_norm": 1.3137375116348267, "learning_rate": 0.00019604360277362064, "loss": 0.6793, "step": 1981 }, { "epoch": 0.2733981653907166, "grad_norm": 0.5420688986778259, "learning_rate": 0.00019603957296299471, "loss": 0.4237, "step": 1982 }, { "epoch": 0.2735361059383406, "grad_norm": 1.1246532201766968, "learning_rate": 0.00019603554114258054, "loss": 0.4511, "step": 1983 }, { "epoch": 0.27367404648596455, "grad_norm": 1.3849523067474365, "learning_rate": 0.00019603150731246246, "loss": 1.2152, "step": 1984 }, { "epoch": 0.2738119870335885, "grad_norm": 0.7936597466468811, "learning_rate": 0.00019602747147272497, "loss": 0.414, "step": 1985 }, { "epoch": 0.2739499275812125, "grad_norm": 0.8089765906333923, "learning_rate": 0.00019602343362345242, "loss": 0.5532, "step": 1986 }, { "epoch": 0.27408786812883645, "grad_norm": 0.7434920072555542, "learning_rate": 0.0001960193937647294, "loss": 0.395, "step": 1987 }, { "epoch": 0.27422580867646046, "grad_norm": 0.6069332957267761, "learning_rate": 0.0001960153518966404, "loss": 0.479, "step": 1988 }, { "epoch": 0.2743637492240844, "grad_norm": 0.6373541355133057, "learning_rate": 0.00019601130801927003, "loss": 0.6278, "step": 1989 }, { "epoch": 0.2745016897717084, "grad_norm": 0.7494776844978333, "learning_rate": 0.00019600726213270288, "loss": 0.6765, "step": 1990 }, { "epoch": 0.27463963031933236, "grad_norm": 0.9745563864707947, "learning_rate": 0.00019600321423702367, "loss": 0.3576, "step": 1991 }, { "epoch": 0.27477757086695637, "grad_norm": 0.6818426251411438, "learning_rate": 0.00019599916433231707, "loss": 0.2374, "step": 1992 }, { "epoch": 0.2749155114145803, "grad_norm": 0.5867907404899597, "learning_rate": 0.00019599511241866785, "loss": 0.5321, "step": 1993 }, { "epoch": 0.27505345196220427, "grad_norm": 0.6568586826324463, "learning_rate": 0.00019599105849616078, "loss": 0.5506, "step": 1994 }, { "epoch": 0.27519139250982827, "grad_norm": 0.7166451811790466, "learning_rate": 0.00019598700256488071, "loss": 0.6947, "step": 1995 }, { "epoch": 0.2753293330574522, "grad_norm": 0.6616407036781311, "learning_rate": 0.00019598294462491253, "loss": 0.5189, "step": 1996 }, { "epoch": 0.2754672736050762, "grad_norm": 0.7204359173774719, "learning_rate": 0.00019597888467634116, "loss": 0.5192, "step": 1997 }, { "epoch": 0.2756052141527002, "grad_norm": 0.6476003527641296, "learning_rate": 0.0001959748227192515, "loss": 0.7107, "step": 1998 }, { "epoch": 0.2757431547003242, "grad_norm": 0.6306154131889343, "learning_rate": 0.00019597075875372864, "loss": 0.4349, "step": 1999 }, { "epoch": 0.2758810952479481, "grad_norm": 0.8269013166427612, "learning_rate": 0.00019596669277985757, "loss": 0.2929, "step": 2000 }, { "epoch": 0.2758810952479481, "eval_loss": 0.6565178632736206, "eval_runtime": 23.5215, "eval_samples_per_second": 2.508, "eval_steps_per_second": 2.508, "step": 2000 }, { "epoch": 0.27601903579557213, "grad_norm": 0.7976175546646118, "learning_rate": 0.00019596262479772337, "loss": 0.8986, "step": 2001 }, { "epoch": 0.2761569763431961, "grad_norm": 0.7986498475074768, "learning_rate": 0.0001959585548074112, "loss": 0.6112, "step": 2002 }, { "epoch": 0.27629491689082003, "grad_norm": 0.6366571187973022, "learning_rate": 0.00019595448280900626, "loss": 0.4923, "step": 2003 }, { "epoch": 0.27643285743844404, "grad_norm": 0.6254854798316956, "learning_rate": 0.0001959504088025937, "loss": 0.3532, "step": 2004 }, { "epoch": 0.276570797986068, "grad_norm": 0.9452973008155823, "learning_rate": 0.0001959463327882588, "loss": 1.054, "step": 2005 }, { "epoch": 0.276708738533692, "grad_norm": 0.8644076585769653, "learning_rate": 0.00019594225476608686, "loss": 0.6988, "step": 2006 }, { "epoch": 0.27684667908131594, "grad_norm": 0.563485860824585, "learning_rate": 0.00019593817473616322, "loss": 0.5089, "step": 2007 }, { "epoch": 0.27698461962893994, "grad_norm": 0.9795621037483215, "learning_rate": 0.00019593409269857325, "loss": 0.2905, "step": 2008 }, { "epoch": 0.2771225601765639, "grad_norm": 0.6783512234687805, "learning_rate": 0.00019593000865340238, "loss": 0.3544, "step": 2009 }, { "epoch": 0.2772605007241879, "grad_norm": 0.6736263036727905, "learning_rate": 0.00019592592260073613, "loss": 0.508, "step": 2010 }, { "epoch": 0.27739844127181185, "grad_norm": 0.8978418111801147, "learning_rate": 0.00019592183454065988, "loss": 0.8027, "step": 2011 }, { "epoch": 0.2775363818194358, "grad_norm": 0.877034604549408, "learning_rate": 0.0001959177444732593, "loss": 0.7846, "step": 2012 }, { "epoch": 0.2776743223670598, "grad_norm": 0.6984195113182068, "learning_rate": 0.00019591365239861994, "loss": 0.559, "step": 2013 }, { "epoch": 0.27781226291468375, "grad_norm": 1.1937421560287476, "learning_rate": 0.00019590955831682742, "loss": 0.7328, "step": 2014 }, { "epoch": 0.27795020346230775, "grad_norm": 0.7599695920944214, "learning_rate": 0.00019590546222796742, "loss": 0.392, "step": 2015 }, { "epoch": 0.2780881440099317, "grad_norm": 0.8356521129608154, "learning_rate": 0.00019590136413212566, "loss": 0.3812, "step": 2016 }, { "epoch": 0.2782260845575557, "grad_norm": 0.6217128038406372, "learning_rate": 0.00019589726402938792, "loss": 0.5064, "step": 2017 }, { "epoch": 0.27836402510517966, "grad_norm": 0.9336037039756775, "learning_rate": 0.00019589316191984, "loss": 0.7596, "step": 2018 }, { "epoch": 0.27850196565280366, "grad_norm": 0.6983953714370728, "learning_rate": 0.0001958890578035677, "loss": 0.7579, "step": 2019 }, { "epoch": 0.2786399062004276, "grad_norm": 0.6743526458740234, "learning_rate": 0.00019588495168065692, "loss": 0.4536, "step": 2020 }, { "epoch": 0.2787778467480516, "grad_norm": 0.8309145569801331, "learning_rate": 0.00019588084355119363, "loss": 0.5478, "step": 2021 }, { "epoch": 0.27891578729567557, "grad_norm": 0.9033045172691345, "learning_rate": 0.00019587673341526376, "loss": 0.4243, "step": 2022 }, { "epoch": 0.2790537278432995, "grad_norm": 0.8193897604942322, "learning_rate": 0.00019587262127295331, "loss": 0.5688, "step": 2023 }, { "epoch": 0.2791916683909235, "grad_norm": 0.6730914115905762, "learning_rate": 0.0001958685071243484, "loss": 0.5477, "step": 2024 }, { "epoch": 0.27932960893854747, "grad_norm": 0.5275852680206299, "learning_rate": 0.00019586439096953506, "loss": 0.2861, "step": 2025 }, { "epoch": 0.2794675494861715, "grad_norm": 0.7503786087036133, "learning_rate": 0.00019586027280859945, "loss": 0.7682, "step": 2026 }, { "epoch": 0.2796054900337954, "grad_norm": 0.6576685309410095, "learning_rate": 0.00019585615264162772, "loss": 0.6214, "step": 2027 }, { "epoch": 0.2797434305814194, "grad_norm": 0.7833530306816101, "learning_rate": 0.00019585203046870614, "loss": 0.9256, "step": 2028 }, { "epoch": 0.2798813711290434, "grad_norm": 0.779478132724762, "learning_rate": 0.00019584790628992098, "loss": 0.6512, "step": 2029 }, { "epoch": 0.2800193116766674, "grad_norm": 0.5535669922828674, "learning_rate": 0.0001958437801053585, "loss": 0.3869, "step": 2030 }, { "epoch": 0.28015725222429133, "grad_norm": 0.694486141204834, "learning_rate": 0.00019583965191510505, "loss": 0.3586, "step": 2031 }, { "epoch": 0.2802951927719153, "grad_norm": 0.7821094989776611, "learning_rate": 0.00019583552171924704, "loss": 0.5341, "step": 2032 }, { "epoch": 0.2804331333195393, "grad_norm": 0.570767879486084, "learning_rate": 0.0001958313895178709, "loss": 0.4214, "step": 2033 }, { "epoch": 0.28057107386716323, "grad_norm": 0.7766290307044983, "learning_rate": 0.00019582725531106307, "loss": 0.6409, "step": 2034 }, { "epoch": 0.28070901441478724, "grad_norm": 0.7544063925743103, "learning_rate": 0.00019582311909891012, "loss": 0.5586, "step": 2035 }, { "epoch": 0.2808469549624112, "grad_norm": 0.6841877102851868, "learning_rate": 0.0001958189808814986, "loss": 0.4377, "step": 2036 }, { "epoch": 0.2809848955100352, "grad_norm": 0.584334135055542, "learning_rate": 0.00019581484065891506, "loss": 0.5649, "step": 2037 }, { "epoch": 0.28112283605765914, "grad_norm": 0.7064344882965088, "learning_rate": 0.00019581069843124617, "loss": 0.8847, "step": 2038 }, { "epoch": 0.28126077660528315, "grad_norm": 0.5461025834083557, "learning_rate": 0.00019580655419857866, "loss": 0.3344, "step": 2039 }, { "epoch": 0.2813987171529071, "grad_norm": 1.2574125528335571, "learning_rate": 0.00019580240796099915, "loss": 0.7018, "step": 2040 }, { "epoch": 0.28153665770053105, "grad_norm": 1.02732253074646, "learning_rate": 0.00019579825971859452, "loss": 1.1026, "step": 2041 }, { "epoch": 0.28167459824815505, "grad_norm": 0.4866338074207306, "learning_rate": 0.00019579410947145146, "loss": 0.4095, "step": 2042 }, { "epoch": 0.281812538795779, "grad_norm": 0.7297942042350769, "learning_rate": 0.00019578995721965695, "loss": 0.7477, "step": 2043 }, { "epoch": 0.281950479343403, "grad_norm": 0.671257734298706, "learning_rate": 0.0001957858029632978, "loss": 0.6971, "step": 2044 }, { "epoch": 0.28208841989102695, "grad_norm": 0.6661747097969055, "learning_rate": 0.00019578164670246094, "loss": 0.4219, "step": 2045 }, { "epoch": 0.28222636043865096, "grad_norm": 1.152039885520935, "learning_rate": 0.00019577748843723337, "loss": 0.9014, "step": 2046 }, { "epoch": 0.2823643009862749, "grad_norm": 0.8204615712165833, "learning_rate": 0.0001957733281677021, "loss": 0.7038, "step": 2047 }, { "epoch": 0.2825022415338989, "grad_norm": 0.6705266237258911, "learning_rate": 0.00019576916589395424, "loss": 0.7392, "step": 2048 }, { "epoch": 0.28264018208152286, "grad_norm": 1.229459524154663, "learning_rate": 0.00019576500161607685, "loss": 1.0651, "step": 2049 }, { "epoch": 0.2827781226291468, "grad_norm": 0.685117244720459, "learning_rate": 0.00019576083533415703, "loss": 0.7557, "step": 2050 }, { "epoch": 0.2829160631767708, "grad_norm": 0.8755848407745361, "learning_rate": 0.00019575666704828206, "loss": 0.931, "step": 2051 }, { "epoch": 0.28305400372439476, "grad_norm": 0.6004536747932434, "learning_rate": 0.00019575249675853908, "loss": 0.5779, "step": 2052 }, { "epoch": 0.28319194427201877, "grad_norm": 0.671427845954895, "learning_rate": 0.00019574832446501544, "loss": 0.4515, "step": 2053 }, { "epoch": 0.2833298848196427, "grad_norm": 0.9582410454750061, "learning_rate": 0.0001957441501677984, "loss": 0.6176, "step": 2054 }, { "epoch": 0.2834678253672667, "grad_norm": 0.8629324436187744, "learning_rate": 0.00019573997386697532, "loss": 0.7077, "step": 2055 }, { "epoch": 0.2836057659148907, "grad_norm": 0.9006950259208679, "learning_rate": 0.0001957357955626336, "loss": 0.6964, "step": 2056 }, { "epoch": 0.2837437064625147, "grad_norm": 0.6615795493125916, "learning_rate": 0.0001957316152548607, "loss": 0.4452, "step": 2057 }, { "epoch": 0.2838816470101386, "grad_norm": 0.7859619855880737, "learning_rate": 0.00019572743294374404, "loss": 1.0109, "step": 2058 }, { "epoch": 0.28401958755776263, "grad_norm": 0.6359809041023254, "learning_rate": 0.00019572324862937124, "loss": 0.452, "step": 2059 }, { "epoch": 0.2841575281053866, "grad_norm": 0.780289351940155, "learning_rate": 0.00019571906231182978, "loss": 0.8381, "step": 2060 }, { "epoch": 0.28429546865301053, "grad_norm": 0.8848547339439392, "learning_rate": 0.0001957148739912073, "loss": 0.6521, "step": 2061 }, { "epoch": 0.28443340920063453, "grad_norm": 0.6815661787986755, "learning_rate": 0.00019571068366759143, "loss": 0.5813, "step": 2062 }, { "epoch": 0.2845713497482585, "grad_norm": 0.5312855243682861, "learning_rate": 0.00019570649134106985, "loss": 0.3351, "step": 2063 }, { "epoch": 0.2847092902958825, "grad_norm": 0.5981124043464661, "learning_rate": 0.00019570229701173036, "loss": 0.4126, "step": 2064 }, { "epoch": 0.28484723084350644, "grad_norm": 0.9804319739341736, "learning_rate": 0.00019569810067966066, "loss": 0.8333, "step": 2065 }, { "epoch": 0.28498517139113044, "grad_norm": 1.0361062288284302, "learning_rate": 0.00019569390234494858, "loss": 0.6087, "step": 2066 }, { "epoch": 0.2851231119387544, "grad_norm": 0.7839725017547607, "learning_rate": 0.000195689702007682, "loss": 0.7404, "step": 2067 }, { "epoch": 0.2852610524863784, "grad_norm": 1.3355668783187866, "learning_rate": 0.0001956854996679488, "loss": 0.4984, "step": 2068 }, { "epoch": 0.28539899303400235, "grad_norm": 0.6724937558174133, "learning_rate": 0.00019568129532583693, "loss": 0.4341, "step": 2069 }, { "epoch": 0.2855369335816263, "grad_norm": 0.7715407013893127, "learning_rate": 0.00019567708898143437, "loss": 0.6913, "step": 2070 }, { "epoch": 0.2856748741292503, "grad_norm": 0.8403461575508118, "learning_rate": 0.00019567288063482914, "loss": 0.5184, "step": 2071 }, { "epoch": 0.28581281467687425, "grad_norm": 0.6787713766098022, "learning_rate": 0.0001956686702861093, "loss": 0.5928, "step": 2072 }, { "epoch": 0.28595075522449825, "grad_norm": 0.5545241832733154, "learning_rate": 0.00019566445793536299, "loss": 0.4176, "step": 2073 }, { "epoch": 0.2860886957721222, "grad_norm": 0.5456835031509399, "learning_rate": 0.00019566024358267834, "loss": 0.409, "step": 2074 }, { "epoch": 0.2862266363197462, "grad_norm": 1.8867385387420654, "learning_rate": 0.00019565602722814354, "loss": 0.6322, "step": 2075 }, { "epoch": 0.28636457686737016, "grad_norm": 0.7244119644165039, "learning_rate": 0.0001956518088718468, "loss": 0.4894, "step": 2076 }, { "epoch": 0.28650251741499416, "grad_norm": 0.7089682817459106, "learning_rate": 0.00019564758851387649, "loss": 0.5693, "step": 2077 }, { "epoch": 0.2866404579626181, "grad_norm": 0.6970006823539734, "learning_rate": 0.0001956433661543208, "loss": 0.4493, "step": 2078 }, { "epoch": 0.28677839851024206, "grad_norm": 0.7393503785133362, "learning_rate": 0.00019563914179326818, "loss": 0.5863, "step": 2079 }, { "epoch": 0.28691633905786607, "grad_norm": 0.6624215841293335, "learning_rate": 0.00019563491543080698, "loss": 0.3739, "step": 2080 }, { "epoch": 0.28705427960549, "grad_norm": 0.7205662727355957, "learning_rate": 0.0001956306870670257, "loss": 0.5124, "step": 2081 }, { "epoch": 0.287192220153114, "grad_norm": 1.1564881801605225, "learning_rate": 0.00019562645670201276, "loss": 1.0517, "step": 2082 }, { "epoch": 0.28733016070073797, "grad_norm": 0.7639877796173096, "learning_rate": 0.00019562222433585673, "loss": 0.9036, "step": 2083 }, { "epoch": 0.287468101248362, "grad_norm": 0.6498881578445435, "learning_rate": 0.00019561798996864618, "loss": 0.618, "step": 2084 }, { "epoch": 0.2876060417959859, "grad_norm": 0.7746434807777405, "learning_rate": 0.0001956137536004697, "loss": 0.5444, "step": 2085 }, { "epoch": 0.2877439823436099, "grad_norm": 1.1528464555740356, "learning_rate": 0.00019560951523141595, "loss": 0.4188, "step": 2086 }, { "epoch": 0.2878819228912339, "grad_norm": 0.6776193976402283, "learning_rate": 0.00019560527486157364, "loss": 0.4812, "step": 2087 }, { "epoch": 0.2880198634388578, "grad_norm": 1.0938503742218018, "learning_rate": 0.00019560103249103148, "loss": 0.8737, "step": 2088 }, { "epoch": 0.28815780398648183, "grad_norm": 0.6782721281051636, "learning_rate": 0.00019559678811987828, "loss": 0.5982, "step": 2089 }, { "epoch": 0.2882957445341058, "grad_norm": 0.6858242154121399, "learning_rate": 0.00019559254174820282, "loss": 0.6636, "step": 2090 }, { "epoch": 0.2884336850817298, "grad_norm": 0.8259555697441101, "learning_rate": 0.00019558829337609402, "loss": 0.4079, "step": 2091 }, { "epoch": 0.28857162562935373, "grad_norm": 0.8771445155143738, "learning_rate": 0.00019558404300364072, "loss": 0.6069, "step": 2092 }, { "epoch": 0.28870956617697774, "grad_norm": 0.5591592192649841, "learning_rate": 0.00019557979063093188, "loss": 0.373, "step": 2093 }, { "epoch": 0.2888475067246017, "grad_norm": 0.7256616353988647, "learning_rate": 0.00019557553625805657, "loss": 0.6074, "step": 2094 }, { "epoch": 0.2889854472722257, "grad_norm": 0.646175742149353, "learning_rate": 0.00019557127988510372, "loss": 0.6554, "step": 2095 }, { "epoch": 0.28912338781984964, "grad_norm": 0.5466925501823425, "learning_rate": 0.00019556702151216242, "loss": 0.4869, "step": 2096 }, { "epoch": 0.2892613283674736, "grad_norm": 0.8264899253845215, "learning_rate": 0.00019556276113932183, "loss": 0.5827, "step": 2097 }, { "epoch": 0.2893992689150976, "grad_norm": 0.7389553189277649, "learning_rate": 0.00019555849876667103, "loss": 0.3154, "step": 2098 }, { "epoch": 0.28953720946272155, "grad_norm": 0.6903903484344482, "learning_rate": 0.0001955542343942993, "loss": 0.7065, "step": 2099 }, { "epoch": 0.28967515001034555, "grad_norm": 1.1037869453430176, "learning_rate": 0.00019554996802229583, "loss": 0.7192, "step": 2100 }, { "epoch": 0.2898130905579695, "grad_norm": 1.640199065208435, "learning_rate": 0.00019554569965074992, "loss": 0.553, "step": 2101 }, { "epoch": 0.2899510311055935, "grad_norm": 0.6174784302711487, "learning_rate": 0.00019554142927975088, "loss": 0.5931, "step": 2102 }, { "epoch": 0.29008897165321745, "grad_norm": 0.9308575987815857, "learning_rate": 0.0001955371569093881, "loss": 0.5784, "step": 2103 }, { "epoch": 0.29022691220084146, "grad_norm": 1.0464619398117065, "learning_rate": 0.00019553288253975094, "loss": 0.6852, "step": 2104 }, { "epoch": 0.2903648527484654, "grad_norm": 0.8496968746185303, "learning_rate": 0.00019552860617092887, "loss": 0.7255, "step": 2105 }, { "epoch": 0.2905027932960894, "grad_norm": 0.7223439812660217, "learning_rate": 0.00019552432780301139, "loss": 0.6047, "step": 2106 }, { "epoch": 0.29064073384371336, "grad_norm": 0.7429842948913574, "learning_rate": 0.00019552004743608804, "loss": 0.4424, "step": 2107 }, { "epoch": 0.2907786743913373, "grad_norm": 0.6475143432617188, "learning_rate": 0.0001955157650702484, "loss": 0.4936, "step": 2108 }, { "epoch": 0.2909166149389613, "grad_norm": 0.744999885559082, "learning_rate": 0.00019551148070558205, "loss": 0.7856, "step": 2109 }, { "epoch": 0.29105455548658526, "grad_norm": 0.6844127178192139, "learning_rate": 0.00019550719434217865, "loss": 0.6373, "step": 2110 }, { "epoch": 0.29119249603420927, "grad_norm": 0.7517552375793457, "learning_rate": 0.00019550290598012793, "loss": 0.4843, "step": 2111 }, { "epoch": 0.2913304365818332, "grad_norm": 0.7101173996925354, "learning_rate": 0.00019549861561951959, "loss": 0.3913, "step": 2112 }, { "epoch": 0.2914683771294572, "grad_norm": 0.721660852432251, "learning_rate": 0.00019549432326044345, "loss": 0.302, "step": 2113 }, { "epoch": 0.2916063176770812, "grad_norm": 0.6831248998641968, "learning_rate": 0.00019549002890298934, "loss": 0.5832, "step": 2114 }, { "epoch": 0.2917442582247052, "grad_norm": 0.6875988245010376, "learning_rate": 0.00019548573254724708, "loss": 0.3342, "step": 2115 }, { "epoch": 0.2918821987723291, "grad_norm": 0.5807436108589172, "learning_rate": 0.00019548143419330661, "loss": 0.7861, "step": 2116 }, { "epoch": 0.2920201393199531, "grad_norm": 1.089648962020874, "learning_rate": 0.0001954771338412579, "loss": 0.6961, "step": 2117 }, { "epoch": 0.2921580798675771, "grad_norm": 0.7699464559555054, "learning_rate": 0.00019547283149119092, "loss": 0.4305, "step": 2118 }, { "epoch": 0.29229602041520103, "grad_norm": 0.7928016781806946, "learning_rate": 0.0001954685271431957, "loss": 0.6308, "step": 2119 }, { "epoch": 0.29243396096282503, "grad_norm": 0.9115915298461914, "learning_rate": 0.0001954642207973623, "loss": 0.6821, "step": 2120 }, { "epoch": 0.292571901510449, "grad_norm": 0.8448048233985901, "learning_rate": 0.00019545991245378087, "loss": 0.7006, "step": 2121 }, { "epoch": 0.292709842058073, "grad_norm": 0.7800453305244446, "learning_rate": 0.00019545560211254155, "loss": 0.5068, "step": 2122 }, { "epoch": 0.29284778260569694, "grad_norm": 0.8466667532920837, "learning_rate": 0.00019545128977373454, "loss": 0.8495, "step": 2123 }, { "epoch": 0.29298572315332094, "grad_norm": 0.5569940805435181, "learning_rate": 0.00019544697543745013, "loss": 0.4394, "step": 2124 }, { "epoch": 0.2931236637009449, "grad_norm": 1.0552653074264526, "learning_rate": 0.0001954426591037785, "loss": 0.932, "step": 2125 }, { "epoch": 0.29326160424856884, "grad_norm": 0.8036336898803711, "learning_rate": 0.00019543834077281007, "loss": 0.5089, "step": 2126 }, { "epoch": 0.29339954479619285, "grad_norm": 0.9038389325141907, "learning_rate": 0.00019543402044463521, "loss": 0.4724, "step": 2127 }, { "epoch": 0.2935374853438168, "grad_norm": 0.5815970301628113, "learning_rate": 0.00019542969811934426, "loss": 0.3765, "step": 2128 }, { "epoch": 0.2936754258914408, "grad_norm": 0.8801495432853699, "learning_rate": 0.00019542537379702772, "loss": 0.6568, "step": 2129 }, { "epoch": 0.29381336643906475, "grad_norm": 0.9125809073448181, "learning_rate": 0.0001954210474777761, "loss": 1.0139, "step": 2130 }, { "epoch": 0.29395130698668875, "grad_norm": 0.6941331624984741, "learning_rate": 0.00019541671916167987, "loss": 0.5135, "step": 2131 }, { "epoch": 0.2940892475343127, "grad_norm": 0.721238911151886, "learning_rate": 0.00019541238884882966, "loss": 0.3743, "step": 2132 }, { "epoch": 0.2942271880819367, "grad_norm": 1.1131097078323364, "learning_rate": 0.00019540805653931609, "loss": 0.8024, "step": 2133 }, { "epoch": 0.29436512862956066, "grad_norm": 0.6035623550415039, "learning_rate": 0.0001954037222332298, "loss": 0.4709, "step": 2134 }, { "epoch": 0.2945030691771846, "grad_norm": 0.6855323314666748, "learning_rate": 0.0001953993859306615, "loss": 0.4904, "step": 2135 }, { "epoch": 0.2946410097248086, "grad_norm": 0.7864904403686523, "learning_rate": 0.00019539504763170192, "loss": 0.73, "step": 2136 }, { "epoch": 0.29477895027243256, "grad_norm": 1.1502920389175415, "learning_rate": 0.0001953907073364419, "loss": 1.1485, "step": 2137 }, { "epoch": 0.29491689082005657, "grad_norm": 0.8464686870574951, "learning_rate": 0.0001953863650449722, "loss": 0.6272, "step": 2138 }, { "epoch": 0.2950548313676805, "grad_norm": 0.645497739315033, "learning_rate": 0.00019538202075738373, "loss": 0.5731, "step": 2139 }, { "epoch": 0.2951927719153045, "grad_norm": 0.7950919270515442, "learning_rate": 0.00019537767447376736, "loss": 0.6039, "step": 2140 }, { "epoch": 0.29533071246292847, "grad_norm": 0.7011622190475464, "learning_rate": 0.0001953733261942141, "loss": 0.4037, "step": 2141 }, { "epoch": 0.2954686530105525, "grad_norm": 0.7136140465736389, "learning_rate": 0.0001953689759188149, "loss": 0.6339, "step": 2142 }, { "epoch": 0.2956065935581764, "grad_norm": 1.0056228637695312, "learning_rate": 0.0001953646236476608, "loss": 0.5475, "step": 2143 }, { "epoch": 0.2957445341058004, "grad_norm": 0.8086057901382446, "learning_rate": 0.00019536026938084296, "loss": 0.6254, "step": 2144 }, { "epoch": 0.2958824746534244, "grad_norm": 0.8735644817352295, "learning_rate": 0.00019535591311845235, "loss": 0.5957, "step": 2145 }, { "epoch": 0.2960204152010483, "grad_norm": 0.6294654607772827, "learning_rate": 0.00019535155486058027, "loss": 0.4358, "step": 2146 }, { "epoch": 0.29615835574867233, "grad_norm": 0.6147223114967346, "learning_rate": 0.00019534719460731785, "loss": 0.5106, "step": 2147 }, { "epoch": 0.2962962962962963, "grad_norm": 0.6865537166595459, "learning_rate": 0.00019534283235875637, "loss": 0.6796, "step": 2148 }, { "epoch": 0.2964342368439203, "grad_norm": 0.8193590641021729, "learning_rate": 0.0001953384681149871, "loss": 0.6479, "step": 2149 }, { "epoch": 0.29657217739154423, "grad_norm": 0.8016851544380188, "learning_rate": 0.00019533410187610138, "loss": 0.8757, "step": 2150 }, { "epoch": 0.29671011793916824, "grad_norm": 0.8192347288131714, "learning_rate": 0.00019532973364219054, "loss": 0.8549, "step": 2151 }, { "epoch": 0.2968480584867922, "grad_norm": 1.2745975255966187, "learning_rate": 0.000195325363413346, "loss": 0.8386, "step": 2152 }, { "epoch": 0.2969859990344162, "grad_norm": 0.7096378207206726, "learning_rate": 0.00019532099118965931, "loss": 0.5653, "step": 2153 }, { "epoch": 0.29712393958204014, "grad_norm": 0.5258468985557556, "learning_rate": 0.00019531661697122184, "loss": 0.3627, "step": 2154 }, { "epoch": 0.2972618801296641, "grad_norm": 0.6194223165512085, "learning_rate": 0.00019531224075812524, "loss": 0.2661, "step": 2155 }, { "epoch": 0.2973998206772881, "grad_norm": 0.760379433631897, "learning_rate": 0.000195307862550461, "loss": 0.5146, "step": 2156 }, { "epoch": 0.29753776122491205, "grad_norm": 0.6956475973129272, "learning_rate": 0.00019530348234832076, "loss": 0.8747, "step": 2157 }, { "epoch": 0.29767570177253605, "grad_norm": 0.6443366408348083, "learning_rate": 0.0001952991001517962, "loss": 0.3854, "step": 2158 }, { "epoch": 0.29781364232016, "grad_norm": 0.7563418745994568, "learning_rate": 0.00019529471596097902, "loss": 0.4861, "step": 2159 }, { "epoch": 0.297951582867784, "grad_norm": 1.3392752408981323, "learning_rate": 0.000195290329775961, "loss": 0.6267, "step": 2160 }, { "epoch": 0.29808952341540795, "grad_norm": 0.5877766013145447, "learning_rate": 0.00019528594159683385, "loss": 0.4547, "step": 2161 }, { "epoch": 0.29822746396303196, "grad_norm": 0.9212716817855835, "learning_rate": 0.00019528155142368948, "loss": 0.8183, "step": 2162 }, { "epoch": 0.2983654045106559, "grad_norm": 2.37778902053833, "learning_rate": 0.00019527715925661974, "loss": 0.9423, "step": 2163 }, { "epoch": 0.29850334505827986, "grad_norm": 0.6656827330589294, "learning_rate": 0.0001952727650957165, "loss": 0.4532, "step": 2164 }, { "epoch": 0.29864128560590386, "grad_norm": 0.9072442650794983, "learning_rate": 0.00019526836894107175, "loss": 0.8813, "step": 2165 }, { "epoch": 0.2987792261535278, "grad_norm": 1.0257484912872314, "learning_rate": 0.00019526397079277748, "loss": 0.8555, "step": 2166 }, { "epoch": 0.2989171667011518, "grad_norm": 0.8982290029525757, "learning_rate": 0.00019525957065092575, "loss": 0.5986, "step": 2167 }, { "epoch": 0.29905510724877576, "grad_norm": 0.6626170873641968, "learning_rate": 0.00019525516851560859, "loss": 0.6548, "step": 2168 }, { "epoch": 0.29919304779639977, "grad_norm": 1.0469564199447632, "learning_rate": 0.00019525076438691818, "loss": 1.2987, "step": 2169 }, { "epoch": 0.2993309883440237, "grad_norm": 0.9261853098869324, "learning_rate": 0.00019524635826494665, "loss": 0.7951, "step": 2170 }, { "epoch": 0.2994689288916477, "grad_norm": 0.6855120062828064, "learning_rate": 0.00019524195014978624, "loss": 0.5499, "step": 2171 }, { "epoch": 0.2996068694392717, "grad_norm": 0.8790969252586365, "learning_rate": 0.00019523754004152912, "loss": 0.6992, "step": 2172 }, { "epoch": 0.2997448099868956, "grad_norm": 0.8488597869873047, "learning_rate": 0.00019523312794026768, "loss": 0.6438, "step": 2173 }, { "epoch": 0.2998827505345196, "grad_norm": 1.2970049381256104, "learning_rate": 0.00019522871384609417, "loss": 0.7046, "step": 2174 }, { "epoch": 0.3000206910821436, "grad_norm": 1.1517971754074097, "learning_rate": 0.000195224297759101, "loss": 0.8484, "step": 2175 }, { "epoch": 0.3001586316297676, "grad_norm": 1.1928848028182983, "learning_rate": 0.00019521987967938058, "loss": 0.7795, "step": 2176 }, { "epoch": 0.30029657217739153, "grad_norm": 0.7839152216911316, "learning_rate": 0.00019521545960702534, "loss": 0.337, "step": 2177 }, { "epoch": 0.30043451272501553, "grad_norm": 0.6438336372375488, "learning_rate": 0.0001952110375421278, "loss": 0.5161, "step": 2178 }, { "epoch": 0.3005724532726395, "grad_norm": 0.6859175562858582, "learning_rate": 0.00019520661348478054, "loss": 0.4835, "step": 2179 }, { "epoch": 0.3007103938202635, "grad_norm": 0.6152809858322144, "learning_rate": 0.00019520218743507606, "loss": 0.394, "step": 2180 }, { "epoch": 0.30084833436788744, "grad_norm": 0.6782438158988953, "learning_rate": 0.00019519775939310705, "loss": 0.6891, "step": 2181 }, { "epoch": 0.3009862749155114, "grad_norm": 0.9672862887382507, "learning_rate": 0.00019519332935896613, "loss": 0.6395, "step": 2182 }, { "epoch": 0.3011242154631354, "grad_norm": 0.6512202620506287, "learning_rate": 0.000195188897332746, "loss": 0.4909, "step": 2183 }, { "epoch": 0.30126215601075934, "grad_norm": 1.2240195274353027, "learning_rate": 0.00019518446331453948, "loss": 0.9607, "step": 2184 }, { "epoch": 0.30140009655838335, "grad_norm": 0.6230162978172302, "learning_rate": 0.00019518002730443927, "loss": 0.4991, "step": 2185 }, { "epoch": 0.3015380371060073, "grad_norm": 1.0958621501922607, "learning_rate": 0.00019517558930253826, "loss": 0.6996, "step": 2186 }, { "epoch": 0.3016759776536313, "grad_norm": 0.8363164067268372, "learning_rate": 0.00019517114930892927, "loss": 0.8149, "step": 2187 }, { "epoch": 0.30181391820125525, "grad_norm": 0.7095656394958496, "learning_rate": 0.00019516670732370528, "loss": 0.4541, "step": 2188 }, { "epoch": 0.30195185874887925, "grad_norm": 0.9975584149360657, "learning_rate": 0.0001951622633469592, "loss": 0.7558, "step": 2189 }, { "epoch": 0.3020897992965032, "grad_norm": 0.5672247409820557, "learning_rate": 0.00019515781737878402, "loss": 0.2655, "step": 2190 }, { "epoch": 0.3022277398441272, "grad_norm": 0.6647024154663086, "learning_rate": 0.00019515336941927283, "loss": 0.5032, "step": 2191 }, { "epoch": 0.30236568039175116, "grad_norm": 0.7022714614868164, "learning_rate": 0.00019514891946851868, "loss": 0.5562, "step": 2192 }, { "epoch": 0.3025036209393751, "grad_norm": 0.6081823706626892, "learning_rate": 0.00019514446752661466, "loss": 0.4645, "step": 2193 }, { "epoch": 0.3026415614869991, "grad_norm": 0.6334623098373413, "learning_rate": 0.00019514001359365399, "loss": 0.6747, "step": 2194 }, { "epoch": 0.30277950203462306, "grad_norm": 0.6264985799789429, "learning_rate": 0.00019513555766972987, "loss": 0.5149, "step": 2195 }, { "epoch": 0.30291744258224707, "grad_norm": 0.784883975982666, "learning_rate": 0.0001951310997549355, "loss": 0.7089, "step": 2196 }, { "epoch": 0.303055383129871, "grad_norm": 0.5658442974090576, "learning_rate": 0.00019512663984936422, "loss": 0.4461, "step": 2197 }, { "epoch": 0.303193323677495, "grad_norm": 0.6339519023895264, "learning_rate": 0.00019512217795310933, "loss": 0.5541, "step": 2198 }, { "epoch": 0.30333126422511897, "grad_norm": 0.8487290740013123, "learning_rate": 0.0001951177140662642, "loss": 0.7045, "step": 2199 }, { "epoch": 0.303469204772743, "grad_norm": 0.5900312662124634, "learning_rate": 0.00019511324818892228, "loss": 0.42, "step": 2200 }, { "epoch": 0.303469204772743, "eval_loss": 0.661845326423645, "eval_runtime": 23.5093, "eval_samples_per_second": 2.51, "eval_steps_per_second": 2.51, "step": 2200 }, { "epoch": 0.3036071453203669, "grad_norm": 0.7370956540107727, "learning_rate": 0.000195108780321177, "loss": 0.4832, "step": 2201 }, { "epoch": 0.30374508586799087, "grad_norm": 0.6172298192977905, "learning_rate": 0.00019510431046312185, "loss": 0.5685, "step": 2202 }, { "epoch": 0.3038830264156149, "grad_norm": 0.4689820408821106, "learning_rate": 0.0001950998386148504, "loss": 0.3635, "step": 2203 }, { "epoch": 0.3040209669632388, "grad_norm": 0.8951042294502258, "learning_rate": 0.00019509536477645617, "loss": 0.8364, "step": 2204 }, { "epoch": 0.30415890751086283, "grad_norm": 0.6719712018966675, "learning_rate": 0.00019509088894803286, "loss": 0.2531, "step": 2205 }, { "epoch": 0.3042968480584868, "grad_norm": 0.730803370475769, "learning_rate": 0.00019508641112967408, "loss": 0.5159, "step": 2206 }, { "epoch": 0.3044347886061108, "grad_norm": 0.7379736304283142, "learning_rate": 0.0001950819313214736, "loss": 0.4162, "step": 2207 }, { "epoch": 0.30457272915373473, "grad_norm": 0.8285558223724365, "learning_rate": 0.00019507744952352508, "loss": 0.7966, "step": 2208 }, { "epoch": 0.30471066970135874, "grad_norm": 0.8864738941192627, "learning_rate": 0.00019507296573592235, "loss": 0.7326, "step": 2209 }, { "epoch": 0.3048486102489827, "grad_norm": 0.7778903841972351, "learning_rate": 0.00019506847995875924, "loss": 0.4939, "step": 2210 }, { "epoch": 0.30498655079660664, "grad_norm": 0.7180725932121277, "learning_rate": 0.00019506399219212966, "loss": 0.5479, "step": 2211 }, { "epoch": 0.30512449134423064, "grad_norm": 0.9998635053634644, "learning_rate": 0.00019505950243612746, "loss": 1.02, "step": 2212 }, { "epoch": 0.3052624318918546, "grad_norm": 1.4325881004333496, "learning_rate": 0.00019505501069084659, "loss": 0.6919, "step": 2213 }, { "epoch": 0.3054003724394786, "grad_norm": 0.7900728583335876, "learning_rate": 0.00019505051695638113, "loss": 0.3652, "step": 2214 }, { "epoch": 0.30553831298710254, "grad_norm": 0.8904551863670349, "learning_rate": 0.00019504602123282508, "loss": 0.8051, "step": 2215 }, { "epoch": 0.30567625353472655, "grad_norm": 0.5742565989494324, "learning_rate": 0.00019504152352027245, "loss": 0.3562, "step": 2216 }, { "epoch": 0.3058141940823505, "grad_norm": 0.8754223585128784, "learning_rate": 0.00019503702381881745, "loss": 0.7154, "step": 2217 }, { "epoch": 0.3059521346299745, "grad_norm": 0.834255576133728, "learning_rate": 0.00019503252212855422, "loss": 0.8241, "step": 2218 }, { "epoch": 0.30609007517759845, "grad_norm": 0.8959856033325195, "learning_rate": 0.00019502801844957697, "loss": 1.1416, "step": 2219 }, { "epoch": 0.3062280157252224, "grad_norm": 0.76212078332901, "learning_rate": 0.00019502351278197994, "loss": 0.5501, "step": 2220 }, { "epoch": 0.3063659562728464, "grad_norm": 1.0702933073043823, "learning_rate": 0.0001950190051258574, "loss": 0.5158, "step": 2221 }, { "epoch": 0.30650389682047036, "grad_norm": 0.9771005511283875, "learning_rate": 0.00019501449548130372, "loss": 0.6492, "step": 2222 }, { "epoch": 0.30664183736809436, "grad_norm": 0.6449692845344543, "learning_rate": 0.00019500998384841322, "loss": 0.581, "step": 2223 }, { "epoch": 0.3067797779157183, "grad_norm": 0.6486768126487732, "learning_rate": 0.00019500547022728034, "loss": 0.6896, "step": 2224 }, { "epoch": 0.3069177184633423, "grad_norm": 0.570933997631073, "learning_rate": 0.00019500095461799955, "loss": 0.4472, "step": 2225 }, { "epoch": 0.30705565901096626, "grad_norm": 0.6124463081359863, "learning_rate": 0.00019499643702066536, "loss": 0.49, "step": 2226 }, { "epoch": 0.30719359955859027, "grad_norm": 1.030892014503479, "learning_rate": 0.00019499191743537224, "loss": 0.6116, "step": 2227 }, { "epoch": 0.3073315401062142, "grad_norm": 0.7422316670417786, "learning_rate": 0.00019498739586221482, "loss": 0.4349, "step": 2228 }, { "epoch": 0.3074694806538382, "grad_norm": 1.2078644037246704, "learning_rate": 0.00019498287230128775, "loss": 0.8739, "step": 2229 }, { "epoch": 0.3076074212014622, "grad_norm": 0.6796876788139343, "learning_rate": 0.0001949783467526856, "loss": 0.4402, "step": 2230 }, { "epoch": 0.3077453617490861, "grad_norm": 0.9108544588088989, "learning_rate": 0.00019497381921650318, "loss": 0.8838, "step": 2231 }, { "epoch": 0.3078833022967101, "grad_norm": 0.9964629411697388, "learning_rate": 0.00019496928969283517, "loss": 0.7255, "step": 2232 }, { "epoch": 0.3080212428443341, "grad_norm": 1.5495188236236572, "learning_rate": 0.00019496475818177634, "loss": 1.264, "step": 2233 }, { "epoch": 0.3081591833919581, "grad_norm": 0.8140445351600647, "learning_rate": 0.0001949602246834216, "loss": 0.9636, "step": 2234 }, { "epoch": 0.30829712393958203, "grad_norm": 0.6906377077102661, "learning_rate": 0.0001949556891978658, "loss": 0.553, "step": 2235 }, { "epoch": 0.30843506448720603, "grad_norm": 0.8340548872947693, "learning_rate": 0.00019495115172520378, "loss": 0.5792, "step": 2236 }, { "epoch": 0.30857300503483, "grad_norm": 1.0296357870101929, "learning_rate": 0.00019494661226553055, "loss": 0.971, "step": 2237 }, { "epoch": 0.308710945582454, "grad_norm": 0.7610672116279602, "learning_rate": 0.0001949420708189411, "loss": 0.5375, "step": 2238 }, { "epoch": 0.30884888613007794, "grad_norm": 0.722172200679779, "learning_rate": 0.00019493752738553046, "loss": 0.479, "step": 2239 }, { "epoch": 0.3089868266777019, "grad_norm": 0.8141410946846008, "learning_rate": 0.00019493298196539375, "loss": 0.8384, "step": 2240 }, { "epoch": 0.3091247672253259, "grad_norm": 0.7743800282478333, "learning_rate": 0.000194928434558626, "loss": 0.9943, "step": 2241 }, { "epoch": 0.30926270777294984, "grad_norm": 0.6680206656455994, "learning_rate": 0.00019492388516532247, "loss": 0.4103, "step": 2242 }, { "epoch": 0.30940064832057385, "grad_norm": 0.9488325715065002, "learning_rate": 0.0001949193337855783, "loss": 0.8465, "step": 2243 }, { "epoch": 0.3095385888681978, "grad_norm": 0.5857890248298645, "learning_rate": 0.00019491478041948877, "loss": 0.395, "step": 2244 }, { "epoch": 0.3096765294158218, "grad_norm": 0.5725042223930359, "learning_rate": 0.00019491022506714912, "loss": 0.3626, "step": 2245 }, { "epoch": 0.30981446996344575, "grad_norm": 0.7076693773269653, "learning_rate": 0.00019490566772865475, "loss": 0.5949, "step": 2246 }, { "epoch": 0.30995241051106975, "grad_norm": 0.8544387817382812, "learning_rate": 0.00019490110840410097, "loss": 1.0608, "step": 2247 }, { "epoch": 0.3100903510586937, "grad_norm": 0.832599937915802, "learning_rate": 0.00019489654709358323, "loss": 0.807, "step": 2248 }, { "epoch": 0.31022829160631765, "grad_norm": 1.0049424171447754, "learning_rate": 0.00019489198379719696, "loss": 0.794, "step": 2249 }, { "epoch": 0.31036623215394166, "grad_norm": 0.6564392447471619, "learning_rate": 0.00019488741851503765, "loss": 0.5557, "step": 2250 }, { "epoch": 0.3105041727015656, "grad_norm": 0.5619440078735352, "learning_rate": 0.00019488285124720086, "loss": 0.4077, "step": 2251 }, { "epoch": 0.3106421132491896, "grad_norm": 0.5860351920127869, "learning_rate": 0.00019487828199378214, "loss": 0.4018, "step": 2252 }, { "epoch": 0.31078005379681356, "grad_norm": 0.7864125370979309, "learning_rate": 0.00019487371075487713, "loss": 0.6525, "step": 2253 }, { "epoch": 0.31091799434443756, "grad_norm": 0.6421269178390503, "learning_rate": 0.00019486913753058148, "loss": 0.4446, "step": 2254 }, { "epoch": 0.3110559348920615, "grad_norm": 1.2416633367538452, "learning_rate": 0.0001948645623209909, "loss": 0.5695, "step": 2255 }, { "epoch": 0.3111938754396855, "grad_norm": 1.3990689516067505, "learning_rate": 0.00019485998512620113, "loss": 0.8486, "step": 2256 }, { "epoch": 0.31133181598730947, "grad_norm": 0.8644762635231018, "learning_rate": 0.00019485540594630794, "loss": 0.5197, "step": 2257 }, { "epoch": 0.3114697565349334, "grad_norm": 0.7197523713111877, "learning_rate": 0.0001948508247814072, "loss": 0.4854, "step": 2258 }, { "epoch": 0.3116076970825574, "grad_norm": 0.7777307033538818, "learning_rate": 0.00019484624163159474, "loss": 0.8011, "step": 2259 }, { "epoch": 0.31174563763018137, "grad_norm": 3.498762369155884, "learning_rate": 0.00019484165649696648, "loss": 1.2415, "step": 2260 }, { "epoch": 0.3118835781778054, "grad_norm": 0.8177916407585144, "learning_rate": 0.00019483706937761837, "loss": 0.6254, "step": 2261 }, { "epoch": 0.3120215187254293, "grad_norm": 0.8077528476715088, "learning_rate": 0.0001948324802736464, "loss": 1.1841, "step": 2262 }, { "epoch": 0.31215945927305333, "grad_norm": 0.7529622316360474, "learning_rate": 0.00019482788918514664, "loss": 0.5046, "step": 2263 }, { "epoch": 0.3122973998206773, "grad_norm": 0.6038236618041992, "learning_rate": 0.0001948232961122151, "loss": 0.5598, "step": 2264 }, { "epoch": 0.3124353403683013, "grad_norm": 0.6496687531471252, "learning_rate": 0.00019481870105494796, "loss": 0.3127, "step": 2265 }, { "epoch": 0.31257328091592523, "grad_norm": 0.8372655510902405, "learning_rate": 0.00019481410401344133, "loss": 0.7623, "step": 2266 }, { "epoch": 0.3127112214635492, "grad_norm": 0.9408671855926514, "learning_rate": 0.00019480950498779144, "loss": 0.913, "step": 2267 }, { "epoch": 0.3128491620111732, "grad_norm": 1.2297847270965576, "learning_rate": 0.00019480490397809456, "loss": 0.7727, "step": 2268 }, { "epoch": 0.31298710255879714, "grad_norm": 0.8657265305519104, "learning_rate": 0.0001948003009844469, "loss": 0.7712, "step": 2269 }, { "epoch": 0.31312504310642114, "grad_norm": 0.6789664030075073, "learning_rate": 0.00019479569600694486, "loss": 0.5377, "step": 2270 }, { "epoch": 0.3132629836540451, "grad_norm": 0.8153241872787476, "learning_rate": 0.00019479108904568474, "loss": 0.438, "step": 2271 }, { "epoch": 0.3134009242016691, "grad_norm": 0.820363461971283, "learning_rate": 0.00019478648010076298, "loss": 0.5774, "step": 2272 }, { "epoch": 0.31353886474929304, "grad_norm": 0.9345502853393555, "learning_rate": 0.00019478186917227605, "loss": 0.7403, "step": 2273 }, { "epoch": 0.31367680529691705, "grad_norm": 0.6386396884918213, "learning_rate": 0.00019477725626032043, "loss": 0.5016, "step": 2274 }, { "epoch": 0.313814745844541, "grad_norm": 1.081990122795105, "learning_rate": 0.00019477264136499262, "loss": 0.7868, "step": 2275 }, { "epoch": 0.313952686392165, "grad_norm": 0.7201882600784302, "learning_rate": 0.00019476802448638924, "loss": 0.488, "step": 2276 }, { "epoch": 0.31409062693978895, "grad_norm": 0.7955479621887207, "learning_rate": 0.00019476340562460688, "loss": 0.7676, "step": 2277 }, { "epoch": 0.3142285674874129, "grad_norm": 0.731919527053833, "learning_rate": 0.0001947587847797422, "loss": 0.579, "step": 2278 }, { "epoch": 0.3143665080350369, "grad_norm": 1.8228474855422974, "learning_rate": 0.00019475416195189192, "loss": 0.8461, "step": 2279 }, { "epoch": 0.31450444858266086, "grad_norm": 0.5661347508430481, "learning_rate": 0.00019474953714115274, "loss": 0.3593, "step": 2280 }, { "epoch": 0.31464238913028486, "grad_norm": 0.747999370098114, "learning_rate": 0.00019474491034762145, "loss": 0.6878, "step": 2281 }, { "epoch": 0.3147803296779088, "grad_norm": 0.9928996562957764, "learning_rate": 0.0001947402815713949, "loss": 0.8761, "step": 2282 }, { "epoch": 0.3149182702255328, "grad_norm": 0.7003133893013, "learning_rate": 0.00019473565081256996, "loss": 0.4855, "step": 2283 }, { "epoch": 0.31505621077315676, "grad_norm": 0.6472734808921814, "learning_rate": 0.00019473101807124352, "loss": 0.511, "step": 2284 }, { "epoch": 0.31519415132078077, "grad_norm": 0.723513662815094, "learning_rate": 0.0001947263833475125, "loss": 0.4892, "step": 2285 }, { "epoch": 0.3153320918684047, "grad_norm": 1.6176047325134277, "learning_rate": 0.00019472174664147393, "loss": 0.5581, "step": 2286 }, { "epoch": 0.31547003241602867, "grad_norm": 0.9376205801963806, "learning_rate": 0.00019471710795322485, "loss": 1.091, "step": 2287 }, { "epoch": 0.31560797296365267, "grad_norm": 1.0848584175109863, "learning_rate": 0.00019471246728286227, "loss": 0.6718, "step": 2288 }, { "epoch": 0.3157459135112766, "grad_norm": 1.0394634008407593, "learning_rate": 0.00019470782463048336, "loss": 0.4477, "step": 2289 }, { "epoch": 0.3158838540589006, "grad_norm": 0.8964745998382568, "learning_rate": 0.00019470317999618523, "loss": 0.4769, "step": 2290 }, { "epoch": 0.3160217946065246, "grad_norm": 0.6246095299720764, "learning_rate": 0.00019469853338006514, "loss": 0.2479, "step": 2291 }, { "epoch": 0.3161597351541486, "grad_norm": 0.878368079662323, "learning_rate": 0.0001946938847822203, "loss": 0.3964, "step": 2292 }, { "epoch": 0.31629767570177253, "grad_norm": 0.6446416974067688, "learning_rate": 0.00019468923420274797, "loss": 0.6782, "step": 2293 }, { "epoch": 0.31643561624939653, "grad_norm": 0.8462199568748474, "learning_rate": 0.0001946845816417455, "loss": 0.6378, "step": 2294 }, { "epoch": 0.3165735567970205, "grad_norm": 0.7193346619606018, "learning_rate": 0.00019467992709931017, "loss": 0.5933, "step": 2295 }, { "epoch": 0.31671149734464443, "grad_norm": 1.4028959274291992, "learning_rate": 0.00019467527057553952, "loss": 1.1746, "step": 2296 }, { "epoch": 0.31684943789226844, "grad_norm": 0.8412365913391113, "learning_rate": 0.00019467061207053087, "loss": 0.5632, "step": 2297 }, { "epoch": 0.3169873784398924, "grad_norm": 0.6352449655532837, "learning_rate": 0.0001946659515843818, "loss": 0.4559, "step": 2298 }, { "epoch": 0.3171253189875164, "grad_norm": 0.48701727390289307, "learning_rate": 0.00019466128911718982, "loss": 0.2398, "step": 2299 }, { "epoch": 0.31726325953514034, "grad_norm": 0.5449528098106384, "learning_rate": 0.00019465662466905243, "loss": 0.6206, "step": 2300 }, { "epoch": 0.31740120008276435, "grad_norm": 1.2383208274841309, "learning_rate": 0.00019465195824006732, "loss": 0.9354, "step": 2301 }, { "epoch": 0.3175391406303883, "grad_norm": 0.9451349377632141, "learning_rate": 0.00019464728983033212, "loss": 0.9349, "step": 2302 }, { "epoch": 0.3176770811780123, "grad_norm": 0.7076907753944397, "learning_rate": 0.0001946426194399445, "loss": 0.667, "step": 2303 }, { "epoch": 0.31781502172563625, "grad_norm": 0.6356270909309387, "learning_rate": 0.00019463794706900224, "loss": 0.2469, "step": 2304 }, { "epoch": 0.3179529622732602, "grad_norm": 0.8059444427490234, "learning_rate": 0.00019463327271760308, "loss": 0.6322, "step": 2305 }, { "epoch": 0.3180909028208842, "grad_norm": 0.7126657366752625, "learning_rate": 0.00019462859638584484, "loss": 0.4607, "step": 2306 }, { "epoch": 0.31822884336850815, "grad_norm": 1.20512855052948, "learning_rate": 0.0001946239180738254, "loss": 0.7065, "step": 2307 }, { "epoch": 0.31836678391613216, "grad_norm": 1.0039737224578857, "learning_rate": 0.00019461923778164267, "loss": 0.7817, "step": 2308 }, { "epoch": 0.3185047244637561, "grad_norm": 0.8472278118133545, "learning_rate": 0.00019461455550939455, "loss": 0.7392, "step": 2309 }, { "epoch": 0.3186426650113801, "grad_norm": 0.8026204109191895, "learning_rate": 0.00019460987125717905, "loss": 0.6547, "step": 2310 }, { "epoch": 0.31878060555900406, "grad_norm": 0.985788881778717, "learning_rate": 0.00019460518502509422, "loss": 0.3619, "step": 2311 }, { "epoch": 0.31891854610662806, "grad_norm": 0.913837194442749, "learning_rate": 0.00019460049681323808, "loss": 0.8376, "step": 2312 }, { "epoch": 0.319056486654252, "grad_norm": 0.6265845894813538, "learning_rate": 0.0001945958066217088, "loss": 0.579, "step": 2313 }, { "epoch": 0.319194427201876, "grad_norm": 0.9424504637718201, "learning_rate": 0.00019459111445060444, "loss": 0.5184, "step": 2314 }, { "epoch": 0.31933236774949997, "grad_norm": 0.5835946202278137, "learning_rate": 0.00019458642030002326, "loss": 0.4495, "step": 2315 }, { "epoch": 0.3194703082971239, "grad_norm": 0.7594127058982849, "learning_rate": 0.00019458172417006347, "loss": 0.7142, "step": 2316 }, { "epoch": 0.3196082488447479, "grad_norm": 0.6176849007606506, "learning_rate": 0.00019457702606082337, "loss": 0.3594, "step": 2317 }, { "epoch": 0.31974618939237187, "grad_norm": 1.6596888303756714, "learning_rate": 0.00019457232597240126, "loss": 0.9118, "step": 2318 }, { "epoch": 0.3198841299399959, "grad_norm": 0.8690287470817566, "learning_rate": 0.00019456762390489548, "loss": 0.566, "step": 2319 }, { "epoch": 0.3200220704876198, "grad_norm": 1.1131110191345215, "learning_rate": 0.0001945629198584044, "loss": 1.1, "step": 2320 }, { "epoch": 0.32016001103524383, "grad_norm": 0.7218566536903381, "learning_rate": 0.00019455821383302657, "loss": 0.5501, "step": 2321 }, { "epoch": 0.3202979515828678, "grad_norm": 0.5688751339912415, "learning_rate": 0.00019455350582886038, "loss": 0.5373, "step": 2322 }, { "epoch": 0.3204358921304918, "grad_norm": 1.2792819738388062, "learning_rate": 0.00019454879584600437, "loss": 0.733, "step": 2323 }, { "epoch": 0.32057383267811573, "grad_norm": 0.9383312463760376, "learning_rate": 0.0001945440838845571, "loss": 0.4367, "step": 2324 }, { "epoch": 0.3207117732257397, "grad_norm": 0.9324066042900085, "learning_rate": 0.00019453936994461718, "loss": 0.9925, "step": 2325 }, { "epoch": 0.3208497137733637, "grad_norm": 1.0629867315292358, "learning_rate": 0.0001945346540262833, "loss": 0.7296, "step": 2326 }, { "epoch": 0.32098765432098764, "grad_norm": 0.7863196730613708, "learning_rate": 0.0001945299361296541, "loss": 0.936, "step": 2327 }, { "epoch": 0.32112559486861164, "grad_norm": 0.6948659420013428, "learning_rate": 0.0001945252162548283, "loss": 0.4759, "step": 2328 }, { "epoch": 0.3212635354162356, "grad_norm": 0.908307671546936, "learning_rate": 0.00019452049440190473, "loss": 0.8042, "step": 2329 }, { "epoch": 0.3214014759638596, "grad_norm": 0.814140796661377, "learning_rate": 0.00019451577057098213, "loss": 0.7884, "step": 2330 }, { "epoch": 0.32153941651148354, "grad_norm": 0.752573549747467, "learning_rate": 0.0001945110447621594, "loss": 0.8405, "step": 2331 }, { "epoch": 0.32167735705910755, "grad_norm": 0.8677518963813782, "learning_rate": 0.00019450631697553542, "loss": 0.8891, "step": 2332 }, { "epoch": 0.3218152976067315, "grad_norm": 0.7212129831314087, "learning_rate": 0.00019450158721120916, "loss": 0.5369, "step": 2333 }, { "epoch": 0.32195323815435545, "grad_norm": 0.6805658936500549, "learning_rate": 0.00019449685546927954, "loss": 0.4181, "step": 2334 }, { "epoch": 0.32209117870197945, "grad_norm": 0.8572118878364563, "learning_rate": 0.0001944921217498456, "loss": 0.4678, "step": 2335 }, { "epoch": 0.3222291192496034, "grad_norm": 0.7739250063896179, "learning_rate": 0.00019448738605300645, "loss": 0.8138, "step": 2336 }, { "epoch": 0.3223670597972274, "grad_norm": 0.9221212863922119, "learning_rate": 0.00019448264837886113, "loss": 0.6867, "step": 2337 }, { "epoch": 0.32250500034485136, "grad_norm": 0.5943915247917175, "learning_rate": 0.0001944779087275088, "loss": 0.4269, "step": 2338 }, { "epoch": 0.32264294089247536, "grad_norm": 0.7601683735847473, "learning_rate": 0.00019447316709904865, "loss": 0.4699, "step": 2339 }, { "epoch": 0.3227808814400993, "grad_norm": 0.8653863072395325, "learning_rate": 0.0001944684234935799, "loss": 0.6408, "step": 2340 }, { "epoch": 0.3229188219877233, "grad_norm": 0.8126456141471863, "learning_rate": 0.00019446367791120186, "loss": 0.7773, "step": 2341 }, { "epoch": 0.32305676253534726, "grad_norm": 0.6638123393058777, "learning_rate": 0.00019445893035201383, "loss": 0.4854, "step": 2342 }, { "epoch": 0.3231947030829712, "grad_norm": 1.3545905351638794, "learning_rate": 0.00019445418081611506, "loss": 0.9794, "step": 2343 }, { "epoch": 0.3233326436305952, "grad_norm": 0.8681669235229492, "learning_rate": 0.00019444942930360503, "loss": 0.8998, "step": 2344 }, { "epoch": 0.32347058417821917, "grad_norm": 1.0023455619812012, "learning_rate": 0.00019444467581458322, "loss": 0.7062, "step": 2345 }, { "epoch": 0.32360852472584317, "grad_norm": 0.8101288676261902, "learning_rate": 0.00019443992034914897, "loss": 0.6581, "step": 2346 }, { "epoch": 0.3237464652734671, "grad_norm": 1.2586729526519775, "learning_rate": 0.00019443516290740194, "loss": 0.7804, "step": 2347 }, { "epoch": 0.3238844058210911, "grad_norm": 0.9507285356521606, "learning_rate": 0.00019443040348944156, "loss": 0.6049, "step": 2348 }, { "epoch": 0.3240223463687151, "grad_norm": 0.6528936624526978, "learning_rate": 0.00019442564209536754, "loss": 0.4616, "step": 2349 }, { "epoch": 0.3241602869163391, "grad_norm": 0.7113572359085083, "learning_rate": 0.00019442087872527944, "loss": 0.6116, "step": 2350 }, { "epoch": 0.32429822746396303, "grad_norm": 0.5419871807098389, "learning_rate": 0.00019441611337927696, "loss": 0.2321, "step": 2351 }, { "epoch": 0.324436168011587, "grad_norm": 0.679607629776001, "learning_rate": 0.00019441134605745986, "loss": 0.459, "step": 2352 }, { "epoch": 0.324574108559211, "grad_norm": 0.9691960215568542, "learning_rate": 0.00019440657675992787, "loss": 0.4727, "step": 2353 }, { "epoch": 0.32471204910683493, "grad_norm": 0.8125988841056824, "learning_rate": 0.0001944018054867808, "loss": 0.8017, "step": 2354 }, { "epoch": 0.32484998965445894, "grad_norm": 1.649573802947998, "learning_rate": 0.00019439703223811847, "loss": 0.829, "step": 2355 }, { "epoch": 0.3249879302020829, "grad_norm": 0.745305061340332, "learning_rate": 0.00019439225701404085, "loss": 0.4651, "step": 2356 }, { "epoch": 0.3251258707497069, "grad_norm": 0.6748473048210144, "learning_rate": 0.00019438747981464775, "loss": 0.5996, "step": 2357 }, { "epoch": 0.32526381129733084, "grad_norm": 1.0531598329544067, "learning_rate": 0.00019438270064003926, "loss": 0.9084, "step": 2358 }, { "epoch": 0.32540175184495485, "grad_norm": 0.9223348498344421, "learning_rate": 0.00019437791949031535, "loss": 0.7866, "step": 2359 }, { "epoch": 0.3255396923925788, "grad_norm": 0.7526196837425232, "learning_rate": 0.00019437313636557602, "loss": 0.4975, "step": 2360 }, { "epoch": 0.3256776329402028, "grad_norm": 1.6201496124267578, "learning_rate": 0.00019436835126592143, "loss": 0.7395, "step": 2361 }, { "epoch": 0.32581557348782675, "grad_norm": 0.7340310215950012, "learning_rate": 0.00019436356419145166, "loss": 0.6325, "step": 2362 }, { "epoch": 0.3259535140354507, "grad_norm": 1.1777743101119995, "learning_rate": 0.00019435877514226697, "loss": 0.4779, "step": 2363 }, { "epoch": 0.3260914545830747, "grad_norm": 0.9242397546768188, "learning_rate": 0.00019435398411846752, "loss": 0.4674, "step": 2364 }, { "epoch": 0.32622939513069865, "grad_norm": 0.6935853958129883, "learning_rate": 0.00019434919112015355, "loss": 0.3949, "step": 2365 }, { "epoch": 0.32636733567832266, "grad_norm": 0.7134401202201843, "learning_rate": 0.00019434439614742543, "loss": 0.5659, "step": 2366 }, { "epoch": 0.3265052762259466, "grad_norm": 0.9489606618881226, "learning_rate": 0.00019433959920038345, "loss": 0.7124, "step": 2367 }, { "epoch": 0.3266432167735706, "grad_norm": 0.6194107532501221, "learning_rate": 0.000194334800279128, "loss": 0.8171, "step": 2368 }, { "epoch": 0.32678115732119456, "grad_norm": 0.8815126419067383, "learning_rate": 0.00019432999938375953, "loss": 0.5195, "step": 2369 }, { "epoch": 0.32691909786881856, "grad_norm": 0.5797806978225708, "learning_rate": 0.0001943251965143785, "loss": 0.4952, "step": 2370 }, { "epoch": 0.3270570384164425, "grad_norm": 0.9306840300559998, "learning_rate": 0.00019432039167108537, "loss": 0.476, "step": 2371 }, { "epoch": 0.32719497896406646, "grad_norm": 0.6784822344779968, "learning_rate": 0.00019431558485398076, "loss": 0.641, "step": 2372 }, { "epoch": 0.32733291951169047, "grad_norm": 0.7142674922943115, "learning_rate": 0.00019431077606316523, "loss": 0.6712, "step": 2373 }, { "epoch": 0.3274708600593144, "grad_norm": 1.0263147354125977, "learning_rate": 0.00019430596529873938, "loss": 0.8278, "step": 2374 }, { "epoch": 0.3276088006069384, "grad_norm": 0.672478199005127, "learning_rate": 0.00019430115256080394, "loss": 0.5935, "step": 2375 }, { "epoch": 0.32774674115456237, "grad_norm": 0.9333507418632507, "learning_rate": 0.0001942963378494596, "loss": 0.664, "step": 2376 }, { "epoch": 0.3278846817021864, "grad_norm": 0.8227028250694275, "learning_rate": 0.0001942915211648071, "loss": 0.3793, "step": 2377 }, { "epoch": 0.3280226222498103, "grad_norm": 0.8363267183303833, "learning_rate": 0.00019428670250694728, "loss": 0.534, "step": 2378 }, { "epoch": 0.32816056279743433, "grad_norm": 0.6801791787147522, "learning_rate": 0.00019428188187598094, "loss": 0.5693, "step": 2379 }, { "epoch": 0.3282985033450583, "grad_norm": 0.9937869310379028, "learning_rate": 0.00019427705927200896, "loss": 0.4011, "step": 2380 }, { "epoch": 0.32843644389268223, "grad_norm": 0.7679700255393982, "learning_rate": 0.00019427223469513228, "loss": 0.4928, "step": 2381 }, { "epoch": 0.32857438444030623, "grad_norm": 1.2696233987808228, "learning_rate": 0.00019426740814545185, "loss": 0.3716, "step": 2382 }, { "epoch": 0.3287123249879302, "grad_norm": 0.816831648349762, "learning_rate": 0.00019426257962306868, "loss": 0.469, "step": 2383 }, { "epoch": 0.3288502655355542, "grad_norm": 1.172206163406372, "learning_rate": 0.0001942577491280838, "loss": 0.7011, "step": 2384 }, { "epoch": 0.32898820608317814, "grad_norm": 0.8468907475471497, "learning_rate": 0.00019425291666059832, "loss": 0.2813, "step": 2385 }, { "epoch": 0.32912614663080214, "grad_norm": 0.9245859980583191, "learning_rate": 0.00019424808222071337, "loss": 0.5006, "step": 2386 }, { "epoch": 0.3292640871784261, "grad_norm": 1.3314694166183472, "learning_rate": 0.00019424324580853006, "loss": 0.3318, "step": 2387 }, { "epoch": 0.3294020277260501, "grad_norm": 0.6868737936019897, "learning_rate": 0.00019423840742414968, "loss": 0.4828, "step": 2388 }, { "epoch": 0.32953996827367404, "grad_norm": 0.5695831775665283, "learning_rate": 0.00019423356706767343, "loss": 0.4117, "step": 2389 }, { "epoch": 0.329677908821298, "grad_norm": 0.8199607729911804, "learning_rate": 0.00019422872473920264, "loss": 0.8271, "step": 2390 }, { "epoch": 0.329815849368922, "grad_norm": 0.9360648989677429, "learning_rate": 0.0001942238804388386, "loss": 0.8707, "step": 2391 }, { "epoch": 0.32995378991654595, "grad_norm": 0.7775169610977173, "learning_rate": 0.00019421903416668273, "loss": 0.5637, "step": 2392 }, { "epoch": 0.33009173046416995, "grad_norm": 0.8939715027809143, "learning_rate": 0.0001942141859228364, "loss": 0.681, "step": 2393 }, { "epoch": 0.3302296710117939, "grad_norm": 0.7903376221656799, "learning_rate": 0.00019420933570740112, "loss": 0.6019, "step": 2394 }, { "epoch": 0.3303676115594179, "grad_norm": 0.5653364062309265, "learning_rate": 0.00019420448352047833, "loss": 0.4377, "step": 2395 }, { "epoch": 0.33050555210704186, "grad_norm": 0.6574212312698364, "learning_rate": 0.0001941996293621696, "loss": 0.3995, "step": 2396 }, { "epoch": 0.33064349265466586, "grad_norm": 0.9487119913101196, "learning_rate": 0.00019419477323257654, "loss": 0.652, "step": 2397 }, { "epoch": 0.3307814332022898, "grad_norm": 0.8530499339103699, "learning_rate": 0.0001941899151318007, "loss": 0.7224, "step": 2398 }, { "epoch": 0.3309193737499138, "grad_norm": 0.8137893676757812, "learning_rate": 0.0001941850550599438, "loss": 0.5038, "step": 2399 }, { "epoch": 0.33105731429753776, "grad_norm": 0.8479599356651306, "learning_rate": 0.00019418019301710757, "loss": 0.4543, "step": 2400 }, { "epoch": 0.33105731429753776, "eval_loss": 0.6588593125343323, "eval_runtime": 23.4746, "eval_samples_per_second": 2.513, "eval_steps_per_second": 2.513, "step": 2400 } ], "logging_steps": 1, "max_steps": 21747, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.8002757108760576e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }