{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7794641184185872, "eval_steps": 500, "global_step": 2600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00029979389169945664, "grad_norm": 4.617544174194336, "learning_rate": 2.9999999999999997e-05, "loss": 1.26, "step": 1 }, { "epoch": 0.0005995877833989133, "grad_norm": 5.574990272521973, "learning_rate": 5.9999999999999995e-05, "loss": 1.2193, "step": 2 }, { "epoch": 0.0008993816750983699, "grad_norm": 2.685803174972534, "learning_rate": 8.999999999999999e-05, "loss": 1.2098, "step": 3 }, { "epoch": 0.0011991755667978266, "grad_norm": 1.2137216329574585, "learning_rate": 0.00011999999999999999, "loss": 1.0505, "step": 4 }, { "epoch": 0.001498969458497283, "grad_norm": 1.1230342388153076, "learning_rate": 0.00015, "loss": 0.8918, "step": 5 }, { "epoch": 0.0017987633501967398, "grad_norm": 0.9145472049713135, "learning_rate": 0.00017999999999999998, "loss": 0.8306, "step": 6 }, { "epoch": 0.0020985572418961962, "grad_norm": 1.5094902515411377, "learning_rate": 0.00020999999999999998, "loss": 0.788, "step": 7 }, { "epoch": 0.002398351133595653, "grad_norm": 0.5805755257606506, "learning_rate": 0.00023999999999999998, "loss": 0.8026, "step": 8 }, { "epoch": 0.0026981450252951096, "grad_norm": 0.338878333568573, "learning_rate": 0.00027, "loss": 0.723, "step": 9 }, { "epoch": 0.002997938916994566, "grad_norm": 0.34122711420059204, "learning_rate": 0.0003, "loss": 0.705, "step": 10 }, { "epoch": 0.003297732808694023, "grad_norm": 0.3165666162967682, "learning_rate": 0.0002999549549549549, "loss": 0.7282, "step": 11 }, { "epoch": 0.0035975267003934795, "grad_norm": 0.3415158689022064, "learning_rate": 0.0002999099099099099, "loss": 0.6833, "step": 12 }, { "epoch": 0.003897320592092936, "grad_norm": 0.3431508243083954, "learning_rate": 0.00029986486486486484, "loss": 0.713, "step": 13 }, { "epoch": 0.0041971144837923925, "grad_norm": 0.3494497239589691, "learning_rate": 0.0002998198198198198, "loss": 0.6822, "step": 14 }, { "epoch": 0.004496908375491849, "grad_norm": 0.29291579127311707, "learning_rate": 0.00029977477477477477, "loss": 0.6348, "step": 15 }, { "epoch": 0.004796702267191306, "grad_norm": 0.3061606287956238, "learning_rate": 0.0002997297297297297, "loss": 0.6584, "step": 16 }, { "epoch": 0.005096496158890762, "grad_norm": 0.32851532101631165, "learning_rate": 0.00029968468468468464, "loss": 0.6792, "step": 17 }, { "epoch": 0.005396290050590219, "grad_norm": 0.3087822198867798, "learning_rate": 0.00029963963963963963, "loss": 0.686, "step": 18 }, { "epoch": 0.005696083942289676, "grad_norm": 0.2965148687362671, "learning_rate": 0.00029959459459459457, "loss": 0.6549, "step": 19 }, { "epoch": 0.005995877833989132, "grad_norm": 0.32235461473464966, "learning_rate": 0.0002995495495495495, "loss": 0.6653, "step": 20 }, { "epoch": 0.006295671725688589, "grad_norm": 0.29656529426574707, "learning_rate": 0.0002995045045045045, "loss": 0.6423, "step": 21 }, { "epoch": 0.006595465617388046, "grad_norm": 0.3357860743999481, "learning_rate": 0.00029945945945945943, "loss": 0.6957, "step": 22 }, { "epoch": 0.006895259509087502, "grad_norm": 0.28682708740234375, "learning_rate": 0.00029941441441441437, "loss": 0.6475, "step": 23 }, { "epoch": 0.007195053400786959, "grad_norm": 0.29643768072128296, "learning_rate": 0.0002993693693693693, "loss": 0.6651, "step": 24 }, { "epoch": 0.007494847292486416, "grad_norm": 0.2794937789440155, "learning_rate": 0.0002993243243243243, "loss": 0.6423, "step": 25 }, { "epoch": 0.007794641184185872, "grad_norm": 0.3023228943347931, "learning_rate": 0.00029927927927927923, "loss": 0.6728, "step": 26 }, { "epoch": 0.008094435075885328, "grad_norm": 0.27937522530555725, "learning_rate": 0.00029923423423423417, "loss": 0.6538, "step": 27 }, { "epoch": 0.008394228967584785, "grad_norm": 0.2845768332481384, "learning_rate": 0.00029918918918918916, "loss": 0.6588, "step": 28 }, { "epoch": 0.008694022859284242, "grad_norm": 0.2637081742286682, "learning_rate": 0.0002991441441441441, "loss": 0.6253, "step": 29 }, { "epoch": 0.008993816750983699, "grad_norm": 0.25597694516181946, "learning_rate": 0.00029909909909909903, "loss": 0.5973, "step": 30 }, { "epoch": 0.009293610642683156, "grad_norm": 0.28932246565818787, "learning_rate": 0.000299054054054054, "loss": 0.6607, "step": 31 }, { "epoch": 0.009593404534382613, "grad_norm": 0.28410351276397705, "learning_rate": 0.00029900900900900896, "loss": 0.6496, "step": 32 }, { "epoch": 0.009893198426082068, "grad_norm": 0.2708141803741455, "learning_rate": 0.00029896396396396395, "loss": 0.6161, "step": 33 }, { "epoch": 0.010192992317781525, "grad_norm": 0.27171120047569275, "learning_rate": 0.0002989189189189189, "loss": 0.6179, "step": 34 }, { "epoch": 0.010492786209480982, "grad_norm": 0.2806681990623474, "learning_rate": 0.0002988738738738738, "loss": 0.6111, "step": 35 }, { "epoch": 0.010792580101180439, "grad_norm": 0.36722084879875183, "learning_rate": 0.0002988288288288288, "loss": 0.5778, "step": 36 }, { "epoch": 0.011092373992879895, "grad_norm": 0.26182547211647034, "learning_rate": 0.00029878378378378375, "loss": 0.5941, "step": 37 }, { "epoch": 0.011392167884579352, "grad_norm": 0.26753902435302734, "learning_rate": 0.0002987387387387387, "loss": 0.6019, "step": 38 }, { "epoch": 0.011691961776278808, "grad_norm": 0.28038090467453003, "learning_rate": 0.0002986936936936937, "loss": 0.6379, "step": 39 }, { "epoch": 0.011991755667978264, "grad_norm": 0.29290881752967834, "learning_rate": 0.0002986486486486486, "loss": 0.6743, "step": 40 }, { "epoch": 0.012291549559677721, "grad_norm": 0.28465205430984497, "learning_rate": 0.00029860360360360356, "loss": 0.6214, "step": 41 }, { "epoch": 0.012591343451377178, "grad_norm": 0.26730677485466003, "learning_rate": 0.00029855855855855855, "loss": 0.6085, "step": 42 }, { "epoch": 0.012891137343076635, "grad_norm": 0.2801668643951416, "learning_rate": 0.0002985135135135135, "loss": 0.6541, "step": 43 }, { "epoch": 0.013190931234776092, "grad_norm": 0.2741893529891968, "learning_rate": 0.0002984684684684684, "loss": 0.6102, "step": 44 }, { "epoch": 0.013490725126475547, "grad_norm": 0.26284873485565186, "learning_rate": 0.0002984234234234234, "loss": 0.5762, "step": 45 }, { "epoch": 0.013790519018175004, "grad_norm": 0.26464149355888367, "learning_rate": 0.00029837837837837835, "loss": 0.6238, "step": 46 }, { "epoch": 0.014090312909874461, "grad_norm": 0.2674684226512909, "learning_rate": 0.00029833333333333334, "loss": 0.6342, "step": 47 }, { "epoch": 0.014390106801573918, "grad_norm": 0.2591281533241272, "learning_rate": 0.0002982882882882883, "loss": 0.5993, "step": 48 }, { "epoch": 0.014689900693273375, "grad_norm": 0.29611843824386597, "learning_rate": 0.0002982432432432432, "loss": 0.6376, "step": 49 }, { "epoch": 0.014989694584972832, "grad_norm": 0.26478683948516846, "learning_rate": 0.0002981981981981982, "loss": 0.5935, "step": 50 }, { "epoch": 0.015289488476672287, "grad_norm": 0.25142884254455566, "learning_rate": 0.00029815315315315314, "loss": 0.5522, "step": 51 }, { "epoch": 0.015589282368371744, "grad_norm": 0.26863306760787964, "learning_rate": 0.0002981081081081081, "loss": 0.5808, "step": 52 }, { "epoch": 0.015889076260071203, "grad_norm": 0.26126888394355774, "learning_rate": 0.00029806306306306307, "loss": 0.6101, "step": 53 }, { "epoch": 0.016188870151770656, "grad_norm": 0.24878369271755219, "learning_rate": 0.000298018018018018, "loss": 0.5787, "step": 54 }, { "epoch": 0.016488664043470113, "grad_norm": 0.2513170838356018, "learning_rate": 0.00029797297297297294, "loss": 0.5743, "step": 55 }, { "epoch": 0.01678845793516957, "grad_norm": 0.26510271430015564, "learning_rate": 0.00029792792792792793, "loss": 0.6023, "step": 56 }, { "epoch": 0.017088251826869027, "grad_norm": 0.2809905409812927, "learning_rate": 0.00029788288288288287, "loss": 0.6336, "step": 57 }, { "epoch": 0.017388045718568484, "grad_norm": 0.250021755695343, "learning_rate": 0.0002978378378378378, "loss": 0.5385, "step": 58 }, { "epoch": 0.01768783961026794, "grad_norm": 0.248112291097641, "learning_rate": 0.0002977927927927928, "loss": 0.5633, "step": 59 }, { "epoch": 0.017987633501967398, "grad_norm": 0.2756728529930115, "learning_rate": 0.00029774774774774773, "loss": 0.5988, "step": 60 }, { "epoch": 0.018287427393666855, "grad_norm": 0.26302388310432434, "learning_rate": 0.00029770270270270267, "loss": 0.5738, "step": 61 }, { "epoch": 0.01858722128536631, "grad_norm": 0.2736607789993286, "learning_rate": 0.00029765765765765766, "loss": 0.5908, "step": 62 }, { "epoch": 0.01888701517706577, "grad_norm": 0.26531675457954407, "learning_rate": 0.0002976126126126126, "loss": 0.6112, "step": 63 }, { "epoch": 0.019186809068765225, "grad_norm": 0.252244234085083, "learning_rate": 0.00029756756756756753, "loss": 0.5921, "step": 64 }, { "epoch": 0.019486602960464682, "grad_norm": 0.26548662781715393, "learning_rate": 0.0002975225225225225, "loss": 0.604, "step": 65 }, { "epoch": 0.019786396852164136, "grad_norm": 0.2575795352458954, "learning_rate": 0.00029747747747747746, "loss": 0.564, "step": 66 }, { "epoch": 0.020086190743863593, "grad_norm": 0.25535911321640015, "learning_rate": 0.0002974324324324324, "loss": 0.5962, "step": 67 }, { "epoch": 0.02038598463556305, "grad_norm": 0.26686182618141174, "learning_rate": 0.0002973873873873874, "loss": 0.6268, "step": 68 }, { "epoch": 0.020685778527262506, "grad_norm": 0.25553348660469055, "learning_rate": 0.00029734234234234233, "loss": 0.5583, "step": 69 }, { "epoch": 0.020985572418961963, "grad_norm": 0.2537127435207367, "learning_rate": 0.00029729729729729726, "loss": 0.5734, "step": 70 }, { "epoch": 0.02128536631066142, "grad_norm": 0.27586349844932556, "learning_rate": 0.0002972522522522522, "loss": 0.5667, "step": 71 }, { "epoch": 0.021585160202360877, "grad_norm": 0.2614293694496155, "learning_rate": 0.0002972072072072072, "loss": 0.5831, "step": 72 }, { "epoch": 0.021884954094060334, "grad_norm": 0.263944536447525, "learning_rate": 0.00029716216216216213, "loss": 0.5862, "step": 73 }, { "epoch": 0.02218474798575979, "grad_norm": 0.24802720546722412, "learning_rate": 0.00029711711711711707, "loss": 0.5788, "step": 74 }, { "epoch": 0.022484541877459248, "grad_norm": 0.2746400833129883, "learning_rate": 0.00029707207207207206, "loss": 0.591, "step": 75 }, { "epoch": 0.022784335769158705, "grad_norm": 0.2553657591342926, "learning_rate": 0.000297027027027027, "loss": 0.5735, "step": 76 }, { "epoch": 0.02308412966085816, "grad_norm": 0.2704835534095764, "learning_rate": 0.00029698198198198193, "loss": 0.5809, "step": 77 }, { "epoch": 0.023383923552557615, "grad_norm": 0.2556537985801697, "learning_rate": 0.0002969369369369369, "loss": 0.5496, "step": 78 }, { "epoch": 0.023683717444257072, "grad_norm": 0.2523523271083832, "learning_rate": 0.00029689189189189186, "loss": 0.5758, "step": 79 }, { "epoch": 0.02398351133595653, "grad_norm": 0.25214141607284546, "learning_rate": 0.0002968468468468468, "loss": 0.5844, "step": 80 }, { "epoch": 0.024283305227655986, "grad_norm": 0.2590884268283844, "learning_rate": 0.0002968018018018018, "loss": 0.5862, "step": 81 }, { "epoch": 0.024583099119355443, "grad_norm": 0.24399450421333313, "learning_rate": 0.0002967567567567567, "loss": 0.5685, "step": 82 }, { "epoch": 0.0248828930110549, "grad_norm": 0.24457746744155884, "learning_rate": 0.00029671171171171166, "loss": 0.5684, "step": 83 }, { "epoch": 0.025182686902754357, "grad_norm": 0.23214662075042725, "learning_rate": 0.00029666666666666665, "loss": 0.5682, "step": 84 }, { "epoch": 0.025482480794453814, "grad_norm": 0.24058258533477783, "learning_rate": 0.0002966216216216216, "loss": 0.5688, "step": 85 }, { "epoch": 0.02578227468615327, "grad_norm": 0.2338346391916275, "learning_rate": 0.0002965765765765765, "loss": 0.5485, "step": 86 }, { "epoch": 0.026082068577852727, "grad_norm": 0.2553529143333435, "learning_rate": 0.0002965315315315315, "loss": 0.5985, "step": 87 }, { "epoch": 0.026381862469552184, "grad_norm": 0.2518393099308014, "learning_rate": 0.00029648648648648645, "loss": 0.6079, "step": 88 }, { "epoch": 0.02668165636125164, "grad_norm": 0.23418371379375458, "learning_rate": 0.0002964414414414414, "loss": 0.5942, "step": 89 }, { "epoch": 0.026981450252951095, "grad_norm": 0.2454022616147995, "learning_rate": 0.0002963963963963964, "loss": 0.5519, "step": 90 }, { "epoch": 0.02728124414465055, "grad_norm": 0.25474220514297485, "learning_rate": 0.0002963513513513513, "loss": 0.5771, "step": 91 }, { "epoch": 0.02758103803635001, "grad_norm": 0.2332638055086136, "learning_rate": 0.00029630630630630625, "loss": 0.5477, "step": 92 }, { "epoch": 0.027880831928049465, "grad_norm": 0.23931057751178741, "learning_rate": 0.00029626126126126124, "loss": 0.5391, "step": 93 }, { "epoch": 0.028180625819748922, "grad_norm": 0.23103167116641998, "learning_rate": 0.0002962162162162162, "loss": 0.5622, "step": 94 }, { "epoch": 0.02848041971144838, "grad_norm": 0.2356046438217163, "learning_rate": 0.0002961711711711711, "loss": 0.5729, "step": 95 }, { "epoch": 0.028780213603147836, "grad_norm": 0.26015767455101013, "learning_rate": 0.0002961261261261261, "loss": 0.5878, "step": 96 }, { "epoch": 0.029080007494847293, "grad_norm": 0.2299613654613495, "learning_rate": 0.00029608108108108104, "loss": 0.5385, "step": 97 }, { "epoch": 0.02937980138654675, "grad_norm": 0.24516573548316956, "learning_rate": 0.000296036036036036, "loss": 0.5688, "step": 98 }, { "epoch": 0.029679595278246207, "grad_norm": 0.25019732117652893, "learning_rate": 0.00029599099099099097, "loss": 0.5562, "step": 99 }, { "epoch": 0.029979389169945664, "grad_norm": 0.23934195935726166, "learning_rate": 0.0002959459459459459, "loss": 0.5572, "step": 100 }, { "epoch": 0.030279183061645117, "grad_norm": 0.23651225864887238, "learning_rate": 0.00029590090090090085, "loss": 0.5517, "step": 101 }, { "epoch": 0.030578976953344574, "grad_norm": 0.2545163631439209, "learning_rate": 0.00029585585585585584, "loss": 0.5553, "step": 102 }, { "epoch": 0.03087877084504403, "grad_norm": 0.2259800285100937, "learning_rate": 0.0002958108108108108, "loss": 0.5216, "step": 103 }, { "epoch": 0.031178564736743488, "grad_norm": 0.25137075781822205, "learning_rate": 0.00029576576576576576, "loss": 0.559, "step": 104 }, { "epoch": 0.03147835862844295, "grad_norm": 0.2526615262031555, "learning_rate": 0.0002957207207207207, "loss": 0.6025, "step": 105 }, { "epoch": 0.031778152520142405, "grad_norm": 0.2406662553548813, "learning_rate": 0.00029567567567567564, "loss": 0.562, "step": 106 }, { "epoch": 0.032077946411841855, "grad_norm": 0.23598824441432953, "learning_rate": 0.00029563063063063063, "loss": 0.5749, "step": 107 }, { "epoch": 0.03237774030354131, "grad_norm": 0.24079738557338715, "learning_rate": 0.00029558558558558557, "loss": 0.5697, "step": 108 }, { "epoch": 0.03267753419524077, "grad_norm": 0.25064727663993835, "learning_rate": 0.0002955405405405405, "loss": 0.5821, "step": 109 }, { "epoch": 0.032977328086940226, "grad_norm": 0.22557033598423004, "learning_rate": 0.0002954954954954955, "loss": 0.5367, "step": 110 }, { "epoch": 0.03327712197863968, "grad_norm": 0.22542916238307953, "learning_rate": 0.00029545045045045043, "loss": 0.55, "step": 111 }, { "epoch": 0.03357691587033914, "grad_norm": 0.24270494282245636, "learning_rate": 0.00029540540540540537, "loss": 0.5773, "step": 112 }, { "epoch": 0.0338767097620386, "grad_norm": 0.23397719860076904, "learning_rate": 0.00029536036036036036, "loss": 0.5516, "step": 113 }, { "epoch": 0.034176503653738054, "grad_norm": 0.2322990894317627, "learning_rate": 0.0002953153153153153, "loss": 0.5307, "step": 114 }, { "epoch": 0.03447629754543751, "grad_norm": 0.2167540043592453, "learning_rate": 0.0002952702702702703, "loss": 0.5197, "step": 115 }, { "epoch": 0.03477609143713697, "grad_norm": 0.2705434262752533, "learning_rate": 0.0002952252252252252, "loss": 0.564, "step": 116 }, { "epoch": 0.035075885328836424, "grad_norm": 0.23195774853229523, "learning_rate": 0.00029518018018018016, "loss": 0.5397, "step": 117 }, { "epoch": 0.03537567922053588, "grad_norm": 0.22944559156894684, "learning_rate": 0.00029513513513513515, "loss": 0.5197, "step": 118 }, { "epoch": 0.03567547311223534, "grad_norm": 0.25140368938446045, "learning_rate": 0.0002950900900900901, "loss": 0.5661, "step": 119 }, { "epoch": 0.035975267003934795, "grad_norm": 0.23137278854846954, "learning_rate": 0.000295045045045045, "loss": 0.5284, "step": 120 }, { "epoch": 0.03627506089563425, "grad_norm": 0.2358487993478775, "learning_rate": 0.00029499999999999996, "loss": 0.551, "step": 121 }, { "epoch": 0.03657485478733371, "grad_norm": 0.23212139308452606, "learning_rate": 0.00029495495495495495, "loss": 0.5624, "step": 122 }, { "epoch": 0.036874648679033166, "grad_norm": 0.2504674196243286, "learning_rate": 0.0002949099099099099, "loss": 0.5649, "step": 123 }, { "epoch": 0.03717444257073262, "grad_norm": 0.24574224650859833, "learning_rate": 0.0002948648648648648, "loss": 0.5415, "step": 124 }, { "epoch": 0.03747423646243208, "grad_norm": 0.23007921874523163, "learning_rate": 0.0002948198198198198, "loss": 0.5624, "step": 125 }, { "epoch": 0.03777403035413154, "grad_norm": 0.2503686547279358, "learning_rate": 0.00029477477477477475, "loss": 0.5484, "step": 126 }, { "epoch": 0.038073824245830994, "grad_norm": 0.2467029094696045, "learning_rate": 0.0002947297297297297, "loss": 0.5442, "step": 127 }, { "epoch": 0.03837361813753045, "grad_norm": 0.23382776975631714, "learning_rate": 0.0002946846846846847, "loss": 0.5485, "step": 128 }, { "epoch": 0.03867341202922991, "grad_norm": 0.25080442428588867, "learning_rate": 0.0002946396396396396, "loss": 0.5726, "step": 129 }, { "epoch": 0.038973205920929364, "grad_norm": 0.2410043627023697, "learning_rate": 0.00029459459459459455, "loss": 0.5475, "step": 130 }, { "epoch": 0.039272999812628814, "grad_norm": 0.24639956653118134, "learning_rate": 0.00029454954954954955, "loss": 0.5461, "step": 131 }, { "epoch": 0.03957279370432827, "grad_norm": 0.23122674226760864, "learning_rate": 0.0002945045045045045, "loss": 0.5605, "step": 132 }, { "epoch": 0.03987258759602773, "grad_norm": 0.22207331657409668, "learning_rate": 0.0002944594594594594, "loss": 0.5045, "step": 133 }, { "epoch": 0.040172381487727185, "grad_norm": 0.2344479262828827, "learning_rate": 0.0002944144144144144, "loss": 0.5724, "step": 134 }, { "epoch": 0.04047217537942664, "grad_norm": 0.2410653978586197, "learning_rate": 0.00029436936936936935, "loss": 0.5266, "step": 135 }, { "epoch": 0.0407719692711261, "grad_norm": 0.23028722405433655, "learning_rate": 0.0002943243243243243, "loss": 0.545, "step": 136 }, { "epoch": 0.041071763162825556, "grad_norm": 0.23669356107711792, "learning_rate": 0.0002942792792792793, "loss": 0.5429, "step": 137 }, { "epoch": 0.04137155705452501, "grad_norm": 0.24232889711856842, "learning_rate": 0.0002942342342342342, "loss": 0.5815, "step": 138 }, { "epoch": 0.04167135094622447, "grad_norm": 0.22076158225536346, "learning_rate": 0.00029418918918918915, "loss": 0.5028, "step": 139 }, { "epoch": 0.04197114483792393, "grad_norm": 0.23322072625160217, "learning_rate": 0.00029414414414414414, "loss": 0.5293, "step": 140 }, { "epoch": 0.042270938729623384, "grad_norm": 0.24910598993301392, "learning_rate": 0.0002940990990990991, "loss": 0.5403, "step": 141 }, { "epoch": 0.04257073262132284, "grad_norm": 0.24588559567928314, "learning_rate": 0.000294054054054054, "loss": 0.538, "step": 142 }, { "epoch": 0.0428705265130223, "grad_norm": 0.20689445734024048, "learning_rate": 0.00029400900900900895, "loss": 0.4921, "step": 143 }, { "epoch": 0.043170320404721754, "grad_norm": 0.2518969178199768, "learning_rate": 0.00029396396396396394, "loss": 0.5625, "step": 144 }, { "epoch": 0.04347011429642121, "grad_norm": 0.25027644634246826, "learning_rate": 0.0002939189189189189, "loss": 0.5823, "step": 145 }, { "epoch": 0.04376990818812067, "grad_norm": 0.24390611052513123, "learning_rate": 0.0002938738738738738, "loss": 0.5205, "step": 146 }, { "epoch": 0.044069702079820125, "grad_norm": 0.24781368672847748, "learning_rate": 0.0002938288288288288, "loss": 0.5559, "step": 147 }, { "epoch": 0.04436949597151958, "grad_norm": 0.23696455359458923, "learning_rate": 0.00029378378378378374, "loss": 0.5443, "step": 148 }, { "epoch": 0.04466928986321904, "grad_norm": 0.23888202011585236, "learning_rate": 0.0002937387387387387, "loss": 0.5397, "step": 149 }, { "epoch": 0.044969083754918496, "grad_norm": 0.22896665334701538, "learning_rate": 0.00029369369369369367, "loss": 0.5462, "step": 150 }, { "epoch": 0.04526887764661795, "grad_norm": 0.23967792093753815, "learning_rate": 0.0002936486486486486, "loss": 0.5653, "step": 151 }, { "epoch": 0.04556867153831741, "grad_norm": 0.2244715392589569, "learning_rate": 0.00029360360360360354, "loss": 0.5251, "step": 152 }, { "epoch": 0.045868465430016866, "grad_norm": 0.2287164330482483, "learning_rate": 0.00029355855855855853, "loss": 0.5313, "step": 153 }, { "epoch": 0.04616825932171632, "grad_norm": 0.2419881671667099, "learning_rate": 0.00029351351351351347, "loss": 0.5397, "step": 154 }, { "epoch": 0.04646805321341577, "grad_norm": 0.2414129674434662, "learning_rate": 0.0002934684684684684, "loss": 0.574, "step": 155 }, { "epoch": 0.04676784710511523, "grad_norm": 0.237702414393425, "learning_rate": 0.0002934234234234234, "loss": 0.5377, "step": 156 }, { "epoch": 0.04706764099681469, "grad_norm": 0.24712331593036652, "learning_rate": 0.00029337837837837833, "loss": 0.5722, "step": 157 }, { "epoch": 0.047367434888514144, "grad_norm": 0.23250596225261688, "learning_rate": 0.00029333333333333327, "loss": 0.5663, "step": 158 }, { "epoch": 0.0476672287802136, "grad_norm": 0.22616314888000488, "learning_rate": 0.00029328828828828826, "loss": 0.4984, "step": 159 }, { "epoch": 0.04796702267191306, "grad_norm": 0.23835696280002594, "learning_rate": 0.0002932432432432432, "loss": 0.5512, "step": 160 }, { "epoch": 0.048266816563612515, "grad_norm": 0.23458854854106903, "learning_rate": 0.0002931981981981982, "loss": 0.5472, "step": 161 }, { "epoch": 0.04856661045531197, "grad_norm": 0.2530427873134613, "learning_rate": 0.0002931531531531531, "loss": 0.5807, "step": 162 }, { "epoch": 0.04886640434701143, "grad_norm": 0.23911581933498383, "learning_rate": 0.00029310810810810806, "loss": 0.5236, "step": 163 }, { "epoch": 0.049166198238710886, "grad_norm": 0.24235881865024567, "learning_rate": 0.00029306306306306305, "loss": 0.5509, "step": 164 }, { "epoch": 0.04946599213041034, "grad_norm": 0.23077982664108276, "learning_rate": 0.000293018018018018, "loss": 0.5265, "step": 165 }, { "epoch": 0.0497657860221098, "grad_norm": 0.23159649968147278, "learning_rate": 0.00029297297297297293, "loss": 0.5521, "step": 166 }, { "epoch": 0.050065579913809256, "grad_norm": 0.23901638388633728, "learning_rate": 0.0002929279279279279, "loss": 0.517, "step": 167 }, { "epoch": 0.05036537380550871, "grad_norm": 0.2441125512123108, "learning_rate": 0.00029288288288288286, "loss": 0.5376, "step": 168 }, { "epoch": 0.05066516769720817, "grad_norm": 0.2408357411623001, "learning_rate": 0.0002928378378378378, "loss": 0.5119, "step": 169 }, { "epoch": 0.05096496158890763, "grad_norm": 0.24620820581912994, "learning_rate": 0.0002927927927927928, "loss": 0.5454, "step": 170 }, { "epoch": 0.051264755480607084, "grad_norm": 0.23092059791088104, "learning_rate": 0.0002927477477477477, "loss": 0.5464, "step": 171 }, { "epoch": 0.05156454937230654, "grad_norm": 0.23076176643371582, "learning_rate": 0.0002927027027027027, "loss": 0.5153, "step": 172 }, { "epoch": 0.051864343264006, "grad_norm": 0.2518848478794098, "learning_rate": 0.00029265765765765765, "loss": 0.5416, "step": 173 }, { "epoch": 0.052164137155705455, "grad_norm": 0.24004964530467987, "learning_rate": 0.0002926126126126126, "loss": 0.5625, "step": 174 }, { "epoch": 0.05246393104740491, "grad_norm": 0.26169759035110474, "learning_rate": 0.0002925675675675676, "loss": 0.5322, "step": 175 }, { "epoch": 0.05276372493910437, "grad_norm": 0.23733891546726227, "learning_rate": 0.0002925225225225225, "loss": 0.5526, "step": 176 }, { "epoch": 0.053063518830803826, "grad_norm": 0.22720226645469666, "learning_rate": 0.00029247747747747745, "loss": 0.5108, "step": 177 }, { "epoch": 0.05336331272250328, "grad_norm": 0.2425220012664795, "learning_rate": 0.00029243243243243244, "loss": 0.5337, "step": 178 }, { "epoch": 0.05366310661420273, "grad_norm": 0.24859829246997833, "learning_rate": 0.0002923873873873874, "loss": 0.5239, "step": 179 }, { "epoch": 0.05396290050590219, "grad_norm": 0.23471564054489136, "learning_rate": 0.0002923423423423423, "loss": 0.5372, "step": 180 }, { "epoch": 0.054262694397601646, "grad_norm": 0.23243340849876404, "learning_rate": 0.0002922972972972973, "loss": 0.5283, "step": 181 }, { "epoch": 0.0545624882893011, "grad_norm": 0.240774467587471, "learning_rate": 0.00029225225225225224, "loss": 0.523, "step": 182 }, { "epoch": 0.05486228218100056, "grad_norm": 0.2518199384212494, "learning_rate": 0.0002922072072072072, "loss": 0.56, "step": 183 }, { "epoch": 0.05516207607270002, "grad_norm": 0.22320473194122314, "learning_rate": 0.00029216216216216217, "loss": 0.5442, "step": 184 }, { "epoch": 0.055461869964399474, "grad_norm": 0.24681055545806885, "learning_rate": 0.0002921171171171171, "loss": 0.5596, "step": 185 }, { "epoch": 0.05576166385609893, "grad_norm": 0.23595312237739563, "learning_rate": 0.00029207207207207204, "loss": 0.5343, "step": 186 }, { "epoch": 0.05606145774779839, "grad_norm": 0.2307872772216797, "learning_rate": 0.00029202702702702703, "loss": 0.5206, "step": 187 }, { "epoch": 0.056361251639497845, "grad_norm": 0.24639058113098145, "learning_rate": 0.00029198198198198197, "loss": 0.5449, "step": 188 }, { "epoch": 0.0566610455311973, "grad_norm": 0.24432207643985748, "learning_rate": 0.0002919369369369369, "loss": 0.5455, "step": 189 }, { "epoch": 0.05696083942289676, "grad_norm": 0.23387478291988373, "learning_rate": 0.0002918918918918919, "loss": 0.5402, "step": 190 }, { "epoch": 0.057260633314596215, "grad_norm": 0.2183016538619995, "learning_rate": 0.00029184684684684684, "loss": 0.4989, "step": 191 }, { "epoch": 0.05756042720629567, "grad_norm": 0.23916368186473846, "learning_rate": 0.00029180180180180177, "loss": 0.5397, "step": 192 }, { "epoch": 0.05786022109799513, "grad_norm": 0.24624724686145782, "learning_rate": 0.0002917567567567567, "loss": 0.5248, "step": 193 }, { "epoch": 0.058160014989694586, "grad_norm": 0.2571977376937866, "learning_rate": 0.0002917117117117117, "loss": 0.587, "step": 194 }, { "epoch": 0.05845980888139404, "grad_norm": 0.22828777134418488, "learning_rate": 0.00029166666666666664, "loss": 0.5424, "step": 195 }, { "epoch": 0.0587596027730935, "grad_norm": 0.23892144858837128, "learning_rate": 0.0002916216216216216, "loss": 0.5228, "step": 196 }, { "epoch": 0.05905939666479296, "grad_norm": 0.22904744744300842, "learning_rate": 0.00029157657657657656, "loss": 0.5052, "step": 197 }, { "epoch": 0.059359190556492414, "grad_norm": 0.2399347573518753, "learning_rate": 0.0002915315315315315, "loss": 0.541, "step": 198 }, { "epoch": 0.05965898444819187, "grad_norm": 0.2253408432006836, "learning_rate": 0.00029148648648648644, "loss": 0.5204, "step": 199 }, { "epoch": 0.05995877833989133, "grad_norm": 0.23211434483528137, "learning_rate": 0.00029144144144144143, "loss": 0.5248, "step": 200 }, { "epoch": 0.060258572231590785, "grad_norm": 0.24016128480434418, "learning_rate": 0.00029139639639639637, "loss": 0.5386, "step": 201 }, { "epoch": 0.060558366123290235, "grad_norm": 0.2374415397644043, "learning_rate": 0.0002913513513513513, "loss": 0.4904, "step": 202 }, { "epoch": 0.06085816001498969, "grad_norm": 0.257735013961792, "learning_rate": 0.0002913063063063063, "loss": 0.5253, "step": 203 }, { "epoch": 0.06115795390668915, "grad_norm": 0.23416665196418762, "learning_rate": 0.00029126126126126123, "loss": 0.5164, "step": 204 }, { "epoch": 0.061457747798388605, "grad_norm": 0.24382436275482178, "learning_rate": 0.00029121621621621617, "loss": 0.5507, "step": 205 }, { "epoch": 0.06175754169008806, "grad_norm": 0.2488170564174652, "learning_rate": 0.00029117117117117116, "loss": 0.543, "step": 206 }, { "epoch": 0.06205733558178752, "grad_norm": 0.24262337386608124, "learning_rate": 0.0002911261261261261, "loss": 0.5533, "step": 207 }, { "epoch": 0.062357129473486976, "grad_norm": 0.2239450216293335, "learning_rate": 0.00029108108108108103, "loss": 0.4955, "step": 208 }, { "epoch": 0.06265692336518644, "grad_norm": 0.22644869983196259, "learning_rate": 0.000291036036036036, "loss": 0.5362, "step": 209 }, { "epoch": 0.0629567172568859, "grad_norm": 0.23194913566112518, "learning_rate": 0.00029099099099099096, "loss": 0.5226, "step": 210 }, { "epoch": 0.06325651114858535, "grad_norm": 0.22114375233650208, "learning_rate": 0.0002909459459459459, "loss": 0.472, "step": 211 }, { "epoch": 0.06355630504028481, "grad_norm": 0.24242356419563293, "learning_rate": 0.0002909009009009009, "loss": 0.5041, "step": 212 }, { "epoch": 0.06385609893198427, "grad_norm": 0.24195095896720886, "learning_rate": 0.0002908558558558558, "loss": 0.511, "step": 213 }, { "epoch": 0.06415589282368371, "grad_norm": 0.23720628023147583, "learning_rate": 0.00029081081081081076, "loss": 0.5257, "step": 214 }, { "epoch": 0.06445568671538317, "grad_norm": 0.22656656801700592, "learning_rate": 0.00029076576576576575, "loss": 0.5336, "step": 215 }, { "epoch": 0.06475548060708262, "grad_norm": 0.23326708376407623, "learning_rate": 0.0002907207207207207, "loss": 0.5184, "step": 216 }, { "epoch": 0.06505527449878208, "grad_norm": 0.2327839583158493, "learning_rate": 0.0002906756756756756, "loss": 0.5434, "step": 217 }, { "epoch": 0.06535506839048154, "grad_norm": 0.24181734025478363, "learning_rate": 0.0002906306306306306, "loss": 0.534, "step": 218 }, { "epoch": 0.065654862282181, "grad_norm": 0.23325061798095703, "learning_rate": 0.00029058558558558555, "loss": 0.5438, "step": 219 }, { "epoch": 0.06595465617388045, "grad_norm": 0.2559988498687744, "learning_rate": 0.0002905405405405405, "loss": 0.5229, "step": 220 }, { "epoch": 0.06625445006557991, "grad_norm": 0.2432825118303299, "learning_rate": 0.0002904954954954955, "loss": 0.5223, "step": 221 }, { "epoch": 0.06655424395727937, "grad_norm": 0.23795264959335327, "learning_rate": 0.0002904504504504504, "loss": 0.5199, "step": 222 }, { "epoch": 0.06685403784897882, "grad_norm": 0.22999484837055206, "learning_rate": 0.00029040540540540535, "loss": 0.506, "step": 223 }, { "epoch": 0.06715383174067828, "grad_norm": 0.22168515622615814, "learning_rate": 0.00029036036036036034, "loss": 0.4873, "step": 224 }, { "epoch": 0.06745362563237774, "grad_norm": 0.23448102176189423, "learning_rate": 0.0002903153153153153, "loss": 0.5533, "step": 225 }, { "epoch": 0.0677534195240772, "grad_norm": 0.23822058737277985, "learning_rate": 0.0002902702702702702, "loss": 0.5221, "step": 226 }, { "epoch": 0.06805321341577665, "grad_norm": 0.22952431440353394, "learning_rate": 0.0002902252252252252, "loss": 0.5402, "step": 227 }, { "epoch": 0.06835300730747611, "grad_norm": 0.2197679728269577, "learning_rate": 0.00029018018018018015, "loss": 0.5115, "step": 228 }, { "epoch": 0.06865280119917556, "grad_norm": 0.23495204746723175, "learning_rate": 0.00029013513513513514, "loss": 0.4963, "step": 229 }, { "epoch": 0.06895259509087502, "grad_norm": 0.2233276218175888, "learning_rate": 0.0002900900900900901, "loss": 0.5257, "step": 230 }, { "epoch": 0.06925238898257448, "grad_norm": 0.23294100165367126, "learning_rate": 0.000290045045045045, "loss": 0.5231, "step": 231 }, { "epoch": 0.06955218287427394, "grad_norm": 0.24349254369735718, "learning_rate": 0.00029, "loss": 0.5198, "step": 232 }, { "epoch": 0.06985197676597339, "grad_norm": 0.22396647930145264, "learning_rate": 0.00028995495495495494, "loss": 0.5032, "step": 233 }, { "epoch": 0.07015177065767285, "grad_norm": 0.23737500607967377, "learning_rate": 0.0002899099099099099, "loss": 0.5071, "step": 234 }, { "epoch": 0.0704515645493723, "grad_norm": 0.22798973321914673, "learning_rate": 0.00028986486486486487, "loss": 0.4844, "step": 235 }, { "epoch": 0.07075135844107176, "grad_norm": 0.24095383286476135, "learning_rate": 0.0002898198198198198, "loss": 0.5351, "step": 236 }, { "epoch": 0.07105115233277122, "grad_norm": 0.23701000213623047, "learning_rate": 0.00028977477477477474, "loss": 0.5051, "step": 237 }, { "epoch": 0.07135094622447068, "grad_norm": 0.23588530719280243, "learning_rate": 0.00028972972972972973, "loss": 0.4992, "step": 238 }, { "epoch": 0.07165074011617013, "grad_norm": 0.27736473083496094, "learning_rate": 0.00028968468468468467, "loss": 0.5399, "step": 239 }, { "epoch": 0.07195053400786959, "grad_norm": 0.2486957311630249, "learning_rate": 0.0002896396396396396, "loss": 0.543, "step": 240 }, { "epoch": 0.07225032789956905, "grad_norm": 0.23474164307117462, "learning_rate": 0.0002895945945945946, "loss": 0.5275, "step": 241 }, { "epoch": 0.0725501217912685, "grad_norm": 0.26560667157173157, "learning_rate": 0.00028954954954954953, "loss": 0.5576, "step": 242 }, { "epoch": 0.07284991568296796, "grad_norm": 0.2260173261165619, "learning_rate": 0.00028950450450450447, "loss": 0.5046, "step": 243 }, { "epoch": 0.07314970957466742, "grad_norm": 0.24497725069522858, "learning_rate": 0.00028945945945945946, "loss": 0.5093, "step": 244 }, { "epoch": 0.07344950346636687, "grad_norm": 0.24871525168418884, "learning_rate": 0.0002894144144144144, "loss": 0.5353, "step": 245 }, { "epoch": 0.07374929735806633, "grad_norm": 0.2592950761318207, "learning_rate": 0.00028936936936936933, "loss": 0.553, "step": 246 }, { "epoch": 0.07404909124976579, "grad_norm": 0.23617064952850342, "learning_rate": 0.0002893243243243243, "loss": 0.5276, "step": 247 }, { "epoch": 0.07434888514146525, "grad_norm": 0.23108692467212677, "learning_rate": 0.00028927927927927926, "loss": 0.5027, "step": 248 }, { "epoch": 0.0746486790331647, "grad_norm": 0.2418566793203354, "learning_rate": 0.0002892342342342342, "loss": 0.5142, "step": 249 }, { "epoch": 0.07494847292486416, "grad_norm": 0.2367243766784668, "learning_rate": 0.0002891891891891892, "loss": 0.5062, "step": 250 }, { "epoch": 0.07524826681656362, "grad_norm": 0.2405940294265747, "learning_rate": 0.0002891441441441441, "loss": 0.4964, "step": 251 }, { "epoch": 0.07554806070826307, "grad_norm": 0.22956568002700806, "learning_rate": 0.00028909909909909906, "loss": 0.5081, "step": 252 }, { "epoch": 0.07584785459996253, "grad_norm": 0.24594880640506744, "learning_rate": 0.00028905405405405405, "loss": 0.5522, "step": 253 }, { "epoch": 0.07614764849166199, "grad_norm": 0.25554656982421875, "learning_rate": 0.000289009009009009, "loss": 0.5579, "step": 254 }, { "epoch": 0.07644744238336144, "grad_norm": 0.24296589195728302, "learning_rate": 0.0002889639639639639, "loss": 0.515, "step": 255 }, { "epoch": 0.0767472362750609, "grad_norm": 0.2128724306821823, "learning_rate": 0.0002889189189189189, "loss": 0.5046, "step": 256 }, { "epoch": 0.07704703016676036, "grad_norm": 0.2304687201976776, "learning_rate": 0.00028887387387387385, "loss": 0.5275, "step": 257 }, { "epoch": 0.07734682405845981, "grad_norm": 0.23740506172180176, "learning_rate": 0.0002888288288288288, "loss": 0.5308, "step": 258 }, { "epoch": 0.07764661795015927, "grad_norm": 0.2270033061504364, "learning_rate": 0.0002887837837837838, "loss": 0.5232, "step": 259 }, { "epoch": 0.07794641184185873, "grad_norm": 0.24655793607234955, "learning_rate": 0.0002887387387387387, "loss": 0.5087, "step": 260 }, { "epoch": 0.07824620573355819, "grad_norm": 0.25199684500694275, "learning_rate": 0.00028869369369369366, "loss": 0.5616, "step": 261 }, { "epoch": 0.07854599962525763, "grad_norm": 0.2466064840555191, "learning_rate": 0.00028864864864864865, "loss": 0.5343, "step": 262 }, { "epoch": 0.07884579351695709, "grad_norm": 0.2401546686887741, "learning_rate": 0.0002886036036036036, "loss": 0.5326, "step": 263 }, { "epoch": 0.07914558740865654, "grad_norm": 0.2478310465812683, "learning_rate": 0.0002885585585585585, "loss": 0.5377, "step": 264 }, { "epoch": 0.079445381300356, "grad_norm": 0.25078094005584717, "learning_rate": 0.00028851351351351346, "loss": 0.5065, "step": 265 }, { "epoch": 0.07974517519205546, "grad_norm": 0.24395832419395447, "learning_rate": 0.00028846846846846845, "loss": 0.5066, "step": 266 }, { "epoch": 0.08004496908375491, "grad_norm": 0.23809655010700226, "learning_rate": 0.0002884234234234234, "loss": 0.5032, "step": 267 }, { "epoch": 0.08034476297545437, "grad_norm": 0.25203442573547363, "learning_rate": 0.0002883783783783783, "loss": 0.5539, "step": 268 }, { "epoch": 0.08064455686715383, "grad_norm": 0.22731390595436096, "learning_rate": 0.0002883333333333333, "loss": 0.5056, "step": 269 }, { "epoch": 0.08094435075885328, "grad_norm": 0.26038557291030884, "learning_rate": 0.00028828828828828825, "loss": 0.5554, "step": 270 }, { "epoch": 0.08124414465055274, "grad_norm": 0.22735048830509186, "learning_rate": 0.0002882432432432432, "loss": 0.513, "step": 271 }, { "epoch": 0.0815439385422522, "grad_norm": 0.2459019273519516, "learning_rate": 0.0002881981981981982, "loss": 0.5074, "step": 272 }, { "epoch": 0.08184373243395165, "grad_norm": 0.23949427902698517, "learning_rate": 0.0002881531531531531, "loss": 0.5289, "step": 273 }, { "epoch": 0.08214352632565111, "grad_norm": 0.22845999896526337, "learning_rate": 0.00028810810810810805, "loss": 0.5046, "step": 274 }, { "epoch": 0.08244332021735057, "grad_norm": 0.23616977035999298, "learning_rate": 0.00028806306306306304, "loss": 0.4766, "step": 275 }, { "epoch": 0.08274311410905003, "grad_norm": 0.24239955842494965, "learning_rate": 0.000288018018018018, "loss": 0.5177, "step": 276 }, { "epoch": 0.08304290800074948, "grad_norm": 0.22668063640594482, "learning_rate": 0.0002879729729729729, "loss": 0.5343, "step": 277 }, { "epoch": 0.08334270189244894, "grad_norm": 0.23789212107658386, "learning_rate": 0.0002879279279279279, "loss": 0.5325, "step": 278 }, { "epoch": 0.0836424957841484, "grad_norm": 0.23370423913002014, "learning_rate": 0.00028788288288288284, "loss": 0.5054, "step": 279 }, { "epoch": 0.08394228967584785, "grad_norm": 0.2391810417175293, "learning_rate": 0.0002878378378378378, "loss": 0.5128, "step": 280 }, { "epoch": 0.08424208356754731, "grad_norm": 0.2511520981788635, "learning_rate": 0.00028779279279279277, "loss": 0.5205, "step": 281 }, { "epoch": 0.08454187745924677, "grad_norm": 0.25624001026153564, "learning_rate": 0.0002877477477477477, "loss": 0.5341, "step": 282 }, { "epoch": 0.08484167135094622, "grad_norm": 0.22454805672168732, "learning_rate": 0.00028770270270270264, "loss": 0.4776, "step": 283 }, { "epoch": 0.08514146524264568, "grad_norm": 0.24822303652763367, "learning_rate": 0.00028765765765765764, "loss": 0.5545, "step": 284 }, { "epoch": 0.08544125913434514, "grad_norm": 0.24899134039878845, "learning_rate": 0.00028761261261261257, "loss": 0.5081, "step": 285 }, { "epoch": 0.0857410530260446, "grad_norm": 0.22409197688102722, "learning_rate": 0.00028756756756756756, "loss": 0.4964, "step": 286 }, { "epoch": 0.08604084691774405, "grad_norm": 0.24713397026062012, "learning_rate": 0.0002875225225225225, "loss": 0.5604, "step": 287 }, { "epoch": 0.08634064080944351, "grad_norm": 0.2386777251958847, "learning_rate": 0.00028747747747747744, "loss": 0.5158, "step": 288 }, { "epoch": 0.08664043470114297, "grad_norm": 0.2695513963699341, "learning_rate": 0.00028743243243243243, "loss": 0.5289, "step": 289 }, { "epoch": 0.08694022859284242, "grad_norm": 0.23554596304893494, "learning_rate": 0.00028738738738738736, "loss": 0.5002, "step": 290 }, { "epoch": 0.08724002248454188, "grad_norm": 0.2348342090845108, "learning_rate": 0.0002873423423423423, "loss": 0.5026, "step": 291 }, { "epoch": 0.08753981637624134, "grad_norm": 0.24448086321353912, "learning_rate": 0.0002872972972972973, "loss": 0.508, "step": 292 }, { "epoch": 0.0878396102679408, "grad_norm": 0.22410009801387787, "learning_rate": 0.00028725225225225223, "loss": 0.4932, "step": 293 }, { "epoch": 0.08813940415964025, "grad_norm": 0.25097909569740295, "learning_rate": 0.00028720720720720717, "loss": 0.5196, "step": 294 }, { "epoch": 0.0884391980513397, "grad_norm": 0.23564878106117249, "learning_rate": 0.00028716216216216216, "loss": 0.4871, "step": 295 }, { "epoch": 0.08873899194303916, "grad_norm": 0.22870436310768127, "learning_rate": 0.0002871171171171171, "loss": 0.4717, "step": 296 }, { "epoch": 0.08903878583473862, "grad_norm": 0.24509447813034058, "learning_rate": 0.00028707207207207203, "loss": 0.5417, "step": 297 }, { "epoch": 0.08933857972643808, "grad_norm": 0.22554557025432587, "learning_rate": 0.000287027027027027, "loss": 0.5081, "step": 298 }, { "epoch": 0.08963837361813753, "grad_norm": 0.2863612473011017, "learning_rate": 0.00028698198198198196, "loss": 0.5453, "step": 299 }, { "epoch": 0.08993816750983699, "grad_norm": 0.23862136900424957, "learning_rate": 0.00028693693693693695, "loss": 0.4938, "step": 300 }, { "epoch": 0.09023796140153645, "grad_norm": 0.2628205120563507, "learning_rate": 0.0002868918918918919, "loss": 0.5392, "step": 301 }, { "epoch": 0.0905377552932359, "grad_norm": 0.23156946897506714, "learning_rate": 0.0002868468468468468, "loss": 0.4983, "step": 302 }, { "epoch": 0.09083754918493536, "grad_norm": 0.27579790353775024, "learning_rate": 0.0002868018018018018, "loss": 0.5343, "step": 303 }, { "epoch": 0.09113734307663482, "grad_norm": 0.22691994905471802, "learning_rate": 0.00028675675675675675, "loss": 0.4828, "step": 304 }, { "epoch": 0.09143713696833428, "grad_norm": 0.23544538021087646, "learning_rate": 0.0002867117117117117, "loss": 0.5143, "step": 305 }, { "epoch": 0.09173693086003373, "grad_norm": 0.22597934305667877, "learning_rate": 0.0002866666666666667, "loss": 0.5059, "step": 306 }, { "epoch": 0.09203672475173319, "grad_norm": 0.23851321637630463, "learning_rate": 0.0002866216216216216, "loss": 0.5127, "step": 307 }, { "epoch": 0.09233651864343265, "grad_norm": 0.2260097861289978, "learning_rate": 0.00028657657657657655, "loss": 0.4901, "step": 308 }, { "epoch": 0.09263631253513209, "grad_norm": 0.2568291127681732, "learning_rate": 0.00028653153153153154, "loss": 0.556, "step": 309 }, { "epoch": 0.09293610642683155, "grad_norm": 0.23510834574699402, "learning_rate": 0.0002864864864864865, "loss": 0.5254, "step": 310 }, { "epoch": 0.093235900318531, "grad_norm": 0.24556247889995575, "learning_rate": 0.0002864414414414414, "loss": 0.5292, "step": 311 }, { "epoch": 0.09353569421023046, "grad_norm": 0.2357538938522339, "learning_rate": 0.00028639639639639635, "loss": 0.5321, "step": 312 }, { "epoch": 0.09383548810192992, "grad_norm": 0.25484514236450195, "learning_rate": 0.00028635135135135134, "loss": 0.5449, "step": 313 }, { "epoch": 0.09413528199362937, "grad_norm": 0.243759885430336, "learning_rate": 0.0002863063063063063, "loss": 0.4766, "step": 314 }, { "epoch": 0.09443507588532883, "grad_norm": 0.232033833861351, "learning_rate": 0.0002862612612612612, "loss": 0.481, "step": 315 }, { "epoch": 0.09473486977702829, "grad_norm": 0.24136248230934143, "learning_rate": 0.0002862162162162162, "loss": 0.541, "step": 316 }, { "epoch": 0.09503466366872775, "grad_norm": 0.23708300292491913, "learning_rate": 0.00028617117117117114, "loss": 0.5298, "step": 317 }, { "epoch": 0.0953344575604272, "grad_norm": 0.22408358752727509, "learning_rate": 0.0002861261261261261, "loss": 0.4818, "step": 318 }, { "epoch": 0.09563425145212666, "grad_norm": 0.22969000041484833, "learning_rate": 0.00028608108108108107, "loss": 0.4668, "step": 319 }, { "epoch": 0.09593404534382612, "grad_norm": 0.233534574508667, "learning_rate": 0.000286036036036036, "loss": 0.506, "step": 320 }, { "epoch": 0.09623383923552557, "grad_norm": 0.2442328929901123, "learning_rate": 0.00028599099099099095, "loss": 0.5208, "step": 321 }, { "epoch": 0.09653363312722503, "grad_norm": 0.22215022146701813, "learning_rate": 0.00028594594594594594, "loss": 0.4963, "step": 322 }, { "epoch": 0.09683342701892449, "grad_norm": 0.23993152379989624, "learning_rate": 0.0002859009009009009, "loss": 0.4945, "step": 323 }, { "epoch": 0.09713322091062394, "grad_norm": 0.22987253963947296, "learning_rate": 0.0002858558558558558, "loss": 0.4895, "step": 324 }, { "epoch": 0.0974330148023234, "grad_norm": 0.25025674700737, "learning_rate": 0.0002858108108108108, "loss": 0.5053, "step": 325 }, { "epoch": 0.09773280869402286, "grad_norm": 0.21882835030555725, "learning_rate": 0.00028576576576576574, "loss": 0.4844, "step": 326 }, { "epoch": 0.09803260258572231, "grad_norm": 0.23361052572727203, "learning_rate": 0.0002857207207207207, "loss": 0.5159, "step": 327 }, { "epoch": 0.09833239647742177, "grad_norm": 0.23713140189647675, "learning_rate": 0.00028567567567567567, "loss": 0.5419, "step": 328 }, { "epoch": 0.09863219036912123, "grad_norm": 0.2439422458410263, "learning_rate": 0.0002856306306306306, "loss": 0.5068, "step": 329 }, { "epoch": 0.09893198426082069, "grad_norm": 0.23442302644252777, "learning_rate": 0.00028558558558558554, "loss": 0.5041, "step": 330 }, { "epoch": 0.09923177815252014, "grad_norm": 0.2295604944229126, "learning_rate": 0.00028554054054054053, "loss": 0.4838, "step": 331 }, { "epoch": 0.0995315720442196, "grad_norm": 0.2328769713640213, "learning_rate": 0.00028549549549549547, "loss": 0.5118, "step": 332 }, { "epoch": 0.09983136593591906, "grad_norm": 0.21605005860328674, "learning_rate": 0.0002854504504504504, "loss": 0.4786, "step": 333 }, { "epoch": 0.10013115982761851, "grad_norm": 0.22740024328231812, "learning_rate": 0.0002854054054054054, "loss": 0.5116, "step": 334 }, { "epoch": 0.10043095371931797, "grad_norm": 0.24437595903873444, "learning_rate": 0.00028536036036036033, "loss": 0.5467, "step": 335 }, { "epoch": 0.10073074761101743, "grad_norm": 0.2281450480222702, "learning_rate": 0.00028531531531531527, "loss": 0.5269, "step": 336 }, { "epoch": 0.10103054150271688, "grad_norm": 0.2343592643737793, "learning_rate": 0.0002852702702702702, "loss": 0.5176, "step": 337 }, { "epoch": 0.10133033539441634, "grad_norm": 0.24275358021259308, "learning_rate": 0.0002852252252252252, "loss": 0.5103, "step": 338 }, { "epoch": 0.1016301292861158, "grad_norm": 0.2498926818370819, "learning_rate": 0.00028518018018018013, "loss": 0.5282, "step": 339 }, { "epoch": 0.10192992317781525, "grad_norm": 0.23029617965221405, "learning_rate": 0.00028513513513513507, "loss": 0.512, "step": 340 }, { "epoch": 0.10222971706951471, "grad_norm": 0.2608179450035095, "learning_rate": 0.00028509009009009006, "loss": 0.5419, "step": 341 }, { "epoch": 0.10252951096121417, "grad_norm": 0.24309523403644562, "learning_rate": 0.000285045045045045, "loss": 0.5178, "step": 342 }, { "epoch": 0.10282930485291362, "grad_norm": 0.22357851266860962, "learning_rate": 0.000285, "loss": 0.487, "step": 343 }, { "epoch": 0.10312909874461308, "grad_norm": 0.2575152814388275, "learning_rate": 0.0002849549549549549, "loss": 0.5163, "step": 344 }, { "epoch": 0.10342889263631254, "grad_norm": 0.25661131739616394, "learning_rate": 0.00028490990990990986, "loss": 0.5229, "step": 345 }, { "epoch": 0.103728686528012, "grad_norm": 0.25328096747398376, "learning_rate": 0.00028486486486486485, "loss": 0.5456, "step": 346 }, { "epoch": 0.10402848041971145, "grad_norm": 0.2357887625694275, "learning_rate": 0.0002848198198198198, "loss": 0.4824, "step": 347 }, { "epoch": 0.10432827431141091, "grad_norm": 0.2275196760892868, "learning_rate": 0.0002847747747747747, "loss": 0.4899, "step": 348 }, { "epoch": 0.10462806820311037, "grad_norm": 0.2616022229194641, "learning_rate": 0.0002847297297297297, "loss": 0.543, "step": 349 }, { "epoch": 0.10492786209480982, "grad_norm": 0.23371827602386475, "learning_rate": 0.00028468468468468465, "loss": 0.4876, "step": 350 }, { "epoch": 0.10522765598650928, "grad_norm": 0.21652457118034363, "learning_rate": 0.0002846396396396396, "loss": 0.5051, "step": 351 }, { "epoch": 0.10552744987820874, "grad_norm": 0.2464732825756073, "learning_rate": 0.0002845945945945946, "loss": 0.4993, "step": 352 }, { "epoch": 0.1058272437699082, "grad_norm": 0.24194246530532837, "learning_rate": 0.0002845495495495495, "loss": 0.492, "step": 353 }, { "epoch": 0.10612703766160765, "grad_norm": 0.21986526250839233, "learning_rate": 0.00028450450450450446, "loss": 0.4668, "step": 354 }, { "epoch": 0.10642683155330711, "grad_norm": 0.2386123389005661, "learning_rate": 0.00028445945945945945, "loss": 0.4823, "step": 355 }, { "epoch": 0.10672662544500656, "grad_norm": 0.2368634194135666, "learning_rate": 0.0002844144144144144, "loss": 0.5119, "step": 356 }, { "epoch": 0.10702641933670601, "grad_norm": 0.24802769720554352, "learning_rate": 0.0002843693693693694, "loss": 0.5298, "step": 357 }, { "epoch": 0.10732621322840546, "grad_norm": 0.25534310936927795, "learning_rate": 0.0002843243243243243, "loss": 0.5107, "step": 358 }, { "epoch": 0.10762600712010492, "grad_norm": 0.24845390021800995, "learning_rate": 0.00028427927927927925, "loss": 0.539, "step": 359 }, { "epoch": 0.10792580101180438, "grad_norm": 0.22670955955982208, "learning_rate": 0.00028423423423423424, "loss": 0.5013, "step": 360 }, { "epoch": 0.10822559490350384, "grad_norm": 0.21695859730243683, "learning_rate": 0.0002841891891891892, "loss": 0.468, "step": 361 }, { "epoch": 0.10852538879520329, "grad_norm": 0.24691784381866455, "learning_rate": 0.0002841441441441441, "loss": 0.5011, "step": 362 }, { "epoch": 0.10882518268690275, "grad_norm": 0.23244348168373108, "learning_rate": 0.0002840990990990991, "loss": 0.481, "step": 363 }, { "epoch": 0.1091249765786022, "grad_norm": 0.23756597936153412, "learning_rate": 0.00028405405405405404, "loss": 0.4869, "step": 364 }, { "epoch": 0.10942477047030166, "grad_norm": 0.26336169242858887, "learning_rate": 0.000284009009009009, "loss": 0.5216, "step": 365 }, { "epoch": 0.10972456436200112, "grad_norm": 0.23734593391418457, "learning_rate": 0.00028396396396396397, "loss": 0.502, "step": 366 }, { "epoch": 0.11002435825370058, "grad_norm": 0.2188960462808609, "learning_rate": 0.0002839189189189189, "loss": 0.4784, "step": 367 }, { "epoch": 0.11032415214540003, "grad_norm": 0.26715338230133057, "learning_rate": 0.00028387387387387384, "loss": 0.4997, "step": 368 }, { "epoch": 0.11062394603709949, "grad_norm": 0.23621448874473572, "learning_rate": 0.00028382882882882883, "loss": 0.4819, "step": 369 }, { "epoch": 0.11092373992879895, "grad_norm": 0.23166733980178833, "learning_rate": 0.00028378378378378377, "loss": 0.4949, "step": 370 }, { "epoch": 0.1112235338204984, "grad_norm": 0.22275973856449127, "learning_rate": 0.0002837387387387387, "loss": 0.4893, "step": 371 }, { "epoch": 0.11152332771219786, "grad_norm": 0.22097566723823547, "learning_rate": 0.0002836936936936937, "loss": 0.5011, "step": 372 }, { "epoch": 0.11182312160389732, "grad_norm": 0.25464650988578796, "learning_rate": 0.00028364864864864863, "loss": 0.5369, "step": 373 }, { "epoch": 0.11212291549559678, "grad_norm": 0.2584247589111328, "learning_rate": 0.00028360360360360357, "loss": 0.5271, "step": 374 }, { "epoch": 0.11242270938729623, "grad_norm": 0.2229800671339035, "learning_rate": 0.00028355855855855856, "loss": 0.4919, "step": 375 }, { "epoch": 0.11272250327899569, "grad_norm": 0.25196340680122375, "learning_rate": 0.0002835135135135135, "loss": 0.4935, "step": 376 }, { "epoch": 0.11302229717069515, "grad_norm": 0.23945283889770508, "learning_rate": 0.00028346846846846843, "loss": 0.5158, "step": 377 }, { "epoch": 0.1133220910623946, "grad_norm": 0.22441525757312775, "learning_rate": 0.0002834234234234234, "loss": 0.5106, "step": 378 }, { "epoch": 0.11362188495409406, "grad_norm": 0.24109874665737152, "learning_rate": 0.00028337837837837836, "loss": 0.4646, "step": 379 }, { "epoch": 0.11392167884579352, "grad_norm": 0.23156128823757172, "learning_rate": 0.0002833333333333333, "loss": 0.4752, "step": 380 }, { "epoch": 0.11422147273749297, "grad_norm": 0.24328507483005524, "learning_rate": 0.0002832882882882883, "loss": 0.5225, "step": 381 }, { "epoch": 0.11452126662919243, "grad_norm": 0.2364337146282196, "learning_rate": 0.00028324324324324323, "loss": 0.4965, "step": 382 }, { "epoch": 0.11482106052089189, "grad_norm": 0.24130620062351227, "learning_rate": 0.00028319819819819816, "loss": 0.5058, "step": 383 }, { "epoch": 0.11512085441259134, "grad_norm": 0.2482600063085556, "learning_rate": 0.00028315315315315315, "loss": 0.5142, "step": 384 }, { "epoch": 0.1154206483042908, "grad_norm": 0.2426941990852356, "learning_rate": 0.0002831081081081081, "loss": 0.5185, "step": 385 }, { "epoch": 0.11572044219599026, "grad_norm": 0.24431252479553223, "learning_rate": 0.00028306306306306303, "loss": 0.4946, "step": 386 }, { "epoch": 0.11602023608768972, "grad_norm": 0.22974024713039398, "learning_rate": 0.00028301801801801797, "loss": 0.4845, "step": 387 }, { "epoch": 0.11632002997938917, "grad_norm": 0.26395878195762634, "learning_rate": 0.00028297297297297296, "loss": 0.512, "step": 388 }, { "epoch": 0.11661982387108863, "grad_norm": 0.2300662398338318, "learning_rate": 0.0002829279279279279, "loss": 0.5161, "step": 389 }, { "epoch": 0.11691961776278809, "grad_norm": 0.23376531898975372, "learning_rate": 0.00028288288288288283, "loss": 0.5165, "step": 390 }, { "epoch": 0.11721941165448754, "grad_norm": 0.23775231838226318, "learning_rate": 0.0002828378378378378, "loss": 0.5028, "step": 391 }, { "epoch": 0.117519205546187, "grad_norm": 0.22368961572647095, "learning_rate": 0.00028279279279279276, "loss": 0.4724, "step": 392 }, { "epoch": 0.11781899943788646, "grad_norm": 0.25789904594421387, "learning_rate": 0.0002827477477477477, "loss": 0.5545, "step": 393 }, { "epoch": 0.11811879332958591, "grad_norm": 0.22045502066612244, "learning_rate": 0.0002827027027027027, "loss": 0.4739, "step": 394 }, { "epoch": 0.11841858722128537, "grad_norm": 0.24297311902046204, "learning_rate": 0.0002826576576576576, "loss": 0.5284, "step": 395 }, { "epoch": 0.11871838111298483, "grad_norm": 0.22944821417331696, "learning_rate": 0.00028261261261261256, "loss": 0.5071, "step": 396 }, { "epoch": 0.11901817500468428, "grad_norm": 0.22093501687049866, "learning_rate": 0.00028256756756756755, "loss": 0.5016, "step": 397 }, { "epoch": 0.11931796889638374, "grad_norm": 0.24359877407550812, "learning_rate": 0.0002825225225225225, "loss": 0.494, "step": 398 }, { "epoch": 0.1196177627880832, "grad_norm": 0.23019848763942719, "learning_rate": 0.0002824774774774774, "loss": 0.4774, "step": 399 }, { "epoch": 0.11991755667978266, "grad_norm": 0.22894737124443054, "learning_rate": 0.0002824324324324324, "loss": 0.4804, "step": 400 }, { "epoch": 0.12021735057148211, "grad_norm": 0.23049385845661163, "learning_rate": 0.00028238738738738735, "loss": 0.4816, "step": 401 }, { "epoch": 0.12051714446318157, "grad_norm": 0.21988703310489655, "learning_rate": 0.0002823423423423423, "loss": 0.479, "step": 402 }, { "epoch": 0.12081693835488103, "grad_norm": 0.2389378845691681, "learning_rate": 0.0002822972972972973, "loss": 0.4993, "step": 403 }, { "epoch": 0.12111673224658047, "grad_norm": 0.2271224707365036, "learning_rate": 0.0002822522522522522, "loss": 0.479, "step": 404 }, { "epoch": 0.12141652613827993, "grad_norm": 0.253499835729599, "learning_rate": 0.00028220720720720715, "loss": 0.5289, "step": 405 }, { "epoch": 0.12171632002997938, "grad_norm": 0.2213011533021927, "learning_rate": 0.00028216216216216214, "loss": 0.4759, "step": 406 }, { "epoch": 0.12201611392167884, "grad_norm": 0.2327415943145752, "learning_rate": 0.0002821171171171171, "loss": 0.4822, "step": 407 }, { "epoch": 0.1223159078133783, "grad_norm": 0.2592346668243408, "learning_rate": 0.000282072072072072, "loss": 0.5331, "step": 408 }, { "epoch": 0.12261570170507775, "grad_norm": 0.23039428889751434, "learning_rate": 0.000282027027027027, "loss": 0.4882, "step": 409 }, { "epoch": 0.12291549559677721, "grad_norm": 0.2345624566078186, "learning_rate": 0.00028198198198198194, "loss": 0.4859, "step": 410 }, { "epoch": 0.12321528948847667, "grad_norm": 0.22759179770946503, "learning_rate": 0.0002819369369369369, "loss": 0.484, "step": 411 }, { "epoch": 0.12351508338017612, "grad_norm": 0.24670295417308807, "learning_rate": 0.00028189189189189187, "loss": 0.5047, "step": 412 }, { "epoch": 0.12381487727187558, "grad_norm": 0.2306670844554901, "learning_rate": 0.0002818468468468468, "loss": 0.5041, "step": 413 }, { "epoch": 0.12411467116357504, "grad_norm": 0.2440173178911209, "learning_rate": 0.0002818018018018018, "loss": 0.4942, "step": 414 }, { "epoch": 0.1244144650552745, "grad_norm": 0.2239232212305069, "learning_rate": 0.00028175675675675674, "loss": 0.4677, "step": 415 }, { "epoch": 0.12471425894697395, "grad_norm": 0.22838331758975983, "learning_rate": 0.0002817117117117117, "loss": 0.5061, "step": 416 }, { "epoch": 0.12501405283867342, "grad_norm": 0.23874101042747498, "learning_rate": 0.00028166666666666666, "loss": 0.5261, "step": 417 }, { "epoch": 0.12531384673037288, "grad_norm": 0.2522803544998169, "learning_rate": 0.0002816216216216216, "loss": 0.5525, "step": 418 }, { "epoch": 0.12561364062207234, "grad_norm": 0.2339329719543457, "learning_rate": 0.00028157657657657654, "loss": 0.5157, "step": 419 }, { "epoch": 0.1259134345137718, "grad_norm": 0.23074860870838165, "learning_rate": 0.00028153153153153153, "loss": 0.5062, "step": 420 }, { "epoch": 0.12621322840547125, "grad_norm": 0.22877013683319092, "learning_rate": 0.00028148648648648647, "loss": 0.4827, "step": 421 }, { "epoch": 0.1265130222971707, "grad_norm": 0.2384471893310547, "learning_rate": 0.0002814414414414414, "loss": 0.5035, "step": 422 }, { "epoch": 0.12681281618887016, "grad_norm": 0.24479855597019196, "learning_rate": 0.0002813963963963964, "loss": 0.4645, "step": 423 }, { "epoch": 0.12711261008056962, "grad_norm": 0.23424111306667328, "learning_rate": 0.00028135135135135133, "loss": 0.4927, "step": 424 }, { "epoch": 0.12741240397226908, "grad_norm": 0.24012430012226105, "learning_rate": 0.0002813063063063063, "loss": 0.4904, "step": 425 }, { "epoch": 0.12771219786396854, "grad_norm": 0.24606235325336456, "learning_rate": 0.00028126126126126126, "loss": 0.5143, "step": 426 }, { "epoch": 0.128011991755668, "grad_norm": 0.24893403053283691, "learning_rate": 0.0002812162162162162, "loss": 0.5232, "step": 427 }, { "epoch": 0.12831178564736742, "grad_norm": 0.2424110770225525, "learning_rate": 0.0002811711711711712, "loss": 0.5232, "step": 428 }, { "epoch": 0.12861157953906688, "grad_norm": 0.2312486171722412, "learning_rate": 0.0002811261261261261, "loss": 0.4645, "step": 429 }, { "epoch": 0.12891137343076634, "grad_norm": 0.22639836370944977, "learning_rate": 0.00028108108108108106, "loss": 0.4642, "step": 430 }, { "epoch": 0.1292111673224658, "grad_norm": 0.23626941442489624, "learning_rate": 0.00028103603603603605, "loss": 0.5038, "step": 431 }, { "epoch": 0.12951096121416525, "grad_norm": 0.2625383138656616, "learning_rate": 0.000280990990990991, "loss": 0.4867, "step": 432 }, { "epoch": 0.1298107551058647, "grad_norm": 0.24292655289173126, "learning_rate": 0.0002809459459459459, "loss": 0.5081, "step": 433 }, { "epoch": 0.13011054899756416, "grad_norm": 0.23609769344329834, "learning_rate": 0.00028090090090090086, "loss": 0.4832, "step": 434 }, { "epoch": 0.13041034288926362, "grad_norm": 0.22934478521347046, "learning_rate": 0.00028085585585585585, "loss": 0.4872, "step": 435 }, { "epoch": 0.13071013678096308, "grad_norm": 0.22949008643627167, "learning_rate": 0.0002808108108108108, "loss": 0.5129, "step": 436 }, { "epoch": 0.13100993067266253, "grad_norm": 0.2302381694316864, "learning_rate": 0.0002807657657657657, "loss": 0.4793, "step": 437 }, { "epoch": 0.131309724564362, "grad_norm": 0.23368242383003235, "learning_rate": 0.0002807207207207207, "loss": 0.4989, "step": 438 }, { "epoch": 0.13160951845606145, "grad_norm": 0.21572020649909973, "learning_rate": 0.00028067567567567565, "loss": 0.4546, "step": 439 }, { "epoch": 0.1319093123477609, "grad_norm": 0.2268449366092682, "learning_rate": 0.0002806306306306306, "loss": 0.4954, "step": 440 }, { "epoch": 0.13220910623946036, "grad_norm": 0.23617544770240784, "learning_rate": 0.0002805855855855856, "loss": 0.4865, "step": 441 }, { "epoch": 0.13250890013115982, "grad_norm": 0.24015142023563385, "learning_rate": 0.0002805405405405405, "loss": 0.5211, "step": 442 }, { "epoch": 0.13280869402285927, "grad_norm": 0.21798421442508698, "learning_rate": 0.00028049549549549545, "loss": 0.4482, "step": 443 }, { "epoch": 0.13310848791455873, "grad_norm": 0.23476584255695343, "learning_rate": 0.00028045045045045045, "loss": 0.4871, "step": 444 }, { "epoch": 0.1334082818062582, "grad_norm": 0.2404216080904007, "learning_rate": 0.0002804054054054054, "loss": 0.488, "step": 445 }, { "epoch": 0.13370807569795765, "grad_norm": 0.25073060393333435, "learning_rate": 0.0002803603603603603, "loss": 0.5194, "step": 446 }, { "epoch": 0.1340078695896571, "grad_norm": 0.24332286417484283, "learning_rate": 0.0002803153153153153, "loss": 0.5347, "step": 447 }, { "epoch": 0.13430766348135656, "grad_norm": 0.2420525699853897, "learning_rate": 0.00028027027027027025, "loss": 0.5109, "step": 448 }, { "epoch": 0.13460745737305602, "grad_norm": 0.22389326989650726, "learning_rate": 0.0002802252252252252, "loss": 0.5075, "step": 449 }, { "epoch": 0.13490725126475547, "grad_norm": 0.23522739112377167, "learning_rate": 0.0002801801801801802, "loss": 0.4794, "step": 450 }, { "epoch": 0.13520704515645493, "grad_norm": 0.221591517329216, "learning_rate": 0.0002801351351351351, "loss": 0.4821, "step": 451 }, { "epoch": 0.1355068390481544, "grad_norm": 0.24136802554130554, "learning_rate": 0.00028009009009009005, "loss": 0.4949, "step": 452 }, { "epoch": 0.13580663293985384, "grad_norm": 0.23207104206085205, "learning_rate": 0.00028004504504504504, "loss": 0.4803, "step": 453 }, { "epoch": 0.1361064268315533, "grad_norm": 0.25318193435668945, "learning_rate": 0.00028, "loss": 0.5345, "step": 454 }, { "epoch": 0.13640622072325276, "grad_norm": 0.25274619460105896, "learning_rate": 0.0002799549549549549, "loss": 0.5159, "step": 455 }, { "epoch": 0.13670601461495221, "grad_norm": 0.22540399432182312, "learning_rate": 0.0002799099099099099, "loss": 0.4772, "step": 456 }, { "epoch": 0.13700580850665167, "grad_norm": 0.2346925288438797, "learning_rate": 0.00027986486486486484, "loss": 0.4916, "step": 457 }, { "epoch": 0.13730560239835113, "grad_norm": 0.2226891815662384, "learning_rate": 0.0002798198198198198, "loss": 0.4651, "step": 458 }, { "epoch": 0.13760539629005059, "grad_norm": 0.25540515780448914, "learning_rate": 0.0002797747747747747, "loss": 0.5316, "step": 459 }, { "epoch": 0.13790519018175004, "grad_norm": 0.22934426367282867, "learning_rate": 0.0002797297297297297, "loss": 0.477, "step": 460 }, { "epoch": 0.1382049840734495, "grad_norm": 0.22268570959568024, "learning_rate": 0.00027968468468468464, "loss": 0.4611, "step": 461 }, { "epoch": 0.13850477796514896, "grad_norm": 0.23548570275306702, "learning_rate": 0.0002796396396396396, "loss": 0.5012, "step": 462 }, { "epoch": 0.1388045718568484, "grad_norm": 0.22782792150974274, "learning_rate": 0.00027959459459459457, "loss": 0.4878, "step": 463 }, { "epoch": 0.13910436574854787, "grad_norm": 0.24569828808307648, "learning_rate": 0.0002795495495495495, "loss": 0.4882, "step": 464 }, { "epoch": 0.13940415964024733, "grad_norm": 0.23523476719856262, "learning_rate": 0.00027950450450450444, "loss": 0.498, "step": 465 }, { "epoch": 0.13970395353194678, "grad_norm": 0.24249842762947083, "learning_rate": 0.00027945945945945943, "loss": 0.5156, "step": 466 }, { "epoch": 0.14000374742364624, "grad_norm": 0.22582505643367767, "learning_rate": 0.00027941441441441437, "loss": 0.4428, "step": 467 }, { "epoch": 0.1403035413153457, "grad_norm": 0.2527635395526886, "learning_rate": 0.0002793693693693693, "loss": 0.507, "step": 468 }, { "epoch": 0.14060333520704515, "grad_norm": 0.2490163892507553, "learning_rate": 0.0002793243243243243, "loss": 0.5119, "step": 469 }, { "epoch": 0.1409031290987446, "grad_norm": 0.26502713561058044, "learning_rate": 0.00027927927927927923, "loss": 0.4994, "step": 470 }, { "epoch": 0.14120292299044407, "grad_norm": 0.25225281715393066, "learning_rate": 0.0002792342342342342, "loss": 0.5029, "step": 471 }, { "epoch": 0.14150271688214353, "grad_norm": 0.235763818025589, "learning_rate": 0.00027918918918918916, "loss": 0.5154, "step": 472 }, { "epoch": 0.14180251077384298, "grad_norm": 0.24403534829616547, "learning_rate": 0.0002791441441441441, "loss": 0.5241, "step": 473 }, { "epoch": 0.14210230466554244, "grad_norm": 0.24656488001346588, "learning_rate": 0.0002790990990990991, "loss": 0.4767, "step": 474 }, { "epoch": 0.1424020985572419, "grad_norm": 0.2506386935710907, "learning_rate": 0.000279054054054054, "loss": 0.5066, "step": 475 }, { "epoch": 0.14270189244894135, "grad_norm": 0.24634157121181488, "learning_rate": 0.00027900900900900896, "loss": 0.4815, "step": 476 }, { "epoch": 0.1430016863406408, "grad_norm": 0.23619256913661957, "learning_rate": 0.00027896396396396395, "loss": 0.4929, "step": 477 }, { "epoch": 0.14330148023234027, "grad_norm": 0.23421134054660797, "learning_rate": 0.0002789189189189189, "loss": 0.4841, "step": 478 }, { "epoch": 0.14360127412403972, "grad_norm": 0.2287687510251999, "learning_rate": 0.00027887387387387383, "loss": 0.4976, "step": 479 }, { "epoch": 0.14390106801573918, "grad_norm": 0.2362293004989624, "learning_rate": 0.0002788288288288288, "loss": 0.5054, "step": 480 }, { "epoch": 0.14420086190743864, "grad_norm": 0.23907198011875153, "learning_rate": 0.00027878378378378376, "loss": 0.4879, "step": 481 }, { "epoch": 0.1445006557991381, "grad_norm": 0.21802479028701782, "learning_rate": 0.00027873873873873875, "loss": 0.4807, "step": 482 }, { "epoch": 0.14480044969083755, "grad_norm": 0.2445833832025528, "learning_rate": 0.0002786936936936937, "loss": 0.519, "step": 483 }, { "epoch": 0.145100243582537, "grad_norm": 0.24606822431087494, "learning_rate": 0.0002786486486486486, "loss": 0.4956, "step": 484 }, { "epoch": 0.14540003747423647, "grad_norm": 0.24663852155208588, "learning_rate": 0.0002786036036036036, "loss": 0.5029, "step": 485 }, { "epoch": 0.14569983136593592, "grad_norm": 0.22668293118476868, "learning_rate": 0.00027855855855855855, "loss": 0.4987, "step": 486 }, { "epoch": 0.14599962525763538, "grad_norm": 0.2292596995830536, "learning_rate": 0.0002785135135135135, "loss": 0.4808, "step": 487 }, { "epoch": 0.14629941914933484, "grad_norm": 0.2296249270439148, "learning_rate": 0.0002784684684684685, "loss": 0.4733, "step": 488 }, { "epoch": 0.1465992130410343, "grad_norm": 0.2463514357805252, "learning_rate": 0.0002784234234234234, "loss": 0.4856, "step": 489 }, { "epoch": 0.14689900693273375, "grad_norm": 0.2578481435775757, "learning_rate": 0.00027837837837837835, "loss": 0.52, "step": 490 }, { "epoch": 0.1471988008244332, "grad_norm": 0.22988936305046082, "learning_rate": 0.00027833333333333334, "loss": 0.469, "step": 491 }, { "epoch": 0.14749859471613266, "grad_norm": 0.24489805102348328, "learning_rate": 0.0002782882882882883, "loss": 0.5108, "step": 492 }, { "epoch": 0.14779838860783212, "grad_norm": 0.24594642221927643, "learning_rate": 0.0002782432432432432, "loss": 0.4977, "step": 493 }, { "epoch": 0.14809818249953158, "grad_norm": 0.23341059684753418, "learning_rate": 0.0002781981981981982, "loss": 0.4784, "step": 494 }, { "epoch": 0.14839797639123103, "grad_norm": 0.24211278557777405, "learning_rate": 0.00027815315315315314, "loss": 0.4966, "step": 495 }, { "epoch": 0.1486977702829305, "grad_norm": 0.24049176275730133, "learning_rate": 0.0002781081081081081, "loss": 0.4807, "step": 496 }, { "epoch": 0.14899756417462995, "grad_norm": 0.2326640784740448, "learning_rate": 0.00027806306306306307, "loss": 0.4571, "step": 497 }, { "epoch": 0.1492973580663294, "grad_norm": 0.23826268315315247, "learning_rate": 0.000278018018018018, "loss": 0.4656, "step": 498 }, { "epoch": 0.14959715195802886, "grad_norm": 0.2514077425003052, "learning_rate": 0.00027797297297297294, "loss": 0.5057, "step": 499 }, { "epoch": 0.14989694584972832, "grad_norm": 0.22455474734306335, "learning_rate": 0.00027792792792792793, "loss": 0.4754, "step": 500 }, { "epoch": 0.14989694584972832, "eval_loss": 0.49213707447052, "eval_runtime": 564.4362, "eval_samples_per_second": 3.825, "eval_steps_per_second": 0.478, "step": 500 }, { "epoch": 0.15019673974142778, "grad_norm": 0.24720892310142517, "learning_rate": 0.00027788288288288287, "loss": 0.5022, "step": 501 }, { "epoch": 0.15049653363312723, "grad_norm": 0.24081699550151825, "learning_rate": 0.0002778378378378378, "loss": 0.4928, "step": 502 }, { "epoch": 0.1507963275248267, "grad_norm": 0.24245139956474304, "learning_rate": 0.0002777927927927928, "loss": 0.5005, "step": 503 }, { "epoch": 0.15109612141652615, "grad_norm": 0.23554831743240356, "learning_rate": 0.00027774774774774774, "loss": 0.4914, "step": 504 }, { "epoch": 0.1513959153082256, "grad_norm": 0.2291078418493271, "learning_rate": 0.00027770270270270267, "loss": 0.4807, "step": 505 }, { "epoch": 0.15169570919992506, "grad_norm": 0.23393088579177856, "learning_rate": 0.0002776576576576576, "loss": 0.496, "step": 506 }, { "epoch": 0.15199550309162452, "grad_norm": 0.24490569531917572, "learning_rate": 0.0002776126126126126, "loss": 0.5, "step": 507 }, { "epoch": 0.15229529698332397, "grad_norm": 0.21680289506912231, "learning_rate": 0.00027756756756756754, "loss": 0.4549, "step": 508 }, { "epoch": 0.15259509087502343, "grad_norm": 0.22479134798049927, "learning_rate": 0.0002775225225225225, "loss": 0.471, "step": 509 }, { "epoch": 0.1528948847667229, "grad_norm": 0.2381131947040558, "learning_rate": 0.00027747747747747746, "loss": 0.4949, "step": 510 }, { "epoch": 0.15319467865842235, "grad_norm": 0.22718404233455658, "learning_rate": 0.0002774324324324324, "loss": 0.4883, "step": 511 }, { "epoch": 0.1534944725501218, "grad_norm": 0.2359694391489029, "learning_rate": 0.00027738738738738734, "loss": 0.5052, "step": 512 }, { "epoch": 0.15379426644182126, "grad_norm": 0.229795902967453, "learning_rate": 0.00027734234234234233, "loss": 0.4814, "step": 513 }, { "epoch": 0.15409406033352072, "grad_norm": 0.2197142243385315, "learning_rate": 0.00027729729729729727, "loss": 0.4617, "step": 514 }, { "epoch": 0.15439385422522017, "grad_norm": 0.23570996522903442, "learning_rate": 0.0002772522522522522, "loss": 0.4731, "step": 515 }, { "epoch": 0.15469364811691963, "grad_norm": 0.23566411435604095, "learning_rate": 0.0002772072072072072, "loss": 0.4921, "step": 516 }, { "epoch": 0.1549934420086191, "grad_norm": 0.21966999769210815, "learning_rate": 0.00027716216216216213, "loss": 0.4683, "step": 517 }, { "epoch": 0.15529323590031854, "grad_norm": 0.2531338036060333, "learning_rate": 0.00027711711711711707, "loss": 0.5254, "step": 518 }, { "epoch": 0.155593029792018, "grad_norm": 0.2375670224428177, "learning_rate": 0.00027707207207207206, "loss": 0.4988, "step": 519 }, { "epoch": 0.15589282368371746, "grad_norm": 0.2455272376537323, "learning_rate": 0.000277027027027027, "loss": 0.501, "step": 520 }, { "epoch": 0.15619261757541691, "grad_norm": 0.21289831399917603, "learning_rate": 0.00027698198198198193, "loss": 0.4575, "step": 521 }, { "epoch": 0.15649241146711637, "grad_norm": 0.2653936743736267, "learning_rate": 0.0002769369369369369, "loss": 0.5251, "step": 522 }, { "epoch": 0.1567922053588158, "grad_norm": 0.23822923004627228, "learning_rate": 0.00027689189189189186, "loss": 0.5095, "step": 523 }, { "epoch": 0.15709199925051526, "grad_norm": 0.25067201256752014, "learning_rate": 0.0002768468468468468, "loss": 0.4841, "step": 524 }, { "epoch": 0.15739179314221471, "grad_norm": 0.2340254783630371, "learning_rate": 0.0002768018018018018, "loss": 0.4959, "step": 525 }, { "epoch": 0.15769158703391417, "grad_norm": 0.2431899458169937, "learning_rate": 0.0002767567567567567, "loss": 0.5035, "step": 526 }, { "epoch": 0.15799138092561363, "grad_norm": 0.22817112505435944, "learning_rate": 0.00027671171171171166, "loss": 0.49, "step": 527 }, { "epoch": 0.15829117481731309, "grad_norm": 0.21927404403686523, "learning_rate": 0.00027666666666666665, "loss": 0.4785, "step": 528 }, { "epoch": 0.15859096870901254, "grad_norm": 0.2402762919664383, "learning_rate": 0.0002766216216216216, "loss": 0.4799, "step": 529 }, { "epoch": 0.158890762600712, "grad_norm": 0.2559228241443634, "learning_rate": 0.0002765765765765765, "loss": 0.5065, "step": 530 }, { "epoch": 0.15919055649241146, "grad_norm": 0.22883668541908264, "learning_rate": 0.0002765315315315315, "loss": 0.4797, "step": 531 }, { "epoch": 0.1594903503841109, "grad_norm": 0.24328212440013885, "learning_rate": 0.00027648648648648645, "loss": 0.4796, "step": 532 }, { "epoch": 0.15979014427581037, "grad_norm": 0.2543148398399353, "learning_rate": 0.0002764414414414414, "loss": 0.4785, "step": 533 }, { "epoch": 0.16008993816750983, "grad_norm": 0.24784719944000244, "learning_rate": 0.0002763963963963964, "loss": 0.4981, "step": 534 }, { "epoch": 0.16038973205920928, "grad_norm": 0.24210456013679504, "learning_rate": 0.0002763513513513513, "loss": 0.4939, "step": 535 }, { "epoch": 0.16068952595090874, "grad_norm": 0.22924496233463287, "learning_rate": 0.00027630630630630625, "loss": 0.4524, "step": 536 }, { "epoch": 0.1609893198426082, "grad_norm": 0.270022451877594, "learning_rate": 0.00027626126126126124, "loss": 0.5184, "step": 537 }, { "epoch": 0.16128911373430765, "grad_norm": 0.2689591646194458, "learning_rate": 0.0002762162162162162, "loss": 0.4966, "step": 538 }, { "epoch": 0.1615889076260071, "grad_norm": 0.23465842008590698, "learning_rate": 0.00027617117117117117, "loss": 0.4865, "step": 539 }, { "epoch": 0.16188870151770657, "grad_norm": 0.23281508684158325, "learning_rate": 0.0002761261261261261, "loss": 0.4859, "step": 540 }, { "epoch": 0.16218849540940602, "grad_norm": 0.25370529294013977, "learning_rate": 0.00027608108108108105, "loss": 0.4609, "step": 541 }, { "epoch": 0.16248828930110548, "grad_norm": 0.2646511495113373, "learning_rate": 0.00027603603603603604, "loss": 0.4967, "step": 542 }, { "epoch": 0.16278808319280494, "grad_norm": 0.22836188971996307, "learning_rate": 0.000275990990990991, "loss": 0.4982, "step": 543 }, { "epoch": 0.1630878770845044, "grad_norm": 0.22948142886161804, "learning_rate": 0.0002759459459459459, "loss": 0.471, "step": 544 }, { "epoch": 0.16338767097620385, "grad_norm": 0.2623734474182129, "learning_rate": 0.0002759009009009009, "loss": 0.4938, "step": 545 }, { "epoch": 0.1636874648679033, "grad_norm": 0.2337695211172104, "learning_rate": 0.00027585585585585584, "loss": 0.4584, "step": 546 }, { "epoch": 0.16398725875960277, "grad_norm": 0.2507021129131317, "learning_rate": 0.0002758108108108108, "loss": 0.5002, "step": 547 }, { "epoch": 0.16428705265130222, "grad_norm": 0.23930178582668304, "learning_rate": 0.00027576576576576577, "loss": 0.4724, "step": 548 }, { "epoch": 0.16458684654300168, "grad_norm": 0.24984320998191833, "learning_rate": 0.0002757207207207207, "loss": 0.4768, "step": 549 }, { "epoch": 0.16488664043470114, "grad_norm": 0.2434365600347519, "learning_rate": 0.00027567567567567564, "loss": 0.4667, "step": 550 }, { "epoch": 0.1651864343264006, "grad_norm": 0.22952896356582642, "learning_rate": 0.00027563063063063063, "loss": 0.4624, "step": 551 }, { "epoch": 0.16548622821810005, "grad_norm": 0.2372165471315384, "learning_rate": 0.00027558558558558557, "loss": 0.477, "step": 552 }, { "epoch": 0.1657860221097995, "grad_norm": 0.24741259217262268, "learning_rate": 0.0002755405405405405, "loss": 0.5077, "step": 553 }, { "epoch": 0.16608581600149896, "grad_norm": 0.2387109249830246, "learning_rate": 0.0002754954954954955, "loss": 0.4817, "step": 554 }, { "epoch": 0.16638560989319842, "grad_norm": 0.24962367117404938, "learning_rate": 0.00027545045045045043, "loss": 0.4775, "step": 555 }, { "epoch": 0.16668540378489788, "grad_norm": 0.2375505119562149, "learning_rate": 0.00027540540540540537, "loss": 0.4843, "step": 556 }, { "epoch": 0.16698519767659734, "grad_norm": 0.24189910292625427, "learning_rate": 0.00027536036036036036, "loss": 0.4952, "step": 557 }, { "epoch": 0.1672849915682968, "grad_norm": 0.2314407229423523, "learning_rate": 0.0002753153153153153, "loss": 0.4676, "step": 558 }, { "epoch": 0.16758478545999625, "grad_norm": 0.24112465977668762, "learning_rate": 0.00027527027027027023, "loss": 0.479, "step": 559 }, { "epoch": 0.1678845793516957, "grad_norm": 0.22687260806560516, "learning_rate": 0.0002752252252252252, "loss": 0.4651, "step": 560 }, { "epoch": 0.16818437324339516, "grad_norm": 0.23146574199199677, "learning_rate": 0.00027518018018018016, "loss": 0.4724, "step": 561 }, { "epoch": 0.16848416713509462, "grad_norm": 0.23164650797843933, "learning_rate": 0.0002751351351351351, "loss": 0.4595, "step": 562 }, { "epoch": 0.16878396102679408, "grad_norm": 0.2290349006652832, "learning_rate": 0.0002750900900900901, "loss": 0.469, "step": 563 }, { "epoch": 0.16908375491849353, "grad_norm": 0.22324185073375702, "learning_rate": 0.000275045045045045, "loss": 0.4616, "step": 564 }, { "epoch": 0.169383548810193, "grad_norm": 0.2320687472820282, "learning_rate": 0.00027499999999999996, "loss": 0.4706, "step": 565 }, { "epoch": 0.16968334270189245, "grad_norm": 0.2461112141609192, "learning_rate": 0.00027495495495495495, "loss": 0.4886, "step": 566 }, { "epoch": 0.1699831365935919, "grad_norm": 0.22541654109954834, "learning_rate": 0.0002749099099099099, "loss": 0.4676, "step": 567 }, { "epoch": 0.17028293048529136, "grad_norm": 0.24664641916751862, "learning_rate": 0.0002748648648648648, "loss": 0.509, "step": 568 }, { "epoch": 0.17058272437699082, "grad_norm": 0.23051698505878448, "learning_rate": 0.0002748198198198198, "loss": 0.4742, "step": 569 }, { "epoch": 0.17088251826869028, "grad_norm": 0.21268148720264435, "learning_rate": 0.00027477477477477475, "loss": 0.4536, "step": 570 }, { "epoch": 0.17118231216038973, "grad_norm": 0.25143638253211975, "learning_rate": 0.0002747297297297297, "loss": 0.5233, "step": 571 }, { "epoch": 0.1714821060520892, "grad_norm": 0.21673695743083954, "learning_rate": 0.0002746846846846847, "loss": 0.445, "step": 572 }, { "epoch": 0.17178189994378865, "grad_norm": 0.24307781457901, "learning_rate": 0.0002746396396396396, "loss": 0.493, "step": 573 }, { "epoch": 0.1720816938354881, "grad_norm": 0.24256987869739532, "learning_rate": 0.00027459459459459456, "loss": 0.5249, "step": 574 }, { "epoch": 0.17238148772718756, "grad_norm": 0.23426513373851776, "learning_rate": 0.00027454954954954955, "loss": 0.4956, "step": 575 }, { "epoch": 0.17268128161888702, "grad_norm": 0.23137056827545166, "learning_rate": 0.0002745045045045045, "loss": 0.4909, "step": 576 }, { "epoch": 0.17298107551058647, "grad_norm": 0.22946982085704803, "learning_rate": 0.0002744594594594594, "loss": 0.4733, "step": 577 }, { "epoch": 0.17328086940228593, "grad_norm": 0.23843489587306976, "learning_rate": 0.00027441441441441436, "loss": 0.4944, "step": 578 }, { "epoch": 0.1735806632939854, "grad_norm": 0.21571891009807587, "learning_rate": 0.00027436936936936935, "loss": 0.4488, "step": 579 }, { "epoch": 0.17388045718568484, "grad_norm": 0.25007542967796326, "learning_rate": 0.0002743243243243243, "loss": 0.488, "step": 580 }, { "epoch": 0.1741802510773843, "grad_norm": 0.24017852544784546, "learning_rate": 0.0002742792792792792, "loss": 0.4969, "step": 581 }, { "epoch": 0.17448004496908376, "grad_norm": 0.23361638188362122, "learning_rate": 0.0002742342342342342, "loss": 0.5348, "step": 582 }, { "epoch": 0.17477983886078322, "grad_norm": 0.22652795910835266, "learning_rate": 0.00027418918918918915, "loss": 0.4776, "step": 583 }, { "epoch": 0.17507963275248267, "grad_norm": 0.23973309993743896, "learning_rate": 0.0002741441441441441, "loss": 0.4803, "step": 584 }, { "epoch": 0.17537942664418213, "grad_norm": 0.23487752676010132, "learning_rate": 0.0002740990990990991, "loss": 0.482, "step": 585 }, { "epoch": 0.1756792205358816, "grad_norm": 0.2464274764060974, "learning_rate": 0.000274054054054054, "loss": 0.5167, "step": 586 }, { "epoch": 0.17597901442758104, "grad_norm": 0.2280922681093216, "learning_rate": 0.00027400900900900895, "loss": 0.4941, "step": 587 }, { "epoch": 0.1762788083192805, "grad_norm": 0.24017813801765442, "learning_rate": 0.00027396396396396394, "loss": 0.4721, "step": 588 }, { "epoch": 0.17657860221097996, "grad_norm": 0.24262118339538574, "learning_rate": 0.0002739189189189189, "loss": 0.5053, "step": 589 }, { "epoch": 0.1768783961026794, "grad_norm": 0.24060070514678955, "learning_rate": 0.0002738738738738738, "loss": 0.4932, "step": 590 }, { "epoch": 0.17717818999437887, "grad_norm": 0.2486894428730011, "learning_rate": 0.0002738288288288288, "loss": 0.4963, "step": 591 }, { "epoch": 0.17747798388607833, "grad_norm": 0.22934255003929138, "learning_rate": 0.00027378378378378374, "loss": 0.4911, "step": 592 }, { "epoch": 0.17777777777777778, "grad_norm": 0.23473136126995087, "learning_rate": 0.0002737387387387387, "loss": 0.4967, "step": 593 }, { "epoch": 0.17807757166947724, "grad_norm": 0.24307146668434143, "learning_rate": 0.00027369369369369367, "loss": 0.5051, "step": 594 }, { "epoch": 0.1783773655611767, "grad_norm": 0.25658494234085083, "learning_rate": 0.0002736486486486486, "loss": 0.4959, "step": 595 }, { "epoch": 0.17867715945287616, "grad_norm": 0.23326924443244934, "learning_rate": 0.0002736036036036036, "loss": 0.5024, "step": 596 }, { "epoch": 0.1789769533445756, "grad_norm": 0.2539668083190918, "learning_rate": 0.00027355855855855854, "loss": 0.5059, "step": 597 }, { "epoch": 0.17927674723627507, "grad_norm": 0.24097499251365662, "learning_rate": 0.00027351351351351347, "loss": 0.4888, "step": 598 }, { "epoch": 0.17957654112797453, "grad_norm": 0.24816173315048218, "learning_rate": 0.00027346846846846846, "loss": 0.4943, "step": 599 }, { "epoch": 0.17987633501967398, "grad_norm": 0.25391021370887756, "learning_rate": 0.0002734234234234234, "loss": 0.4909, "step": 600 }, { "epoch": 0.18017612891137344, "grad_norm": 0.25449302792549133, "learning_rate": 0.00027337837837837834, "loss": 0.499, "step": 601 }, { "epoch": 0.1804759228030729, "grad_norm": 0.2581718862056732, "learning_rate": 0.00027333333333333333, "loss": 0.5093, "step": 602 }, { "epoch": 0.18077571669477235, "grad_norm": 0.2509480118751526, "learning_rate": 0.00027328828828828826, "loss": 0.4974, "step": 603 }, { "epoch": 0.1810755105864718, "grad_norm": 0.23391370475292206, "learning_rate": 0.0002732432432432432, "loss": 0.5176, "step": 604 }, { "epoch": 0.18137530447817127, "grad_norm": 0.2365102618932724, "learning_rate": 0.0002731981981981982, "loss": 0.5021, "step": 605 }, { "epoch": 0.18167509836987072, "grad_norm": 0.23687691986560822, "learning_rate": 0.00027315315315315313, "loss": 0.4728, "step": 606 }, { "epoch": 0.18197489226157018, "grad_norm": 0.24404986202716827, "learning_rate": 0.00027310810810810807, "loss": 0.4842, "step": 607 }, { "epoch": 0.18227468615326964, "grad_norm": 0.2473643720149994, "learning_rate": 0.00027306306306306306, "loss": 0.4823, "step": 608 }, { "epoch": 0.1825744800449691, "grad_norm": 0.22777344286441803, "learning_rate": 0.000273018018018018, "loss": 0.4995, "step": 609 }, { "epoch": 0.18287427393666855, "grad_norm": 0.22545696794986725, "learning_rate": 0.000272972972972973, "loss": 0.4634, "step": 610 }, { "epoch": 0.183174067828368, "grad_norm": 0.2380336970090866, "learning_rate": 0.0002729279279279279, "loss": 0.4831, "step": 611 }, { "epoch": 0.18347386172006747, "grad_norm": 0.22387194633483887, "learning_rate": 0.00027288288288288286, "loss": 0.4567, "step": 612 }, { "epoch": 0.18377365561176692, "grad_norm": 0.2482718676328659, "learning_rate": 0.00027283783783783785, "loss": 0.4974, "step": 613 }, { "epoch": 0.18407344950346638, "grad_norm": 0.2622338533401489, "learning_rate": 0.0002727927927927928, "loss": 0.5004, "step": 614 }, { "epoch": 0.18437324339516584, "grad_norm": 0.22443141043186188, "learning_rate": 0.0002727477477477477, "loss": 0.4717, "step": 615 }, { "epoch": 0.1846730372868653, "grad_norm": 0.23443573713302612, "learning_rate": 0.0002727027027027027, "loss": 0.5023, "step": 616 }, { "epoch": 0.18497283117856475, "grad_norm": 0.2201087921857834, "learning_rate": 0.00027265765765765765, "loss": 0.4812, "step": 617 }, { "epoch": 0.18527262507026418, "grad_norm": 0.24011440575122833, "learning_rate": 0.0002726126126126126, "loss": 0.4865, "step": 618 }, { "epoch": 0.18557241896196364, "grad_norm": 0.2697189450263977, "learning_rate": 0.0002725675675675676, "loss": 0.5256, "step": 619 }, { "epoch": 0.1858722128536631, "grad_norm": 0.22377456724643707, "learning_rate": 0.0002725225225225225, "loss": 0.4701, "step": 620 }, { "epoch": 0.18617200674536255, "grad_norm": 0.2551979720592499, "learning_rate": 0.00027247747747747745, "loss": 0.4878, "step": 621 }, { "epoch": 0.186471800637062, "grad_norm": 0.2368023544549942, "learning_rate": 0.00027243243243243244, "loss": 0.4768, "step": 622 }, { "epoch": 0.18677159452876146, "grad_norm": 0.23817569017410278, "learning_rate": 0.0002723873873873874, "loss": 0.4914, "step": 623 }, { "epoch": 0.18707138842046092, "grad_norm": 0.23484331369400024, "learning_rate": 0.0002723423423423423, "loss": 0.4896, "step": 624 }, { "epoch": 0.18737118231216038, "grad_norm": 0.2473037838935852, "learning_rate": 0.0002722972972972973, "loss": 0.4841, "step": 625 }, { "epoch": 0.18767097620385984, "grad_norm": 0.2387157678604126, "learning_rate": 0.00027225225225225224, "loss": 0.4913, "step": 626 }, { "epoch": 0.1879707700955593, "grad_norm": 0.2485678642988205, "learning_rate": 0.0002722072072072072, "loss": 0.5155, "step": 627 }, { "epoch": 0.18827056398725875, "grad_norm": 0.22908784449100494, "learning_rate": 0.0002721621621621621, "loss": 0.4874, "step": 628 }, { "epoch": 0.1885703578789582, "grad_norm": 0.22057555615901947, "learning_rate": 0.0002721171171171171, "loss": 0.4846, "step": 629 }, { "epoch": 0.18887015177065766, "grad_norm": 0.23972582817077637, "learning_rate": 0.00027207207207207204, "loss": 0.4785, "step": 630 }, { "epoch": 0.18916994566235712, "grad_norm": 0.2453726977109909, "learning_rate": 0.000272027027027027, "loss": 0.4855, "step": 631 }, { "epoch": 0.18946973955405658, "grad_norm": 0.23710183799266815, "learning_rate": 0.00027198198198198197, "loss": 0.4979, "step": 632 }, { "epoch": 0.18976953344575603, "grad_norm": 0.24524274468421936, "learning_rate": 0.0002719369369369369, "loss": 0.4806, "step": 633 }, { "epoch": 0.1900693273374555, "grad_norm": 0.22835515439510345, "learning_rate": 0.00027189189189189185, "loss": 0.4746, "step": 634 }, { "epoch": 0.19036912122915495, "grad_norm": 0.23380154371261597, "learning_rate": 0.00027184684684684684, "loss": 0.4931, "step": 635 }, { "epoch": 0.1906689151208544, "grad_norm": 0.22659477591514587, "learning_rate": 0.0002718018018018018, "loss": 0.4443, "step": 636 }, { "epoch": 0.19096870901255386, "grad_norm": 0.22725367546081543, "learning_rate": 0.0002717567567567567, "loss": 0.4888, "step": 637 }, { "epoch": 0.19126850290425332, "grad_norm": 0.233082115650177, "learning_rate": 0.0002717117117117117, "loss": 0.4752, "step": 638 }, { "epoch": 0.19156829679595277, "grad_norm": 0.22560617327690125, "learning_rate": 0.00027166666666666664, "loss": 0.4585, "step": 639 }, { "epoch": 0.19186809068765223, "grad_norm": 0.22963936626911163, "learning_rate": 0.0002716216216216216, "loss": 0.4774, "step": 640 }, { "epoch": 0.1921678845793517, "grad_norm": 0.2543715238571167, "learning_rate": 0.00027157657657657657, "loss": 0.4896, "step": 641 }, { "epoch": 0.19246767847105115, "grad_norm": 0.24594075977802277, "learning_rate": 0.0002715315315315315, "loss": 0.4753, "step": 642 }, { "epoch": 0.1927674723627506, "grad_norm": 0.2333337366580963, "learning_rate": 0.00027148648648648644, "loss": 0.4759, "step": 643 }, { "epoch": 0.19306726625445006, "grad_norm": 0.23100800812244415, "learning_rate": 0.00027144144144144143, "loss": 0.4536, "step": 644 }, { "epoch": 0.19336706014614952, "grad_norm": 0.26426073908805847, "learning_rate": 0.00027139639639639637, "loss": 0.4882, "step": 645 }, { "epoch": 0.19366685403784897, "grad_norm": 0.22670197486877441, "learning_rate": 0.0002713513513513513, "loss": 0.473, "step": 646 }, { "epoch": 0.19396664792954843, "grad_norm": 0.23645341396331787, "learning_rate": 0.0002713063063063063, "loss": 0.4964, "step": 647 }, { "epoch": 0.1942664418212479, "grad_norm": 0.2535463869571686, "learning_rate": 0.00027126126126126123, "loss": 0.5001, "step": 648 }, { "epoch": 0.19456623571294734, "grad_norm": 0.25858014822006226, "learning_rate": 0.00027121621621621617, "loss": 0.5007, "step": 649 }, { "epoch": 0.1948660296046468, "grad_norm": 0.2457359880208969, "learning_rate": 0.0002711711711711711, "loss": 0.4849, "step": 650 }, { "epoch": 0.19516582349634626, "grad_norm": 0.25070154666900635, "learning_rate": 0.0002711261261261261, "loss": 0.4888, "step": 651 }, { "epoch": 0.19546561738804571, "grad_norm": 0.22972378134727478, "learning_rate": 0.00027108108108108103, "loss": 0.4719, "step": 652 }, { "epoch": 0.19576541127974517, "grad_norm": 0.23261764645576477, "learning_rate": 0.000271036036036036, "loss": 0.4633, "step": 653 }, { "epoch": 0.19606520517144463, "grad_norm": 0.25028276443481445, "learning_rate": 0.00027099099099099096, "loss": 0.5209, "step": 654 }, { "epoch": 0.19636499906314409, "grad_norm": 0.23486949503421783, "learning_rate": 0.0002709459459459459, "loss": 0.4757, "step": 655 }, { "epoch": 0.19666479295484354, "grad_norm": 0.2393907606601715, "learning_rate": 0.0002709009009009009, "loss": 0.464, "step": 656 }, { "epoch": 0.196964586846543, "grad_norm": 0.24317681789398193, "learning_rate": 0.0002708558558558558, "loss": 0.4843, "step": 657 }, { "epoch": 0.19726438073824246, "grad_norm": 0.23742252588272095, "learning_rate": 0.00027081081081081076, "loss": 0.5025, "step": 658 }, { "epoch": 0.1975641746299419, "grad_norm": 0.23426878452301025, "learning_rate": 0.00027076576576576575, "loss": 0.4776, "step": 659 }, { "epoch": 0.19786396852164137, "grad_norm": 0.250949501991272, "learning_rate": 0.0002707207207207207, "loss": 0.4968, "step": 660 }, { "epoch": 0.19816376241334083, "grad_norm": 0.23000746965408325, "learning_rate": 0.0002706756756756756, "loss": 0.4895, "step": 661 }, { "epoch": 0.19846355630504028, "grad_norm": 0.26243215799331665, "learning_rate": 0.0002706306306306306, "loss": 0.5076, "step": 662 }, { "epoch": 0.19876335019673974, "grad_norm": 0.22115159034729004, "learning_rate": 0.00027058558558558555, "loss": 0.4823, "step": 663 }, { "epoch": 0.1990631440884392, "grad_norm": 0.22618655860424042, "learning_rate": 0.0002705405405405405, "loss": 0.4994, "step": 664 }, { "epoch": 0.19936293798013865, "grad_norm": 0.22989815473556519, "learning_rate": 0.0002704954954954955, "loss": 0.4701, "step": 665 }, { "epoch": 0.1996627318718381, "grad_norm": 0.24214977025985718, "learning_rate": 0.0002704504504504504, "loss": 0.4806, "step": 666 }, { "epoch": 0.19996252576353757, "grad_norm": 0.21489010751247406, "learning_rate": 0.0002704054054054054, "loss": 0.4511, "step": 667 }, { "epoch": 0.20026231965523703, "grad_norm": 0.2397059053182602, "learning_rate": 0.00027036036036036035, "loss": 0.4427, "step": 668 }, { "epoch": 0.20056211354693648, "grad_norm": 0.2419203370809555, "learning_rate": 0.0002703153153153153, "loss": 0.4993, "step": 669 }, { "epoch": 0.20086190743863594, "grad_norm": 0.24709810316562653, "learning_rate": 0.0002702702702702703, "loss": 0.5195, "step": 670 }, { "epoch": 0.2011617013303354, "grad_norm": 0.25068792700767517, "learning_rate": 0.0002702252252252252, "loss": 0.508, "step": 671 }, { "epoch": 0.20146149522203485, "grad_norm": 0.210756316781044, "learning_rate": 0.00027018018018018015, "loss": 0.4413, "step": 672 }, { "epoch": 0.2017612891137343, "grad_norm": 0.2557854652404785, "learning_rate": 0.00027013513513513514, "loss": 0.4971, "step": 673 }, { "epoch": 0.20206108300543377, "grad_norm": 0.23103776574134827, "learning_rate": 0.0002700900900900901, "loss": 0.4683, "step": 674 }, { "epoch": 0.20236087689713322, "grad_norm": 0.21560733020305634, "learning_rate": 0.000270045045045045, "loss": 0.4443, "step": 675 }, { "epoch": 0.20266067078883268, "grad_norm": 0.2477121204137802, "learning_rate": 0.00027, "loss": 0.5131, "step": 676 }, { "epoch": 0.20296046468053214, "grad_norm": 0.24966078996658325, "learning_rate": 0.00026995495495495494, "loss": 0.4856, "step": 677 }, { "epoch": 0.2032602585722316, "grad_norm": 0.23841539025306702, "learning_rate": 0.0002699099099099099, "loss": 0.4658, "step": 678 }, { "epoch": 0.20356005246393105, "grad_norm": 0.2685762047767639, "learning_rate": 0.00026986486486486487, "loss": 0.5096, "step": 679 }, { "epoch": 0.2038598463556305, "grad_norm": 0.25834083557128906, "learning_rate": 0.0002698198198198198, "loss": 0.5036, "step": 680 }, { "epoch": 0.20415964024732997, "grad_norm": 0.2324528843164444, "learning_rate": 0.00026977477477477474, "loss": 0.4618, "step": 681 }, { "epoch": 0.20445943413902942, "grad_norm": 0.22903920710086823, "learning_rate": 0.00026972972972972973, "loss": 0.4662, "step": 682 }, { "epoch": 0.20475922803072888, "grad_norm": 0.24908147752285004, "learning_rate": 0.00026968468468468467, "loss": 0.4684, "step": 683 }, { "epoch": 0.20505902192242834, "grad_norm": 0.2278299629688263, "learning_rate": 0.0002696396396396396, "loss": 0.4447, "step": 684 }, { "epoch": 0.2053588158141278, "grad_norm": 0.2315731793642044, "learning_rate": 0.0002695945945945946, "loss": 0.4875, "step": 685 }, { "epoch": 0.20565860970582725, "grad_norm": 0.23152673244476318, "learning_rate": 0.00026954954954954953, "loss": 0.4796, "step": 686 }, { "epoch": 0.2059584035975267, "grad_norm": 0.23902982473373413, "learning_rate": 0.00026950450450450447, "loss": 0.5169, "step": 687 }, { "epoch": 0.20625819748922616, "grad_norm": 0.23636193573474884, "learning_rate": 0.00026945945945945946, "loss": 0.493, "step": 688 }, { "epoch": 0.20655799138092562, "grad_norm": 0.21632736921310425, "learning_rate": 0.0002694144144144144, "loss": 0.4594, "step": 689 }, { "epoch": 0.20685778527262508, "grad_norm": 0.2258147895336151, "learning_rate": 0.00026936936936936934, "loss": 0.4625, "step": 690 }, { "epoch": 0.20715757916432453, "grad_norm": 0.21552099287509918, "learning_rate": 0.0002693243243243243, "loss": 0.4481, "step": 691 }, { "epoch": 0.207457373056024, "grad_norm": 0.23030760884284973, "learning_rate": 0.00026927927927927926, "loss": 0.4644, "step": 692 }, { "epoch": 0.20775716694772345, "grad_norm": 0.23163190484046936, "learning_rate": 0.0002692342342342342, "loss": 0.4483, "step": 693 }, { "epoch": 0.2080569608394229, "grad_norm": 0.2412249743938446, "learning_rate": 0.0002691891891891892, "loss": 0.4886, "step": 694 }, { "epoch": 0.20835675473112236, "grad_norm": 0.23279330134391785, "learning_rate": 0.00026914414414414413, "loss": 0.4733, "step": 695 }, { "epoch": 0.20865654862282182, "grad_norm": 0.2269987165927887, "learning_rate": 0.00026909909909909906, "loss": 0.4866, "step": 696 }, { "epoch": 0.20895634251452128, "grad_norm": 0.23355835676193237, "learning_rate": 0.00026905405405405406, "loss": 0.4918, "step": 697 }, { "epoch": 0.20925613640622073, "grad_norm": 0.26988187432289124, "learning_rate": 0.000269009009009009, "loss": 0.4953, "step": 698 }, { "epoch": 0.2095559302979202, "grad_norm": 0.22978806495666504, "learning_rate": 0.00026896396396396393, "loss": 0.4622, "step": 699 }, { "epoch": 0.20985572418961965, "grad_norm": 0.2823212146759033, "learning_rate": 0.00026891891891891887, "loss": 0.5105, "step": 700 }, { "epoch": 0.2101555180813191, "grad_norm": 0.23818424344062805, "learning_rate": 0.00026887387387387386, "loss": 0.479, "step": 701 }, { "epoch": 0.21045531197301856, "grad_norm": 0.23730318248271942, "learning_rate": 0.0002688288288288288, "loss": 0.4991, "step": 702 }, { "epoch": 0.21075510586471802, "grad_norm": 0.23275551199913025, "learning_rate": 0.00026878378378378373, "loss": 0.466, "step": 703 }, { "epoch": 0.21105489975641747, "grad_norm": 0.2296077013015747, "learning_rate": 0.0002687387387387387, "loss": 0.4964, "step": 704 }, { "epoch": 0.21135469364811693, "grad_norm": 0.24341174960136414, "learning_rate": 0.00026869369369369366, "loss": 0.505, "step": 705 }, { "epoch": 0.2116544875398164, "grad_norm": 0.22542104125022888, "learning_rate": 0.0002686486486486486, "loss": 0.4533, "step": 706 }, { "epoch": 0.21195428143151585, "grad_norm": 0.22414691746234894, "learning_rate": 0.0002686036036036036, "loss": 0.4504, "step": 707 }, { "epoch": 0.2122540753232153, "grad_norm": 0.2119213044643402, "learning_rate": 0.0002685585585585585, "loss": 0.4457, "step": 708 }, { "epoch": 0.21255386921491476, "grad_norm": 0.21771720051765442, "learning_rate": 0.00026851351351351346, "loss": 0.4411, "step": 709 }, { "epoch": 0.21285366310661422, "grad_norm": 0.2484733760356903, "learning_rate": 0.00026846846846846845, "loss": 0.5016, "step": 710 }, { "epoch": 0.21315345699831367, "grad_norm": 0.23043473064899445, "learning_rate": 0.0002684234234234234, "loss": 0.4717, "step": 711 }, { "epoch": 0.21345325089001313, "grad_norm": 0.2769162952899933, "learning_rate": 0.0002683783783783783, "loss": 0.5135, "step": 712 }, { "epoch": 0.21375304478171256, "grad_norm": 0.22561033070087433, "learning_rate": 0.0002683333333333333, "loss": 0.4928, "step": 713 }, { "epoch": 0.21405283867341202, "grad_norm": 0.2365075945854187, "learning_rate": 0.00026828828828828825, "loss": 0.4843, "step": 714 }, { "epoch": 0.21435263256511147, "grad_norm": 0.23628349602222443, "learning_rate": 0.0002682432432432432, "loss": 0.4782, "step": 715 }, { "epoch": 0.21465242645681093, "grad_norm": 0.22449685633182526, "learning_rate": 0.0002681981981981982, "loss": 0.4533, "step": 716 }, { "epoch": 0.2149522203485104, "grad_norm": 0.23336151242256165, "learning_rate": 0.0002681531531531531, "loss": 0.4818, "step": 717 }, { "epoch": 0.21525201424020984, "grad_norm": 0.2387206107378006, "learning_rate": 0.00026810810810810805, "loss": 0.4537, "step": 718 }, { "epoch": 0.2155518081319093, "grad_norm": 0.23359011113643646, "learning_rate": 0.00026806306306306304, "loss": 0.4895, "step": 719 }, { "epoch": 0.21585160202360876, "grad_norm": 0.240494042634964, "learning_rate": 0.000268018018018018, "loss": 0.4878, "step": 720 }, { "epoch": 0.21615139591530821, "grad_norm": 0.23335424065589905, "learning_rate": 0.0002679729729729729, "loss": 0.4747, "step": 721 }, { "epoch": 0.21645118980700767, "grad_norm": 0.2620643079280853, "learning_rate": 0.0002679279279279279, "loss": 0.4968, "step": 722 }, { "epoch": 0.21675098369870713, "grad_norm": 0.2350034862756729, "learning_rate": 0.00026788288288288284, "loss": 0.4801, "step": 723 }, { "epoch": 0.21705077759040659, "grad_norm": 0.2358752340078354, "learning_rate": 0.00026783783783783784, "loss": 0.5265, "step": 724 }, { "epoch": 0.21735057148210604, "grad_norm": 0.2392471730709076, "learning_rate": 0.00026779279279279277, "loss": 0.4667, "step": 725 }, { "epoch": 0.2176503653738055, "grad_norm": 0.23733973503112793, "learning_rate": 0.0002677477477477477, "loss": 0.4712, "step": 726 }, { "epoch": 0.21795015926550496, "grad_norm": 0.2224283516407013, "learning_rate": 0.0002677027027027027, "loss": 0.4519, "step": 727 }, { "epoch": 0.2182499531572044, "grad_norm": 0.22749215364456177, "learning_rate": 0.00026765765765765764, "loss": 0.4656, "step": 728 }, { "epoch": 0.21854974704890387, "grad_norm": 0.29321786761283875, "learning_rate": 0.0002676126126126126, "loss": 0.5085, "step": 729 }, { "epoch": 0.21884954094060333, "grad_norm": 0.23674741387367249, "learning_rate": 0.00026756756756756756, "loss": 0.4958, "step": 730 }, { "epoch": 0.21914933483230278, "grad_norm": 0.21558566391468048, "learning_rate": 0.0002675225225225225, "loss": 0.4679, "step": 731 }, { "epoch": 0.21944912872400224, "grad_norm": 0.2383924126625061, "learning_rate": 0.00026747747747747744, "loss": 0.4827, "step": 732 }, { "epoch": 0.2197489226157017, "grad_norm": 0.23788924515247345, "learning_rate": 0.00026743243243243243, "loss": 0.486, "step": 733 }, { "epoch": 0.22004871650740115, "grad_norm": 0.23550404608249664, "learning_rate": 0.00026738738738738737, "loss": 0.4603, "step": 734 }, { "epoch": 0.2203485103991006, "grad_norm": 0.2342066466808319, "learning_rate": 0.00026734234234234236, "loss": 0.4591, "step": 735 }, { "epoch": 0.22064830429080007, "grad_norm": 0.25759053230285645, "learning_rate": 0.0002672972972972973, "loss": 0.483, "step": 736 }, { "epoch": 0.22094809818249952, "grad_norm": 0.22325725853443146, "learning_rate": 0.00026725225225225223, "loss": 0.4609, "step": 737 }, { "epoch": 0.22124789207419898, "grad_norm": 0.22235055267810822, "learning_rate": 0.0002672072072072072, "loss": 0.4512, "step": 738 }, { "epoch": 0.22154768596589844, "grad_norm": 0.23441246151924133, "learning_rate": 0.00026716216216216216, "loss": 0.4517, "step": 739 }, { "epoch": 0.2218474798575979, "grad_norm": 0.2520740330219269, "learning_rate": 0.0002671171171171171, "loss": 0.4712, "step": 740 }, { "epoch": 0.22214727374929735, "grad_norm": 0.22782452404499054, "learning_rate": 0.0002670720720720721, "loss": 0.4723, "step": 741 }, { "epoch": 0.2224470676409968, "grad_norm": 0.2406499981880188, "learning_rate": 0.000267027027027027, "loss": 0.4909, "step": 742 }, { "epoch": 0.22274686153269627, "grad_norm": 0.21733756363391876, "learning_rate": 0.00026698198198198196, "loss": 0.4402, "step": 743 }, { "epoch": 0.22304665542439572, "grad_norm": 0.2329728901386261, "learning_rate": 0.00026693693693693695, "loss": 0.4659, "step": 744 }, { "epoch": 0.22334644931609518, "grad_norm": 0.23359104990959167, "learning_rate": 0.0002668918918918919, "loss": 0.4848, "step": 745 }, { "epoch": 0.22364624320779464, "grad_norm": 0.23723845183849335, "learning_rate": 0.0002668468468468468, "loss": 0.4674, "step": 746 }, { "epoch": 0.2239460370994941, "grad_norm": 0.2128835916519165, "learning_rate": 0.00026680180180180176, "loss": 0.4617, "step": 747 }, { "epoch": 0.22424583099119355, "grad_norm": 0.2343822568655014, "learning_rate": 0.00026675675675675675, "loss": 0.4624, "step": 748 }, { "epoch": 0.224545624882893, "grad_norm": 0.24932916462421417, "learning_rate": 0.0002667117117117117, "loss": 0.476, "step": 749 }, { "epoch": 0.22484541877459246, "grad_norm": 0.24181415140628815, "learning_rate": 0.0002666666666666666, "loss": 0.4681, "step": 750 }, { "epoch": 0.22514521266629192, "grad_norm": 0.23665620386600494, "learning_rate": 0.0002666216216216216, "loss": 0.4858, "step": 751 }, { "epoch": 0.22544500655799138, "grad_norm": 0.24904295802116394, "learning_rate": 0.00026657657657657655, "loss": 0.4957, "step": 752 }, { "epoch": 0.22574480044969084, "grad_norm": 0.2285979986190796, "learning_rate": 0.0002665315315315315, "loss": 0.4515, "step": 753 }, { "epoch": 0.2260445943413903, "grad_norm": 0.2505464553833008, "learning_rate": 0.0002664864864864865, "loss": 0.4884, "step": 754 }, { "epoch": 0.22634438823308975, "grad_norm": 0.22328858077526093, "learning_rate": 0.0002664414414414414, "loss": 0.4463, "step": 755 }, { "epoch": 0.2266441821247892, "grad_norm": 0.2543044984340668, "learning_rate": 0.00026639639639639635, "loss": 0.493, "step": 756 }, { "epoch": 0.22694397601648866, "grad_norm": 0.2348204255104065, "learning_rate": 0.00026635135135135135, "loss": 0.4511, "step": 757 }, { "epoch": 0.22724376990818812, "grad_norm": 0.25663718581199646, "learning_rate": 0.0002663063063063063, "loss": 0.5006, "step": 758 }, { "epoch": 0.22754356379988758, "grad_norm": 0.24245639145374298, "learning_rate": 0.0002662612612612612, "loss": 0.4687, "step": 759 }, { "epoch": 0.22784335769158703, "grad_norm": 0.2461511641740799, "learning_rate": 0.0002662162162162162, "loss": 0.5136, "step": 760 }, { "epoch": 0.2281431515832865, "grad_norm": 0.22325679659843445, "learning_rate": 0.00026617117117117115, "loss": 0.472, "step": 761 }, { "epoch": 0.22844294547498595, "grad_norm": 0.2652730345726013, "learning_rate": 0.0002661261261261261, "loss": 0.4732, "step": 762 }, { "epoch": 0.2287427393666854, "grad_norm": 0.24870134890079498, "learning_rate": 0.0002660810810810811, "loss": 0.467, "step": 763 }, { "epoch": 0.22904253325838486, "grad_norm": 0.23315280675888062, "learning_rate": 0.000266036036036036, "loss": 0.4994, "step": 764 }, { "epoch": 0.22934232715008432, "grad_norm": 0.23781219124794006, "learning_rate": 0.00026599099099099095, "loss": 0.4923, "step": 765 }, { "epoch": 0.22964212104178378, "grad_norm": 0.2272898256778717, "learning_rate": 0.00026594594594594594, "loss": 0.4683, "step": 766 }, { "epoch": 0.22994191493348323, "grad_norm": 0.2321631759405136, "learning_rate": 0.0002659009009009009, "loss": 0.5018, "step": 767 }, { "epoch": 0.2302417088251827, "grad_norm": 0.22698219120502472, "learning_rate": 0.0002658558558558558, "loss": 0.479, "step": 768 }, { "epoch": 0.23054150271688215, "grad_norm": 0.23421627283096313, "learning_rate": 0.0002658108108108108, "loss": 0.4613, "step": 769 }, { "epoch": 0.2308412966085816, "grad_norm": 0.21950644254684448, "learning_rate": 0.00026576576576576574, "loss": 0.4497, "step": 770 }, { "epoch": 0.23114109050028106, "grad_norm": 0.2207535058259964, "learning_rate": 0.0002657207207207207, "loss": 0.4777, "step": 771 }, { "epoch": 0.23144088439198052, "grad_norm": 0.22216112911701202, "learning_rate": 0.0002656756756756756, "loss": 0.4682, "step": 772 }, { "epoch": 0.23174067828367997, "grad_norm": 0.2619054317474365, "learning_rate": 0.0002656306306306306, "loss": 0.4791, "step": 773 }, { "epoch": 0.23204047217537943, "grad_norm": 0.2443225234746933, "learning_rate": 0.00026558558558558554, "loss": 0.4777, "step": 774 }, { "epoch": 0.2323402660670789, "grad_norm": 0.21427664160728455, "learning_rate": 0.0002655405405405405, "loss": 0.4407, "step": 775 }, { "epoch": 0.23264005995877834, "grad_norm": 0.21477638185024261, "learning_rate": 0.00026549549549549547, "loss": 0.4333, "step": 776 }, { "epoch": 0.2329398538504778, "grad_norm": 0.23390546441078186, "learning_rate": 0.0002654504504504504, "loss": 0.4839, "step": 777 }, { "epoch": 0.23323964774217726, "grad_norm": 0.2529938220977783, "learning_rate": 0.00026540540540540534, "loss": 0.4701, "step": 778 }, { "epoch": 0.23353944163387672, "grad_norm": 0.24290771782398224, "learning_rate": 0.00026536036036036033, "loss": 0.492, "step": 779 }, { "epoch": 0.23383923552557617, "grad_norm": 0.2573592960834503, "learning_rate": 0.00026531531531531527, "loss": 0.5105, "step": 780 }, { "epoch": 0.23413902941727563, "grad_norm": 0.25404611229896545, "learning_rate": 0.00026527027027027026, "loss": 0.5054, "step": 781 }, { "epoch": 0.2344388233089751, "grad_norm": 0.2394997775554657, "learning_rate": 0.0002652252252252252, "loss": 0.4786, "step": 782 }, { "epoch": 0.23473861720067454, "grad_norm": 0.2353266179561615, "learning_rate": 0.00026518018018018013, "loss": 0.4613, "step": 783 }, { "epoch": 0.235038411092374, "grad_norm": 0.22989432513713837, "learning_rate": 0.0002651351351351351, "loss": 0.4505, "step": 784 }, { "epoch": 0.23533820498407346, "grad_norm": 0.21917951107025146, "learning_rate": 0.00026509009009009006, "loss": 0.438, "step": 785 }, { "epoch": 0.2356379988757729, "grad_norm": 0.23011858761310577, "learning_rate": 0.000265045045045045, "loss": 0.4477, "step": 786 }, { "epoch": 0.23593779276747237, "grad_norm": 0.22732798755168915, "learning_rate": 0.000265, "loss": 0.454, "step": 787 }, { "epoch": 0.23623758665917183, "grad_norm": 0.22975054383277893, "learning_rate": 0.0002649549549549549, "loss": 0.4419, "step": 788 }, { "epoch": 0.23653738055087128, "grad_norm": 0.25520968437194824, "learning_rate": 0.00026490990990990986, "loss": 0.4987, "step": 789 }, { "epoch": 0.23683717444257074, "grad_norm": 0.2375541776418686, "learning_rate": 0.00026486486486486485, "loss": 0.473, "step": 790 }, { "epoch": 0.2371369683342702, "grad_norm": 0.2304588407278061, "learning_rate": 0.0002648198198198198, "loss": 0.4734, "step": 791 }, { "epoch": 0.23743676222596966, "grad_norm": 0.22878654301166534, "learning_rate": 0.0002647747747747748, "loss": 0.4684, "step": 792 }, { "epoch": 0.2377365561176691, "grad_norm": 0.25825339555740356, "learning_rate": 0.0002647297297297297, "loss": 0.5006, "step": 793 }, { "epoch": 0.23803635000936857, "grad_norm": 0.2332850843667984, "learning_rate": 0.00026468468468468466, "loss": 0.4603, "step": 794 }, { "epoch": 0.23833614390106803, "grad_norm": 0.23115694522857666, "learning_rate": 0.00026463963963963965, "loss": 0.4879, "step": 795 }, { "epoch": 0.23863593779276748, "grad_norm": 0.2409309297800064, "learning_rate": 0.0002645945945945946, "loss": 0.4662, "step": 796 }, { "epoch": 0.23893573168446694, "grad_norm": 0.23094283044338226, "learning_rate": 0.0002645495495495495, "loss": 0.4477, "step": 797 }, { "epoch": 0.2392355255761664, "grad_norm": 0.2324245125055313, "learning_rate": 0.0002645045045045045, "loss": 0.4539, "step": 798 }, { "epoch": 0.23953531946786585, "grad_norm": 0.23273488879203796, "learning_rate": 0.00026445945945945945, "loss": 0.4926, "step": 799 }, { "epoch": 0.2398351133595653, "grad_norm": 0.2254081666469574, "learning_rate": 0.0002644144144144144, "loss": 0.4487, "step": 800 }, { "epoch": 0.24013490725126477, "grad_norm": 0.22009852528572083, "learning_rate": 0.0002643693693693694, "loss": 0.4373, "step": 801 }, { "epoch": 0.24043470114296422, "grad_norm": 0.24840936064720154, "learning_rate": 0.0002643243243243243, "loss": 0.4803, "step": 802 }, { "epoch": 0.24073449503466368, "grad_norm": 0.2305980920791626, "learning_rate": 0.00026427927927927925, "loss": 0.4727, "step": 803 }, { "epoch": 0.24103428892636314, "grad_norm": 0.23277850449085236, "learning_rate": 0.00026423423423423424, "loss": 0.4775, "step": 804 }, { "epoch": 0.2413340828180626, "grad_norm": 0.24016259610652924, "learning_rate": 0.0002641891891891892, "loss": 0.5002, "step": 805 }, { "epoch": 0.24163387670976205, "grad_norm": 0.239017054438591, "learning_rate": 0.0002641441441441441, "loss": 0.4586, "step": 806 }, { "epoch": 0.2419336706014615, "grad_norm": 0.23575717210769653, "learning_rate": 0.0002640990990990991, "loss": 0.481, "step": 807 }, { "epoch": 0.24223346449316094, "grad_norm": 0.23028531670570374, "learning_rate": 0.00026405405405405404, "loss": 0.4546, "step": 808 }, { "epoch": 0.2425332583848604, "grad_norm": 0.23798401653766632, "learning_rate": 0.000264009009009009, "loss": 0.4806, "step": 809 }, { "epoch": 0.24283305227655985, "grad_norm": 0.23191827535629272, "learning_rate": 0.00026396396396396397, "loss": 0.4509, "step": 810 }, { "epoch": 0.2431328461682593, "grad_norm": 0.2182149440050125, "learning_rate": 0.0002639189189189189, "loss": 0.4448, "step": 811 }, { "epoch": 0.24343264005995877, "grad_norm": 0.2463945746421814, "learning_rate": 0.00026387387387387384, "loss": 0.5088, "step": 812 }, { "epoch": 0.24373243395165822, "grad_norm": 0.2388424575328827, "learning_rate": 0.00026382882882882883, "loss": 0.4931, "step": 813 }, { "epoch": 0.24403222784335768, "grad_norm": 0.2515762746334076, "learning_rate": 0.00026378378378378377, "loss": 0.4742, "step": 814 }, { "epoch": 0.24433202173505714, "grad_norm": 0.23625001311302185, "learning_rate": 0.0002637387387387387, "loss": 0.4937, "step": 815 }, { "epoch": 0.2446318156267566, "grad_norm": 0.2393738478422165, "learning_rate": 0.0002636936936936937, "loss": 0.4588, "step": 816 }, { "epoch": 0.24493160951845605, "grad_norm": 0.23316219449043274, "learning_rate": 0.00026364864864864864, "loss": 0.4577, "step": 817 }, { "epoch": 0.2452314034101555, "grad_norm": 0.2306746244430542, "learning_rate": 0.00026360360360360357, "loss": 0.4454, "step": 818 }, { "epoch": 0.24553119730185496, "grad_norm": 0.26293689012527466, "learning_rate": 0.0002635585585585585, "loss": 0.4723, "step": 819 }, { "epoch": 0.24583099119355442, "grad_norm": 0.23483715951442719, "learning_rate": 0.0002635135135135135, "loss": 0.4704, "step": 820 }, { "epoch": 0.24613078508525388, "grad_norm": 0.2556680738925934, "learning_rate": 0.00026346846846846844, "loss": 0.5077, "step": 821 }, { "epoch": 0.24643057897695334, "grad_norm": 0.25275811553001404, "learning_rate": 0.0002634234234234234, "loss": 0.483, "step": 822 }, { "epoch": 0.2467303728686528, "grad_norm": 0.22292107343673706, "learning_rate": 0.00026337837837837836, "loss": 0.4684, "step": 823 }, { "epoch": 0.24703016676035225, "grad_norm": 0.23125959932804108, "learning_rate": 0.0002633333333333333, "loss": 0.4618, "step": 824 }, { "epoch": 0.2473299606520517, "grad_norm": 0.2515474259853363, "learning_rate": 0.00026328828828828824, "loss": 0.5111, "step": 825 }, { "epoch": 0.24762975454375116, "grad_norm": 0.23193036019802094, "learning_rate": 0.00026324324324324323, "loss": 0.451, "step": 826 }, { "epoch": 0.24792954843545062, "grad_norm": 0.22238105535507202, "learning_rate": 0.00026319819819819817, "loss": 0.4472, "step": 827 }, { "epoch": 0.24822934232715008, "grad_norm": 0.23125764727592468, "learning_rate": 0.0002631531531531531, "loss": 0.4711, "step": 828 }, { "epoch": 0.24852913621884953, "grad_norm": 0.23620037734508514, "learning_rate": 0.0002631081081081081, "loss": 0.4723, "step": 829 }, { "epoch": 0.248828930110549, "grad_norm": 0.22470439970493317, "learning_rate": 0.00026306306306306303, "loss": 0.4814, "step": 830 }, { "epoch": 0.24912872400224845, "grad_norm": 0.23267348110675812, "learning_rate": 0.00026301801801801797, "loss": 0.4669, "step": 831 }, { "epoch": 0.2494285178939479, "grad_norm": 0.23558740317821503, "learning_rate": 0.00026297297297297296, "loss": 0.4407, "step": 832 }, { "epoch": 0.24972831178564736, "grad_norm": 0.2202112227678299, "learning_rate": 0.0002629279279279279, "loss": 0.4546, "step": 833 }, { "epoch": 0.25002810567734685, "grad_norm": 0.2349451333284378, "learning_rate": 0.00026288288288288283, "loss": 0.4741, "step": 834 }, { "epoch": 0.2503278995690463, "grad_norm": 0.2210862636566162, "learning_rate": 0.0002628378378378378, "loss": 0.4624, "step": 835 }, { "epoch": 0.25062769346074576, "grad_norm": 0.25249290466308594, "learning_rate": 0.00026279279279279276, "loss": 0.5213, "step": 836 }, { "epoch": 0.2509274873524452, "grad_norm": 0.2458237111568451, "learning_rate": 0.0002627477477477477, "loss": 0.4937, "step": 837 }, { "epoch": 0.2512272812441447, "grad_norm": 0.22827856242656708, "learning_rate": 0.0002627027027027027, "loss": 0.4286, "step": 838 }, { "epoch": 0.2515270751358441, "grad_norm": 0.22871458530426025, "learning_rate": 0.0002626576576576576, "loss": 0.4181, "step": 839 }, { "epoch": 0.2518268690275436, "grad_norm": 0.24196332693099976, "learning_rate": 0.00026261261261261256, "loss": 0.4704, "step": 840 }, { "epoch": 0.252126662919243, "grad_norm": 0.24222321808338165, "learning_rate": 0.00026256756756756755, "loss": 0.474, "step": 841 }, { "epoch": 0.2524264568109425, "grad_norm": 0.2258533090353012, "learning_rate": 0.0002625225225225225, "loss": 0.4644, "step": 842 }, { "epoch": 0.25272625070264193, "grad_norm": 0.2234419882297516, "learning_rate": 0.0002624774774774774, "loss": 0.4506, "step": 843 }, { "epoch": 0.2530260445943414, "grad_norm": 0.24231363832950592, "learning_rate": 0.0002624324324324324, "loss": 0.4975, "step": 844 }, { "epoch": 0.25332583848604084, "grad_norm": 0.2430192083120346, "learning_rate": 0.00026238738738738735, "loss": 0.462, "step": 845 }, { "epoch": 0.25362563237774033, "grad_norm": 0.23717942833900452, "learning_rate": 0.0002623423423423423, "loss": 0.48, "step": 846 }, { "epoch": 0.25392542626943976, "grad_norm": 0.23983192443847656, "learning_rate": 0.0002622972972972973, "loss": 0.491, "step": 847 }, { "epoch": 0.25422522016113924, "grad_norm": 0.24544605612754822, "learning_rate": 0.0002622522522522522, "loss": 0.4525, "step": 848 }, { "epoch": 0.25452501405283867, "grad_norm": 0.25106650590896606, "learning_rate": 0.0002622072072072072, "loss": 0.4716, "step": 849 }, { "epoch": 0.25482480794453816, "grad_norm": 0.2644721269607544, "learning_rate": 0.00026216216216216215, "loss": 0.4888, "step": 850 }, { "epoch": 0.2551246018362376, "grad_norm": 0.2338344007730484, "learning_rate": 0.0002621171171171171, "loss": 0.4779, "step": 851 }, { "epoch": 0.25542439572793707, "grad_norm": 0.2368081659078598, "learning_rate": 0.00026207207207207207, "loss": 0.458, "step": 852 }, { "epoch": 0.2557241896196365, "grad_norm": 0.2628321051597595, "learning_rate": 0.000262027027027027, "loss": 0.4786, "step": 853 }, { "epoch": 0.256023983511336, "grad_norm": 0.23109877109527588, "learning_rate": 0.00026198198198198195, "loss": 0.4396, "step": 854 }, { "epoch": 0.2563237774030354, "grad_norm": 0.23273521661758423, "learning_rate": 0.00026193693693693694, "loss": 0.4579, "step": 855 }, { "epoch": 0.25662357129473484, "grad_norm": 0.229389026761055, "learning_rate": 0.0002618918918918919, "loss": 0.4515, "step": 856 }, { "epoch": 0.2569233651864343, "grad_norm": 0.24866041541099548, "learning_rate": 0.0002618468468468468, "loss": 0.4402, "step": 857 }, { "epoch": 0.25722315907813376, "grad_norm": 0.24374257028102875, "learning_rate": 0.0002618018018018018, "loss": 0.5025, "step": 858 }, { "epoch": 0.25752295296983324, "grad_norm": 0.2753133177757263, "learning_rate": 0.00026175675675675674, "loss": 0.5058, "step": 859 }, { "epoch": 0.25782274686153267, "grad_norm": 0.236386239528656, "learning_rate": 0.0002617117117117117, "loss": 0.4741, "step": 860 }, { "epoch": 0.25812254075323215, "grad_norm": 0.21907605230808258, "learning_rate": 0.00026166666666666667, "loss": 0.463, "step": 861 }, { "epoch": 0.2584223346449316, "grad_norm": 0.25744542479515076, "learning_rate": 0.0002616216216216216, "loss": 0.4585, "step": 862 }, { "epoch": 0.25872212853663107, "grad_norm": 0.25060373544692993, "learning_rate": 0.0002615765765765766, "loss": 0.476, "step": 863 }, { "epoch": 0.2590219224283305, "grad_norm": 0.21545180678367615, "learning_rate": 0.00026153153153153153, "loss": 0.4379, "step": 864 }, { "epoch": 0.25932171632003, "grad_norm": 0.2536545991897583, "learning_rate": 0.00026148648648648647, "loss": 0.4965, "step": 865 }, { "epoch": 0.2596215102117294, "grad_norm": 0.22960424423217773, "learning_rate": 0.00026144144144144146, "loss": 0.4441, "step": 866 }, { "epoch": 0.2599213041034289, "grad_norm": 0.22601282596588135, "learning_rate": 0.0002613963963963964, "loss": 0.4462, "step": 867 }, { "epoch": 0.2602210979951283, "grad_norm": 0.23997683823108673, "learning_rate": 0.00026135135135135133, "loss": 0.4496, "step": 868 }, { "epoch": 0.2605208918868278, "grad_norm": 0.24241064488887787, "learning_rate": 0.00026130630630630627, "loss": 0.4538, "step": 869 }, { "epoch": 0.26082068577852724, "grad_norm": 0.2302720993757248, "learning_rate": 0.00026126126126126126, "loss": 0.4547, "step": 870 }, { "epoch": 0.2611204796702267, "grad_norm": 0.2532520294189453, "learning_rate": 0.0002612162162162162, "loss": 0.4919, "step": 871 }, { "epoch": 0.26142027356192615, "grad_norm": 0.2543450891971588, "learning_rate": 0.00026117117117117113, "loss": 0.4638, "step": 872 }, { "epoch": 0.26172006745362564, "grad_norm": 0.24088874459266663, "learning_rate": 0.0002611261261261261, "loss": 0.4603, "step": 873 }, { "epoch": 0.26201986134532507, "grad_norm": 0.22305645048618317, "learning_rate": 0.00026108108108108106, "loss": 0.4394, "step": 874 }, { "epoch": 0.26231965523702455, "grad_norm": 0.261001318693161, "learning_rate": 0.000261036036036036, "loss": 0.5244, "step": 875 }, { "epoch": 0.262619449128724, "grad_norm": 0.2550908029079437, "learning_rate": 0.000260990990990991, "loss": 0.4716, "step": 876 }, { "epoch": 0.26291924302042347, "grad_norm": 0.2264460027217865, "learning_rate": 0.0002609459459459459, "loss": 0.4527, "step": 877 }, { "epoch": 0.2632190369121229, "grad_norm": 0.2598486542701721, "learning_rate": 0.00026090090090090086, "loss": 0.51, "step": 878 }, { "epoch": 0.2635188308038224, "grad_norm": 0.2528247833251953, "learning_rate": 0.00026085585585585585, "loss": 0.4619, "step": 879 }, { "epoch": 0.2638186246955218, "grad_norm": 0.22703434526920319, "learning_rate": 0.0002608108108108108, "loss": 0.4519, "step": 880 }, { "epoch": 0.2641184185872213, "grad_norm": 0.24291987717151642, "learning_rate": 0.0002607657657657657, "loss": 0.4588, "step": 881 }, { "epoch": 0.2644182124789207, "grad_norm": 0.265899121761322, "learning_rate": 0.0002607207207207207, "loss": 0.4741, "step": 882 }, { "epoch": 0.2647180063706202, "grad_norm": 0.24852798879146576, "learning_rate": 0.00026067567567567565, "loss": 0.4772, "step": 883 }, { "epoch": 0.26501780026231964, "grad_norm": 0.24373799562454224, "learning_rate": 0.0002606306306306306, "loss": 0.475, "step": 884 }, { "epoch": 0.2653175941540191, "grad_norm": 0.24994871020317078, "learning_rate": 0.0002605855855855856, "loss": 0.5094, "step": 885 }, { "epoch": 0.26561738804571855, "grad_norm": 0.23686103522777557, "learning_rate": 0.0002605405405405405, "loss": 0.4583, "step": 886 }, { "epoch": 0.26591718193741803, "grad_norm": 0.2280004322528839, "learning_rate": 0.00026049549549549546, "loss": 0.4474, "step": 887 }, { "epoch": 0.26621697582911746, "grad_norm": 0.25110939145088196, "learning_rate": 0.00026045045045045045, "loss": 0.4722, "step": 888 }, { "epoch": 0.26651676972081695, "grad_norm": 0.25370022654533386, "learning_rate": 0.0002604054054054054, "loss": 0.4651, "step": 889 }, { "epoch": 0.2668165636125164, "grad_norm": 0.24179215729236603, "learning_rate": 0.0002603603603603603, "loss": 0.4727, "step": 890 }, { "epoch": 0.26711635750421586, "grad_norm": 0.2525777518749237, "learning_rate": 0.00026031531531531526, "loss": 0.4681, "step": 891 }, { "epoch": 0.2674161513959153, "grad_norm": 0.21325957775115967, "learning_rate": 0.00026027027027027025, "loss": 0.4374, "step": 892 }, { "epoch": 0.2677159452876148, "grad_norm": 0.2358642816543579, "learning_rate": 0.0002602252252252252, "loss": 0.4492, "step": 893 }, { "epoch": 0.2680157391793142, "grad_norm": 0.2625977694988251, "learning_rate": 0.0002601801801801801, "loss": 0.522, "step": 894 }, { "epoch": 0.2683155330710137, "grad_norm": 0.22606413066387177, "learning_rate": 0.0002601351351351351, "loss": 0.4539, "step": 895 }, { "epoch": 0.2686153269627131, "grad_norm": 0.24337491393089294, "learning_rate": 0.00026009009009009005, "loss": 0.4988, "step": 896 }, { "epoch": 0.2689151208544126, "grad_norm": 0.23522725701332092, "learning_rate": 0.000260045045045045, "loss": 0.4665, "step": 897 }, { "epoch": 0.26921491474611203, "grad_norm": 0.25222131609916687, "learning_rate": 0.00026, "loss": 0.4715, "step": 898 }, { "epoch": 0.2695147086378115, "grad_norm": 0.22760646045207977, "learning_rate": 0.0002599549549549549, "loss": 0.4633, "step": 899 }, { "epoch": 0.26981450252951095, "grad_norm": 0.2398597002029419, "learning_rate": 0.00025990990990990985, "loss": 0.4682, "step": 900 }, { "epoch": 0.27011429642121043, "grad_norm": 0.24494816362857819, "learning_rate": 0.00025986486486486484, "loss": 0.4833, "step": 901 }, { "epoch": 0.27041409031290986, "grad_norm": 0.23173199594020844, "learning_rate": 0.0002598198198198198, "loss": 0.4583, "step": 902 }, { "epoch": 0.27071388420460935, "grad_norm": 0.242969810962677, "learning_rate": 0.0002597747747747747, "loss": 0.4748, "step": 903 }, { "epoch": 0.2710136780963088, "grad_norm": 0.2286025583744049, "learning_rate": 0.0002597297297297297, "loss": 0.4802, "step": 904 }, { "epoch": 0.27131347198800826, "grad_norm": 0.241167351603508, "learning_rate": 0.00025968468468468464, "loss": 0.4947, "step": 905 }, { "epoch": 0.2716132658797077, "grad_norm": 0.2599638104438782, "learning_rate": 0.00025963963963963963, "loss": 0.5028, "step": 906 }, { "epoch": 0.2719130597714072, "grad_norm": 0.22766104340553284, "learning_rate": 0.00025959459459459457, "loss": 0.4586, "step": 907 }, { "epoch": 0.2722128536631066, "grad_norm": 0.24524454772472382, "learning_rate": 0.0002595495495495495, "loss": 0.4918, "step": 908 }, { "epoch": 0.2725126475548061, "grad_norm": 0.24995583295822144, "learning_rate": 0.0002595045045045045, "loss": 0.4851, "step": 909 }, { "epoch": 0.2728124414465055, "grad_norm": 0.24542704224586487, "learning_rate": 0.00025945945945945944, "loss": 0.4891, "step": 910 }, { "epoch": 0.273112235338205, "grad_norm": 0.2262720763683319, "learning_rate": 0.00025941441441441437, "loss": 0.457, "step": 911 }, { "epoch": 0.27341202922990443, "grad_norm": 0.2677282989025116, "learning_rate": 0.00025936936936936936, "loss": 0.4781, "step": 912 }, { "epoch": 0.2737118231216039, "grad_norm": 0.22617483139038086, "learning_rate": 0.0002593243243243243, "loss": 0.4666, "step": 913 }, { "epoch": 0.27401161701330334, "grad_norm": 0.24579745531082153, "learning_rate": 0.00025927927927927924, "loss": 0.4991, "step": 914 }, { "epoch": 0.27431141090500283, "grad_norm": 0.22964158654212952, "learning_rate": 0.00025923423423423423, "loss": 0.4757, "step": 915 }, { "epoch": 0.27461120479670226, "grad_norm": 0.24435275793075562, "learning_rate": 0.00025918918918918916, "loss": 0.4636, "step": 916 }, { "epoch": 0.27491099868840174, "grad_norm": 0.23039105534553528, "learning_rate": 0.0002591441441441441, "loss": 0.4591, "step": 917 }, { "epoch": 0.27521079258010117, "grad_norm": 0.24856770038604736, "learning_rate": 0.0002590990990990991, "loss": 0.4866, "step": 918 }, { "epoch": 0.27551058647180066, "grad_norm": 0.22115269303321838, "learning_rate": 0.00025905405405405403, "loss": 0.4322, "step": 919 }, { "epoch": 0.2758103803635001, "grad_norm": 0.2645402252674103, "learning_rate": 0.000259009009009009, "loss": 0.4891, "step": 920 }, { "epoch": 0.27611017425519957, "grad_norm": 0.24427354335784912, "learning_rate": 0.00025896396396396396, "loss": 0.4636, "step": 921 }, { "epoch": 0.276409968146899, "grad_norm": 0.23059400916099548, "learning_rate": 0.0002589189189189189, "loss": 0.4502, "step": 922 }, { "epoch": 0.2767097620385985, "grad_norm": 0.21996812522411346, "learning_rate": 0.0002588738738738739, "loss": 0.4461, "step": 923 }, { "epoch": 0.2770095559302979, "grad_norm": 0.24204552173614502, "learning_rate": 0.0002588288288288288, "loss": 0.4678, "step": 924 }, { "epoch": 0.2773093498219974, "grad_norm": 0.26428595185279846, "learning_rate": 0.00025878378378378376, "loss": 0.476, "step": 925 }, { "epoch": 0.2776091437136968, "grad_norm": 0.2542773187160492, "learning_rate": 0.00025873873873873875, "loss": 0.464, "step": 926 }, { "epoch": 0.2779089376053963, "grad_norm": 0.2621975839138031, "learning_rate": 0.0002586936936936937, "loss": 0.4735, "step": 927 }, { "epoch": 0.27820873149709574, "grad_norm": 0.24359507858753204, "learning_rate": 0.0002586486486486486, "loss": 0.4768, "step": 928 }, { "epoch": 0.2785085253887952, "grad_norm": 0.24825096130371094, "learning_rate": 0.0002586036036036036, "loss": 0.471, "step": 929 }, { "epoch": 0.27880831928049465, "grad_norm": 0.2950778007507324, "learning_rate": 0.00025855855855855855, "loss": 0.4625, "step": 930 }, { "epoch": 0.27910811317219414, "grad_norm": 0.23210273683071136, "learning_rate": 0.0002585135135135135, "loss": 0.4325, "step": 931 }, { "epoch": 0.27940790706389357, "grad_norm": 0.26627883315086365, "learning_rate": 0.0002584684684684685, "loss": 0.4903, "step": 932 }, { "epoch": 0.27970770095559305, "grad_norm": 0.2619936466217041, "learning_rate": 0.0002584234234234234, "loss": 0.4777, "step": 933 }, { "epoch": 0.2800074948472925, "grad_norm": 0.23771123588085175, "learning_rate": 0.00025837837837837835, "loss": 0.4319, "step": 934 }, { "epoch": 0.28030728873899197, "grad_norm": 0.2495034784078598, "learning_rate": 0.00025833333333333334, "loss": 0.4503, "step": 935 }, { "epoch": 0.2806070826306914, "grad_norm": 0.26627448201179504, "learning_rate": 0.0002582882882882883, "loss": 0.4501, "step": 936 }, { "epoch": 0.2809068765223909, "grad_norm": 0.22482304275035858, "learning_rate": 0.0002582432432432432, "loss": 0.4615, "step": 937 }, { "epoch": 0.2812066704140903, "grad_norm": 0.23891392350196838, "learning_rate": 0.0002581981981981982, "loss": 0.4732, "step": 938 }, { "epoch": 0.2815064643057898, "grad_norm": 0.2233395278453827, "learning_rate": 0.00025815315315315314, "loss": 0.4521, "step": 939 }, { "epoch": 0.2818062581974892, "grad_norm": 0.22510269284248352, "learning_rate": 0.0002581081081081081, "loss": 0.4307, "step": 940 }, { "epoch": 0.2821060520891887, "grad_norm": 0.24909009039402008, "learning_rate": 0.000258063063063063, "loss": 0.4574, "step": 941 }, { "epoch": 0.28240584598088814, "grad_norm": 0.2461954653263092, "learning_rate": 0.000258018018018018, "loss": 0.4782, "step": 942 }, { "epoch": 0.2827056398725876, "grad_norm": 0.23391996324062347, "learning_rate": 0.00025797297297297294, "loss": 0.4692, "step": 943 }, { "epoch": 0.28300543376428705, "grad_norm": 0.2419288158416748, "learning_rate": 0.0002579279279279279, "loss": 0.4742, "step": 944 }, { "epoch": 0.28330522765598654, "grad_norm": 0.24654226005077362, "learning_rate": 0.00025788288288288287, "loss": 0.4793, "step": 945 }, { "epoch": 0.28360502154768596, "grad_norm": 0.23362454771995544, "learning_rate": 0.0002578378378378378, "loss": 0.4398, "step": 946 }, { "epoch": 0.28390481543938545, "grad_norm": 0.23269514739513397, "learning_rate": 0.00025779279279279275, "loss": 0.4642, "step": 947 }, { "epoch": 0.2842046093310849, "grad_norm": 0.22531503438949585, "learning_rate": 0.00025774774774774774, "loss": 0.436, "step": 948 }, { "epoch": 0.28450440322278436, "grad_norm": 0.24250438809394836, "learning_rate": 0.0002577027027027027, "loss": 0.4708, "step": 949 }, { "epoch": 0.2848041971144838, "grad_norm": 0.2370329648256302, "learning_rate": 0.0002576576576576576, "loss": 0.4737, "step": 950 }, { "epoch": 0.2851039910061832, "grad_norm": 0.2553395628929138, "learning_rate": 0.0002576126126126126, "loss": 0.4927, "step": 951 }, { "epoch": 0.2854037848978827, "grad_norm": 0.24398140609264374, "learning_rate": 0.00025756756756756754, "loss": 0.4471, "step": 952 }, { "epoch": 0.28570357878958214, "grad_norm": 0.2420070916414261, "learning_rate": 0.0002575225225225225, "loss": 0.4573, "step": 953 }, { "epoch": 0.2860033726812816, "grad_norm": 0.22280406951904297, "learning_rate": 0.00025747747747747747, "loss": 0.4637, "step": 954 }, { "epoch": 0.28630316657298105, "grad_norm": 0.268107146024704, "learning_rate": 0.0002574324324324324, "loss": 0.4751, "step": 955 }, { "epoch": 0.28660296046468053, "grad_norm": 0.224797785282135, "learning_rate": 0.00025738738738738734, "loss": 0.4567, "step": 956 }, { "epoch": 0.28690275435637996, "grad_norm": 0.2350010722875595, "learning_rate": 0.00025734234234234233, "loss": 0.4669, "step": 957 }, { "epoch": 0.28720254824807945, "grad_norm": 0.23346953094005585, "learning_rate": 0.00025729729729729727, "loss": 0.4711, "step": 958 }, { "epoch": 0.2875023421397789, "grad_norm": 0.26031309366226196, "learning_rate": 0.0002572522522522522, "loss": 0.4921, "step": 959 }, { "epoch": 0.28780213603147836, "grad_norm": 0.21255329251289368, "learning_rate": 0.0002572072072072072, "loss": 0.444, "step": 960 }, { "epoch": 0.2881019299231778, "grad_norm": 0.24799884855747223, "learning_rate": 0.00025716216216216213, "loss": 0.469, "step": 961 }, { "epoch": 0.2884017238148773, "grad_norm": 0.2208838164806366, "learning_rate": 0.00025711711711711707, "loss": 0.442, "step": 962 }, { "epoch": 0.2887015177065767, "grad_norm": 0.2880913317203522, "learning_rate": 0.00025707207207207206, "loss": 0.4424, "step": 963 }, { "epoch": 0.2890013115982762, "grad_norm": 0.26574239134788513, "learning_rate": 0.000257027027027027, "loss": 0.452, "step": 964 }, { "epoch": 0.2893011054899756, "grad_norm": 0.23267340660095215, "learning_rate": 0.00025698198198198193, "loss": 0.455, "step": 965 }, { "epoch": 0.2896008993816751, "grad_norm": 0.26304900646209717, "learning_rate": 0.0002569369369369369, "loss": 0.5211, "step": 966 }, { "epoch": 0.28990069327337453, "grad_norm": 0.2575905919075012, "learning_rate": 0.00025689189189189186, "loss": 0.483, "step": 967 }, { "epoch": 0.290200487165074, "grad_norm": 0.22459660470485687, "learning_rate": 0.0002568468468468468, "loss": 0.4636, "step": 968 }, { "epoch": 0.29050028105677345, "grad_norm": 0.220341295003891, "learning_rate": 0.0002568018018018018, "loss": 0.4615, "step": 969 }, { "epoch": 0.29080007494847293, "grad_norm": 0.239531472325325, "learning_rate": 0.0002567567567567567, "loss": 0.4446, "step": 970 }, { "epoch": 0.29109986884017236, "grad_norm": 0.23338812589645386, "learning_rate": 0.00025671171171171166, "loss": 0.4704, "step": 971 }, { "epoch": 0.29139966273187184, "grad_norm": 0.24035978317260742, "learning_rate": 0.00025666666666666665, "loss": 0.4717, "step": 972 }, { "epoch": 0.2916994566235713, "grad_norm": 0.23094506561756134, "learning_rate": 0.0002566216216216216, "loss": 0.4493, "step": 973 }, { "epoch": 0.29199925051527076, "grad_norm": 0.25101473927497864, "learning_rate": 0.0002565765765765765, "loss": 0.4979, "step": 974 }, { "epoch": 0.2922990444069702, "grad_norm": 0.24172839522361755, "learning_rate": 0.0002565315315315315, "loss": 0.4666, "step": 975 }, { "epoch": 0.2925988382986697, "grad_norm": 0.2213078737258911, "learning_rate": 0.00025648648648648645, "loss": 0.4335, "step": 976 }, { "epoch": 0.2928986321903691, "grad_norm": 0.2230203002691269, "learning_rate": 0.00025644144144144145, "loss": 0.4183, "step": 977 }, { "epoch": 0.2931984260820686, "grad_norm": 0.24735258519649506, "learning_rate": 0.0002563963963963964, "loss": 0.4612, "step": 978 }, { "epoch": 0.293498219973768, "grad_norm": 0.24861575663089752, "learning_rate": 0.0002563513513513513, "loss": 0.4713, "step": 979 }, { "epoch": 0.2937980138654675, "grad_norm": 0.2333897352218628, "learning_rate": 0.0002563063063063063, "loss": 0.4502, "step": 980 }, { "epoch": 0.29409780775716693, "grad_norm": 0.23923064768314362, "learning_rate": 0.00025626126126126125, "loss": 0.4574, "step": 981 }, { "epoch": 0.2943976016488664, "grad_norm": 0.24568355083465576, "learning_rate": 0.0002562162162162162, "loss": 0.4358, "step": 982 }, { "epoch": 0.29469739554056584, "grad_norm": 0.24993112683296204, "learning_rate": 0.0002561711711711712, "loss": 0.4663, "step": 983 }, { "epoch": 0.2949971894322653, "grad_norm": 0.2531440854072571, "learning_rate": 0.0002561261261261261, "loss": 0.4964, "step": 984 }, { "epoch": 0.29529698332396476, "grad_norm": 0.26995500922203064, "learning_rate": 0.00025608108108108105, "loss": 0.4971, "step": 985 }, { "epoch": 0.29559677721566424, "grad_norm": 0.23319192230701447, "learning_rate": 0.00025603603603603604, "loss": 0.4605, "step": 986 }, { "epoch": 0.29589657110736367, "grad_norm": 0.2496713548898697, "learning_rate": 0.000255990990990991, "loss": 0.4468, "step": 987 }, { "epoch": 0.29619636499906316, "grad_norm": 0.23224860429763794, "learning_rate": 0.0002559459459459459, "loss": 0.4519, "step": 988 }, { "epoch": 0.2964961588907626, "grad_norm": 0.24383842945098877, "learning_rate": 0.0002559009009009009, "loss": 0.467, "step": 989 }, { "epoch": 0.29679595278246207, "grad_norm": 0.2240372598171234, "learning_rate": 0.00025585585585585584, "loss": 0.4541, "step": 990 }, { "epoch": 0.2970957466741615, "grad_norm": 0.23554278910160065, "learning_rate": 0.0002558108108108108, "loss": 0.4627, "step": 991 }, { "epoch": 0.297395540565861, "grad_norm": 0.24655288457870483, "learning_rate": 0.00025576576576576577, "loss": 0.4697, "step": 992 }, { "epoch": 0.2976953344575604, "grad_norm": 0.2397495061159134, "learning_rate": 0.0002557207207207207, "loss": 0.4821, "step": 993 }, { "epoch": 0.2979951283492599, "grad_norm": 0.24440963566303253, "learning_rate": 0.00025567567567567564, "loss": 0.4293, "step": 994 }, { "epoch": 0.2982949222409593, "grad_norm": 0.230440154671669, "learning_rate": 0.00025563063063063063, "loss": 0.4592, "step": 995 }, { "epoch": 0.2985947161326588, "grad_norm": 0.22211557626724243, "learning_rate": 0.00025558558558558557, "loss": 0.4324, "step": 996 }, { "epoch": 0.29889451002435824, "grad_norm": 0.22826789319515228, "learning_rate": 0.0002555405405405405, "loss": 0.447, "step": 997 }, { "epoch": 0.2991943039160577, "grad_norm": 0.24060975015163422, "learning_rate": 0.0002554954954954955, "loss": 0.4954, "step": 998 }, { "epoch": 0.29949409780775715, "grad_norm": 0.2227400243282318, "learning_rate": 0.00025545045045045043, "loss": 0.4503, "step": 999 }, { "epoch": 0.29979389169945664, "grad_norm": 0.23061898350715637, "learning_rate": 0.00025540540540540537, "loss": 0.4356, "step": 1000 }, { "epoch": 0.29979389169945664, "eval_loss": 0.46561944484710693, "eval_runtime": 566.7946, "eval_samples_per_second": 3.809, "eval_steps_per_second": 0.476, "step": 1000 }, { "epoch": 0.30009368559115607, "grad_norm": 0.25651443004608154, "learning_rate": 0.00025536036036036036, "loss": 0.4813, "step": 1001 }, { "epoch": 0.30039347948285555, "grad_norm": 0.23068277537822723, "learning_rate": 0.0002553153153153153, "loss": 0.4563, "step": 1002 }, { "epoch": 0.300693273374555, "grad_norm": 0.2346974015235901, "learning_rate": 0.00025527027027027024, "loss": 0.4471, "step": 1003 }, { "epoch": 0.30099306726625447, "grad_norm": 0.22291089594364166, "learning_rate": 0.0002552252252252252, "loss": 0.4399, "step": 1004 }, { "epoch": 0.3012928611579539, "grad_norm": 0.24533626437187195, "learning_rate": 0.00025518018018018016, "loss": 0.4949, "step": 1005 }, { "epoch": 0.3015926550496534, "grad_norm": 0.2337205857038498, "learning_rate": 0.0002551351351351351, "loss": 0.4815, "step": 1006 }, { "epoch": 0.3018924489413528, "grad_norm": 0.23926278948783875, "learning_rate": 0.0002550900900900901, "loss": 0.4742, "step": 1007 }, { "epoch": 0.3021922428330523, "grad_norm": 0.23630262911319733, "learning_rate": 0.00025504504504504503, "loss": 0.4643, "step": 1008 }, { "epoch": 0.3024920367247517, "grad_norm": 0.2534981071949005, "learning_rate": 0.00025499999999999996, "loss": 0.4806, "step": 1009 }, { "epoch": 0.3027918306164512, "grad_norm": 0.22765369713306427, "learning_rate": 0.00025495495495495496, "loss": 0.4562, "step": 1010 }, { "epoch": 0.30309162450815064, "grad_norm": 0.23825423419475555, "learning_rate": 0.0002549099099099099, "loss": 0.4642, "step": 1011 }, { "epoch": 0.3033914183998501, "grad_norm": 0.2275952249765396, "learning_rate": 0.00025486486486486483, "loss": 0.4716, "step": 1012 }, { "epoch": 0.30369121229154955, "grad_norm": 0.23309756815433502, "learning_rate": 0.00025481981981981977, "loss": 0.467, "step": 1013 }, { "epoch": 0.30399100618324904, "grad_norm": 0.2582738995552063, "learning_rate": 0.00025477477477477476, "loss": 0.4756, "step": 1014 }, { "epoch": 0.30429080007494846, "grad_norm": 0.21543192863464355, "learning_rate": 0.0002547297297297297, "loss": 0.4184, "step": 1015 }, { "epoch": 0.30459059396664795, "grad_norm": 0.22537867724895477, "learning_rate": 0.00025468468468468463, "loss": 0.451, "step": 1016 }, { "epoch": 0.3048903878583474, "grad_norm": 0.22783374786376953, "learning_rate": 0.0002546396396396396, "loss": 0.4595, "step": 1017 }, { "epoch": 0.30519018175004686, "grad_norm": 0.2382606863975525, "learning_rate": 0.00025459459459459456, "loss": 0.4465, "step": 1018 }, { "epoch": 0.3054899756417463, "grad_norm": 0.23583681881427765, "learning_rate": 0.0002545495495495495, "loss": 0.453, "step": 1019 }, { "epoch": 0.3057897695334458, "grad_norm": 0.24536754190921783, "learning_rate": 0.0002545045045045045, "loss": 0.4829, "step": 1020 }, { "epoch": 0.3060895634251452, "grad_norm": 0.21961447596549988, "learning_rate": 0.0002544594594594594, "loss": 0.4555, "step": 1021 }, { "epoch": 0.3063893573168447, "grad_norm": 0.22678659856319427, "learning_rate": 0.00025441441441441436, "loss": 0.4423, "step": 1022 }, { "epoch": 0.3066891512085441, "grad_norm": 0.23871202766895294, "learning_rate": 0.00025436936936936935, "loss": 0.4657, "step": 1023 }, { "epoch": 0.3069889451002436, "grad_norm": 0.23637500405311584, "learning_rate": 0.0002543243243243243, "loss": 0.4665, "step": 1024 }, { "epoch": 0.30728873899194303, "grad_norm": 0.22538509964942932, "learning_rate": 0.0002542792792792792, "loss": 0.4349, "step": 1025 }, { "epoch": 0.3075885328836425, "grad_norm": 0.2449694573879242, "learning_rate": 0.0002542342342342342, "loss": 0.465, "step": 1026 }, { "epoch": 0.30788832677534195, "grad_norm": 0.22211913764476776, "learning_rate": 0.00025418918918918915, "loss": 0.4276, "step": 1027 }, { "epoch": 0.30818812066704143, "grad_norm": 0.25039684772491455, "learning_rate": 0.0002541441441441441, "loss": 0.4799, "step": 1028 }, { "epoch": 0.30848791455874086, "grad_norm": 0.22903893887996674, "learning_rate": 0.0002540990990990991, "loss": 0.4679, "step": 1029 }, { "epoch": 0.30878770845044035, "grad_norm": 0.2479073703289032, "learning_rate": 0.000254054054054054, "loss": 0.4722, "step": 1030 }, { "epoch": 0.3090875023421398, "grad_norm": 0.22166845202445984, "learning_rate": 0.00025400900900900895, "loss": 0.4354, "step": 1031 }, { "epoch": 0.30938729623383926, "grad_norm": 0.22735866904258728, "learning_rate": 0.00025396396396396394, "loss": 0.4344, "step": 1032 }, { "epoch": 0.3096870901255387, "grad_norm": 0.2471131831407547, "learning_rate": 0.0002539189189189189, "loss": 0.4751, "step": 1033 }, { "epoch": 0.3099868840172382, "grad_norm": 0.22858619689941406, "learning_rate": 0.00025387387387387387, "loss": 0.4589, "step": 1034 }, { "epoch": 0.3102866779089376, "grad_norm": 0.2418094426393509, "learning_rate": 0.0002538288288288288, "loss": 0.4873, "step": 1035 }, { "epoch": 0.3105864718006371, "grad_norm": 0.2313918024301529, "learning_rate": 0.00025378378378378374, "loss": 0.4819, "step": 1036 }, { "epoch": 0.3108862656923365, "grad_norm": 0.22201773524284363, "learning_rate": 0.00025373873873873874, "loss": 0.4553, "step": 1037 }, { "epoch": 0.311186059584036, "grad_norm": 0.2284671664237976, "learning_rate": 0.00025369369369369367, "loss": 0.4485, "step": 1038 }, { "epoch": 0.31148585347573543, "grad_norm": 0.2529887855052948, "learning_rate": 0.0002536486486486486, "loss": 0.4674, "step": 1039 }, { "epoch": 0.3117856473674349, "grad_norm": 0.23350189626216888, "learning_rate": 0.0002536036036036036, "loss": 0.4835, "step": 1040 }, { "epoch": 0.31208544125913434, "grad_norm": 0.23258428275585175, "learning_rate": 0.00025355855855855854, "loss": 0.4458, "step": 1041 }, { "epoch": 0.31238523515083383, "grad_norm": 0.23113112151622772, "learning_rate": 0.0002535135135135135, "loss": 0.4581, "step": 1042 }, { "epoch": 0.31268502904253326, "grad_norm": 0.22711153328418732, "learning_rate": 0.00025346846846846846, "loss": 0.4485, "step": 1043 }, { "epoch": 0.31298482293423274, "grad_norm": 0.23305024206638336, "learning_rate": 0.0002534234234234234, "loss": 0.4316, "step": 1044 }, { "epoch": 0.31328461682593217, "grad_norm": 0.24723917245864868, "learning_rate": 0.0002533783783783784, "loss": 0.4512, "step": 1045 }, { "epoch": 0.3135844107176316, "grad_norm": 0.21640846133232117, "learning_rate": 0.00025333333333333333, "loss": 0.4485, "step": 1046 }, { "epoch": 0.3138842046093311, "grad_norm": 0.25021156668663025, "learning_rate": 0.00025328828828828827, "loss": 0.4708, "step": 1047 }, { "epoch": 0.3141839985010305, "grad_norm": 0.24005773663520813, "learning_rate": 0.00025324324324324326, "loss": 0.4698, "step": 1048 }, { "epoch": 0.31448379239273, "grad_norm": 0.24885396659374237, "learning_rate": 0.0002531981981981982, "loss": 0.4899, "step": 1049 }, { "epoch": 0.31478358628442943, "grad_norm": 0.2413524091243744, "learning_rate": 0.00025315315315315313, "loss": 0.4776, "step": 1050 }, { "epoch": 0.3150833801761289, "grad_norm": 0.25239062309265137, "learning_rate": 0.0002531081081081081, "loss": 0.4788, "step": 1051 }, { "epoch": 0.31538317406782834, "grad_norm": 0.23389939963817596, "learning_rate": 0.00025306306306306306, "loss": 0.4299, "step": 1052 }, { "epoch": 0.3156829679595278, "grad_norm": 0.2468218207359314, "learning_rate": 0.000253018018018018, "loss": 0.4759, "step": 1053 }, { "epoch": 0.31598276185122726, "grad_norm": 0.2298142910003662, "learning_rate": 0.000252972972972973, "loss": 0.4658, "step": 1054 }, { "epoch": 0.31628255574292674, "grad_norm": 0.22888216376304626, "learning_rate": 0.0002529279279279279, "loss": 0.4419, "step": 1055 }, { "epoch": 0.31658234963462617, "grad_norm": 0.23855063319206238, "learning_rate": 0.00025288288288288286, "loss": 0.4642, "step": 1056 }, { "epoch": 0.31688214352632565, "grad_norm": 0.24454447627067566, "learning_rate": 0.00025283783783783785, "loss": 0.4495, "step": 1057 }, { "epoch": 0.3171819374180251, "grad_norm": 0.22794046998023987, "learning_rate": 0.0002527927927927928, "loss": 0.4474, "step": 1058 }, { "epoch": 0.31748173130972457, "grad_norm": 0.248634934425354, "learning_rate": 0.0002527477477477477, "loss": 0.4694, "step": 1059 }, { "epoch": 0.317781525201424, "grad_norm": 0.24363334476947784, "learning_rate": 0.00025270270270270266, "loss": 0.4549, "step": 1060 }, { "epoch": 0.3180813190931235, "grad_norm": 0.23220765590667725, "learning_rate": 0.00025265765765765765, "loss": 0.4478, "step": 1061 }, { "epoch": 0.3183811129848229, "grad_norm": 0.22161665558815002, "learning_rate": 0.0002526126126126126, "loss": 0.4549, "step": 1062 }, { "epoch": 0.3186809068765224, "grad_norm": 0.24613521993160248, "learning_rate": 0.0002525675675675675, "loss": 0.4505, "step": 1063 }, { "epoch": 0.3189807007682218, "grad_norm": 0.26228928565979004, "learning_rate": 0.0002525225225225225, "loss": 0.4878, "step": 1064 }, { "epoch": 0.3192804946599213, "grad_norm": 0.2279721200466156, "learning_rate": 0.00025247747747747745, "loss": 0.4481, "step": 1065 }, { "epoch": 0.31958028855162074, "grad_norm": 0.24583470821380615, "learning_rate": 0.0002524324324324324, "loss": 0.4643, "step": 1066 }, { "epoch": 0.3198800824433202, "grad_norm": 0.24150992929935455, "learning_rate": 0.0002523873873873874, "loss": 0.4705, "step": 1067 }, { "epoch": 0.32017987633501965, "grad_norm": 0.2419997900724411, "learning_rate": 0.0002523423423423423, "loss": 0.4648, "step": 1068 }, { "epoch": 0.32047967022671914, "grad_norm": 0.26776254177093506, "learning_rate": 0.00025229729729729725, "loss": 0.5013, "step": 1069 }, { "epoch": 0.32077946411841857, "grad_norm": 0.24678350985050201, "learning_rate": 0.00025225225225225225, "loss": 0.449, "step": 1070 }, { "epoch": 0.32107925801011805, "grad_norm": 0.24101199209690094, "learning_rate": 0.0002522072072072072, "loss": 0.469, "step": 1071 }, { "epoch": 0.3213790519018175, "grad_norm": 0.24230711162090302, "learning_rate": 0.0002521621621621621, "loss": 0.481, "step": 1072 }, { "epoch": 0.32167884579351697, "grad_norm": 0.22988542914390564, "learning_rate": 0.0002521171171171171, "loss": 0.4442, "step": 1073 }, { "epoch": 0.3219786396852164, "grad_norm": 0.23284588754177094, "learning_rate": 0.00025207207207207205, "loss": 0.4392, "step": 1074 }, { "epoch": 0.3222784335769159, "grad_norm": 0.24554894864559174, "learning_rate": 0.000252027027027027, "loss": 0.4544, "step": 1075 }, { "epoch": 0.3225782274686153, "grad_norm": 0.22776508331298828, "learning_rate": 0.000251981981981982, "loss": 0.4503, "step": 1076 }, { "epoch": 0.3228780213603148, "grad_norm": 0.2508374750614166, "learning_rate": 0.0002519369369369369, "loss": 0.4779, "step": 1077 }, { "epoch": 0.3231778152520142, "grad_norm": 0.22543244063854218, "learning_rate": 0.00025189189189189185, "loss": 0.4672, "step": 1078 }, { "epoch": 0.3234776091437137, "grad_norm": 0.2409958839416504, "learning_rate": 0.00025184684684684684, "loss": 0.4631, "step": 1079 }, { "epoch": 0.32377740303541314, "grad_norm": 0.2308938056230545, "learning_rate": 0.0002518018018018018, "loss": 0.4244, "step": 1080 }, { "epoch": 0.3240771969271126, "grad_norm": 0.2354745715856552, "learning_rate": 0.0002517567567567567, "loss": 0.4499, "step": 1081 }, { "epoch": 0.32437699081881205, "grad_norm": 0.24564653635025024, "learning_rate": 0.0002517117117117117, "loss": 0.4655, "step": 1082 }, { "epoch": 0.32467678471051153, "grad_norm": 0.2388393133878708, "learning_rate": 0.00025166666666666664, "loss": 0.4747, "step": 1083 }, { "epoch": 0.32497657860221096, "grad_norm": 0.23941588401794434, "learning_rate": 0.0002516216216216216, "loss": 0.4646, "step": 1084 }, { "epoch": 0.32527637249391045, "grad_norm": 0.24191126227378845, "learning_rate": 0.0002515765765765765, "loss": 0.4686, "step": 1085 }, { "epoch": 0.3255761663856099, "grad_norm": 0.2466372847557068, "learning_rate": 0.0002515315315315315, "loss": 0.4989, "step": 1086 }, { "epoch": 0.32587596027730936, "grad_norm": 0.2441006302833557, "learning_rate": 0.00025148648648648644, "loss": 0.4562, "step": 1087 }, { "epoch": 0.3261757541690088, "grad_norm": 0.26483842730522156, "learning_rate": 0.0002514414414414414, "loss": 0.4872, "step": 1088 }, { "epoch": 0.3264755480607083, "grad_norm": 0.2481161653995514, "learning_rate": 0.00025139639639639637, "loss": 0.4486, "step": 1089 }, { "epoch": 0.3267753419524077, "grad_norm": 0.23705454170703888, "learning_rate": 0.0002513513513513513, "loss": 0.4407, "step": 1090 }, { "epoch": 0.3270751358441072, "grad_norm": 0.25678539276123047, "learning_rate": 0.0002513063063063063, "loss": 0.4899, "step": 1091 }, { "epoch": 0.3273749297358066, "grad_norm": 0.22578591108322144, "learning_rate": 0.00025126126126126123, "loss": 0.4326, "step": 1092 }, { "epoch": 0.3276747236275061, "grad_norm": 0.23661458492279053, "learning_rate": 0.00025121621621621617, "loss": 0.4649, "step": 1093 }, { "epoch": 0.32797451751920553, "grad_norm": 0.2496035248041153, "learning_rate": 0.00025117117117117116, "loss": 0.4978, "step": 1094 }, { "epoch": 0.328274311410905, "grad_norm": 0.225214421749115, "learning_rate": 0.0002511261261261261, "loss": 0.4608, "step": 1095 }, { "epoch": 0.32857410530260445, "grad_norm": 0.24089965224266052, "learning_rate": 0.00025108108108108103, "loss": 0.4854, "step": 1096 }, { "epoch": 0.32887389919430393, "grad_norm": 0.23737536370754242, "learning_rate": 0.000251036036036036, "loss": 0.4692, "step": 1097 }, { "epoch": 0.32917369308600336, "grad_norm": 0.23569715023040771, "learning_rate": 0.00025099099099099096, "loss": 0.44, "step": 1098 }, { "epoch": 0.32947348697770285, "grad_norm": 0.22477473318576813, "learning_rate": 0.0002509459459459459, "loss": 0.4168, "step": 1099 }, { "epoch": 0.3297732808694023, "grad_norm": 0.25336310267448425, "learning_rate": 0.0002509009009009009, "loss": 0.4493, "step": 1100 }, { "epoch": 0.33007307476110176, "grad_norm": 0.2452186793088913, "learning_rate": 0.00025085585585585583, "loss": 0.484, "step": 1101 }, { "epoch": 0.3303728686528012, "grad_norm": 0.23870813846588135, "learning_rate": 0.0002508108108108108, "loss": 0.4228, "step": 1102 }, { "epoch": 0.3306726625445007, "grad_norm": 0.2262556552886963, "learning_rate": 0.00025076576576576575, "loss": 0.4525, "step": 1103 }, { "epoch": 0.3309724564362001, "grad_norm": 0.2720157504081726, "learning_rate": 0.0002507207207207207, "loss": 0.4961, "step": 1104 }, { "epoch": 0.3312722503278996, "grad_norm": 0.23624925315380096, "learning_rate": 0.0002506756756756757, "loss": 0.4524, "step": 1105 }, { "epoch": 0.331572044219599, "grad_norm": 0.2634661793708801, "learning_rate": 0.0002506306306306306, "loss": 0.4603, "step": 1106 }, { "epoch": 0.3318718381112985, "grad_norm": 0.24629558622837067, "learning_rate": 0.00025058558558558556, "loss": 0.453, "step": 1107 }, { "epoch": 0.33217163200299793, "grad_norm": 0.2503926157951355, "learning_rate": 0.00025054054054054055, "loss": 0.4756, "step": 1108 }, { "epoch": 0.3324714258946974, "grad_norm": 0.22624404728412628, "learning_rate": 0.0002504954954954955, "loss": 0.4391, "step": 1109 }, { "epoch": 0.33277121978639684, "grad_norm": 0.22146141529083252, "learning_rate": 0.0002504504504504504, "loss": 0.4496, "step": 1110 }, { "epoch": 0.33307101367809633, "grad_norm": 0.23911216855049133, "learning_rate": 0.0002504054054054054, "loss": 0.4584, "step": 1111 }, { "epoch": 0.33337080756979576, "grad_norm": 0.23537759482860565, "learning_rate": 0.00025036036036036035, "loss": 0.4735, "step": 1112 }, { "epoch": 0.33367060146149524, "grad_norm": 0.23113307356834412, "learning_rate": 0.0002503153153153153, "loss": 0.4393, "step": 1113 }, { "epoch": 0.33397039535319467, "grad_norm": 0.24185238778591156, "learning_rate": 0.0002502702702702703, "loss": 0.4661, "step": 1114 }, { "epoch": 0.33427018924489416, "grad_norm": 0.25618115067481995, "learning_rate": 0.0002502252252252252, "loss": 0.4777, "step": 1115 }, { "epoch": 0.3345699831365936, "grad_norm": 0.24486567080020905, "learning_rate": 0.00025018018018018015, "loss": 0.4589, "step": 1116 }, { "epoch": 0.33486977702829307, "grad_norm": 0.2608473300933838, "learning_rate": 0.00025013513513513514, "loss": 0.4716, "step": 1117 }, { "epoch": 0.3351695709199925, "grad_norm": 0.2427588254213333, "learning_rate": 0.0002500900900900901, "loss": 0.4856, "step": 1118 }, { "epoch": 0.335469364811692, "grad_norm": 0.2493797391653061, "learning_rate": 0.000250045045045045, "loss": 0.4925, "step": 1119 }, { "epoch": 0.3357691587033914, "grad_norm": 0.2610112428665161, "learning_rate": 0.00025, "loss": 0.4685, "step": 1120 }, { "epoch": 0.3360689525950909, "grad_norm": 0.2435634434223175, "learning_rate": 0.00024995495495495494, "loss": 0.4678, "step": 1121 }, { "epoch": 0.3363687464867903, "grad_norm": 0.24447932839393616, "learning_rate": 0.0002499099099099099, "loss": 0.4656, "step": 1122 }, { "epoch": 0.3366685403784898, "grad_norm": 0.24005521833896637, "learning_rate": 0.00024986486486486487, "loss": 0.4712, "step": 1123 }, { "epoch": 0.33696833427018924, "grad_norm": 0.2303832322359085, "learning_rate": 0.0002498198198198198, "loss": 0.4527, "step": 1124 }, { "epoch": 0.3372681281618887, "grad_norm": 0.221012681722641, "learning_rate": 0.00024977477477477474, "loss": 0.4376, "step": 1125 }, { "epoch": 0.33756792205358815, "grad_norm": 0.23421809077262878, "learning_rate": 0.00024972972972972973, "loss": 0.4449, "step": 1126 }, { "epoch": 0.33786771594528764, "grad_norm": 0.23941418528556824, "learning_rate": 0.00024968468468468467, "loss": 0.4679, "step": 1127 }, { "epoch": 0.33816750983698707, "grad_norm": 0.2479025423526764, "learning_rate": 0.0002496396396396396, "loss": 0.4872, "step": 1128 }, { "epoch": 0.33846730372868655, "grad_norm": 0.24516451358795166, "learning_rate": 0.0002495945945945946, "loss": 0.4488, "step": 1129 }, { "epoch": 0.338767097620386, "grad_norm": 0.2436760663986206, "learning_rate": 0.00024954954954954954, "loss": 0.4477, "step": 1130 }, { "epoch": 0.33906689151208547, "grad_norm": 0.23894813656806946, "learning_rate": 0.00024950450450450447, "loss": 0.4295, "step": 1131 }, { "epoch": 0.3393666854037849, "grad_norm": 0.24569731950759888, "learning_rate": 0.00024945945945945946, "loss": 0.46, "step": 1132 }, { "epoch": 0.3396664792954844, "grad_norm": 0.24807578325271606, "learning_rate": 0.0002494144144144144, "loss": 0.4512, "step": 1133 }, { "epoch": 0.3399662731871838, "grad_norm": 0.23641791939735413, "learning_rate": 0.00024936936936936934, "loss": 0.454, "step": 1134 }, { "epoch": 0.3402660670788833, "grad_norm": 0.26076388359069824, "learning_rate": 0.0002493243243243243, "loss": 0.4771, "step": 1135 }, { "epoch": 0.3405658609705827, "grad_norm": 0.24686305224895477, "learning_rate": 0.00024927927927927926, "loss": 0.4783, "step": 1136 }, { "epoch": 0.3408656548622822, "grad_norm": 0.2262791097164154, "learning_rate": 0.0002492342342342342, "loss": 0.4272, "step": 1137 }, { "epoch": 0.34116544875398164, "grad_norm": 0.23418664932250977, "learning_rate": 0.00024918918918918914, "loss": 0.4666, "step": 1138 }, { "epoch": 0.3414652426456811, "grad_norm": 0.23737958073616028, "learning_rate": 0.00024914414414414413, "loss": 0.4433, "step": 1139 }, { "epoch": 0.34176503653738055, "grad_norm": 0.2579478919506073, "learning_rate": 0.00024909909909909907, "loss": 0.4702, "step": 1140 }, { "epoch": 0.34206483042908, "grad_norm": 0.2627730667591095, "learning_rate": 0.000249054054054054, "loss": 0.4607, "step": 1141 }, { "epoch": 0.34236462432077946, "grad_norm": 0.2283281534910202, "learning_rate": 0.000249009009009009, "loss": 0.4462, "step": 1142 }, { "epoch": 0.3426644182124789, "grad_norm": 0.24083632230758667, "learning_rate": 0.00024896396396396393, "loss": 0.4663, "step": 1143 }, { "epoch": 0.3429642121041784, "grad_norm": 0.24331289529800415, "learning_rate": 0.00024891891891891887, "loss": 0.4414, "step": 1144 }, { "epoch": 0.3432640059958778, "grad_norm": 0.24059659242630005, "learning_rate": 0.00024887387387387386, "loss": 0.4728, "step": 1145 }, { "epoch": 0.3435637998875773, "grad_norm": 0.23175103962421417, "learning_rate": 0.0002488288288288288, "loss": 0.437, "step": 1146 }, { "epoch": 0.3438635937792767, "grad_norm": 0.23247724771499634, "learning_rate": 0.00024878378378378373, "loss": 0.4454, "step": 1147 }, { "epoch": 0.3441633876709762, "grad_norm": 0.22808894515037537, "learning_rate": 0.0002487387387387387, "loss": 0.4379, "step": 1148 }, { "epoch": 0.34446318156267564, "grad_norm": 0.2515697777271271, "learning_rate": 0.00024869369369369366, "loss": 0.4938, "step": 1149 }, { "epoch": 0.3447629754543751, "grad_norm": 0.22830873727798462, "learning_rate": 0.0002486486486486486, "loss": 0.4335, "step": 1150 }, { "epoch": 0.34506276934607455, "grad_norm": 0.2352674901485443, "learning_rate": 0.0002486036036036036, "loss": 0.4661, "step": 1151 }, { "epoch": 0.34536256323777403, "grad_norm": 0.233395054936409, "learning_rate": 0.0002485585585585585, "loss": 0.4429, "step": 1152 }, { "epoch": 0.34566235712947346, "grad_norm": 0.2488911747932434, "learning_rate": 0.00024851351351351346, "loss": 0.4832, "step": 1153 }, { "epoch": 0.34596215102117295, "grad_norm": 0.2402927577495575, "learning_rate": 0.00024846846846846845, "loss": 0.4633, "step": 1154 }, { "epoch": 0.3462619449128724, "grad_norm": 0.23628897964954376, "learning_rate": 0.0002484234234234234, "loss": 0.4683, "step": 1155 }, { "epoch": 0.34656173880457186, "grad_norm": 0.23966571688652039, "learning_rate": 0.0002483783783783783, "loss": 0.4716, "step": 1156 }, { "epoch": 0.3468615326962713, "grad_norm": 0.23786477744579315, "learning_rate": 0.0002483333333333333, "loss": 0.4829, "step": 1157 }, { "epoch": 0.3471613265879708, "grad_norm": 0.24706590175628662, "learning_rate": 0.00024828828828828825, "loss": 0.4741, "step": 1158 }, { "epoch": 0.3474611204796702, "grad_norm": 0.25192269682884216, "learning_rate": 0.00024824324324324324, "loss": 0.4725, "step": 1159 }, { "epoch": 0.3477609143713697, "grad_norm": 0.2672080099582672, "learning_rate": 0.0002481981981981982, "loss": 0.4677, "step": 1160 }, { "epoch": 0.3480607082630691, "grad_norm": 0.23710590600967407, "learning_rate": 0.0002481531531531531, "loss": 0.4629, "step": 1161 }, { "epoch": 0.3483605021547686, "grad_norm": 0.2306007295846939, "learning_rate": 0.0002481081081081081, "loss": 0.4425, "step": 1162 }, { "epoch": 0.34866029604646803, "grad_norm": 0.2416974902153015, "learning_rate": 0.00024806306306306305, "loss": 0.4452, "step": 1163 }, { "epoch": 0.3489600899381675, "grad_norm": 0.2495068907737732, "learning_rate": 0.000248018018018018, "loss": 0.4875, "step": 1164 }, { "epoch": 0.34925988382986695, "grad_norm": 0.26994964480400085, "learning_rate": 0.00024797297297297297, "loss": 0.4957, "step": 1165 }, { "epoch": 0.34955967772156643, "grad_norm": 0.2546437978744507, "learning_rate": 0.0002479279279279279, "loss": 0.498, "step": 1166 }, { "epoch": 0.34985947161326586, "grad_norm": 0.2271340936422348, "learning_rate": 0.00024788288288288285, "loss": 0.4615, "step": 1167 }, { "epoch": 0.35015926550496534, "grad_norm": 0.24870999157428741, "learning_rate": 0.00024783783783783784, "loss": 0.4688, "step": 1168 }, { "epoch": 0.3504590593966648, "grad_norm": 0.23978911340236664, "learning_rate": 0.0002477927927927928, "loss": 0.4882, "step": 1169 }, { "epoch": 0.35075885328836426, "grad_norm": 0.2773337662220001, "learning_rate": 0.0002477477477477477, "loss": 0.4695, "step": 1170 }, { "epoch": 0.3510586471800637, "grad_norm": 0.24570350348949432, "learning_rate": 0.0002477027027027027, "loss": 0.4679, "step": 1171 }, { "epoch": 0.3513584410717632, "grad_norm": 0.25563982129096985, "learning_rate": 0.00024765765765765764, "loss": 0.4731, "step": 1172 }, { "epoch": 0.3516582349634626, "grad_norm": 0.23189115524291992, "learning_rate": 0.00024761261261261263, "loss": 0.4476, "step": 1173 }, { "epoch": 0.3519580288551621, "grad_norm": 0.24074453115463257, "learning_rate": 0.00024756756756756757, "loss": 0.4419, "step": 1174 }, { "epoch": 0.3522578227468615, "grad_norm": 0.2376662790775299, "learning_rate": 0.0002475225225225225, "loss": 0.4571, "step": 1175 }, { "epoch": 0.352557616638561, "grad_norm": 0.2344047725200653, "learning_rate": 0.0002474774774774775, "loss": 0.4389, "step": 1176 }, { "epoch": 0.35285741053026043, "grad_norm": 0.23310165107250214, "learning_rate": 0.00024743243243243243, "loss": 0.4583, "step": 1177 }, { "epoch": 0.3531572044219599, "grad_norm": 0.21277011930942535, "learning_rate": 0.00024738738738738737, "loss": 0.4306, "step": 1178 }, { "epoch": 0.35345699831365934, "grad_norm": 0.23581352829933167, "learning_rate": 0.00024734234234234236, "loss": 0.475, "step": 1179 }, { "epoch": 0.3537567922053588, "grad_norm": 0.23194879293441772, "learning_rate": 0.0002472972972972973, "loss": 0.4518, "step": 1180 }, { "epoch": 0.35405658609705826, "grad_norm": 0.22603453695774078, "learning_rate": 0.00024725225225225223, "loss": 0.4464, "step": 1181 }, { "epoch": 0.35435637998875774, "grad_norm": 0.23987054824829102, "learning_rate": 0.00024720720720720717, "loss": 0.4586, "step": 1182 }, { "epoch": 0.35465617388045717, "grad_norm": 0.22986359894275665, "learning_rate": 0.00024716216216216216, "loss": 0.4664, "step": 1183 }, { "epoch": 0.35495596777215666, "grad_norm": 0.22636739909648895, "learning_rate": 0.0002471171171171171, "loss": 0.4236, "step": 1184 }, { "epoch": 0.3552557616638561, "grad_norm": 0.2346397340297699, "learning_rate": 0.00024707207207207203, "loss": 0.4703, "step": 1185 }, { "epoch": 0.35555555555555557, "grad_norm": 0.2564719617366791, "learning_rate": 0.000247027027027027, "loss": 0.4775, "step": 1186 }, { "epoch": 0.355855349447255, "grad_norm": 0.22305525839328766, "learning_rate": 0.00024698198198198196, "loss": 0.4694, "step": 1187 }, { "epoch": 0.3561551433389545, "grad_norm": 0.2369467169046402, "learning_rate": 0.0002469369369369369, "loss": 0.4498, "step": 1188 }, { "epoch": 0.3564549372306539, "grad_norm": 0.25123798847198486, "learning_rate": 0.0002468918918918919, "loss": 0.4619, "step": 1189 }, { "epoch": 0.3567547311223534, "grad_norm": 0.21925069391727448, "learning_rate": 0.0002468468468468468, "loss": 0.4498, "step": 1190 }, { "epoch": 0.3570545250140528, "grad_norm": 0.2385261207818985, "learning_rate": 0.00024680180180180176, "loss": 0.4537, "step": 1191 }, { "epoch": 0.3573543189057523, "grad_norm": 0.23894301056861877, "learning_rate": 0.00024675675675675675, "loss": 0.4665, "step": 1192 }, { "epoch": 0.35765411279745174, "grad_norm": 0.23315206170082092, "learning_rate": 0.0002467117117117117, "loss": 0.4421, "step": 1193 }, { "epoch": 0.3579539066891512, "grad_norm": 0.23406696319580078, "learning_rate": 0.0002466666666666666, "loss": 0.4377, "step": 1194 }, { "epoch": 0.35825370058085065, "grad_norm": 0.25852885842323303, "learning_rate": 0.0002466216216216216, "loss": 0.4838, "step": 1195 }, { "epoch": 0.35855349447255014, "grad_norm": 0.24008771777153015, "learning_rate": 0.00024657657657657655, "loss": 0.4733, "step": 1196 }, { "epoch": 0.35885328836424957, "grad_norm": 0.228665292263031, "learning_rate": 0.0002465315315315315, "loss": 0.4753, "step": 1197 }, { "epoch": 0.35915308225594905, "grad_norm": 0.2344791293144226, "learning_rate": 0.0002464864864864865, "loss": 0.4484, "step": 1198 }, { "epoch": 0.3594528761476485, "grad_norm": 0.22843588888645172, "learning_rate": 0.0002464414414414414, "loss": 0.4349, "step": 1199 }, { "epoch": 0.35975267003934797, "grad_norm": 0.23127557337284088, "learning_rate": 0.00024639639639639636, "loss": 0.4466, "step": 1200 }, { "epoch": 0.3600524639310474, "grad_norm": 0.2092585414648056, "learning_rate": 0.00024635135135135135, "loss": 0.4101, "step": 1201 }, { "epoch": 0.3603522578227469, "grad_norm": 0.24416851997375488, "learning_rate": 0.0002463063063063063, "loss": 0.4507, "step": 1202 }, { "epoch": 0.3606520517144463, "grad_norm": 0.2409181445837021, "learning_rate": 0.0002462612612612612, "loss": 0.4641, "step": 1203 }, { "epoch": 0.3609518456061458, "grad_norm": 0.2390405684709549, "learning_rate": 0.0002462162162162162, "loss": 0.4398, "step": 1204 }, { "epoch": 0.3612516394978452, "grad_norm": 0.25821688771247864, "learning_rate": 0.00024617117117117115, "loss": 0.4441, "step": 1205 }, { "epoch": 0.3615514333895447, "grad_norm": 0.24180777370929718, "learning_rate": 0.0002461261261261261, "loss": 0.4852, "step": 1206 }, { "epoch": 0.36185122728124414, "grad_norm": 0.2260608822107315, "learning_rate": 0.000246081081081081, "loss": 0.4214, "step": 1207 }, { "epoch": 0.3621510211729436, "grad_norm": 0.2266250103712082, "learning_rate": 0.000246036036036036, "loss": 0.4104, "step": 1208 }, { "epoch": 0.36245081506464305, "grad_norm": 0.247540682554245, "learning_rate": 0.00024599099099099095, "loss": 0.4563, "step": 1209 }, { "epoch": 0.36275060895634254, "grad_norm": 0.22714072465896606, "learning_rate": 0.0002459459459459459, "loss": 0.4528, "step": 1210 }, { "epoch": 0.36305040284804196, "grad_norm": 0.22302433848381042, "learning_rate": 0.0002459009009009009, "loss": 0.3982, "step": 1211 }, { "epoch": 0.36335019673974145, "grad_norm": 0.2646171748638153, "learning_rate": 0.0002458558558558558, "loss": 0.4837, "step": 1212 }, { "epoch": 0.3636499906314409, "grad_norm": 0.24546460807323456, "learning_rate": 0.00024581081081081075, "loss": 0.4716, "step": 1213 }, { "epoch": 0.36394978452314036, "grad_norm": 0.2416929006576538, "learning_rate": 0.00024576576576576574, "loss": 0.4634, "step": 1214 }, { "epoch": 0.3642495784148398, "grad_norm": 0.2360236495733261, "learning_rate": 0.0002457207207207207, "loss": 0.4409, "step": 1215 }, { "epoch": 0.3645493723065393, "grad_norm": 0.24383249878883362, "learning_rate": 0.00024567567567567567, "loss": 0.4553, "step": 1216 }, { "epoch": 0.3648491661982387, "grad_norm": 0.2516370117664337, "learning_rate": 0.0002456306306306306, "loss": 0.4553, "step": 1217 }, { "epoch": 0.3651489600899382, "grad_norm": 0.2524015009403229, "learning_rate": 0.00024558558558558554, "loss": 0.4766, "step": 1218 }, { "epoch": 0.3654487539816376, "grad_norm": 0.23386207222938538, "learning_rate": 0.00024554054054054053, "loss": 0.439, "step": 1219 }, { "epoch": 0.3657485478733371, "grad_norm": 0.23544320464134216, "learning_rate": 0.00024549549549549547, "loss": 0.4414, "step": 1220 }, { "epoch": 0.36604834176503653, "grad_norm": 0.24809177219867706, "learning_rate": 0.0002454504504504504, "loss": 0.4577, "step": 1221 }, { "epoch": 0.366348135656736, "grad_norm": 0.24466872215270996, "learning_rate": 0.0002454054054054054, "loss": 0.4609, "step": 1222 }, { "epoch": 0.36664792954843545, "grad_norm": 0.24159879982471466, "learning_rate": 0.00024536036036036034, "loss": 0.448, "step": 1223 }, { "epoch": 0.36694772344013493, "grad_norm": 0.2456122189760208, "learning_rate": 0.00024531531531531527, "loss": 0.4563, "step": 1224 }, { "epoch": 0.36724751733183436, "grad_norm": 0.23266494274139404, "learning_rate": 0.00024527027027027026, "loss": 0.457, "step": 1225 }, { "epoch": 0.36754731122353385, "grad_norm": 0.24822424352169037, "learning_rate": 0.0002452252252252252, "loss": 0.4577, "step": 1226 }, { "epoch": 0.3678471051152333, "grad_norm": 0.2528662383556366, "learning_rate": 0.00024518018018018014, "loss": 0.4717, "step": 1227 }, { "epoch": 0.36814689900693276, "grad_norm": 0.22255463898181915, "learning_rate": 0.00024513513513513513, "loss": 0.4287, "step": 1228 }, { "epoch": 0.3684466928986322, "grad_norm": 0.23769044876098633, "learning_rate": 0.00024509009009009006, "loss": 0.4348, "step": 1229 }, { "epoch": 0.3687464867903317, "grad_norm": 0.23163922131061554, "learning_rate": 0.00024504504504504506, "loss": 0.4631, "step": 1230 }, { "epoch": 0.3690462806820311, "grad_norm": 0.22742880880832672, "learning_rate": 0.000245, "loss": 0.4344, "step": 1231 }, { "epoch": 0.3693460745737306, "grad_norm": 0.2314460575580597, "learning_rate": 0.00024495495495495493, "loss": 0.4507, "step": 1232 }, { "epoch": 0.36964586846543, "grad_norm": 0.21651825308799744, "learning_rate": 0.0002449099099099099, "loss": 0.422, "step": 1233 }, { "epoch": 0.3699456623571295, "grad_norm": 0.24304571747779846, "learning_rate": 0.00024486486486486486, "loss": 0.4322, "step": 1234 }, { "epoch": 0.37024545624882893, "grad_norm": 0.24105508625507355, "learning_rate": 0.0002448198198198198, "loss": 0.4367, "step": 1235 }, { "epoch": 0.37054525014052836, "grad_norm": 0.2495047152042389, "learning_rate": 0.0002447747747747748, "loss": 0.456, "step": 1236 }, { "epoch": 0.37084504403222784, "grad_norm": 0.2545395791530609, "learning_rate": 0.0002447297297297297, "loss": 0.4608, "step": 1237 }, { "epoch": 0.3711448379239273, "grad_norm": 0.2552499771118164, "learning_rate": 0.00024468468468468466, "loss": 0.4898, "step": 1238 }, { "epoch": 0.37144463181562676, "grad_norm": 0.2469811737537384, "learning_rate": 0.00024463963963963965, "loss": 0.4427, "step": 1239 }, { "epoch": 0.3717444257073262, "grad_norm": 0.23857302963733673, "learning_rate": 0.0002445945945945946, "loss": 0.4509, "step": 1240 }, { "epoch": 0.37204421959902567, "grad_norm": 0.2521422803401947, "learning_rate": 0.0002445495495495495, "loss": 0.4425, "step": 1241 }, { "epoch": 0.3723440134907251, "grad_norm": 0.24907280504703522, "learning_rate": 0.0002445045045045045, "loss": 0.4542, "step": 1242 }, { "epoch": 0.3726438073824246, "grad_norm": 0.23783591389656067, "learning_rate": 0.00024445945945945945, "loss": 0.4831, "step": 1243 }, { "epoch": 0.372943601274124, "grad_norm": 0.2376372069120407, "learning_rate": 0.0002444144144144144, "loss": 0.4514, "step": 1244 }, { "epoch": 0.3732433951658235, "grad_norm": 0.2387792468070984, "learning_rate": 0.0002443693693693694, "loss": 0.4593, "step": 1245 }, { "epoch": 0.37354318905752293, "grad_norm": 0.22432541847229004, "learning_rate": 0.0002443243243243243, "loss": 0.4462, "step": 1246 }, { "epoch": 0.3738429829492224, "grad_norm": 0.24190527200698853, "learning_rate": 0.00024427927927927925, "loss": 0.4645, "step": 1247 }, { "epoch": 0.37414277684092184, "grad_norm": 0.23738646507263184, "learning_rate": 0.00024423423423423424, "loss": 0.4594, "step": 1248 }, { "epoch": 0.3744425707326213, "grad_norm": 0.24582220613956451, "learning_rate": 0.0002441891891891892, "loss": 0.4632, "step": 1249 }, { "epoch": 0.37474236462432076, "grad_norm": 0.22717328369617462, "learning_rate": 0.0002441441441441441, "loss": 0.4372, "step": 1250 }, { "epoch": 0.37504215851602024, "grad_norm": 0.24414947628974915, "learning_rate": 0.0002440990990990991, "loss": 0.4458, "step": 1251 }, { "epoch": 0.37534195240771967, "grad_norm": 0.23710165917873383, "learning_rate": 0.00024405405405405404, "loss": 0.4468, "step": 1252 }, { "epoch": 0.37564174629941915, "grad_norm": 0.25462841987609863, "learning_rate": 0.00024400900900900898, "loss": 0.4616, "step": 1253 }, { "epoch": 0.3759415401911186, "grad_norm": 0.22636806964874268, "learning_rate": 0.00024396396396396392, "loss": 0.4259, "step": 1254 }, { "epoch": 0.37624133408281807, "grad_norm": 0.24978788197040558, "learning_rate": 0.0002439189189189189, "loss": 0.464, "step": 1255 }, { "epoch": 0.3765411279745175, "grad_norm": 0.2312556803226471, "learning_rate": 0.00024387387387387384, "loss": 0.4531, "step": 1256 }, { "epoch": 0.376840921866217, "grad_norm": 0.22814960777759552, "learning_rate": 0.00024382882882882878, "loss": 0.4608, "step": 1257 }, { "epoch": 0.3771407157579164, "grad_norm": 0.25135213136672974, "learning_rate": 0.00024378378378378377, "loss": 0.4551, "step": 1258 }, { "epoch": 0.3774405096496159, "grad_norm": 0.2209721952676773, "learning_rate": 0.0002437387387387387, "loss": 0.4679, "step": 1259 }, { "epoch": 0.3777403035413153, "grad_norm": 0.2256690412759781, "learning_rate": 0.00024369369369369365, "loss": 0.4388, "step": 1260 }, { "epoch": 0.3780400974330148, "grad_norm": 0.23604658246040344, "learning_rate": 0.00024364864864864864, "loss": 0.4393, "step": 1261 }, { "epoch": 0.37833989132471424, "grad_norm": 0.22875599563121796, "learning_rate": 0.00024360360360360357, "loss": 0.4176, "step": 1262 }, { "epoch": 0.3786396852164137, "grad_norm": 0.2428806722164154, "learning_rate": 0.00024355855855855854, "loss": 0.461, "step": 1263 }, { "epoch": 0.37893947910811315, "grad_norm": 0.2470446228981018, "learning_rate": 0.0002435135135135135, "loss": 0.462, "step": 1264 }, { "epoch": 0.37923927299981264, "grad_norm": 0.22954460978507996, "learning_rate": 0.00024346846846846844, "loss": 0.4529, "step": 1265 }, { "epoch": 0.37953906689151207, "grad_norm": 0.23748372495174408, "learning_rate": 0.0002434234234234234, "loss": 0.4598, "step": 1266 }, { "epoch": 0.37983886078321155, "grad_norm": 0.26300883293151855, "learning_rate": 0.00024337837837837837, "loss": 0.482, "step": 1267 }, { "epoch": 0.380138654674911, "grad_norm": 0.2546245753765106, "learning_rate": 0.0002433333333333333, "loss": 0.4607, "step": 1268 }, { "epoch": 0.38043844856661047, "grad_norm": 0.24974289536476135, "learning_rate": 0.00024328828828828827, "loss": 0.4713, "step": 1269 }, { "epoch": 0.3807382424583099, "grad_norm": 0.2457670271396637, "learning_rate": 0.00024324324324324323, "loss": 0.4709, "step": 1270 }, { "epoch": 0.3810380363500094, "grad_norm": 0.2360873520374298, "learning_rate": 0.00024319819819819817, "loss": 0.4653, "step": 1271 }, { "epoch": 0.3813378302417088, "grad_norm": 0.24448256194591522, "learning_rate": 0.00024315315315315313, "loss": 0.4651, "step": 1272 }, { "epoch": 0.3816376241334083, "grad_norm": 0.22578337788581848, "learning_rate": 0.0002431081081081081, "loss": 0.4163, "step": 1273 }, { "epoch": 0.3819374180251077, "grad_norm": 0.24973514676094055, "learning_rate": 0.00024306306306306306, "loss": 0.4638, "step": 1274 }, { "epoch": 0.3822372119168072, "grad_norm": 0.21938931941986084, "learning_rate": 0.000243018018018018, "loss": 0.4325, "step": 1275 }, { "epoch": 0.38253700580850664, "grad_norm": 0.23947425186634064, "learning_rate": 0.00024297297297297296, "loss": 0.4408, "step": 1276 }, { "epoch": 0.3828367997002061, "grad_norm": 0.23008406162261963, "learning_rate": 0.00024292792792792792, "loss": 0.4377, "step": 1277 }, { "epoch": 0.38313659359190555, "grad_norm": 0.24068063497543335, "learning_rate": 0.00024288288288288286, "loss": 0.4606, "step": 1278 }, { "epoch": 0.38343638748360503, "grad_norm": 0.2432139664888382, "learning_rate": 0.0002428378378378378, "loss": 0.4494, "step": 1279 }, { "epoch": 0.38373618137530446, "grad_norm": 0.22731392085552216, "learning_rate": 0.0002427927927927928, "loss": 0.426, "step": 1280 }, { "epoch": 0.38403597526700395, "grad_norm": 0.2352358102798462, "learning_rate": 0.00024274774774774772, "loss": 0.4537, "step": 1281 }, { "epoch": 0.3843357691587034, "grad_norm": 0.23868781328201294, "learning_rate": 0.00024270270270270266, "loss": 0.4641, "step": 1282 }, { "epoch": 0.38463556305040286, "grad_norm": 0.23498302698135376, "learning_rate": 0.00024265765765765765, "loss": 0.4563, "step": 1283 }, { "epoch": 0.3849353569421023, "grad_norm": 0.24769316613674164, "learning_rate": 0.0002426126126126126, "loss": 0.4567, "step": 1284 }, { "epoch": 0.3852351508338018, "grad_norm": 0.21658611297607422, "learning_rate": 0.00024256756756756753, "loss": 0.4437, "step": 1285 }, { "epoch": 0.3855349447255012, "grad_norm": 0.2677985727787018, "learning_rate": 0.00024252252252252252, "loss": 0.45, "step": 1286 }, { "epoch": 0.3858347386172007, "grad_norm": 0.23147153854370117, "learning_rate": 0.00024247747747747745, "loss": 0.4341, "step": 1287 }, { "epoch": 0.3861345325089001, "grad_norm": 0.2465144395828247, "learning_rate": 0.0002424324324324324, "loss": 0.4629, "step": 1288 }, { "epoch": 0.3864343264005996, "grad_norm": 0.23633845150470734, "learning_rate": 0.00024238738738738738, "loss": 0.4484, "step": 1289 }, { "epoch": 0.38673412029229903, "grad_norm": 0.22743773460388184, "learning_rate": 0.00024234234234234232, "loss": 0.4451, "step": 1290 }, { "epoch": 0.3870339141839985, "grad_norm": 0.233259379863739, "learning_rate": 0.00024229729729729726, "loss": 0.4546, "step": 1291 }, { "epoch": 0.38733370807569795, "grad_norm": 0.24213840067386627, "learning_rate": 0.00024225225225225225, "loss": 0.4378, "step": 1292 }, { "epoch": 0.38763350196739743, "grad_norm": 0.23812246322631836, "learning_rate": 0.00024220720720720718, "loss": 0.4613, "step": 1293 }, { "epoch": 0.38793329585909686, "grad_norm": 0.25436896085739136, "learning_rate": 0.00024216216216216212, "loss": 0.4462, "step": 1294 }, { "epoch": 0.38823308975079635, "grad_norm": 0.24321508407592773, "learning_rate": 0.0002421171171171171, "loss": 0.4674, "step": 1295 }, { "epoch": 0.3885328836424958, "grad_norm": 0.2434927523136139, "learning_rate": 0.00024207207207207205, "loss": 0.4317, "step": 1296 }, { "epoch": 0.38883267753419526, "grad_norm": 0.2564734220504761, "learning_rate": 0.000242027027027027, "loss": 0.4556, "step": 1297 }, { "epoch": 0.3891324714258947, "grad_norm": 0.26102596521377563, "learning_rate": 0.00024198198198198198, "loss": 0.4866, "step": 1298 }, { "epoch": 0.3894322653175942, "grad_norm": 0.26838192343711853, "learning_rate": 0.0002419369369369369, "loss": 0.4824, "step": 1299 }, { "epoch": 0.3897320592092936, "grad_norm": 0.24552933871746063, "learning_rate": 0.00024189189189189188, "loss": 0.4581, "step": 1300 }, { "epoch": 0.3900318531009931, "grad_norm": 0.2453169822692871, "learning_rate": 0.0002418468468468468, "loss": 0.4605, "step": 1301 }, { "epoch": 0.3903316469926925, "grad_norm": 0.24922487139701843, "learning_rate": 0.00024180180180180178, "loss": 0.4555, "step": 1302 }, { "epoch": 0.390631440884392, "grad_norm": 0.2683355212211609, "learning_rate": 0.00024175675675675674, "loss": 0.4674, "step": 1303 }, { "epoch": 0.39093123477609143, "grad_norm": 0.23587100207805634, "learning_rate": 0.00024171171171171168, "loss": 0.4564, "step": 1304 }, { "epoch": 0.3912310286677909, "grad_norm": 0.2331109195947647, "learning_rate": 0.00024166666666666664, "loss": 0.4499, "step": 1305 }, { "epoch": 0.39153082255949034, "grad_norm": 0.24744129180908203, "learning_rate": 0.0002416216216216216, "loss": 0.453, "step": 1306 }, { "epoch": 0.39183061645118983, "grad_norm": 0.228163942694664, "learning_rate": 0.00024157657657657654, "loss": 0.4219, "step": 1307 }, { "epoch": 0.39213041034288926, "grad_norm": 0.2409953773021698, "learning_rate": 0.00024153153153153153, "loss": 0.4322, "step": 1308 }, { "epoch": 0.39243020423458874, "grad_norm": 0.22519168257713318, "learning_rate": 0.00024148648648648647, "loss": 0.4177, "step": 1309 }, { "epoch": 0.39272999812628817, "grad_norm": 0.2466001957654953, "learning_rate": 0.0002414414414414414, "loss": 0.4747, "step": 1310 }, { "epoch": 0.39302979201798766, "grad_norm": 0.247292622923851, "learning_rate": 0.0002413963963963964, "loss": 0.4663, "step": 1311 }, { "epoch": 0.3933295859096871, "grad_norm": 0.23905499279499054, "learning_rate": 0.00024135135135135133, "loss": 0.4631, "step": 1312 }, { "epoch": 0.39362937980138657, "grad_norm": 0.27826932072639465, "learning_rate": 0.00024130630630630627, "loss": 0.4863, "step": 1313 }, { "epoch": 0.393929173693086, "grad_norm": 0.3269807994365692, "learning_rate": 0.00024126126126126126, "loss": 0.4751, "step": 1314 }, { "epoch": 0.3942289675847855, "grad_norm": 0.2593337595462799, "learning_rate": 0.0002412162162162162, "loss": 0.4754, "step": 1315 }, { "epoch": 0.3945287614764849, "grad_norm": 0.24511882662773132, "learning_rate": 0.00024117117117117114, "loss": 0.4704, "step": 1316 }, { "epoch": 0.3948285553681844, "grad_norm": 0.2439701408147812, "learning_rate": 0.00024112612612612613, "loss": 0.4367, "step": 1317 }, { "epoch": 0.3951283492598838, "grad_norm": 0.24651208519935608, "learning_rate": 0.00024108108108108106, "loss": 0.4635, "step": 1318 }, { "epoch": 0.3954281431515833, "grad_norm": 0.23849599063396454, "learning_rate": 0.000241036036036036, "loss": 0.4377, "step": 1319 }, { "epoch": 0.39572793704328274, "grad_norm": 0.23733258247375488, "learning_rate": 0.000240990990990991, "loss": 0.4401, "step": 1320 }, { "epoch": 0.3960277309349822, "grad_norm": 0.2643173336982727, "learning_rate": 0.00024094594594594593, "loss": 0.4572, "step": 1321 }, { "epoch": 0.39632752482668165, "grad_norm": 0.23576226830482483, "learning_rate": 0.00024090090090090086, "loss": 0.4693, "step": 1322 }, { "epoch": 0.39662731871838114, "grad_norm": 0.2418884038925171, "learning_rate": 0.00024085585585585586, "loss": 0.4453, "step": 1323 }, { "epoch": 0.39692711261008057, "grad_norm": 0.23336432874202728, "learning_rate": 0.0002408108108108108, "loss": 0.4663, "step": 1324 }, { "epoch": 0.39722690650178005, "grad_norm": 0.2603462040424347, "learning_rate": 0.00024076576576576573, "loss": 0.4643, "step": 1325 }, { "epoch": 0.3975267003934795, "grad_norm": 0.24288874864578247, "learning_rate": 0.0002407207207207207, "loss": 0.4442, "step": 1326 }, { "epoch": 0.39782649428517897, "grad_norm": 0.24161702394485474, "learning_rate": 0.00024067567567567566, "loss": 0.4773, "step": 1327 }, { "epoch": 0.3981262881768784, "grad_norm": 0.23539233207702637, "learning_rate": 0.0002406306306306306, "loss": 0.4416, "step": 1328 }, { "epoch": 0.3984260820685779, "grad_norm": 0.26161664724349976, "learning_rate": 0.00024058558558558556, "loss": 0.4672, "step": 1329 }, { "epoch": 0.3987258759602773, "grad_norm": 0.24159055948257446, "learning_rate": 0.00024054054054054052, "loss": 0.4563, "step": 1330 }, { "epoch": 0.39902566985197674, "grad_norm": 0.24944022297859192, "learning_rate": 0.00024049549549549548, "loss": 0.4554, "step": 1331 }, { "epoch": 0.3993254637436762, "grad_norm": 0.24242587387561798, "learning_rate": 0.00024045045045045042, "loss": 0.4343, "step": 1332 }, { "epoch": 0.39962525763537565, "grad_norm": 0.2406960427761078, "learning_rate": 0.00024040540540540539, "loss": 0.4469, "step": 1333 }, { "epoch": 0.39992505152707514, "grad_norm": 0.24025918543338776, "learning_rate": 0.00024036036036036035, "loss": 0.4056, "step": 1334 }, { "epoch": 0.40022484541877457, "grad_norm": 0.267782062292099, "learning_rate": 0.00024031531531531529, "loss": 0.4319, "step": 1335 }, { "epoch": 0.40052463931047405, "grad_norm": 0.2527433931827545, "learning_rate": 0.00024027027027027025, "loss": 0.4679, "step": 1336 }, { "epoch": 0.4008244332021735, "grad_norm": 0.2520921528339386, "learning_rate": 0.00024022522522522521, "loss": 0.4802, "step": 1337 }, { "epoch": 0.40112422709387296, "grad_norm": 0.24161456525325775, "learning_rate": 0.00024018018018018015, "loss": 0.4415, "step": 1338 }, { "epoch": 0.4014240209855724, "grad_norm": 0.24513588845729828, "learning_rate": 0.00024013513513513511, "loss": 0.445, "step": 1339 }, { "epoch": 0.4017238148772719, "grad_norm": 0.24400116503238678, "learning_rate": 0.00024009009009009008, "loss": 0.4283, "step": 1340 }, { "epoch": 0.4020236087689713, "grad_norm": 0.24796655774116516, "learning_rate": 0.00024004504504504502, "loss": 0.4434, "step": 1341 }, { "epoch": 0.4023234026606708, "grad_norm": 0.2471655309200287, "learning_rate": 0.00023999999999999998, "loss": 0.4378, "step": 1342 }, { "epoch": 0.4026231965523702, "grad_norm": 0.2507822811603546, "learning_rate": 0.00023995495495495494, "loss": 0.4623, "step": 1343 }, { "epoch": 0.4029229904440697, "grad_norm": 0.24304543435573578, "learning_rate": 0.00023990990990990988, "loss": 0.4355, "step": 1344 }, { "epoch": 0.40322278433576914, "grad_norm": 0.240738183259964, "learning_rate": 0.00023986486486486487, "loss": 0.4454, "step": 1345 }, { "epoch": 0.4035225782274686, "grad_norm": 0.2353314906358719, "learning_rate": 0.0002398198198198198, "loss": 0.4493, "step": 1346 }, { "epoch": 0.40382237211916805, "grad_norm": 0.24467633664608002, "learning_rate": 0.00023977477477477474, "loss": 0.4771, "step": 1347 }, { "epoch": 0.40412216601086753, "grad_norm": 0.22876618802547455, "learning_rate": 0.00023972972972972974, "loss": 0.4311, "step": 1348 }, { "epoch": 0.40442195990256696, "grad_norm": 0.24602358043193817, "learning_rate": 0.00023968468468468467, "loss": 0.4676, "step": 1349 }, { "epoch": 0.40472175379426645, "grad_norm": 0.2410927563905716, "learning_rate": 0.0002396396396396396, "loss": 0.4564, "step": 1350 }, { "epoch": 0.4050215476859659, "grad_norm": 0.22765080630779266, "learning_rate": 0.00023959459459459455, "loss": 0.447, "step": 1351 }, { "epoch": 0.40532134157766536, "grad_norm": 0.24429920315742493, "learning_rate": 0.00023954954954954954, "loss": 0.4734, "step": 1352 }, { "epoch": 0.4056211354693648, "grad_norm": 0.2417088747024536, "learning_rate": 0.00023950450450450447, "loss": 0.4465, "step": 1353 }, { "epoch": 0.4059209293610643, "grad_norm": 0.25090181827545166, "learning_rate": 0.00023945945945945944, "loss": 0.4754, "step": 1354 }, { "epoch": 0.4062207232527637, "grad_norm": 0.255610853433609, "learning_rate": 0.0002394144144144144, "loss": 0.4891, "step": 1355 }, { "epoch": 0.4065205171444632, "grad_norm": 0.22734206914901733, "learning_rate": 0.00023936936936936934, "loss": 0.4279, "step": 1356 }, { "epoch": 0.4068203110361626, "grad_norm": 0.24400131404399872, "learning_rate": 0.0002393243243243243, "loss": 0.4493, "step": 1357 }, { "epoch": 0.4071201049278621, "grad_norm": 0.2442186176776886, "learning_rate": 0.00023927927927927927, "loss": 0.4347, "step": 1358 }, { "epoch": 0.40741989881956153, "grad_norm": 0.2278696894645691, "learning_rate": 0.0002392342342342342, "loss": 0.4346, "step": 1359 }, { "epoch": 0.407719692711261, "grad_norm": 0.23576432466506958, "learning_rate": 0.00023918918918918917, "loss": 0.4279, "step": 1360 }, { "epoch": 0.40801948660296045, "grad_norm": 0.25753775238990784, "learning_rate": 0.00023914414414414413, "loss": 0.465, "step": 1361 }, { "epoch": 0.40831928049465993, "grad_norm": 0.232134148478508, "learning_rate": 0.00023909909909909907, "loss": 0.4467, "step": 1362 }, { "epoch": 0.40861907438635936, "grad_norm": 0.25058963894844055, "learning_rate": 0.00023905405405405403, "loss": 0.4496, "step": 1363 }, { "epoch": 0.40891886827805884, "grad_norm": 0.24595490097999573, "learning_rate": 0.000239009009009009, "loss": 0.4625, "step": 1364 }, { "epoch": 0.4092186621697583, "grad_norm": 0.2425229847431183, "learning_rate": 0.00023896396396396393, "loss": 0.4757, "step": 1365 }, { "epoch": 0.40951845606145776, "grad_norm": 0.26115790009498596, "learning_rate": 0.0002389189189189189, "loss": 0.4619, "step": 1366 }, { "epoch": 0.4098182499531572, "grad_norm": 0.22355914115905762, "learning_rate": 0.00023887387387387386, "loss": 0.4234, "step": 1367 }, { "epoch": 0.4101180438448567, "grad_norm": 0.22979210317134857, "learning_rate": 0.00023882882882882882, "loss": 0.4472, "step": 1368 }, { "epoch": 0.4104178377365561, "grad_norm": 0.2713291645050049, "learning_rate": 0.00023878378378378376, "loss": 0.4819, "step": 1369 }, { "epoch": 0.4107176316282556, "grad_norm": 0.22952406108379364, "learning_rate": 0.00023873873873873872, "loss": 0.4428, "step": 1370 }, { "epoch": 0.411017425519955, "grad_norm": 0.24338556826114655, "learning_rate": 0.0002386936936936937, "loss": 0.4703, "step": 1371 }, { "epoch": 0.4113172194116545, "grad_norm": 0.24610304832458496, "learning_rate": 0.00023864864864864862, "loss": 0.4583, "step": 1372 }, { "epoch": 0.41161701330335393, "grad_norm": 0.23116329312324524, "learning_rate": 0.0002386036036036036, "loss": 0.4234, "step": 1373 }, { "epoch": 0.4119168071950534, "grad_norm": 0.24718856811523438, "learning_rate": 0.00023855855855855855, "loss": 0.4507, "step": 1374 }, { "epoch": 0.41221660108675284, "grad_norm": 0.25558093190193176, "learning_rate": 0.0002385135135135135, "loss": 0.4609, "step": 1375 }, { "epoch": 0.4125163949784523, "grad_norm": 0.2391168475151062, "learning_rate": 0.00023846846846846843, "loss": 0.4334, "step": 1376 }, { "epoch": 0.41281618887015176, "grad_norm": 0.242578387260437, "learning_rate": 0.00023842342342342342, "loss": 0.472, "step": 1377 }, { "epoch": 0.41311598276185124, "grad_norm": 0.24465034902095795, "learning_rate": 0.00023837837837837835, "loss": 0.4584, "step": 1378 }, { "epoch": 0.41341577665355067, "grad_norm": 0.2500922679901123, "learning_rate": 0.0002383333333333333, "loss": 0.468, "step": 1379 }, { "epoch": 0.41371557054525016, "grad_norm": 0.23939989507198334, "learning_rate": 0.00023828828828828828, "loss": 0.443, "step": 1380 }, { "epoch": 0.4140153644369496, "grad_norm": 0.272876113653183, "learning_rate": 0.00023824324324324322, "loss": 0.4665, "step": 1381 }, { "epoch": 0.41431515832864907, "grad_norm": 0.2664034068584442, "learning_rate": 0.00023819819819819815, "loss": 0.4855, "step": 1382 }, { "epoch": 0.4146149522203485, "grad_norm": 0.23809301853179932, "learning_rate": 0.00023815315315315315, "loss": 0.4483, "step": 1383 }, { "epoch": 0.414914746112048, "grad_norm": 0.23019112646579742, "learning_rate": 0.00023810810810810808, "loss": 0.4324, "step": 1384 }, { "epoch": 0.4152145400037474, "grad_norm": 0.25093144178390503, "learning_rate": 0.00023806306306306302, "loss": 0.4732, "step": 1385 }, { "epoch": 0.4155143338954469, "grad_norm": 0.23062798380851746, "learning_rate": 0.000238018018018018, "loss": 0.4489, "step": 1386 }, { "epoch": 0.4158141277871463, "grad_norm": 0.23424816131591797, "learning_rate": 0.00023797297297297295, "loss": 0.4586, "step": 1387 }, { "epoch": 0.4161139216788458, "grad_norm": 0.23515811562538147, "learning_rate": 0.0002379279279279279, "loss": 0.4478, "step": 1388 }, { "epoch": 0.41641371557054524, "grad_norm": 0.23988017439842224, "learning_rate": 0.00023788288288288287, "loss": 0.4382, "step": 1389 }, { "epoch": 0.4167135094622447, "grad_norm": 0.23148085176944733, "learning_rate": 0.0002378378378378378, "loss": 0.4476, "step": 1390 }, { "epoch": 0.41701330335394415, "grad_norm": 0.22274036705493927, "learning_rate": 0.00023779279279279278, "loss": 0.4356, "step": 1391 }, { "epoch": 0.41731309724564364, "grad_norm": 0.22446554899215698, "learning_rate": 0.00023774774774774774, "loss": 0.438, "step": 1392 }, { "epoch": 0.41761289113734307, "grad_norm": 0.2218816876411438, "learning_rate": 0.00023770270270270268, "loss": 0.4137, "step": 1393 }, { "epoch": 0.41791268502904255, "grad_norm": 0.23347264528274536, "learning_rate": 0.00023765765765765764, "loss": 0.4361, "step": 1394 }, { "epoch": 0.418212478920742, "grad_norm": 0.26775455474853516, "learning_rate": 0.0002376126126126126, "loss": 0.5153, "step": 1395 }, { "epoch": 0.41851227281244147, "grad_norm": 0.2503002882003784, "learning_rate": 0.00023756756756756754, "loss": 0.4705, "step": 1396 }, { "epoch": 0.4188120667041409, "grad_norm": 0.2259148210287094, "learning_rate": 0.0002375225225225225, "loss": 0.4251, "step": 1397 }, { "epoch": 0.4191118605958404, "grad_norm": 0.24358846247196198, "learning_rate": 0.00023747747747747744, "loss": 0.4483, "step": 1398 }, { "epoch": 0.4194116544875398, "grad_norm": 0.24675339460372925, "learning_rate": 0.0002374324324324324, "loss": 0.4379, "step": 1399 }, { "epoch": 0.4197114483792393, "grad_norm": 0.24181531369686127, "learning_rate": 0.00023738738738738737, "loss": 0.4498, "step": 1400 }, { "epoch": 0.4200112422709387, "grad_norm": 0.24917852878570557, "learning_rate": 0.0002373423423423423, "loss": 0.4436, "step": 1401 }, { "epoch": 0.4203110361626382, "grad_norm": 0.24392594397068024, "learning_rate": 0.0002372972972972973, "loss": 0.4791, "step": 1402 }, { "epoch": 0.42061083005433764, "grad_norm": 0.2411157488822937, "learning_rate": 0.00023725225225225223, "loss": 0.472, "step": 1403 }, { "epoch": 0.4209106239460371, "grad_norm": 0.2558772563934326, "learning_rate": 0.00023720720720720717, "loss": 0.4914, "step": 1404 }, { "epoch": 0.42121041783773655, "grad_norm": 0.2580840587615967, "learning_rate": 0.00023716216216216216, "loss": 0.4502, "step": 1405 }, { "epoch": 0.42151021172943604, "grad_norm": 0.23793622851371765, "learning_rate": 0.0002371171171171171, "loss": 0.4599, "step": 1406 }, { "epoch": 0.42181000562113546, "grad_norm": 0.23233507573604584, "learning_rate": 0.00023707207207207203, "loss": 0.4268, "step": 1407 }, { "epoch": 0.42210979951283495, "grad_norm": 0.24053210020065308, "learning_rate": 0.00023702702702702703, "loss": 0.4233, "step": 1408 }, { "epoch": 0.4224095934045344, "grad_norm": 0.2314370572566986, "learning_rate": 0.00023698198198198196, "loss": 0.427, "step": 1409 }, { "epoch": 0.42270938729623386, "grad_norm": 0.2242741882801056, "learning_rate": 0.0002369369369369369, "loss": 0.4097, "step": 1410 }, { "epoch": 0.4230091811879333, "grad_norm": 0.24178822338581085, "learning_rate": 0.0002368918918918919, "loss": 0.4259, "step": 1411 }, { "epoch": 0.4233089750796328, "grad_norm": 0.23346510529518127, "learning_rate": 0.00023684684684684683, "loss": 0.4442, "step": 1412 }, { "epoch": 0.4236087689713322, "grad_norm": 0.23908735811710358, "learning_rate": 0.00023680180180180176, "loss": 0.432, "step": 1413 }, { "epoch": 0.4239085628630317, "grad_norm": 0.24085824191570282, "learning_rate": 0.00023675675675675675, "loss": 0.4648, "step": 1414 }, { "epoch": 0.4242083567547311, "grad_norm": 0.2325652688741684, "learning_rate": 0.0002367117117117117, "loss": 0.4499, "step": 1415 }, { "epoch": 0.4245081506464306, "grad_norm": 0.2413867712020874, "learning_rate": 0.00023666666666666663, "loss": 0.4407, "step": 1416 }, { "epoch": 0.42480794453813003, "grad_norm": 0.2494458556175232, "learning_rate": 0.00023662162162162162, "loss": 0.4828, "step": 1417 }, { "epoch": 0.4251077384298295, "grad_norm": 0.24031957983970642, "learning_rate": 0.00023657657657657656, "loss": 0.4453, "step": 1418 }, { "epoch": 0.42540753232152895, "grad_norm": 0.2490081638097763, "learning_rate": 0.0002365315315315315, "loss": 0.4595, "step": 1419 }, { "epoch": 0.42570732621322843, "grad_norm": 0.2513922452926636, "learning_rate": 0.00023648648648648648, "loss": 0.4507, "step": 1420 }, { "epoch": 0.42600712010492786, "grad_norm": 0.23692888021469116, "learning_rate": 0.00023644144144144142, "loss": 0.4339, "step": 1421 }, { "epoch": 0.42630691399662735, "grad_norm": 0.22867028415203094, "learning_rate": 0.00023639639639639636, "loss": 0.437, "step": 1422 }, { "epoch": 0.4266067078883268, "grad_norm": 0.23194801807403564, "learning_rate": 0.00023635135135135132, "loss": 0.4558, "step": 1423 }, { "epoch": 0.42690650178002626, "grad_norm": 0.23193570971488953, "learning_rate": 0.00023630630630630628, "loss": 0.4446, "step": 1424 }, { "epoch": 0.4272062956717257, "grad_norm": 0.2337258905172348, "learning_rate": 0.00023626126126126125, "loss": 0.4337, "step": 1425 }, { "epoch": 0.4275060895634251, "grad_norm": 0.23658838868141174, "learning_rate": 0.00023621621621621619, "loss": 0.4288, "step": 1426 }, { "epoch": 0.4278058834551246, "grad_norm": 0.26249489188194275, "learning_rate": 0.00023617117117117115, "loss": 0.4673, "step": 1427 }, { "epoch": 0.42810567734682403, "grad_norm": 0.2465837299823761, "learning_rate": 0.0002361261261261261, "loss": 0.4357, "step": 1428 }, { "epoch": 0.4284054712385235, "grad_norm": 0.26788344979286194, "learning_rate": 0.00023608108108108105, "loss": 0.4465, "step": 1429 }, { "epoch": 0.42870526513022295, "grad_norm": 0.2597041726112366, "learning_rate": 0.00023603603603603601, "loss": 0.4433, "step": 1430 }, { "epoch": 0.42900505902192243, "grad_norm": 0.235942080616951, "learning_rate": 0.00023599099099099098, "loss": 0.4322, "step": 1431 }, { "epoch": 0.42930485291362186, "grad_norm": 0.25687772035598755, "learning_rate": 0.00023594594594594591, "loss": 0.478, "step": 1432 }, { "epoch": 0.42960464680532134, "grad_norm": 0.2209557294845581, "learning_rate": 0.00023590090090090088, "loss": 0.4254, "step": 1433 }, { "epoch": 0.4299044406970208, "grad_norm": 0.2533595860004425, "learning_rate": 0.00023585585585585584, "loss": 0.4545, "step": 1434 }, { "epoch": 0.43020423458872026, "grad_norm": 0.2461264431476593, "learning_rate": 0.00023581081081081078, "loss": 0.4642, "step": 1435 }, { "epoch": 0.4305040284804197, "grad_norm": 0.23638790845870972, "learning_rate": 0.00023576576576576577, "loss": 0.4403, "step": 1436 }, { "epoch": 0.43080382237211917, "grad_norm": 0.23931315541267395, "learning_rate": 0.0002357207207207207, "loss": 0.4356, "step": 1437 }, { "epoch": 0.4311036162638186, "grad_norm": 0.26020053029060364, "learning_rate": 0.00023567567567567564, "loss": 0.4562, "step": 1438 }, { "epoch": 0.4314034101555181, "grad_norm": 0.25653496384620667, "learning_rate": 0.00023563063063063063, "loss": 0.473, "step": 1439 }, { "epoch": 0.4317032040472175, "grad_norm": 0.23115600645542145, "learning_rate": 0.00023558558558558557, "loss": 0.4259, "step": 1440 }, { "epoch": 0.432002997938917, "grad_norm": 0.240982785820961, "learning_rate": 0.0002355405405405405, "loss": 0.4507, "step": 1441 }, { "epoch": 0.43230279183061643, "grad_norm": 0.2819494307041168, "learning_rate": 0.0002354954954954955, "loss": 0.4539, "step": 1442 }, { "epoch": 0.4326025857223159, "grad_norm": 0.24356286227703094, "learning_rate": 0.00023545045045045044, "loss": 0.4377, "step": 1443 }, { "epoch": 0.43290237961401534, "grad_norm": 0.23919035494327545, "learning_rate": 0.00023540540540540537, "loss": 0.4141, "step": 1444 }, { "epoch": 0.4332021735057148, "grad_norm": 0.27333101630210876, "learning_rate": 0.00023536036036036036, "loss": 0.4655, "step": 1445 }, { "epoch": 0.43350196739741426, "grad_norm": 0.24128217995166779, "learning_rate": 0.0002353153153153153, "loss": 0.4483, "step": 1446 }, { "epoch": 0.43380176128911374, "grad_norm": 0.2448810636997223, "learning_rate": 0.00023527027027027024, "loss": 0.4376, "step": 1447 }, { "epoch": 0.43410155518081317, "grad_norm": 0.24084526300430298, "learning_rate": 0.0002352252252252252, "loss": 0.4228, "step": 1448 }, { "epoch": 0.43440134907251265, "grad_norm": 0.2719487249851227, "learning_rate": 0.00023518018018018016, "loss": 0.4564, "step": 1449 }, { "epoch": 0.4347011429642121, "grad_norm": 0.23745928704738617, "learning_rate": 0.0002351351351351351, "loss": 0.4283, "step": 1450 }, { "epoch": 0.43500093685591157, "grad_norm": 0.22788289189338684, "learning_rate": 0.00023509009009009007, "loss": 0.4215, "step": 1451 }, { "epoch": 0.435300730747611, "grad_norm": 0.2702344059944153, "learning_rate": 0.00023504504504504503, "loss": 0.4425, "step": 1452 }, { "epoch": 0.4356005246393105, "grad_norm": 0.2503633499145508, "learning_rate": 0.00023499999999999997, "loss": 0.449, "step": 1453 }, { "epoch": 0.4359003185310099, "grad_norm": 0.2610470950603485, "learning_rate": 0.00023495495495495493, "loss": 0.4708, "step": 1454 }, { "epoch": 0.4362001124227094, "grad_norm": 0.2375205010175705, "learning_rate": 0.0002349099099099099, "loss": 0.4426, "step": 1455 }, { "epoch": 0.4364999063144088, "grad_norm": 0.2531331777572632, "learning_rate": 0.00023486486486486483, "loss": 0.4516, "step": 1456 }, { "epoch": 0.4367997002061083, "grad_norm": 0.23722784221172333, "learning_rate": 0.0002348198198198198, "loss": 0.4345, "step": 1457 }, { "epoch": 0.43709949409780774, "grad_norm": 0.2411564141511917, "learning_rate": 0.00023477477477477476, "loss": 0.4227, "step": 1458 }, { "epoch": 0.4373992879895072, "grad_norm": 0.22307904064655304, "learning_rate": 0.00023472972972972972, "loss": 0.4254, "step": 1459 }, { "epoch": 0.43769908188120665, "grad_norm": 0.273569256067276, "learning_rate": 0.00023468468468468466, "loss": 0.4384, "step": 1460 }, { "epoch": 0.43799887577290614, "grad_norm": 0.24101589620113373, "learning_rate": 0.00023463963963963962, "loss": 0.4694, "step": 1461 }, { "epoch": 0.43829866966460557, "grad_norm": 0.2409631907939911, "learning_rate": 0.0002345945945945946, "loss": 0.4515, "step": 1462 }, { "epoch": 0.43859846355630505, "grad_norm": 0.24057404696941376, "learning_rate": 0.00023454954954954952, "loss": 0.4516, "step": 1463 }, { "epoch": 0.4388982574480045, "grad_norm": 0.24539843201637268, "learning_rate": 0.0002345045045045045, "loss": 0.4078, "step": 1464 }, { "epoch": 0.43919805133970397, "grad_norm": 0.23763391375541687, "learning_rate": 0.00023445945945945945, "loss": 0.4538, "step": 1465 }, { "epoch": 0.4394978452314034, "grad_norm": 0.25087833404541016, "learning_rate": 0.0002344144144144144, "loss": 0.4474, "step": 1466 }, { "epoch": 0.4397976391231029, "grad_norm": 0.24220441281795502, "learning_rate": 0.00023436936936936935, "loss": 0.4527, "step": 1467 }, { "epoch": 0.4400974330148023, "grad_norm": 0.23056988418102264, "learning_rate": 0.00023432432432432432, "loss": 0.4375, "step": 1468 }, { "epoch": 0.4403972269065018, "grad_norm": 0.23940956592559814, "learning_rate": 0.00023427927927927925, "loss": 0.4561, "step": 1469 }, { "epoch": 0.4406970207982012, "grad_norm": 0.23279373347759247, "learning_rate": 0.0002342342342342342, "loss": 0.4342, "step": 1470 }, { "epoch": 0.4409968146899007, "grad_norm": 0.23729127645492554, "learning_rate": 0.00023418918918918918, "loss": 0.4196, "step": 1471 }, { "epoch": 0.44129660858160014, "grad_norm": 0.2296978086233139, "learning_rate": 0.00023414414414414412, "loss": 0.4308, "step": 1472 }, { "epoch": 0.4415964024732996, "grad_norm": 0.24595269560813904, "learning_rate": 0.00023409909909909905, "loss": 0.4607, "step": 1473 }, { "epoch": 0.44189619636499905, "grad_norm": 0.2266230583190918, "learning_rate": 0.00023405405405405404, "loss": 0.4233, "step": 1474 }, { "epoch": 0.44219599025669853, "grad_norm": 0.23812752962112427, "learning_rate": 0.00023400900900900898, "loss": 0.4341, "step": 1475 }, { "epoch": 0.44249578414839796, "grad_norm": 0.2498784214258194, "learning_rate": 0.00023396396396396392, "loss": 0.4406, "step": 1476 }, { "epoch": 0.44279557804009745, "grad_norm": 0.26205873489379883, "learning_rate": 0.0002339189189189189, "loss": 0.463, "step": 1477 }, { "epoch": 0.4430953719317969, "grad_norm": 0.22054405510425568, "learning_rate": 0.00023387387387387385, "loss": 0.4142, "step": 1478 }, { "epoch": 0.44339516582349636, "grad_norm": 0.2611534893512726, "learning_rate": 0.00023382882882882878, "loss": 0.4985, "step": 1479 }, { "epoch": 0.4436949597151958, "grad_norm": 0.2515600323677063, "learning_rate": 0.00023378378378378377, "loss": 0.4567, "step": 1480 }, { "epoch": 0.4439947536068953, "grad_norm": 0.23842091858386993, "learning_rate": 0.0002337387387387387, "loss": 0.4301, "step": 1481 }, { "epoch": 0.4442945474985947, "grad_norm": 0.21718434989452362, "learning_rate": 0.00023369369369369367, "loss": 0.4444, "step": 1482 }, { "epoch": 0.4445943413902942, "grad_norm": 0.2340681105852127, "learning_rate": 0.00023364864864864864, "loss": 0.4555, "step": 1483 }, { "epoch": 0.4448941352819936, "grad_norm": 0.24483737349510193, "learning_rate": 0.00023360360360360357, "loss": 0.4419, "step": 1484 }, { "epoch": 0.4451939291736931, "grad_norm": 0.24512210488319397, "learning_rate": 0.00023355855855855854, "loss": 0.427, "step": 1485 }, { "epoch": 0.44549372306539253, "grad_norm": 0.21723505854606628, "learning_rate": 0.0002335135135135135, "loss": 0.4075, "step": 1486 }, { "epoch": 0.445793516957092, "grad_norm": 0.23696058988571167, "learning_rate": 0.00023346846846846844, "loss": 0.4332, "step": 1487 }, { "epoch": 0.44609331084879145, "grad_norm": 0.23997806012630463, "learning_rate": 0.0002334234234234234, "loss": 0.4438, "step": 1488 }, { "epoch": 0.44639310474049093, "grad_norm": 0.2866729497909546, "learning_rate": 0.00023337837837837837, "loss": 0.4851, "step": 1489 }, { "epoch": 0.44669289863219036, "grad_norm": 0.22988809645175934, "learning_rate": 0.0002333333333333333, "loss": 0.4185, "step": 1490 }, { "epoch": 0.44699269252388985, "grad_norm": 0.24402973055839539, "learning_rate": 0.00023328828828828827, "loss": 0.4277, "step": 1491 }, { "epoch": 0.4472924864155893, "grad_norm": 0.25479069352149963, "learning_rate": 0.00023324324324324323, "loss": 0.4593, "step": 1492 }, { "epoch": 0.44759228030728876, "grad_norm": 0.27010777592658997, "learning_rate": 0.0002331981981981982, "loss": 0.4391, "step": 1493 }, { "epoch": 0.4478920741989882, "grad_norm": 0.2443162202835083, "learning_rate": 0.00023315315315315313, "loss": 0.4485, "step": 1494 }, { "epoch": 0.4481918680906877, "grad_norm": 0.23816895484924316, "learning_rate": 0.00023310810810810807, "loss": 0.4606, "step": 1495 }, { "epoch": 0.4484916619823871, "grad_norm": 0.25230711698532104, "learning_rate": 0.00023306306306306306, "loss": 0.4479, "step": 1496 }, { "epoch": 0.4487914558740866, "grad_norm": 0.23312939703464508, "learning_rate": 0.000233018018018018, "loss": 0.4157, "step": 1497 }, { "epoch": 0.449091249765786, "grad_norm": 0.2355630099773407, "learning_rate": 0.00023297297297297293, "loss": 0.4428, "step": 1498 }, { "epoch": 0.4493910436574855, "grad_norm": 0.21646787226200104, "learning_rate": 0.00023292792792792792, "loss": 0.4166, "step": 1499 }, { "epoch": 0.44969083754918493, "grad_norm": 0.2547577917575836, "learning_rate": 0.00023288288288288286, "loss": 0.4517, "step": 1500 }, { "epoch": 0.44969083754918493, "eval_loss": 0.4507916569709778, "eval_runtime": 565.1356, "eval_samples_per_second": 3.82, "eval_steps_per_second": 0.478, "step": 1500 }, { "epoch": 0.4499906314408844, "grad_norm": 0.22076524794101715, "learning_rate": 0.0002328378378378378, "loss": 0.4395, "step": 1501 }, { "epoch": 0.45029042533258384, "grad_norm": 0.23838326334953308, "learning_rate": 0.0002327927927927928, "loss": 0.456, "step": 1502 }, { "epoch": 0.45059021922428333, "grad_norm": 0.23766019940376282, "learning_rate": 0.00023274774774774773, "loss": 0.4506, "step": 1503 }, { "epoch": 0.45089001311598276, "grad_norm": 0.2391175776720047, "learning_rate": 0.00023270270270270266, "loss": 0.4484, "step": 1504 }, { "epoch": 0.45118980700768224, "grad_norm": 0.24999289214611053, "learning_rate": 0.00023265765765765765, "loss": 0.4524, "step": 1505 }, { "epoch": 0.45148960089938167, "grad_norm": 0.24920374155044556, "learning_rate": 0.0002326126126126126, "loss": 0.4626, "step": 1506 }, { "epoch": 0.45178939479108116, "grad_norm": 0.24506935477256775, "learning_rate": 0.00023256756756756753, "loss": 0.438, "step": 1507 }, { "epoch": 0.4520891886827806, "grad_norm": 0.2410869002342224, "learning_rate": 0.00023252252252252252, "loss": 0.4579, "step": 1508 }, { "epoch": 0.45238898257448007, "grad_norm": 0.2394392192363739, "learning_rate": 0.00023247747747747745, "loss": 0.4701, "step": 1509 }, { "epoch": 0.4526887764661795, "grad_norm": 0.24809177219867706, "learning_rate": 0.0002324324324324324, "loss": 0.4521, "step": 1510 }, { "epoch": 0.452988570357879, "grad_norm": 0.24093541502952576, "learning_rate": 0.00023238738738738738, "loss": 0.4364, "step": 1511 }, { "epoch": 0.4532883642495784, "grad_norm": 0.24750453233718872, "learning_rate": 0.00023234234234234232, "loss": 0.454, "step": 1512 }, { "epoch": 0.4535881581412779, "grad_norm": 0.24669384956359863, "learning_rate": 0.00023229729729729726, "loss": 0.4687, "step": 1513 }, { "epoch": 0.4538879520329773, "grad_norm": 0.258184015750885, "learning_rate": 0.00023225225225225225, "loss": 0.4741, "step": 1514 }, { "epoch": 0.4541877459246768, "grad_norm": 0.23264341056346893, "learning_rate": 0.00023220720720720718, "loss": 0.4255, "step": 1515 }, { "epoch": 0.45448753981637624, "grad_norm": 0.24050508439540863, "learning_rate": 0.00023216216216216215, "loss": 0.4465, "step": 1516 }, { "epoch": 0.4547873337080757, "grad_norm": 0.23079554736614227, "learning_rate": 0.0002321171171171171, "loss": 0.4232, "step": 1517 }, { "epoch": 0.45508712759977515, "grad_norm": 0.22280898690223694, "learning_rate": 0.00023207207207207205, "loss": 0.4233, "step": 1518 }, { "epoch": 0.45538692149147464, "grad_norm": 0.24419550597667694, "learning_rate": 0.000232027027027027, "loss": 0.453, "step": 1519 }, { "epoch": 0.45568671538317407, "grad_norm": 0.2578713595867157, "learning_rate": 0.00023198198198198195, "loss": 0.4701, "step": 1520 }, { "epoch": 0.4559865092748735, "grad_norm": 0.24617789685726166, "learning_rate": 0.0002319369369369369, "loss": 0.4744, "step": 1521 }, { "epoch": 0.456286303166573, "grad_norm": 0.2564181387424469, "learning_rate": 0.00023189189189189188, "loss": 0.4672, "step": 1522 }, { "epoch": 0.4565860970582724, "grad_norm": 0.23741687834262848, "learning_rate": 0.0002318468468468468, "loss": 0.4337, "step": 1523 }, { "epoch": 0.4568858909499719, "grad_norm": 0.2671225070953369, "learning_rate": 0.00023180180180180178, "loss": 0.4723, "step": 1524 }, { "epoch": 0.4571856848416713, "grad_norm": 0.2585636377334595, "learning_rate": 0.00023175675675675674, "loss": 0.4585, "step": 1525 }, { "epoch": 0.4574854787333708, "grad_norm": 0.25808751583099365, "learning_rate": 0.00023171171171171168, "loss": 0.4667, "step": 1526 }, { "epoch": 0.45778527262507024, "grad_norm": 0.25702106952667236, "learning_rate": 0.00023166666666666667, "loss": 0.4218, "step": 1527 }, { "epoch": 0.4580850665167697, "grad_norm": 0.2685486972332001, "learning_rate": 0.0002316216216216216, "loss": 0.4765, "step": 1528 }, { "epoch": 0.45838486040846915, "grad_norm": 0.25075605511665344, "learning_rate": 0.00023157657657657654, "loss": 0.4795, "step": 1529 }, { "epoch": 0.45868465430016864, "grad_norm": 0.25849252939224243, "learning_rate": 0.00023153153153153153, "loss": 0.4362, "step": 1530 }, { "epoch": 0.45898444819186807, "grad_norm": 0.25761592388153076, "learning_rate": 0.00023148648648648647, "loss": 0.4791, "step": 1531 }, { "epoch": 0.45928424208356755, "grad_norm": 0.228532075881958, "learning_rate": 0.0002314414414414414, "loss": 0.4258, "step": 1532 }, { "epoch": 0.459584035975267, "grad_norm": 0.24463020265102386, "learning_rate": 0.0002313963963963964, "loss": 0.4199, "step": 1533 }, { "epoch": 0.45988382986696646, "grad_norm": 0.26668593287467957, "learning_rate": 0.00023135135135135133, "loss": 0.4686, "step": 1534 }, { "epoch": 0.4601836237586659, "grad_norm": 0.24953673779964447, "learning_rate": 0.00023130630630630627, "loss": 0.4433, "step": 1535 }, { "epoch": 0.4604834176503654, "grad_norm": 0.2565534710884094, "learning_rate": 0.00023126126126126126, "loss": 0.4638, "step": 1536 }, { "epoch": 0.4607832115420648, "grad_norm": 0.241172194480896, "learning_rate": 0.0002312162162162162, "loss": 0.4065, "step": 1537 }, { "epoch": 0.4610830054337643, "grad_norm": 0.2695203125476837, "learning_rate": 0.00023117117117117114, "loss": 0.4782, "step": 1538 }, { "epoch": 0.4613827993254637, "grad_norm": 0.25559231638908386, "learning_rate": 0.00023112612612612613, "loss": 0.425, "step": 1539 }, { "epoch": 0.4616825932171632, "grad_norm": 0.2544387876987457, "learning_rate": 0.00023108108108108106, "loss": 0.451, "step": 1540 }, { "epoch": 0.46198238710886264, "grad_norm": 0.27124300599098206, "learning_rate": 0.000231036036036036, "loss": 0.4492, "step": 1541 }, { "epoch": 0.4622821810005621, "grad_norm": 0.2581422030925751, "learning_rate": 0.000230990990990991, "loss": 0.4317, "step": 1542 }, { "epoch": 0.46258197489226155, "grad_norm": 0.2657614052295685, "learning_rate": 0.00023094594594594593, "loss": 0.4308, "step": 1543 }, { "epoch": 0.46288176878396103, "grad_norm": 0.25568437576293945, "learning_rate": 0.00023090090090090087, "loss": 0.4808, "step": 1544 }, { "epoch": 0.46318156267566046, "grad_norm": 0.26639649271965027, "learning_rate": 0.00023085585585585583, "loss": 0.4808, "step": 1545 }, { "epoch": 0.46348135656735995, "grad_norm": 0.23767255246639252, "learning_rate": 0.0002308108108108108, "loss": 0.4382, "step": 1546 }, { "epoch": 0.4637811504590594, "grad_norm": 0.22875267267227173, "learning_rate": 0.00023076576576576573, "loss": 0.4132, "step": 1547 }, { "epoch": 0.46408094435075886, "grad_norm": 0.27917149662971497, "learning_rate": 0.0002307207207207207, "loss": 0.4899, "step": 1548 }, { "epoch": 0.4643807382424583, "grad_norm": 0.24044126272201538, "learning_rate": 0.00023067567567567566, "loss": 0.4481, "step": 1549 }, { "epoch": 0.4646805321341578, "grad_norm": 0.2577783465385437, "learning_rate": 0.00023063063063063062, "loss": 0.4403, "step": 1550 }, { "epoch": 0.4649803260258572, "grad_norm": 0.2589547038078308, "learning_rate": 0.00023058558558558556, "loss": 0.438, "step": 1551 }, { "epoch": 0.4652801199175567, "grad_norm": 0.22958579659461975, "learning_rate": 0.00023054054054054052, "loss": 0.4374, "step": 1552 }, { "epoch": 0.4655799138092561, "grad_norm": 0.28952687978744507, "learning_rate": 0.00023049549549549549, "loss": 0.4463, "step": 1553 }, { "epoch": 0.4658797077009556, "grad_norm": 0.2680447995662689, "learning_rate": 0.00023045045045045042, "loss": 0.4758, "step": 1554 }, { "epoch": 0.46617950159265503, "grad_norm": 0.23771092295646667, "learning_rate": 0.00023040540540540539, "loss": 0.4322, "step": 1555 }, { "epoch": 0.4664792954843545, "grad_norm": 0.2691210210323334, "learning_rate": 0.00023036036036036035, "loss": 0.4739, "step": 1556 }, { "epoch": 0.46677908937605395, "grad_norm": 0.24547810852527618, "learning_rate": 0.0002303153153153153, "loss": 0.4507, "step": 1557 }, { "epoch": 0.46707888326775343, "grad_norm": 0.2669890224933624, "learning_rate": 0.00023027027027027025, "loss": 0.4809, "step": 1558 }, { "epoch": 0.46737867715945286, "grad_norm": 0.25527825951576233, "learning_rate": 0.00023022522522522521, "loss": 0.4558, "step": 1559 }, { "epoch": 0.46767847105115234, "grad_norm": 0.23491926491260529, "learning_rate": 0.00023018018018018015, "loss": 0.4594, "step": 1560 }, { "epoch": 0.4679782649428518, "grad_norm": 0.27634891867637634, "learning_rate": 0.00023013513513513514, "loss": 0.4503, "step": 1561 }, { "epoch": 0.46827805883455126, "grad_norm": 0.2656886577606201, "learning_rate": 0.00023009009009009008, "loss": 0.4674, "step": 1562 }, { "epoch": 0.4685778527262507, "grad_norm": 0.23933476209640503, "learning_rate": 0.00023004504504504502, "loss": 0.4124, "step": 1563 }, { "epoch": 0.4688776466179502, "grad_norm": 0.2596864700317383, "learning_rate": 0.00023, "loss": 0.4288, "step": 1564 }, { "epoch": 0.4691774405096496, "grad_norm": 0.25186148285865784, "learning_rate": 0.00022995495495495494, "loss": 0.4571, "step": 1565 }, { "epoch": 0.4694772344013491, "grad_norm": 0.28007790446281433, "learning_rate": 0.00022990990990990988, "loss": 0.4785, "step": 1566 }, { "epoch": 0.4697770282930485, "grad_norm": 0.26225724816322327, "learning_rate": 0.00022986486486486482, "loss": 0.4378, "step": 1567 }, { "epoch": 0.470076822184748, "grad_norm": 0.24554051458835602, "learning_rate": 0.0002298198198198198, "loss": 0.4655, "step": 1568 }, { "epoch": 0.47037661607644743, "grad_norm": 0.24976593255996704, "learning_rate": 0.00022977477477477475, "loss": 0.4356, "step": 1569 }, { "epoch": 0.4706764099681469, "grad_norm": 0.23914846777915955, "learning_rate": 0.00022972972972972968, "loss": 0.4239, "step": 1570 }, { "epoch": 0.47097620385984634, "grad_norm": 0.24698884785175323, "learning_rate": 0.00022968468468468467, "loss": 0.4607, "step": 1571 }, { "epoch": 0.4712759977515458, "grad_norm": 0.24240310490131378, "learning_rate": 0.0002296396396396396, "loss": 0.4218, "step": 1572 }, { "epoch": 0.47157579164324526, "grad_norm": 0.24838680028915405, "learning_rate": 0.00022959459459459457, "loss": 0.4387, "step": 1573 }, { "epoch": 0.47187558553494474, "grad_norm": 0.25536447763442993, "learning_rate": 0.00022954954954954954, "loss": 0.4529, "step": 1574 }, { "epoch": 0.47217537942664417, "grad_norm": 0.24535490572452545, "learning_rate": 0.00022950450450450447, "loss": 0.4505, "step": 1575 }, { "epoch": 0.47247517331834366, "grad_norm": 0.258878618478775, "learning_rate": 0.00022945945945945944, "loss": 0.4794, "step": 1576 }, { "epoch": 0.4727749672100431, "grad_norm": 0.23862193524837494, "learning_rate": 0.0002294144144144144, "loss": 0.4555, "step": 1577 }, { "epoch": 0.47307476110174257, "grad_norm": 0.2369290292263031, "learning_rate": 0.00022936936936936934, "loss": 0.4111, "step": 1578 }, { "epoch": 0.473374554993442, "grad_norm": 0.2591108977794647, "learning_rate": 0.0002293243243243243, "loss": 0.4738, "step": 1579 }, { "epoch": 0.4736743488851415, "grad_norm": 0.2639445662498474, "learning_rate": 0.00022927927927927927, "loss": 0.489, "step": 1580 }, { "epoch": 0.4739741427768409, "grad_norm": 0.2452382892370224, "learning_rate": 0.0002292342342342342, "loss": 0.4499, "step": 1581 }, { "epoch": 0.4742739366685404, "grad_norm": 0.24414241313934326, "learning_rate": 0.00022918918918918917, "loss": 0.4246, "step": 1582 }, { "epoch": 0.4745737305602398, "grad_norm": 0.24609197676181793, "learning_rate": 0.00022914414414414413, "loss": 0.4615, "step": 1583 }, { "epoch": 0.4748735244519393, "grad_norm": 0.2610466480255127, "learning_rate": 0.0002290990990990991, "loss": 0.4597, "step": 1584 }, { "epoch": 0.47517331834363874, "grad_norm": 0.24946355819702148, "learning_rate": 0.00022905405405405403, "loss": 0.4382, "step": 1585 }, { "epoch": 0.4754731122353382, "grad_norm": 0.24156548082828522, "learning_rate": 0.000229009009009009, "loss": 0.4421, "step": 1586 }, { "epoch": 0.47577290612703765, "grad_norm": 0.2650264799594879, "learning_rate": 0.00022896396396396396, "loss": 0.4735, "step": 1587 }, { "epoch": 0.47607270001873714, "grad_norm": 0.2677678167819977, "learning_rate": 0.0002289189189189189, "loss": 0.4581, "step": 1588 }, { "epoch": 0.47637249391043657, "grad_norm": 0.2421169877052307, "learning_rate": 0.00022887387387387386, "loss": 0.4342, "step": 1589 }, { "epoch": 0.47667228780213605, "grad_norm": 0.2284284085035324, "learning_rate": 0.00022882882882882882, "loss": 0.434, "step": 1590 }, { "epoch": 0.4769720816938355, "grad_norm": 0.235052689909935, "learning_rate": 0.00022878378378378376, "loss": 0.4439, "step": 1591 }, { "epoch": 0.47727187558553497, "grad_norm": 0.24947918951511383, "learning_rate": 0.0002287387387387387, "loss": 0.4704, "step": 1592 }, { "epoch": 0.4775716694772344, "grad_norm": 0.24523784220218658, "learning_rate": 0.0002286936936936937, "loss": 0.4688, "step": 1593 }, { "epoch": 0.4778714633689339, "grad_norm": 0.2427687793970108, "learning_rate": 0.00022864864864864862, "loss": 0.4145, "step": 1594 }, { "epoch": 0.4781712572606333, "grad_norm": 0.2589262127876282, "learning_rate": 0.00022860360360360356, "loss": 0.4602, "step": 1595 }, { "epoch": 0.4784710511523328, "grad_norm": 0.22775280475616455, "learning_rate": 0.00022855855855855855, "loss": 0.4118, "step": 1596 }, { "epoch": 0.4787708450440322, "grad_norm": 0.26483890414237976, "learning_rate": 0.0002285135135135135, "loss": 0.4342, "step": 1597 }, { "epoch": 0.4790706389357317, "grad_norm": 0.2529483735561371, "learning_rate": 0.00022846846846846843, "loss": 0.4474, "step": 1598 }, { "epoch": 0.47937043282743114, "grad_norm": 0.24874089658260345, "learning_rate": 0.00022842342342342342, "loss": 0.4383, "step": 1599 }, { "epoch": 0.4796702267191306, "grad_norm": 0.2583334445953369, "learning_rate": 0.00022837837837837835, "loss": 0.4406, "step": 1600 }, { "epoch": 0.47997002061083005, "grad_norm": 0.2603740990161896, "learning_rate": 0.0002283333333333333, "loss": 0.4418, "step": 1601 }, { "epoch": 0.48026981450252954, "grad_norm": 0.24915438890457153, "learning_rate": 0.00022828828828828828, "loss": 0.458, "step": 1602 }, { "epoch": 0.48056960839422896, "grad_norm": 0.2531617283821106, "learning_rate": 0.00022824324324324322, "loss": 0.4639, "step": 1603 }, { "epoch": 0.48086940228592845, "grad_norm": 0.23148907721042633, "learning_rate": 0.00022819819819819816, "loss": 0.4345, "step": 1604 }, { "epoch": 0.4811691961776279, "grad_norm": 0.26466086506843567, "learning_rate": 0.00022815315315315315, "loss": 0.4625, "step": 1605 }, { "epoch": 0.48146899006932736, "grad_norm": 0.2558290660381317, "learning_rate": 0.00022810810810810808, "loss": 0.4438, "step": 1606 }, { "epoch": 0.4817687839610268, "grad_norm": 0.2294214814901352, "learning_rate": 0.00022806306306306305, "loss": 0.4343, "step": 1607 }, { "epoch": 0.4820685778527263, "grad_norm": 0.25497883558273315, "learning_rate": 0.000228018018018018, "loss": 0.4498, "step": 1608 }, { "epoch": 0.4823683717444257, "grad_norm": 0.25641798973083496, "learning_rate": 0.00022797297297297295, "loss": 0.4077, "step": 1609 }, { "epoch": 0.4826681656361252, "grad_norm": 0.2556770145893097, "learning_rate": 0.0002279279279279279, "loss": 0.4125, "step": 1610 }, { "epoch": 0.4829679595278246, "grad_norm": 0.2528950273990631, "learning_rate": 0.00022788288288288288, "loss": 0.4759, "step": 1611 }, { "epoch": 0.4832677534195241, "grad_norm": 0.22939835488796234, "learning_rate": 0.0002278378378378378, "loss": 0.4422, "step": 1612 }, { "epoch": 0.48356754731122353, "grad_norm": 0.24314181506633759, "learning_rate": 0.00022779279279279278, "loss": 0.4358, "step": 1613 }, { "epoch": 0.483867341202923, "grad_norm": 0.2460578978061676, "learning_rate": 0.00022774774774774774, "loss": 0.4571, "step": 1614 }, { "epoch": 0.48416713509462245, "grad_norm": 0.2627396583557129, "learning_rate": 0.00022770270270270268, "loss": 0.4616, "step": 1615 }, { "epoch": 0.4844669289863219, "grad_norm": 0.23525434732437134, "learning_rate": 0.00022765765765765764, "loss": 0.4243, "step": 1616 }, { "epoch": 0.48476672287802136, "grad_norm": 0.2397356927394867, "learning_rate": 0.00022761261261261258, "loss": 0.4382, "step": 1617 }, { "epoch": 0.4850665167697208, "grad_norm": 0.2398831993341446, "learning_rate": 0.00022756756756756757, "loss": 0.4412, "step": 1618 }, { "epoch": 0.4853663106614203, "grad_norm": 0.259376585483551, "learning_rate": 0.0002275225225225225, "loss": 0.4259, "step": 1619 }, { "epoch": 0.4856661045531197, "grad_norm": 0.23204876482486725, "learning_rate": 0.00022747747747747744, "loss": 0.4068, "step": 1620 }, { "epoch": 0.4859658984448192, "grad_norm": 0.2531450688838959, "learning_rate": 0.00022743243243243243, "loss": 0.4383, "step": 1621 }, { "epoch": 0.4862656923365186, "grad_norm": 0.24866148829460144, "learning_rate": 0.00022738738738738737, "loss": 0.4856, "step": 1622 }, { "epoch": 0.4865654862282181, "grad_norm": 0.23114702105522156, "learning_rate": 0.0002273423423423423, "loss": 0.4167, "step": 1623 }, { "epoch": 0.48686528011991753, "grad_norm": 0.24857169389724731, "learning_rate": 0.0002272972972972973, "loss": 0.4678, "step": 1624 }, { "epoch": 0.487165074011617, "grad_norm": 0.24516363441944122, "learning_rate": 0.00022725225225225223, "loss": 0.4236, "step": 1625 }, { "epoch": 0.48746486790331645, "grad_norm": 0.2485557198524475, "learning_rate": 0.00022720720720720717, "loss": 0.4562, "step": 1626 }, { "epoch": 0.48776466179501593, "grad_norm": 0.2518232762813568, "learning_rate": 0.00022716216216216216, "loss": 0.4703, "step": 1627 }, { "epoch": 0.48806445568671536, "grad_norm": 0.23469193279743195, "learning_rate": 0.0002271171171171171, "loss": 0.4301, "step": 1628 }, { "epoch": 0.48836424957841484, "grad_norm": 0.25275421142578125, "learning_rate": 0.00022707207207207204, "loss": 0.4576, "step": 1629 }, { "epoch": 0.4886640434701143, "grad_norm": 0.25434768199920654, "learning_rate": 0.00022702702702702703, "loss": 0.4654, "step": 1630 }, { "epoch": 0.48896383736181376, "grad_norm": 0.24483707547187805, "learning_rate": 0.00022698198198198196, "loss": 0.45, "step": 1631 }, { "epoch": 0.4892636312535132, "grad_norm": 0.24300119280815125, "learning_rate": 0.0002269369369369369, "loss": 0.4795, "step": 1632 }, { "epoch": 0.48956342514521267, "grad_norm": 0.23230652511119843, "learning_rate": 0.0002268918918918919, "loss": 0.4318, "step": 1633 }, { "epoch": 0.4898632190369121, "grad_norm": 0.2618871331214905, "learning_rate": 0.00022684684684684683, "loss": 0.4846, "step": 1634 }, { "epoch": 0.4901630129286116, "grad_norm": 0.2477121651172638, "learning_rate": 0.00022680180180180176, "loss": 0.4392, "step": 1635 }, { "epoch": 0.490462806820311, "grad_norm": 0.2603763937950134, "learning_rate": 0.00022675675675675676, "loss": 0.4875, "step": 1636 }, { "epoch": 0.4907626007120105, "grad_norm": 0.2470758855342865, "learning_rate": 0.0002267117117117117, "loss": 0.4575, "step": 1637 }, { "epoch": 0.49106239460370993, "grad_norm": 0.2612755000591278, "learning_rate": 0.00022666666666666663, "loss": 0.4652, "step": 1638 }, { "epoch": 0.4913621884954094, "grad_norm": 0.2424866259098053, "learning_rate": 0.0002266216216216216, "loss": 0.4346, "step": 1639 }, { "epoch": 0.49166198238710884, "grad_norm": 0.2529725730419159, "learning_rate": 0.00022657657657657656, "loss": 0.4427, "step": 1640 }, { "epoch": 0.4919617762788083, "grad_norm": 0.2564398944377899, "learning_rate": 0.00022653153153153152, "loss": 0.4641, "step": 1641 }, { "epoch": 0.49226157017050776, "grad_norm": 0.2599097490310669, "learning_rate": 0.00022648648648648646, "loss": 0.4564, "step": 1642 }, { "epoch": 0.49256136406220724, "grad_norm": 0.2355022430419922, "learning_rate": 0.00022644144144144142, "loss": 0.4313, "step": 1643 }, { "epoch": 0.49286115795390667, "grad_norm": 0.24770522117614746, "learning_rate": 0.00022639639639639638, "loss": 0.4468, "step": 1644 }, { "epoch": 0.49316095184560615, "grad_norm": 0.2558223009109497, "learning_rate": 0.00022635135135135132, "loss": 0.4506, "step": 1645 }, { "epoch": 0.4934607457373056, "grad_norm": 0.24741050601005554, "learning_rate": 0.00022630630630630629, "loss": 0.4375, "step": 1646 }, { "epoch": 0.49376053962900507, "grad_norm": 0.24249225854873657, "learning_rate": 0.00022626126126126125, "loss": 0.4283, "step": 1647 }, { "epoch": 0.4940603335207045, "grad_norm": 0.2792401611804962, "learning_rate": 0.00022621621621621619, "loss": 0.4757, "step": 1648 }, { "epoch": 0.494360127412404, "grad_norm": 0.23249030113220215, "learning_rate": 0.00022617117117117115, "loss": 0.4417, "step": 1649 }, { "epoch": 0.4946599213041034, "grad_norm": 0.2646411955356598, "learning_rate": 0.00022612612612612611, "loss": 0.4338, "step": 1650 }, { "epoch": 0.4949597151958029, "grad_norm": 0.2633950710296631, "learning_rate": 0.00022608108108108105, "loss": 0.476, "step": 1651 }, { "epoch": 0.4952595090875023, "grad_norm": 0.24183906614780426, "learning_rate": 0.00022603603603603601, "loss": 0.4363, "step": 1652 }, { "epoch": 0.4955593029792018, "grad_norm": 0.26413261890411377, "learning_rate": 0.00022599099099099098, "loss": 0.4387, "step": 1653 }, { "epoch": 0.49585909687090124, "grad_norm": 0.2538875937461853, "learning_rate": 0.00022594594594594592, "loss": 0.4284, "step": 1654 }, { "epoch": 0.4961588907626007, "grad_norm": 0.2566417157649994, "learning_rate": 0.0002259009009009009, "loss": 0.4352, "step": 1655 }, { "epoch": 0.49645868465430015, "grad_norm": 0.2526787519454956, "learning_rate": 0.00022585585585585584, "loss": 0.4567, "step": 1656 }, { "epoch": 0.49675847854599964, "grad_norm": 0.2506735324859619, "learning_rate": 0.00022581081081081078, "loss": 0.4346, "step": 1657 }, { "epoch": 0.49705827243769907, "grad_norm": 0.25367823243141174, "learning_rate": 0.00022576576576576577, "loss": 0.4392, "step": 1658 }, { "epoch": 0.49735806632939855, "grad_norm": 0.24888494610786438, "learning_rate": 0.0002257207207207207, "loss": 0.4532, "step": 1659 }, { "epoch": 0.497657860221098, "grad_norm": 0.22846034169197083, "learning_rate": 0.00022567567567567564, "loss": 0.4163, "step": 1660 }, { "epoch": 0.49795765411279747, "grad_norm": 0.23993121087551117, "learning_rate": 0.00022563063063063064, "loss": 0.4385, "step": 1661 }, { "epoch": 0.4982574480044969, "grad_norm": 0.2544318437576294, "learning_rate": 0.00022558558558558557, "loss": 0.4058, "step": 1662 }, { "epoch": 0.4985572418961964, "grad_norm": 0.2419573813676834, "learning_rate": 0.0002255405405405405, "loss": 0.4428, "step": 1663 }, { "epoch": 0.4988570357878958, "grad_norm": 0.23838767409324646, "learning_rate": 0.00022549549549549547, "loss": 0.4293, "step": 1664 }, { "epoch": 0.4991568296795953, "grad_norm": 0.23000121116638184, "learning_rate": 0.00022545045045045044, "loss": 0.436, "step": 1665 }, { "epoch": 0.4994566235712947, "grad_norm": 0.2665446102619171, "learning_rate": 0.00022540540540540537, "loss": 0.4841, "step": 1666 }, { "epoch": 0.4997564174629942, "grad_norm": 0.25025802850723267, "learning_rate": 0.00022536036036036034, "loss": 0.4357, "step": 1667 }, { "epoch": 0.5000562113546937, "grad_norm": 0.2441510409116745, "learning_rate": 0.0002253153153153153, "loss": 0.4345, "step": 1668 }, { "epoch": 0.5003560052463931, "grad_norm": 0.24143864214420319, "learning_rate": 0.00022527027027027024, "loss": 0.4416, "step": 1669 }, { "epoch": 0.5006557991380925, "grad_norm": 0.246190145611763, "learning_rate": 0.0002252252252252252, "loss": 0.4124, "step": 1670 }, { "epoch": 0.500955593029792, "grad_norm": 0.2695963382720947, "learning_rate": 0.00022518018018018017, "loss": 0.4683, "step": 1671 }, { "epoch": 0.5012553869214915, "grad_norm": 0.23124708235263824, "learning_rate": 0.0002251351351351351, "loss": 0.4347, "step": 1672 }, { "epoch": 0.501555180813191, "grad_norm": 0.25153648853302, "learning_rate": 0.00022509009009009007, "loss": 0.4408, "step": 1673 }, { "epoch": 0.5018549747048904, "grad_norm": 0.2653743028640747, "learning_rate": 0.00022504504504504503, "loss": 0.4482, "step": 1674 }, { "epoch": 0.5021547685965898, "grad_norm": 0.24365836381912231, "learning_rate": 0.000225, "loss": 0.4225, "step": 1675 }, { "epoch": 0.5024545624882893, "grad_norm": 0.2595864236354828, "learning_rate": 0.00022495495495495493, "loss": 0.4624, "step": 1676 }, { "epoch": 0.5027543563799888, "grad_norm": 0.24555698037147522, "learning_rate": 0.0002249099099099099, "loss": 0.4495, "step": 1677 }, { "epoch": 0.5030541502716882, "grad_norm": 0.25951844453811646, "learning_rate": 0.00022486486486486486, "loss": 0.4503, "step": 1678 }, { "epoch": 0.5033539441633876, "grad_norm": 0.28872594237327576, "learning_rate": 0.0002248198198198198, "loss": 0.4916, "step": 1679 }, { "epoch": 0.5036537380550872, "grad_norm": 0.24827940762043, "learning_rate": 0.00022477477477477476, "loss": 0.4024, "step": 1680 }, { "epoch": 0.5039535319467866, "grad_norm": 0.2625780999660492, "learning_rate": 0.00022472972972972972, "loss": 0.4624, "step": 1681 }, { "epoch": 0.504253325838486, "grad_norm": 0.26651719212532043, "learning_rate": 0.00022468468468468466, "loss": 0.4526, "step": 1682 }, { "epoch": 0.5045531197301855, "grad_norm": 0.2538798153400421, "learning_rate": 0.00022463963963963962, "loss": 0.4328, "step": 1683 }, { "epoch": 0.504852913621885, "grad_norm": 0.25730088353157043, "learning_rate": 0.0002245945945945946, "loss": 0.4642, "step": 1684 }, { "epoch": 0.5051527075135844, "grad_norm": 0.24593298137187958, "learning_rate": 0.00022454954954954952, "loss": 0.4422, "step": 1685 }, { "epoch": 0.5054525014052839, "grad_norm": 0.25883376598358154, "learning_rate": 0.0002245045045045045, "loss": 0.4439, "step": 1686 }, { "epoch": 0.5057522952969833, "grad_norm": 0.2680940330028534, "learning_rate": 0.00022445945945945945, "loss": 0.4459, "step": 1687 }, { "epoch": 0.5060520891886828, "grad_norm": 0.23688143491744995, "learning_rate": 0.0002244144144144144, "loss": 0.4404, "step": 1688 }, { "epoch": 0.5063518830803823, "grad_norm": 0.24362222850322723, "learning_rate": 0.00022436936936936933, "loss": 0.473, "step": 1689 }, { "epoch": 0.5066516769720817, "grad_norm": 0.24516165256500244, "learning_rate": 0.00022432432432432432, "loss": 0.4288, "step": 1690 }, { "epoch": 0.5069514708637811, "grad_norm": 0.23739798367023468, "learning_rate": 0.00022427927927927925, "loss": 0.4231, "step": 1691 }, { "epoch": 0.5072512647554807, "grad_norm": 0.25751793384552, "learning_rate": 0.0002242342342342342, "loss": 0.4406, "step": 1692 }, { "epoch": 0.5075510586471801, "grad_norm": 0.23062650859355927, "learning_rate": 0.00022418918918918918, "loss": 0.4273, "step": 1693 }, { "epoch": 0.5078508525388795, "grad_norm": 0.2453629970550537, "learning_rate": 0.00022414414414414412, "loss": 0.4526, "step": 1694 }, { "epoch": 0.508150646430579, "grad_norm": 0.26255685091018677, "learning_rate": 0.00022409909909909905, "loss": 0.461, "step": 1695 }, { "epoch": 0.5084504403222785, "grad_norm": 0.2384280562400818, "learning_rate": 0.00022405405405405405, "loss": 0.4333, "step": 1696 }, { "epoch": 0.5087502342139779, "grad_norm": 0.24055825173854828, "learning_rate": 0.00022400900900900898, "loss": 0.4472, "step": 1697 }, { "epoch": 0.5090500281056773, "grad_norm": 0.24356570839881897, "learning_rate": 0.00022396396396396395, "loss": 0.4159, "step": 1698 }, { "epoch": 0.5093498219973768, "grad_norm": 0.3042013645172119, "learning_rate": 0.0002239189189189189, "loss": 0.474, "step": 1699 }, { "epoch": 0.5096496158890763, "grad_norm": 0.2500532567501068, "learning_rate": 0.00022387387387387385, "loss": 0.4223, "step": 1700 }, { "epoch": 0.5099494097807757, "grad_norm": 0.25324761867523193, "learning_rate": 0.0002238288288288288, "loss": 0.452, "step": 1701 }, { "epoch": 0.5102492036724752, "grad_norm": 0.26007258892059326, "learning_rate": 0.00022378378378378377, "loss": 0.4516, "step": 1702 }, { "epoch": 0.5105489975641746, "grad_norm": 0.2657194435596466, "learning_rate": 0.0002237387387387387, "loss": 0.4663, "step": 1703 }, { "epoch": 0.5108487914558741, "grad_norm": 0.28216373920440674, "learning_rate": 0.00022369369369369368, "loss": 0.4341, "step": 1704 }, { "epoch": 0.5111485853475736, "grad_norm": 0.2571386992931366, "learning_rate": 0.00022364864864864864, "loss": 0.4532, "step": 1705 }, { "epoch": 0.511448379239273, "grad_norm": 0.27189430594444275, "learning_rate": 0.00022360360360360358, "loss": 0.4593, "step": 1706 }, { "epoch": 0.5117481731309724, "grad_norm": 0.2536429166793823, "learning_rate": 0.00022355855855855854, "loss": 0.4603, "step": 1707 }, { "epoch": 0.512047967022672, "grad_norm": 0.2552615702152252, "learning_rate": 0.0002235135135135135, "loss": 0.4265, "step": 1708 }, { "epoch": 0.5123477609143714, "grad_norm": 0.24926409125328064, "learning_rate": 0.00022346846846846844, "loss": 0.444, "step": 1709 }, { "epoch": 0.5126475548060708, "grad_norm": 0.27449071407318115, "learning_rate": 0.0002234234234234234, "loss": 0.4605, "step": 1710 }, { "epoch": 0.5129473486977703, "grad_norm": 0.255071222782135, "learning_rate": 0.00022337837837837837, "loss": 0.4409, "step": 1711 }, { "epoch": 0.5132471425894697, "grad_norm": 0.2560432553291321, "learning_rate": 0.00022333333333333333, "loss": 0.467, "step": 1712 }, { "epoch": 0.5135469364811692, "grad_norm": 0.2458151876926422, "learning_rate": 0.00022328828828828827, "loss": 0.4626, "step": 1713 }, { "epoch": 0.5138467303728687, "grad_norm": 0.2361784130334854, "learning_rate": 0.0002232432432432432, "loss": 0.4305, "step": 1714 }, { "epoch": 0.5141465242645681, "grad_norm": 0.25834766030311584, "learning_rate": 0.0002231981981981982, "loss": 0.4569, "step": 1715 }, { "epoch": 0.5144463181562675, "grad_norm": 0.253662109375, "learning_rate": 0.00022315315315315313, "loss": 0.4289, "step": 1716 }, { "epoch": 0.514746112047967, "grad_norm": 0.2442713975906372, "learning_rate": 0.00022310810810810807, "loss": 0.4356, "step": 1717 }, { "epoch": 0.5150459059396665, "grad_norm": 0.25813472270965576, "learning_rate": 0.00022306306306306306, "loss": 0.4721, "step": 1718 }, { "epoch": 0.5153456998313659, "grad_norm": 0.269927054643631, "learning_rate": 0.000223018018018018, "loss": 0.4603, "step": 1719 }, { "epoch": 0.5156454937230653, "grad_norm": 0.25108182430267334, "learning_rate": 0.00022297297297297293, "loss": 0.4611, "step": 1720 }, { "epoch": 0.5159452876147649, "grad_norm": 0.23553571105003357, "learning_rate": 0.00022292792792792793, "loss": 0.431, "step": 1721 }, { "epoch": 0.5162450815064643, "grad_norm": 0.2411264032125473, "learning_rate": 0.00022288288288288286, "loss": 0.4405, "step": 1722 }, { "epoch": 0.5165448753981637, "grad_norm": 0.24999505281448364, "learning_rate": 0.0002228378378378378, "loss": 0.4487, "step": 1723 }, { "epoch": 0.5168446692898632, "grad_norm": 0.23619996011257172, "learning_rate": 0.0002227927927927928, "loss": 0.4399, "step": 1724 }, { "epoch": 0.5171444631815627, "grad_norm": 0.23623579740524292, "learning_rate": 0.00022274774774774773, "loss": 0.4358, "step": 1725 }, { "epoch": 0.5174442570732621, "grad_norm": 0.2538294494152069, "learning_rate": 0.00022270270270270266, "loss": 0.4715, "step": 1726 }, { "epoch": 0.5177440509649616, "grad_norm": 0.23572376370429993, "learning_rate": 0.00022265765765765765, "loss": 0.4529, "step": 1727 }, { "epoch": 0.518043844856661, "grad_norm": 0.25421515107154846, "learning_rate": 0.0002226126126126126, "loss": 0.4427, "step": 1728 }, { "epoch": 0.5183436387483605, "grad_norm": 0.23936650156974792, "learning_rate": 0.00022256756756756753, "loss": 0.4416, "step": 1729 }, { "epoch": 0.51864343264006, "grad_norm": 0.26791784167289734, "learning_rate": 0.00022252252252252252, "loss": 0.4697, "step": 1730 }, { "epoch": 0.5189432265317594, "grad_norm": 0.2541804015636444, "learning_rate": 0.00022247747747747746, "loss": 0.4801, "step": 1731 }, { "epoch": 0.5192430204234588, "grad_norm": 0.24337895214557648, "learning_rate": 0.0002224324324324324, "loss": 0.4316, "step": 1732 }, { "epoch": 0.5195428143151584, "grad_norm": 0.2545047998428345, "learning_rate": 0.00022238738738738738, "loss": 0.4465, "step": 1733 }, { "epoch": 0.5198426082068578, "grad_norm": 0.24640010297298431, "learning_rate": 0.00022234234234234232, "loss": 0.4574, "step": 1734 }, { "epoch": 0.5201424020985572, "grad_norm": 0.2528793215751648, "learning_rate": 0.00022229729729729728, "loss": 0.4423, "step": 1735 }, { "epoch": 0.5204421959902567, "grad_norm": 0.24697841703891754, "learning_rate": 0.00022225225225225222, "loss": 0.4259, "step": 1736 }, { "epoch": 0.5207419898819562, "grad_norm": 0.24195986986160278, "learning_rate": 0.00022220720720720718, "loss": 0.439, "step": 1737 }, { "epoch": 0.5210417837736556, "grad_norm": 0.2523336410522461, "learning_rate": 0.00022216216216216215, "loss": 0.4441, "step": 1738 }, { "epoch": 0.521341577665355, "grad_norm": 0.22917968034744263, "learning_rate": 0.00022211711711711709, "loss": 0.4152, "step": 1739 }, { "epoch": 0.5216413715570545, "grad_norm": 0.2633557915687561, "learning_rate": 0.00022207207207207205, "loss": 0.4593, "step": 1740 }, { "epoch": 0.521941165448754, "grad_norm": 0.23756282031536102, "learning_rate": 0.000222027027027027, "loss": 0.4244, "step": 1741 }, { "epoch": 0.5222409593404534, "grad_norm": 0.24200837314128876, "learning_rate": 0.00022198198198198195, "loss": 0.4056, "step": 1742 }, { "epoch": 0.5225407532321529, "grad_norm": 0.2658364474773407, "learning_rate": 0.00022193693693693691, "loss": 0.4432, "step": 1743 }, { "epoch": 0.5228405471238523, "grad_norm": 0.2403399795293808, "learning_rate": 0.00022189189189189188, "loss": 0.4395, "step": 1744 }, { "epoch": 0.5231403410155518, "grad_norm": 0.23752082884311676, "learning_rate": 0.00022184684684684681, "loss": 0.4284, "step": 1745 }, { "epoch": 0.5234401349072513, "grad_norm": 0.2684466540813446, "learning_rate": 0.0002218018018018018, "loss": 0.4266, "step": 1746 }, { "epoch": 0.5237399287989507, "grad_norm": 0.24474304914474487, "learning_rate": 0.00022175675675675674, "loss": 0.4472, "step": 1747 }, { "epoch": 0.5240397226906501, "grad_norm": 0.24958543479442596, "learning_rate": 0.00022171171171171168, "loss": 0.4253, "step": 1748 }, { "epoch": 0.5243395165823497, "grad_norm": 0.25974011421203613, "learning_rate": 0.00022166666666666667, "loss": 0.4477, "step": 1749 }, { "epoch": 0.5246393104740491, "grad_norm": 0.23762384057044983, "learning_rate": 0.0002216216216216216, "loss": 0.425, "step": 1750 }, { "epoch": 0.5249391043657485, "grad_norm": 0.2543089687824249, "learning_rate": 0.00022157657657657654, "loss": 0.3947, "step": 1751 }, { "epoch": 0.525238898257448, "grad_norm": 0.26260843873023987, "learning_rate": 0.00022153153153153153, "loss": 0.4642, "step": 1752 }, { "epoch": 0.5255386921491475, "grad_norm": 0.2474740445613861, "learning_rate": 0.00022148648648648647, "loss": 0.4468, "step": 1753 }, { "epoch": 0.5258384860408469, "grad_norm": 0.24307985603809357, "learning_rate": 0.0002214414414414414, "loss": 0.4537, "step": 1754 }, { "epoch": 0.5261382799325464, "grad_norm": 0.27346494793891907, "learning_rate": 0.0002213963963963964, "loss": 0.4789, "step": 1755 }, { "epoch": 0.5264380738242458, "grad_norm": 0.2407042384147644, "learning_rate": 0.00022135135135135134, "loss": 0.4334, "step": 1756 }, { "epoch": 0.5267378677159453, "grad_norm": 0.2311742603778839, "learning_rate": 0.00022130630630630627, "loss": 0.4117, "step": 1757 }, { "epoch": 0.5270376616076448, "grad_norm": 0.23060280084609985, "learning_rate": 0.00022126126126126126, "loss": 0.4413, "step": 1758 }, { "epoch": 0.5273374554993442, "grad_norm": 0.286851167678833, "learning_rate": 0.0002212162162162162, "loss": 0.461, "step": 1759 }, { "epoch": 0.5276372493910436, "grad_norm": 0.23764149844646454, "learning_rate": 0.00022117117117117114, "loss": 0.4428, "step": 1760 }, { "epoch": 0.5279370432827432, "grad_norm": 0.24021115899085999, "learning_rate": 0.0002211261261261261, "loss": 0.4416, "step": 1761 }, { "epoch": 0.5282368371744426, "grad_norm": 0.25310957431793213, "learning_rate": 0.00022108108108108106, "loss": 0.4474, "step": 1762 }, { "epoch": 0.528536631066142, "grad_norm": 0.2636144161224365, "learning_rate": 0.000221036036036036, "loss": 0.4629, "step": 1763 }, { "epoch": 0.5288364249578414, "grad_norm": 0.254807710647583, "learning_rate": 0.00022099099099099097, "loss": 0.4558, "step": 1764 }, { "epoch": 0.529136218849541, "grad_norm": 0.2389029860496521, "learning_rate": 0.00022094594594594593, "loss": 0.4519, "step": 1765 }, { "epoch": 0.5294360127412404, "grad_norm": 0.24269163608551025, "learning_rate": 0.00022090090090090087, "loss": 0.4371, "step": 1766 }, { "epoch": 0.5297358066329398, "grad_norm": 0.2602348029613495, "learning_rate": 0.00022085585585585583, "loss": 0.4474, "step": 1767 }, { "epoch": 0.5300356005246393, "grad_norm": 0.2557549774646759, "learning_rate": 0.0002208108108108108, "loss": 0.4709, "step": 1768 }, { "epoch": 0.5303353944163388, "grad_norm": 0.27289271354675293, "learning_rate": 0.00022076576576576576, "loss": 0.4248, "step": 1769 }, { "epoch": 0.5306351883080382, "grad_norm": 0.27305862307548523, "learning_rate": 0.0002207207207207207, "loss": 0.4566, "step": 1770 }, { "epoch": 0.5309349821997377, "grad_norm": 0.2508034408092499, "learning_rate": 0.00022067567567567566, "loss": 0.4334, "step": 1771 }, { "epoch": 0.5312347760914371, "grad_norm": 0.24729134142398834, "learning_rate": 0.00022063063063063062, "loss": 0.4251, "step": 1772 }, { "epoch": 0.5315345699831366, "grad_norm": 0.23145246505737305, "learning_rate": 0.00022058558558558556, "loss": 0.3992, "step": 1773 }, { "epoch": 0.5318343638748361, "grad_norm": 0.24190931022167206, "learning_rate": 0.00022054054054054052, "loss": 0.4183, "step": 1774 }, { "epoch": 0.5321341577665355, "grad_norm": 0.2671104073524475, "learning_rate": 0.0002204954954954955, "loss": 0.4548, "step": 1775 }, { "epoch": 0.5324339516582349, "grad_norm": 0.2361837476491928, "learning_rate": 0.00022045045045045042, "loss": 0.414, "step": 1776 }, { "epoch": 0.5327337455499345, "grad_norm": 0.25529375672340393, "learning_rate": 0.0002204054054054054, "loss": 0.4345, "step": 1777 }, { "epoch": 0.5330335394416339, "grad_norm": 0.26258841156959534, "learning_rate": 0.00022036036036036035, "loss": 0.4522, "step": 1778 }, { "epoch": 0.5333333333333333, "grad_norm": 0.24532164633274078, "learning_rate": 0.0002203153153153153, "loss": 0.4303, "step": 1779 }, { "epoch": 0.5336331272250328, "grad_norm": 0.24562229216098785, "learning_rate": 0.00022027027027027028, "loss": 0.427, "step": 1780 }, { "epoch": 0.5339329211167323, "grad_norm": 0.25493496656417847, "learning_rate": 0.00022022522522522522, "loss": 0.45, "step": 1781 }, { "epoch": 0.5342327150084317, "grad_norm": 0.25944793224334717, "learning_rate": 0.00022018018018018015, "loss": 0.449, "step": 1782 }, { "epoch": 0.5345325089001312, "grad_norm": 0.24805709719657898, "learning_rate": 0.00022013513513513514, "loss": 0.4431, "step": 1783 }, { "epoch": 0.5348323027918306, "grad_norm": 0.27083903551101685, "learning_rate": 0.00022009009009009008, "loss": 0.465, "step": 1784 }, { "epoch": 0.5351320966835301, "grad_norm": 0.2876584827899933, "learning_rate": 0.00022004504504504502, "loss": 0.4859, "step": 1785 }, { "epoch": 0.5354318905752296, "grad_norm": 0.2499544471502304, "learning_rate": 0.00021999999999999995, "loss": 0.4551, "step": 1786 }, { "epoch": 0.535731684466929, "grad_norm": 0.2812490165233612, "learning_rate": 0.00021995495495495494, "loss": 0.4554, "step": 1787 }, { "epoch": 0.5360314783586284, "grad_norm": 0.2530219256877899, "learning_rate": 0.00021990990990990988, "loss": 0.4335, "step": 1788 }, { "epoch": 0.536331272250328, "grad_norm": 0.24573327600955963, "learning_rate": 0.00021986486486486482, "loss": 0.4574, "step": 1789 }, { "epoch": 0.5366310661420274, "grad_norm": 0.3053630292415619, "learning_rate": 0.0002198198198198198, "loss": 0.4745, "step": 1790 }, { "epoch": 0.5369308600337268, "grad_norm": 0.24176299571990967, "learning_rate": 0.00021977477477477475, "loss": 0.4244, "step": 1791 }, { "epoch": 0.5372306539254262, "grad_norm": 0.27554550766944885, "learning_rate": 0.0002197297297297297, "loss": 0.469, "step": 1792 }, { "epoch": 0.5375304478171258, "grad_norm": 0.28003454208374023, "learning_rate": 0.00021968468468468467, "loss": 0.4477, "step": 1793 }, { "epoch": 0.5378302417088252, "grad_norm": 0.27388301491737366, "learning_rate": 0.0002196396396396396, "loss": 0.4602, "step": 1794 }, { "epoch": 0.5381300356005246, "grad_norm": 0.2521568238735199, "learning_rate": 0.00021959459459459457, "loss": 0.4507, "step": 1795 }, { "epoch": 0.5384298294922241, "grad_norm": 0.24554435908794403, "learning_rate": 0.00021954954954954954, "loss": 0.4387, "step": 1796 }, { "epoch": 0.5387296233839236, "grad_norm": 0.24148909747600555, "learning_rate": 0.00021950450450450447, "loss": 0.4316, "step": 1797 }, { "epoch": 0.539029417275623, "grad_norm": 0.24902650713920593, "learning_rate": 0.00021945945945945944, "loss": 0.448, "step": 1798 }, { "epoch": 0.5393292111673225, "grad_norm": 0.25397536158561707, "learning_rate": 0.0002194144144144144, "loss": 0.479, "step": 1799 }, { "epoch": 0.5396290050590219, "grad_norm": 0.2324562668800354, "learning_rate": 0.00021936936936936934, "loss": 0.4077, "step": 1800 }, { "epoch": 0.5399287989507214, "grad_norm": 0.26509541273117065, "learning_rate": 0.0002193243243243243, "loss": 0.4628, "step": 1801 }, { "epoch": 0.5402285928424209, "grad_norm": 0.24714629352092743, "learning_rate": 0.00021927927927927927, "loss": 0.4393, "step": 1802 }, { "epoch": 0.5405283867341203, "grad_norm": 0.2634679079055786, "learning_rate": 0.00021923423423423423, "loss": 0.4508, "step": 1803 }, { "epoch": 0.5408281806258197, "grad_norm": 0.2673392593860626, "learning_rate": 0.00021918918918918917, "loss": 0.4587, "step": 1804 }, { "epoch": 0.5411279745175192, "grad_norm": 0.2438841462135315, "learning_rate": 0.00021914414414414413, "loss": 0.4271, "step": 1805 }, { "epoch": 0.5414277684092187, "grad_norm": 0.2463088482618332, "learning_rate": 0.0002190990990990991, "loss": 0.4216, "step": 1806 }, { "epoch": 0.5417275623009181, "grad_norm": 0.22443120181560516, "learning_rate": 0.00021905405405405403, "loss": 0.3935, "step": 1807 }, { "epoch": 0.5420273561926175, "grad_norm": 0.2647371292114258, "learning_rate": 0.00021900900900900897, "loss": 0.4474, "step": 1808 }, { "epoch": 0.542327150084317, "grad_norm": 0.2505868077278137, "learning_rate": 0.00021896396396396396, "loss": 0.4623, "step": 1809 }, { "epoch": 0.5426269439760165, "grad_norm": 0.2395186722278595, "learning_rate": 0.0002189189189189189, "loss": 0.4469, "step": 1810 }, { "epoch": 0.542926737867716, "grad_norm": 0.2345222383737564, "learning_rate": 0.00021887387387387383, "loss": 0.4127, "step": 1811 }, { "epoch": 0.5432265317594154, "grad_norm": 0.25592970848083496, "learning_rate": 0.00021882882882882882, "loss": 0.4642, "step": 1812 }, { "epoch": 0.5435263256511148, "grad_norm": 0.2604975700378418, "learning_rate": 0.00021878378378378376, "loss": 0.461, "step": 1813 }, { "epoch": 0.5438261195428143, "grad_norm": 0.2413945347070694, "learning_rate": 0.0002187387387387387, "loss": 0.4281, "step": 1814 }, { "epoch": 0.5441259134345138, "grad_norm": 0.2543574571609497, "learning_rate": 0.0002186936936936937, "loss": 0.4657, "step": 1815 }, { "epoch": 0.5444257073262132, "grad_norm": 0.2783868610858917, "learning_rate": 0.00021864864864864863, "loss": 0.4917, "step": 1816 }, { "epoch": 0.5447255012179126, "grad_norm": 0.23379693925380707, "learning_rate": 0.00021860360360360356, "loss": 0.4324, "step": 1817 }, { "epoch": 0.5450252951096122, "grad_norm": 0.22307537496089935, "learning_rate": 0.00021855855855855855, "loss": 0.4175, "step": 1818 }, { "epoch": 0.5453250890013116, "grad_norm": 0.23716312646865845, "learning_rate": 0.0002185135135135135, "loss": 0.4195, "step": 1819 }, { "epoch": 0.545624882893011, "grad_norm": 0.2469712346792221, "learning_rate": 0.00021846846846846843, "loss": 0.4386, "step": 1820 }, { "epoch": 0.5459246767847105, "grad_norm": 0.24840980768203735, "learning_rate": 0.00021842342342342342, "loss": 0.4193, "step": 1821 }, { "epoch": 0.54622447067641, "grad_norm": 0.25362369418144226, "learning_rate": 0.00021837837837837835, "loss": 0.445, "step": 1822 }, { "epoch": 0.5465242645681094, "grad_norm": 0.23812243342399597, "learning_rate": 0.0002183333333333333, "loss": 0.4511, "step": 1823 }, { "epoch": 0.5468240584598089, "grad_norm": 0.25310468673706055, "learning_rate": 0.00021828828828828828, "loss": 0.4463, "step": 1824 }, { "epoch": 0.5471238523515083, "grad_norm": 0.25943437218666077, "learning_rate": 0.00021824324324324322, "loss": 0.42, "step": 1825 }, { "epoch": 0.5474236462432078, "grad_norm": 0.270178884267807, "learning_rate": 0.00021819819819819818, "loss": 0.4666, "step": 1826 }, { "epoch": 0.5477234401349073, "grad_norm": 0.23301197588443756, "learning_rate": 0.00021815315315315315, "loss": 0.407, "step": 1827 }, { "epoch": 0.5480232340266067, "grad_norm": 0.2634783983230591, "learning_rate": 0.00021810810810810808, "loss": 0.4596, "step": 1828 }, { "epoch": 0.5483230279183061, "grad_norm": 0.24297159910202026, "learning_rate": 0.00021806306306306305, "loss": 0.4107, "step": 1829 }, { "epoch": 0.5486228218100057, "grad_norm": 0.2781638503074646, "learning_rate": 0.000218018018018018, "loss": 0.4686, "step": 1830 }, { "epoch": 0.5489226157017051, "grad_norm": 0.25801247358322144, "learning_rate": 0.00021797297297297295, "loss": 0.4644, "step": 1831 }, { "epoch": 0.5492224095934045, "grad_norm": 0.27080485224723816, "learning_rate": 0.0002179279279279279, "loss": 0.428, "step": 1832 }, { "epoch": 0.5495222034851039, "grad_norm": 0.2617061138153076, "learning_rate": 0.00021788288288288285, "loss": 0.4437, "step": 1833 }, { "epoch": 0.5498219973768035, "grad_norm": 0.26249265670776367, "learning_rate": 0.0002178378378378378, "loss": 0.4413, "step": 1834 }, { "epoch": 0.5501217912685029, "grad_norm": 0.2705504894256592, "learning_rate": 0.00021779279279279278, "loss": 0.4716, "step": 1835 }, { "epoch": 0.5504215851602023, "grad_norm": 0.25836360454559326, "learning_rate": 0.0002177477477477477, "loss": 0.454, "step": 1836 }, { "epoch": 0.5507213790519018, "grad_norm": 0.23872222006320953, "learning_rate": 0.0002177027027027027, "loss": 0.4173, "step": 1837 }, { "epoch": 0.5510211729436013, "grad_norm": 0.2438534051179886, "learning_rate": 0.00021765765765765764, "loss": 0.4114, "step": 1838 }, { "epoch": 0.5513209668353007, "grad_norm": 0.2600691318511963, "learning_rate": 0.00021761261261261258, "loss": 0.4364, "step": 1839 }, { "epoch": 0.5516207607270002, "grad_norm": 0.2483833283185959, "learning_rate": 0.00021756756756756757, "loss": 0.4544, "step": 1840 }, { "epoch": 0.5519205546186996, "grad_norm": 0.2506955564022064, "learning_rate": 0.0002175225225225225, "loss": 0.4561, "step": 1841 }, { "epoch": 0.5522203485103991, "grad_norm": 0.2800386846065521, "learning_rate": 0.00021747747747747744, "loss": 0.438, "step": 1842 }, { "epoch": 0.5525201424020986, "grad_norm": 0.2458580881357193, "learning_rate": 0.00021743243243243243, "loss": 0.4431, "step": 1843 }, { "epoch": 0.552819936293798, "grad_norm": 0.24995484948158264, "learning_rate": 0.00021738738738738737, "loss": 0.4349, "step": 1844 }, { "epoch": 0.5531197301854974, "grad_norm": 0.2502653896808624, "learning_rate": 0.0002173423423423423, "loss": 0.4366, "step": 1845 }, { "epoch": 0.553419524077197, "grad_norm": 0.2538207471370697, "learning_rate": 0.0002172972972972973, "loss": 0.4381, "step": 1846 }, { "epoch": 0.5537193179688964, "grad_norm": 0.2417684644460678, "learning_rate": 0.00021725225225225223, "loss": 0.4277, "step": 1847 }, { "epoch": 0.5540191118605958, "grad_norm": 0.2531186044216156, "learning_rate": 0.00021720720720720717, "loss": 0.4361, "step": 1848 }, { "epoch": 0.5543189057522953, "grad_norm": 0.2551022171974182, "learning_rate": 0.00021716216216216216, "loss": 0.4483, "step": 1849 }, { "epoch": 0.5546186996439948, "grad_norm": 0.23375505208969116, "learning_rate": 0.0002171171171171171, "loss": 0.4157, "step": 1850 }, { "epoch": 0.5549184935356942, "grad_norm": 0.23428985476493835, "learning_rate": 0.00021707207207207204, "loss": 0.4092, "step": 1851 }, { "epoch": 0.5552182874273937, "grad_norm": 0.24895919859409332, "learning_rate": 0.00021702702702702703, "loss": 0.4377, "step": 1852 }, { "epoch": 0.5555180813190931, "grad_norm": 0.23306754231452942, "learning_rate": 0.00021698198198198196, "loss": 0.4316, "step": 1853 }, { "epoch": 0.5558178752107926, "grad_norm": 0.2527741491794586, "learning_rate": 0.0002169369369369369, "loss": 0.4245, "step": 1854 }, { "epoch": 0.556117669102492, "grad_norm": 0.2319868952035904, "learning_rate": 0.0002168918918918919, "loss": 0.4074, "step": 1855 }, { "epoch": 0.5564174629941915, "grad_norm": 0.25132691860198975, "learning_rate": 0.00021684684684684683, "loss": 0.4378, "step": 1856 }, { "epoch": 0.5567172568858909, "grad_norm": 0.24814215302467346, "learning_rate": 0.00021680180180180177, "loss": 0.4639, "step": 1857 }, { "epoch": 0.5570170507775905, "grad_norm": 0.23483987152576447, "learning_rate": 0.00021675675675675673, "loss": 0.4419, "step": 1858 }, { "epoch": 0.5573168446692899, "grad_norm": 0.24670268595218658, "learning_rate": 0.0002167117117117117, "loss": 0.436, "step": 1859 }, { "epoch": 0.5576166385609893, "grad_norm": 0.23737584054470062, "learning_rate": 0.00021666666666666666, "loss": 0.4246, "step": 1860 }, { "epoch": 0.5579164324526887, "grad_norm": 0.2634913921356201, "learning_rate": 0.0002166216216216216, "loss": 0.4252, "step": 1861 }, { "epoch": 0.5582162263443883, "grad_norm": 0.25565293431282043, "learning_rate": 0.00021657657657657656, "loss": 0.4287, "step": 1862 }, { "epoch": 0.5585160202360877, "grad_norm": 0.2408502846956253, "learning_rate": 0.00021653153153153152, "loss": 0.4259, "step": 1863 }, { "epoch": 0.5588158141277871, "grad_norm": 0.2516701817512512, "learning_rate": 0.00021648648648648646, "loss": 0.4361, "step": 1864 }, { "epoch": 0.5591156080194866, "grad_norm": 0.2645619213581085, "learning_rate": 0.00021644144144144142, "loss": 0.4336, "step": 1865 }, { "epoch": 0.5594154019111861, "grad_norm": 0.27260729670524597, "learning_rate": 0.00021639639639639639, "loss": 0.4494, "step": 1866 }, { "epoch": 0.5597151958028855, "grad_norm": 0.2506016492843628, "learning_rate": 0.00021635135135135132, "loss": 0.4027, "step": 1867 }, { "epoch": 0.560014989694585, "grad_norm": 0.2552817463874817, "learning_rate": 0.00021630630630630629, "loss": 0.426, "step": 1868 }, { "epoch": 0.5603147835862844, "grad_norm": 0.2529433071613312, "learning_rate": 0.00021626126126126125, "loss": 0.4327, "step": 1869 }, { "epoch": 0.5606145774779839, "grad_norm": 0.2483612447977066, "learning_rate": 0.0002162162162162162, "loss": 0.4492, "step": 1870 }, { "epoch": 0.5609143713696834, "grad_norm": 0.25298991799354553, "learning_rate": 0.00021617117117117118, "loss": 0.4442, "step": 1871 }, { "epoch": 0.5612141652613828, "grad_norm": 0.243782639503479, "learning_rate": 0.00021612612612612611, "loss": 0.4462, "step": 1872 }, { "epoch": 0.5615139591530822, "grad_norm": 0.24677562713623047, "learning_rate": 0.00021608108108108105, "loss": 0.433, "step": 1873 }, { "epoch": 0.5618137530447818, "grad_norm": 0.2761828601360321, "learning_rate": 0.00021603603603603604, "loss": 0.4436, "step": 1874 }, { "epoch": 0.5621135469364812, "grad_norm": 0.24887466430664062, "learning_rate": 0.00021599099099099098, "loss": 0.4211, "step": 1875 }, { "epoch": 0.5624133408281806, "grad_norm": 0.2687954902648926, "learning_rate": 0.00021594594594594592, "loss": 0.4545, "step": 1876 }, { "epoch": 0.56271313471988, "grad_norm": 0.2376958280801773, "learning_rate": 0.0002159009009009009, "loss": 0.4062, "step": 1877 }, { "epoch": 0.5630129286115796, "grad_norm": 0.25497403740882874, "learning_rate": 0.00021585585585585584, "loss": 0.4512, "step": 1878 }, { "epoch": 0.563312722503279, "grad_norm": 0.27644020318984985, "learning_rate": 0.00021581081081081078, "loss": 0.4317, "step": 1879 }, { "epoch": 0.5636125163949784, "grad_norm": 0.274746298789978, "learning_rate": 0.00021576576576576577, "loss": 0.4523, "step": 1880 }, { "epoch": 0.5639123102866779, "grad_norm": 0.24522797763347626, "learning_rate": 0.0002157207207207207, "loss": 0.4193, "step": 1881 }, { "epoch": 0.5642121041783774, "grad_norm": 0.26614081859588623, "learning_rate": 0.00021567567567567565, "loss": 0.4474, "step": 1882 }, { "epoch": 0.5645118980700768, "grad_norm": 0.25119736790657043, "learning_rate": 0.0002156306306306306, "loss": 0.4349, "step": 1883 }, { "epoch": 0.5648116919617763, "grad_norm": 0.2535664737224579, "learning_rate": 0.00021558558558558557, "loss": 0.4395, "step": 1884 }, { "epoch": 0.5651114858534757, "grad_norm": 0.27441659569740295, "learning_rate": 0.0002155405405405405, "loss": 0.4591, "step": 1885 }, { "epoch": 0.5654112797451752, "grad_norm": 0.2535187900066376, "learning_rate": 0.00021549549549549547, "loss": 0.4256, "step": 1886 }, { "epoch": 0.5657110736368747, "grad_norm": 0.2877647876739502, "learning_rate": 0.00021545045045045044, "loss": 0.4877, "step": 1887 }, { "epoch": 0.5660108675285741, "grad_norm": 0.23665641248226166, "learning_rate": 0.00021540540540540537, "loss": 0.4506, "step": 1888 }, { "epoch": 0.5663106614202735, "grad_norm": 0.2620941996574402, "learning_rate": 0.00021536036036036034, "loss": 0.4415, "step": 1889 }, { "epoch": 0.5666104553119731, "grad_norm": 0.2704925239086151, "learning_rate": 0.0002153153153153153, "loss": 0.4855, "step": 1890 }, { "epoch": 0.5669102492036725, "grad_norm": 0.2636096775531769, "learning_rate": 0.00021527027027027024, "loss": 0.4196, "step": 1891 }, { "epoch": 0.5672100430953719, "grad_norm": 0.2897530496120453, "learning_rate": 0.0002152252252252252, "loss": 0.4556, "step": 1892 }, { "epoch": 0.5675098369870714, "grad_norm": 0.2407667189836502, "learning_rate": 0.00021518018018018017, "loss": 0.4406, "step": 1893 }, { "epoch": 0.5678096308787709, "grad_norm": 0.277865469455719, "learning_rate": 0.00021513513513513513, "loss": 0.4033, "step": 1894 }, { "epoch": 0.5681094247704703, "grad_norm": 0.2540576159954071, "learning_rate": 0.00021509009009009007, "loss": 0.4355, "step": 1895 }, { "epoch": 0.5684092186621698, "grad_norm": 0.26155397295951843, "learning_rate": 0.00021504504504504503, "loss": 0.4235, "step": 1896 }, { "epoch": 0.5687090125538692, "grad_norm": 0.25544247031211853, "learning_rate": 0.000215, "loss": 0.4279, "step": 1897 }, { "epoch": 0.5690088064455687, "grad_norm": 0.2406405806541443, "learning_rate": 0.00021495495495495493, "loss": 0.4208, "step": 1898 }, { "epoch": 0.5693086003372682, "grad_norm": 0.24712449312210083, "learning_rate": 0.0002149099099099099, "loss": 0.4344, "step": 1899 }, { "epoch": 0.5696083942289676, "grad_norm": 0.26752716302871704, "learning_rate": 0.00021486486486486486, "loss": 0.4934, "step": 1900 }, { "epoch": 0.569908188120667, "grad_norm": 0.24652232229709625, "learning_rate": 0.0002148198198198198, "loss": 0.4546, "step": 1901 }, { "epoch": 0.5702079820123664, "grad_norm": 0.251396507024765, "learning_rate": 0.00021477477477477476, "loss": 0.4738, "step": 1902 }, { "epoch": 0.570507775904066, "grad_norm": 0.24406296014785767, "learning_rate": 0.00021472972972972972, "loss": 0.4536, "step": 1903 }, { "epoch": 0.5708075697957654, "grad_norm": 0.2634785771369934, "learning_rate": 0.00021468468468468466, "loss": 0.4653, "step": 1904 }, { "epoch": 0.5711073636874648, "grad_norm": 0.24010764062404633, "learning_rate": 0.0002146396396396396, "loss": 0.4188, "step": 1905 }, { "epoch": 0.5714071575791643, "grad_norm": 0.24869616329669952, "learning_rate": 0.0002145945945945946, "loss": 0.4474, "step": 1906 }, { "epoch": 0.5717069514708638, "grad_norm": 0.24654364585876465, "learning_rate": 0.00021454954954954953, "loss": 0.4409, "step": 1907 }, { "epoch": 0.5720067453625632, "grad_norm": 0.29856958985328674, "learning_rate": 0.00021450450450450446, "loss": 0.4912, "step": 1908 }, { "epoch": 0.5723065392542627, "grad_norm": 0.2449256181716919, "learning_rate": 0.00021445945945945945, "loss": 0.4501, "step": 1909 }, { "epoch": 0.5726063331459621, "grad_norm": 0.29776662588119507, "learning_rate": 0.0002144144144144144, "loss": 0.4403, "step": 1910 }, { "epoch": 0.5729061270376616, "grad_norm": 0.26075392961502075, "learning_rate": 0.00021436936936936933, "loss": 0.4325, "step": 1911 }, { "epoch": 0.5732059209293611, "grad_norm": 0.23287932574748993, "learning_rate": 0.00021432432432432432, "loss": 0.4265, "step": 1912 }, { "epoch": 0.5735057148210605, "grad_norm": 0.2624457776546478, "learning_rate": 0.00021427927927927925, "loss": 0.4433, "step": 1913 }, { "epoch": 0.5738055087127599, "grad_norm": 0.23440878093242645, "learning_rate": 0.0002142342342342342, "loss": 0.4136, "step": 1914 }, { "epoch": 0.5741053026044595, "grad_norm": 0.2694714069366455, "learning_rate": 0.00021418918918918918, "loss": 0.4508, "step": 1915 }, { "epoch": 0.5744050964961589, "grad_norm": 0.261952668428421, "learning_rate": 0.00021414414414414412, "loss": 0.4697, "step": 1916 }, { "epoch": 0.5747048903878583, "grad_norm": 0.2526634931564331, "learning_rate": 0.00021409909909909908, "loss": 0.4518, "step": 1917 }, { "epoch": 0.5750046842795578, "grad_norm": 0.2554527223110199, "learning_rate": 0.00021405405405405405, "loss": 0.4802, "step": 1918 }, { "epoch": 0.5753044781712573, "grad_norm": 0.2729927599430084, "learning_rate": 0.00021400900900900898, "loss": 0.4622, "step": 1919 }, { "epoch": 0.5756042720629567, "grad_norm": 0.26347601413726807, "learning_rate": 0.00021396396396396395, "loss": 0.4709, "step": 1920 }, { "epoch": 0.5759040659546562, "grad_norm": 0.27795466780662537, "learning_rate": 0.0002139189189189189, "loss": 0.4555, "step": 1921 }, { "epoch": 0.5762038598463556, "grad_norm": 0.25687262415885925, "learning_rate": 0.00021387387387387385, "loss": 0.4459, "step": 1922 }, { "epoch": 0.5765036537380551, "grad_norm": 0.24855007231235504, "learning_rate": 0.0002138288288288288, "loss": 0.3944, "step": 1923 }, { "epoch": 0.5768034476297546, "grad_norm": 0.26772525906562805, "learning_rate": 0.00021378378378378378, "loss": 0.4557, "step": 1924 }, { "epoch": 0.577103241521454, "grad_norm": 0.2783243954181671, "learning_rate": 0.0002137387387387387, "loss": 0.4507, "step": 1925 }, { "epoch": 0.5774030354131534, "grad_norm": 0.26769769191741943, "learning_rate": 0.00021369369369369368, "loss": 0.4371, "step": 1926 }, { "epoch": 0.577702829304853, "grad_norm": 0.2783718705177307, "learning_rate": 0.00021364864864864864, "loss": 0.4269, "step": 1927 }, { "epoch": 0.5780026231965524, "grad_norm": 0.2654764652252197, "learning_rate": 0.0002136036036036036, "loss": 0.4407, "step": 1928 }, { "epoch": 0.5783024170882518, "grad_norm": 0.2694533169269562, "learning_rate": 0.00021355855855855854, "loss": 0.4822, "step": 1929 }, { "epoch": 0.5786022109799512, "grad_norm": 0.2403596192598343, "learning_rate": 0.00021351351351351348, "loss": 0.3985, "step": 1930 }, { "epoch": 0.5789020048716508, "grad_norm": 0.26435697078704834, "learning_rate": 0.00021346846846846847, "loss": 0.4547, "step": 1931 }, { "epoch": 0.5792017987633502, "grad_norm": 0.2416696548461914, "learning_rate": 0.0002134234234234234, "loss": 0.406, "step": 1932 }, { "epoch": 0.5795015926550496, "grad_norm": 0.2664091885089874, "learning_rate": 0.00021337837837837834, "loss": 0.412, "step": 1933 }, { "epoch": 0.5798013865467491, "grad_norm": 0.24384742975234985, "learning_rate": 0.00021333333333333333, "loss": 0.4621, "step": 1934 }, { "epoch": 0.5801011804384486, "grad_norm": 0.27569761872291565, "learning_rate": 0.00021328828828828827, "loss": 0.4643, "step": 1935 }, { "epoch": 0.580400974330148, "grad_norm": 0.2567230761051178, "learning_rate": 0.0002132432432432432, "loss": 0.4231, "step": 1936 }, { "epoch": 0.5807007682218475, "grad_norm": 0.25014641880989075, "learning_rate": 0.0002131981981981982, "loss": 0.4461, "step": 1937 }, { "epoch": 0.5810005621135469, "grad_norm": 0.2679264545440674, "learning_rate": 0.00021315315315315313, "loss": 0.4321, "step": 1938 }, { "epoch": 0.5813003560052464, "grad_norm": 0.2585947513580322, "learning_rate": 0.00021310810810810807, "loss": 0.4486, "step": 1939 }, { "epoch": 0.5816001498969459, "grad_norm": 0.2640276253223419, "learning_rate": 0.00021306306306306306, "loss": 0.4439, "step": 1940 }, { "epoch": 0.5818999437886453, "grad_norm": 0.2556924521923065, "learning_rate": 0.000213018018018018, "loss": 0.4557, "step": 1941 }, { "epoch": 0.5821997376803447, "grad_norm": 0.2560097873210907, "learning_rate": 0.00021297297297297294, "loss": 0.4384, "step": 1942 }, { "epoch": 0.5824995315720443, "grad_norm": 0.25773030519485474, "learning_rate": 0.00021292792792792793, "loss": 0.4289, "step": 1943 }, { "epoch": 0.5827993254637437, "grad_norm": 0.26476195454597473, "learning_rate": 0.00021288288288288286, "loss": 0.4513, "step": 1944 }, { "epoch": 0.5830991193554431, "grad_norm": 0.26929306983947754, "learning_rate": 0.0002128378378378378, "loss": 0.4464, "step": 1945 }, { "epoch": 0.5833989132471425, "grad_norm": 0.25713875889778137, "learning_rate": 0.0002127927927927928, "loss": 0.4422, "step": 1946 }, { "epoch": 0.5836987071388421, "grad_norm": 0.24906396865844727, "learning_rate": 0.00021274774774774773, "loss": 0.4454, "step": 1947 }, { "epoch": 0.5839985010305415, "grad_norm": 0.24760214984416962, "learning_rate": 0.00021270270270270266, "loss": 0.4343, "step": 1948 }, { "epoch": 0.584298294922241, "grad_norm": 0.25164082646369934, "learning_rate": 0.00021265765765765766, "loss": 0.4455, "step": 1949 }, { "epoch": 0.5845980888139404, "grad_norm": 0.24564093351364136, "learning_rate": 0.0002126126126126126, "loss": 0.4389, "step": 1950 }, { "epoch": 0.5848978827056399, "grad_norm": 0.2754795551300049, "learning_rate": 0.00021256756756756756, "loss": 0.466, "step": 1951 }, { "epoch": 0.5851976765973393, "grad_norm": 0.2439223974943161, "learning_rate": 0.00021252252252252252, "loss": 0.4425, "step": 1952 }, { "epoch": 0.5854974704890388, "grad_norm": 0.24320489168167114, "learning_rate": 0.00021247747747747746, "loss": 0.4085, "step": 1953 }, { "epoch": 0.5857972643807382, "grad_norm": 0.2758175730705261, "learning_rate": 0.00021243243243243242, "loss": 0.4606, "step": 1954 }, { "epoch": 0.5860970582724377, "grad_norm": 0.2623588442802429, "learning_rate": 0.00021238738738738736, "loss": 0.4167, "step": 1955 }, { "epoch": 0.5863968521641372, "grad_norm": 0.2561275064945221, "learning_rate": 0.00021234234234234232, "loss": 0.4399, "step": 1956 }, { "epoch": 0.5866966460558366, "grad_norm": 0.2818892300128937, "learning_rate": 0.00021229729729729728, "loss": 0.4628, "step": 1957 }, { "epoch": 0.586996439947536, "grad_norm": 0.26609155535697937, "learning_rate": 0.00021225225225225222, "loss": 0.4285, "step": 1958 }, { "epoch": 0.5872962338392356, "grad_norm": 0.2503769099712372, "learning_rate": 0.00021220720720720719, "loss": 0.4632, "step": 1959 }, { "epoch": 0.587596027730935, "grad_norm": 0.2810426950454712, "learning_rate": 0.00021216216216216215, "loss": 0.4591, "step": 1960 }, { "epoch": 0.5878958216226344, "grad_norm": 0.2517390251159668, "learning_rate": 0.00021211711711711709, "loss": 0.4323, "step": 1961 }, { "epoch": 0.5881956155143339, "grad_norm": 0.26425543427467346, "learning_rate": 0.00021207207207207205, "loss": 0.4466, "step": 1962 }, { "epoch": 0.5884954094060334, "grad_norm": 0.25192978978157043, "learning_rate": 0.00021202702702702701, "loss": 0.4524, "step": 1963 }, { "epoch": 0.5887952032977328, "grad_norm": 0.26266035437583923, "learning_rate": 0.00021198198198198195, "loss": 0.4417, "step": 1964 }, { "epoch": 0.5890949971894323, "grad_norm": 0.31183677911758423, "learning_rate": 0.00021193693693693694, "loss": 0.4473, "step": 1965 }, { "epoch": 0.5893947910811317, "grad_norm": 0.24742548167705536, "learning_rate": 0.00021189189189189188, "loss": 0.4209, "step": 1966 }, { "epoch": 0.5896945849728312, "grad_norm": 0.27282243967056274, "learning_rate": 0.00021184684684684682, "loss": 0.4316, "step": 1967 }, { "epoch": 0.5899943788645307, "grad_norm": 0.2748444974422455, "learning_rate": 0.0002118018018018018, "loss": 0.4276, "step": 1968 }, { "epoch": 0.5902941727562301, "grad_norm": 0.2746492624282837, "learning_rate": 0.00021175675675675674, "loss": 0.4597, "step": 1969 }, { "epoch": 0.5905939666479295, "grad_norm": 0.25905507802963257, "learning_rate": 0.00021171171171171168, "loss": 0.4668, "step": 1970 }, { "epoch": 0.590893760539629, "grad_norm": 0.2456134557723999, "learning_rate": 0.00021166666666666667, "loss": 0.4057, "step": 1971 }, { "epoch": 0.5911935544313285, "grad_norm": 0.25221529603004456, "learning_rate": 0.0002116216216216216, "loss": 0.4427, "step": 1972 }, { "epoch": 0.5914933483230279, "grad_norm": 0.24493472278118134, "learning_rate": 0.00021157657657657654, "loss": 0.4206, "step": 1973 }, { "epoch": 0.5917931422147273, "grad_norm": 0.2561238408088684, "learning_rate": 0.00021153153153153154, "loss": 0.4537, "step": 1974 }, { "epoch": 0.5920929361064269, "grad_norm": 0.2350313663482666, "learning_rate": 0.00021148648648648647, "loss": 0.4163, "step": 1975 }, { "epoch": 0.5923927299981263, "grad_norm": 0.2535955309867859, "learning_rate": 0.0002114414414414414, "loss": 0.4435, "step": 1976 }, { "epoch": 0.5926925238898257, "grad_norm": 0.2498706728219986, "learning_rate": 0.00021139639639639637, "loss": 0.4507, "step": 1977 }, { "epoch": 0.5929923177815252, "grad_norm": 0.2510056495666504, "learning_rate": 0.00021135135135135134, "loss": 0.4268, "step": 1978 }, { "epoch": 0.5932921116732247, "grad_norm": 0.252605676651001, "learning_rate": 0.00021130630630630627, "loss": 0.4457, "step": 1979 }, { "epoch": 0.5935919055649241, "grad_norm": 0.2618269622325897, "learning_rate": 0.00021126126126126124, "loss": 0.4447, "step": 1980 }, { "epoch": 0.5938916994566236, "grad_norm": 0.25301989912986755, "learning_rate": 0.0002112162162162162, "loss": 0.4468, "step": 1981 }, { "epoch": 0.594191493348323, "grad_norm": 0.2434222400188446, "learning_rate": 0.00021117117117117114, "loss": 0.4348, "step": 1982 }, { "epoch": 0.5944912872400225, "grad_norm": 0.24123191833496094, "learning_rate": 0.0002111261261261261, "loss": 0.4485, "step": 1983 }, { "epoch": 0.594791081131722, "grad_norm": 0.2643167972564697, "learning_rate": 0.00021108108108108107, "loss": 0.4464, "step": 1984 }, { "epoch": 0.5950908750234214, "grad_norm": 0.2503741979598999, "learning_rate": 0.00021103603603603603, "loss": 0.4247, "step": 1985 }, { "epoch": 0.5953906689151208, "grad_norm": 0.23599058389663696, "learning_rate": 0.00021099099099099097, "loss": 0.4089, "step": 1986 }, { "epoch": 0.5956904628068204, "grad_norm": 0.25657573342323303, "learning_rate": 0.00021094594594594593, "loss": 0.445, "step": 1987 }, { "epoch": 0.5959902566985198, "grad_norm": 0.26530593633651733, "learning_rate": 0.0002109009009009009, "loss": 0.4386, "step": 1988 }, { "epoch": 0.5962900505902192, "grad_norm": 0.2542692720890045, "learning_rate": 0.00021085585585585583, "loss": 0.4431, "step": 1989 }, { "epoch": 0.5965898444819187, "grad_norm": 0.2595093548297882, "learning_rate": 0.0002108108108108108, "loss": 0.4243, "step": 1990 }, { "epoch": 0.5968896383736182, "grad_norm": 0.2481933832168579, "learning_rate": 0.00021076576576576576, "loss": 0.4215, "step": 1991 }, { "epoch": 0.5971894322653176, "grad_norm": 0.2607382535934448, "learning_rate": 0.0002107207207207207, "loss": 0.4505, "step": 1992 }, { "epoch": 0.597489226157017, "grad_norm": 0.26596030592918396, "learning_rate": 0.00021067567567567566, "loss": 0.4703, "step": 1993 }, { "epoch": 0.5977890200487165, "grad_norm": 0.24884359538555145, "learning_rate": 0.00021063063063063062, "loss": 0.4237, "step": 1994 }, { "epoch": 0.5980888139404159, "grad_norm": 0.23510870337486267, "learning_rate": 0.00021058558558558556, "loss": 0.4235, "step": 1995 }, { "epoch": 0.5983886078321154, "grad_norm": 0.25982344150543213, "learning_rate": 0.00021054054054054052, "loss": 0.4262, "step": 1996 }, { "epoch": 0.5986884017238149, "grad_norm": 0.24918124079704285, "learning_rate": 0.0002104954954954955, "loss": 0.4225, "step": 1997 }, { "epoch": 0.5989881956155143, "grad_norm": 0.26578548550605774, "learning_rate": 0.00021045045045045042, "loss": 0.4685, "step": 1998 }, { "epoch": 0.5992879895072137, "grad_norm": 0.2682873010635376, "learning_rate": 0.00021040540540540542, "loss": 0.4291, "step": 1999 }, { "epoch": 0.5995877833989133, "grad_norm": 0.247017040848732, "learning_rate": 0.00021036036036036035, "loss": 0.4457, "step": 2000 }, { "epoch": 0.5995877833989133, "eval_loss": 0.4406769275665283, "eval_runtime": 564.9558, "eval_samples_per_second": 3.822, "eval_steps_per_second": 0.478, "step": 2000 }, { "epoch": 0.5998875772906127, "grad_norm": 0.25887760519981384, "learning_rate": 0.0002103153153153153, "loss": 0.4261, "step": 2001 }, { "epoch": 0.6001873711823121, "grad_norm": 0.26813459396362305, "learning_rate": 0.00021027027027027023, "loss": 0.4091, "step": 2002 }, { "epoch": 0.6004871650740116, "grad_norm": 0.2717922329902649, "learning_rate": 0.00021022522522522522, "loss": 0.4257, "step": 2003 }, { "epoch": 0.6007869589657111, "grad_norm": 0.2423432320356369, "learning_rate": 0.00021018018018018015, "loss": 0.4187, "step": 2004 }, { "epoch": 0.6010867528574105, "grad_norm": 0.2616721987724304, "learning_rate": 0.0002101351351351351, "loss": 0.4414, "step": 2005 }, { "epoch": 0.60138654674911, "grad_norm": 0.2668519914150238, "learning_rate": 0.00021009009009009008, "loss": 0.4566, "step": 2006 }, { "epoch": 0.6016863406408094, "grad_norm": 0.24378737807273865, "learning_rate": 0.00021004504504504502, "loss": 0.4379, "step": 2007 }, { "epoch": 0.6019861345325089, "grad_norm": 0.24571847915649414, "learning_rate": 0.00020999999999999998, "loss": 0.4491, "step": 2008 }, { "epoch": 0.6022859284242084, "grad_norm": 0.23369182646274567, "learning_rate": 0.00020995495495495495, "loss": 0.4023, "step": 2009 }, { "epoch": 0.6025857223159078, "grad_norm": 0.27274230122566223, "learning_rate": 0.00020990990990990988, "loss": 0.4514, "step": 2010 }, { "epoch": 0.6028855162076072, "grad_norm": 0.29650700092315674, "learning_rate": 0.00020986486486486485, "loss": 0.4553, "step": 2011 }, { "epoch": 0.6031853100993068, "grad_norm": 0.2543351650238037, "learning_rate": 0.0002098198198198198, "loss": 0.4522, "step": 2012 }, { "epoch": 0.6034851039910062, "grad_norm": 0.25208958983421326, "learning_rate": 0.00020977477477477475, "loss": 0.4342, "step": 2013 }, { "epoch": 0.6037848978827056, "grad_norm": 0.24142181873321533, "learning_rate": 0.0002097297297297297, "loss": 0.4106, "step": 2014 }, { "epoch": 0.604084691774405, "grad_norm": 0.2537544071674347, "learning_rate": 0.00020968468468468467, "loss": 0.4373, "step": 2015 }, { "epoch": 0.6043844856661046, "grad_norm": 0.27682605385780334, "learning_rate": 0.0002096396396396396, "loss": 0.4491, "step": 2016 }, { "epoch": 0.604684279557804, "grad_norm": 0.24604292213916779, "learning_rate": 0.00020959459459459458, "loss": 0.4059, "step": 2017 }, { "epoch": 0.6049840734495034, "grad_norm": 0.24552318453788757, "learning_rate": 0.00020954954954954954, "loss": 0.4361, "step": 2018 }, { "epoch": 0.6052838673412029, "grad_norm": 0.24892392754554749, "learning_rate": 0.00020950450450450448, "loss": 0.4579, "step": 2019 }, { "epoch": 0.6055836612329024, "grad_norm": 0.26011762022972107, "learning_rate": 0.00020945945945945944, "loss": 0.4444, "step": 2020 }, { "epoch": 0.6058834551246018, "grad_norm": 0.25486037135124207, "learning_rate": 0.0002094144144144144, "loss": 0.4256, "step": 2021 }, { "epoch": 0.6061832490163013, "grad_norm": 0.24915416538715363, "learning_rate": 0.00020936936936936937, "loss": 0.4295, "step": 2022 }, { "epoch": 0.6064830429080007, "grad_norm": 0.24448558688163757, "learning_rate": 0.0002093243243243243, "loss": 0.422, "step": 2023 }, { "epoch": 0.6067828367997002, "grad_norm": 0.24826547503471375, "learning_rate": 0.00020927927927927927, "loss": 0.3927, "step": 2024 }, { "epoch": 0.6070826306913997, "grad_norm": 0.24415460228919983, "learning_rate": 0.00020923423423423423, "loss": 0.4128, "step": 2025 }, { "epoch": 0.6073824245830991, "grad_norm": 0.25376150012016296, "learning_rate": 0.00020918918918918917, "loss": 0.4363, "step": 2026 }, { "epoch": 0.6076822184747985, "grad_norm": 0.2526264786720276, "learning_rate": 0.0002091441441441441, "loss": 0.4354, "step": 2027 }, { "epoch": 0.6079820123664981, "grad_norm": 0.25586017966270447, "learning_rate": 0.0002090990990990991, "loss": 0.401, "step": 2028 }, { "epoch": 0.6082818062581975, "grad_norm": 0.2642875909805298, "learning_rate": 0.00020905405405405403, "loss": 0.4476, "step": 2029 }, { "epoch": 0.6085816001498969, "grad_norm": 0.2795594334602356, "learning_rate": 0.00020900900900900897, "loss": 0.4845, "step": 2030 }, { "epoch": 0.6088813940415964, "grad_norm": 0.23246948421001434, "learning_rate": 0.00020896396396396396, "loss": 0.3906, "step": 2031 }, { "epoch": 0.6091811879332959, "grad_norm": 0.25595536828041077, "learning_rate": 0.0002089189189189189, "loss": 0.4331, "step": 2032 }, { "epoch": 0.6094809818249953, "grad_norm": 0.24884217977523804, "learning_rate": 0.00020887387387387383, "loss": 0.4361, "step": 2033 }, { "epoch": 0.6097807757166948, "grad_norm": 0.25283509492874146, "learning_rate": 0.00020882882882882883, "loss": 0.422, "step": 2034 }, { "epoch": 0.6100805696083942, "grad_norm": 0.25681042671203613, "learning_rate": 0.00020878378378378376, "loss": 0.4301, "step": 2035 }, { "epoch": 0.6103803635000937, "grad_norm": 0.2695556879043579, "learning_rate": 0.0002087387387387387, "loss": 0.4624, "step": 2036 }, { "epoch": 0.6106801573917932, "grad_norm": 0.28260117769241333, "learning_rate": 0.0002086936936936937, "loss": 0.4608, "step": 2037 }, { "epoch": 0.6109799512834926, "grad_norm": 0.2578640878200531, "learning_rate": 0.00020864864864864863, "loss": 0.4638, "step": 2038 }, { "epoch": 0.611279745175192, "grad_norm": 0.2544034421443939, "learning_rate": 0.00020860360360360356, "loss": 0.4222, "step": 2039 }, { "epoch": 0.6115795390668916, "grad_norm": 0.2726188600063324, "learning_rate": 0.00020855855855855855, "loss": 0.4802, "step": 2040 }, { "epoch": 0.611879332958591, "grad_norm": 0.26486557722091675, "learning_rate": 0.0002085135135135135, "loss": 0.4475, "step": 2041 }, { "epoch": 0.6121791268502904, "grad_norm": 0.2517718970775604, "learning_rate": 0.00020846846846846843, "loss": 0.4445, "step": 2042 }, { "epoch": 0.6124789207419898, "grad_norm": 0.2507137060165405, "learning_rate": 0.00020842342342342342, "loss": 0.4243, "step": 2043 }, { "epoch": 0.6127787146336894, "grad_norm": 0.2707221806049347, "learning_rate": 0.00020837837837837836, "loss": 0.4706, "step": 2044 }, { "epoch": 0.6130785085253888, "grad_norm": 0.24258100986480713, "learning_rate": 0.00020833333333333332, "loss": 0.423, "step": 2045 }, { "epoch": 0.6133783024170882, "grad_norm": 0.25611022114753723, "learning_rate": 0.00020828828828828828, "loss": 0.4553, "step": 2046 }, { "epoch": 0.6136780963087877, "grad_norm": 0.26037782430648804, "learning_rate": 0.00020824324324324322, "loss": 0.4173, "step": 2047 }, { "epoch": 0.6139778902004872, "grad_norm": 0.26126980781555176, "learning_rate": 0.00020819819819819818, "loss": 0.4556, "step": 2048 }, { "epoch": 0.6142776840921866, "grad_norm": 0.283407598733902, "learning_rate": 0.00020815315315315315, "loss": 0.4421, "step": 2049 }, { "epoch": 0.6145774779838861, "grad_norm": 0.2533104717731476, "learning_rate": 0.00020810810810810808, "loss": 0.4253, "step": 2050 }, { "epoch": 0.6148772718755855, "grad_norm": 0.24108561873435974, "learning_rate": 0.00020806306306306305, "loss": 0.438, "step": 2051 }, { "epoch": 0.615177065767285, "grad_norm": 0.2516781687736511, "learning_rate": 0.00020801801801801799, "loss": 0.4245, "step": 2052 }, { "epoch": 0.6154768596589845, "grad_norm": 0.25689882040023804, "learning_rate": 0.00020797297297297295, "loss": 0.4412, "step": 2053 }, { "epoch": 0.6157766535506839, "grad_norm": 0.2661590576171875, "learning_rate": 0.0002079279279279279, "loss": 0.47, "step": 2054 }, { "epoch": 0.6160764474423833, "grad_norm": 0.2580914795398712, "learning_rate": 0.00020788288288288285, "loss": 0.4537, "step": 2055 }, { "epoch": 0.6163762413340829, "grad_norm": 0.23960340023040771, "learning_rate": 0.00020783783783783784, "loss": 0.4535, "step": 2056 }, { "epoch": 0.6166760352257823, "grad_norm": 0.2658163905143738, "learning_rate": 0.00020779279279279278, "loss": 0.4483, "step": 2057 }, { "epoch": 0.6169758291174817, "grad_norm": 0.2633172869682312, "learning_rate": 0.00020774774774774771, "loss": 0.4408, "step": 2058 }, { "epoch": 0.6172756230091812, "grad_norm": 0.2618810832500458, "learning_rate": 0.0002077027027027027, "loss": 0.4497, "step": 2059 }, { "epoch": 0.6175754169008807, "grad_norm": 0.2505929470062256, "learning_rate": 0.00020765765765765764, "loss": 0.4411, "step": 2060 }, { "epoch": 0.6178752107925801, "grad_norm": 0.2556394338607788, "learning_rate": 0.00020761261261261258, "loss": 0.4427, "step": 2061 }, { "epoch": 0.6181750046842795, "grad_norm": 0.2584443688392639, "learning_rate": 0.00020756756756756757, "loss": 0.4707, "step": 2062 }, { "epoch": 0.618474798575979, "grad_norm": 0.2633087635040283, "learning_rate": 0.0002075225225225225, "loss": 0.4576, "step": 2063 }, { "epoch": 0.6187745924676785, "grad_norm": 0.2487655133008957, "learning_rate": 0.00020747747747747744, "loss": 0.4374, "step": 2064 }, { "epoch": 0.619074386359378, "grad_norm": 0.24193575978279114, "learning_rate": 0.00020743243243243243, "loss": 0.4267, "step": 2065 }, { "epoch": 0.6193741802510774, "grad_norm": 0.2395676076412201, "learning_rate": 0.00020738738738738737, "loss": 0.4236, "step": 2066 }, { "epoch": 0.6196739741427768, "grad_norm": 0.24150055646896362, "learning_rate": 0.0002073423423423423, "loss": 0.4138, "step": 2067 }, { "epoch": 0.6199737680344763, "grad_norm": 0.2652229368686676, "learning_rate": 0.0002072972972972973, "loss": 0.4248, "step": 2068 }, { "epoch": 0.6202735619261758, "grad_norm": 0.24750439822673798, "learning_rate": 0.00020725225225225224, "loss": 0.4273, "step": 2069 }, { "epoch": 0.6205733558178752, "grad_norm": 0.25777891278266907, "learning_rate": 0.00020720720720720717, "loss": 0.4471, "step": 2070 }, { "epoch": 0.6208731497095746, "grad_norm": 0.25587761402130127, "learning_rate": 0.00020716216216216216, "loss": 0.4246, "step": 2071 }, { "epoch": 0.6211729436012742, "grad_norm": 0.2518714368343353, "learning_rate": 0.0002071171171171171, "loss": 0.4645, "step": 2072 }, { "epoch": 0.6214727374929736, "grad_norm": 0.24348096549510956, "learning_rate": 0.00020707207207207204, "loss": 0.4454, "step": 2073 }, { "epoch": 0.621772531384673, "grad_norm": 0.2423403263092041, "learning_rate": 0.000207027027027027, "loss": 0.405, "step": 2074 }, { "epoch": 0.6220723252763725, "grad_norm": 0.26894888281822205, "learning_rate": 0.00020698198198198196, "loss": 0.4699, "step": 2075 }, { "epoch": 0.622372119168072, "grad_norm": 0.2578108608722687, "learning_rate": 0.0002069369369369369, "loss": 0.4365, "step": 2076 }, { "epoch": 0.6226719130597714, "grad_norm": 0.27144232392311096, "learning_rate": 0.00020689189189189187, "loss": 0.4291, "step": 2077 }, { "epoch": 0.6229717069514709, "grad_norm": 0.25405609607696533, "learning_rate": 0.00020684684684684683, "loss": 0.4389, "step": 2078 }, { "epoch": 0.6232715008431703, "grad_norm": 0.24775098264217377, "learning_rate": 0.0002068018018018018, "loss": 0.416, "step": 2079 }, { "epoch": 0.6235712947348698, "grad_norm": 0.25767782330513, "learning_rate": 0.00020675675675675673, "loss": 0.455, "step": 2080 }, { "epoch": 0.6238710886265693, "grad_norm": 0.2590571641921997, "learning_rate": 0.0002067117117117117, "loss": 0.4436, "step": 2081 }, { "epoch": 0.6241708825182687, "grad_norm": 0.24577729403972626, "learning_rate": 0.00020666666666666666, "loss": 0.4389, "step": 2082 }, { "epoch": 0.6244706764099681, "grad_norm": 0.25743165612220764, "learning_rate": 0.0002066216216216216, "loss": 0.4467, "step": 2083 }, { "epoch": 0.6247704703016677, "grad_norm": 0.24127769470214844, "learning_rate": 0.00020657657657657656, "loss": 0.4056, "step": 2084 }, { "epoch": 0.6250702641933671, "grad_norm": 0.25930923223495483, "learning_rate": 0.00020653153153153152, "loss": 0.4352, "step": 2085 }, { "epoch": 0.6253700580850665, "grad_norm": 0.23111720383167267, "learning_rate": 0.00020648648648648646, "loss": 0.4035, "step": 2086 }, { "epoch": 0.625669851976766, "grad_norm": 0.25705838203430176, "learning_rate": 0.00020644144144144142, "loss": 0.4402, "step": 2087 }, { "epoch": 0.6259696458684655, "grad_norm": 0.260232150554657, "learning_rate": 0.0002063963963963964, "loss": 0.444, "step": 2088 }, { "epoch": 0.6262694397601649, "grad_norm": 0.24525994062423706, "learning_rate": 0.00020635135135135132, "loss": 0.4461, "step": 2089 }, { "epoch": 0.6265692336518643, "grad_norm": 0.2550069987773895, "learning_rate": 0.00020630630630630631, "loss": 0.4391, "step": 2090 }, { "epoch": 0.6268690275435638, "grad_norm": 0.24561873078346252, "learning_rate": 0.00020626126126126125, "loss": 0.424, "step": 2091 }, { "epoch": 0.6271688214352632, "grad_norm": 0.2500348687171936, "learning_rate": 0.0002062162162162162, "loss": 0.4415, "step": 2092 }, { "epoch": 0.6274686153269627, "grad_norm": 0.23684681951999664, "learning_rate": 0.00020617117117117118, "loss": 0.4282, "step": 2093 }, { "epoch": 0.6277684092186622, "grad_norm": 0.2459501475095749, "learning_rate": 0.00020612612612612612, "loss": 0.4229, "step": 2094 }, { "epoch": 0.6280682031103616, "grad_norm": 0.23220065236091614, "learning_rate": 0.00020608108108108105, "loss": 0.4083, "step": 2095 }, { "epoch": 0.628367997002061, "grad_norm": 0.26996707916259766, "learning_rate": 0.00020603603603603604, "loss": 0.4581, "step": 2096 }, { "epoch": 0.6286677908937606, "grad_norm": 0.2551998496055603, "learning_rate": 0.00020599099099099098, "loss": 0.4444, "step": 2097 }, { "epoch": 0.62896758478546, "grad_norm": 0.2604631781578064, "learning_rate": 0.00020594594594594592, "loss": 0.4367, "step": 2098 }, { "epoch": 0.6292673786771594, "grad_norm": 0.25958266854286194, "learning_rate": 0.00020590090090090085, "loss": 0.4618, "step": 2099 }, { "epoch": 0.6295671725688589, "grad_norm": 0.2714870274066925, "learning_rate": 0.00020585585585585584, "loss": 0.4382, "step": 2100 }, { "epoch": 0.6298669664605584, "grad_norm": 0.2739773392677307, "learning_rate": 0.00020581081081081078, "loss": 0.4752, "step": 2101 }, { "epoch": 0.6301667603522578, "grad_norm": 0.24829277396202087, "learning_rate": 0.00020576576576576575, "loss": 0.4285, "step": 2102 }, { "epoch": 0.6304665542439573, "grad_norm": 0.2551855742931366, "learning_rate": 0.0002057207207207207, "loss": 0.4259, "step": 2103 }, { "epoch": 0.6307663481356567, "grad_norm": 0.243735671043396, "learning_rate": 0.00020567567567567565, "loss": 0.4201, "step": 2104 }, { "epoch": 0.6310661420273562, "grad_norm": 0.2511364817619324, "learning_rate": 0.0002056306306306306, "loss": 0.4351, "step": 2105 }, { "epoch": 0.6313659359190557, "grad_norm": 0.2456447184085846, "learning_rate": 0.00020558558558558557, "loss": 0.448, "step": 2106 }, { "epoch": 0.6316657298107551, "grad_norm": 0.26450565457344055, "learning_rate": 0.0002055405405405405, "loss": 0.4576, "step": 2107 }, { "epoch": 0.6319655237024545, "grad_norm": 0.25267186760902405, "learning_rate": 0.00020549549549549547, "loss": 0.4511, "step": 2108 }, { "epoch": 0.632265317594154, "grad_norm": 0.2436206340789795, "learning_rate": 0.00020545045045045044, "loss": 0.4147, "step": 2109 }, { "epoch": 0.6325651114858535, "grad_norm": 0.27077367901802063, "learning_rate": 0.00020540540540540537, "loss": 0.4687, "step": 2110 }, { "epoch": 0.6328649053775529, "grad_norm": 0.25476735830307007, "learning_rate": 0.00020536036036036034, "loss": 0.4357, "step": 2111 }, { "epoch": 0.6331646992692523, "grad_norm": 0.23889677226543427, "learning_rate": 0.0002053153153153153, "loss": 0.4109, "step": 2112 }, { "epoch": 0.6334644931609519, "grad_norm": 0.2620011270046234, "learning_rate": 0.00020527027027027027, "loss": 0.4418, "step": 2113 }, { "epoch": 0.6337642870526513, "grad_norm": 0.24259789288043976, "learning_rate": 0.0002052252252252252, "loss": 0.4158, "step": 2114 }, { "epoch": 0.6340640809443507, "grad_norm": 0.26212331652641296, "learning_rate": 0.00020518018018018017, "loss": 0.4407, "step": 2115 }, { "epoch": 0.6343638748360502, "grad_norm": 0.2421627789735794, "learning_rate": 0.00020513513513513513, "loss": 0.4162, "step": 2116 }, { "epoch": 0.6346636687277497, "grad_norm": 0.25949686765670776, "learning_rate": 0.00020509009009009007, "loss": 0.4436, "step": 2117 }, { "epoch": 0.6349634626194491, "grad_norm": 0.26797404885292053, "learning_rate": 0.00020504504504504503, "loss": 0.4684, "step": 2118 }, { "epoch": 0.6352632565111486, "grad_norm": 0.2433563470840454, "learning_rate": 0.000205, "loss": 0.4312, "step": 2119 }, { "epoch": 0.635563050402848, "grad_norm": 0.25377362966537476, "learning_rate": 0.00020495495495495493, "loss": 0.4583, "step": 2120 }, { "epoch": 0.6358628442945475, "grad_norm": 0.2583523392677307, "learning_rate": 0.0002049099099099099, "loss": 0.4587, "step": 2121 }, { "epoch": 0.636162638186247, "grad_norm": 0.2512962520122528, "learning_rate": 0.00020486486486486486, "loss": 0.4148, "step": 2122 }, { "epoch": 0.6364624320779464, "grad_norm": 0.24238212406635284, "learning_rate": 0.0002048198198198198, "loss": 0.4057, "step": 2123 }, { "epoch": 0.6367622259696458, "grad_norm": 0.25228351354599, "learning_rate": 0.00020477477477477473, "loss": 0.4291, "step": 2124 }, { "epoch": 0.6370620198613454, "grad_norm": 0.26437970995903015, "learning_rate": 0.00020472972972972972, "loss": 0.4425, "step": 2125 }, { "epoch": 0.6373618137530448, "grad_norm": 0.2502360939979553, "learning_rate": 0.00020468468468468466, "loss": 0.4102, "step": 2126 }, { "epoch": 0.6376616076447442, "grad_norm": 0.24895761907100677, "learning_rate": 0.0002046396396396396, "loss": 0.4423, "step": 2127 }, { "epoch": 0.6379614015364437, "grad_norm": 0.2674795985221863, "learning_rate": 0.0002045945945945946, "loss": 0.4572, "step": 2128 }, { "epoch": 0.6382611954281432, "grad_norm": 0.2468012422323227, "learning_rate": 0.00020454954954954953, "loss": 0.4525, "step": 2129 }, { "epoch": 0.6385609893198426, "grad_norm": 0.2560267448425293, "learning_rate": 0.00020450450450450446, "loss": 0.4553, "step": 2130 }, { "epoch": 0.638860783211542, "grad_norm": 0.25216519832611084, "learning_rate": 0.00020445945945945945, "loss": 0.4319, "step": 2131 }, { "epoch": 0.6391605771032415, "grad_norm": 0.22927191853523254, "learning_rate": 0.0002044144144144144, "loss": 0.4088, "step": 2132 }, { "epoch": 0.639460370994941, "grad_norm": 0.2495734840631485, "learning_rate": 0.00020436936936936933, "loss": 0.4301, "step": 2133 }, { "epoch": 0.6397601648866404, "grad_norm": 0.2604648470878601, "learning_rate": 0.00020432432432432432, "loss": 0.4115, "step": 2134 }, { "epoch": 0.6400599587783399, "grad_norm": 0.27185025811195374, "learning_rate": 0.00020427927927927925, "loss": 0.4452, "step": 2135 }, { "epoch": 0.6403597526700393, "grad_norm": 0.26344043016433716, "learning_rate": 0.00020423423423423422, "loss": 0.4178, "step": 2136 }, { "epoch": 0.6406595465617388, "grad_norm": 0.26041179895401, "learning_rate": 0.00020418918918918918, "loss": 0.4384, "step": 2137 }, { "epoch": 0.6409593404534383, "grad_norm": 0.25143757462501526, "learning_rate": 0.00020414414414414412, "loss": 0.4486, "step": 2138 }, { "epoch": 0.6412591343451377, "grad_norm": 0.2869662642478943, "learning_rate": 0.00020409909909909908, "loss": 0.487, "step": 2139 }, { "epoch": 0.6415589282368371, "grad_norm": 0.2788044214248657, "learning_rate": 0.00020405405405405405, "loss": 0.4255, "step": 2140 }, { "epoch": 0.6418587221285367, "grad_norm": 0.27385109663009644, "learning_rate": 0.00020400900900900898, "loss": 0.4535, "step": 2141 }, { "epoch": 0.6421585160202361, "grad_norm": 0.26231497526168823, "learning_rate": 0.00020396396396396395, "loss": 0.4435, "step": 2142 }, { "epoch": 0.6424583099119355, "grad_norm": 0.26582983136177063, "learning_rate": 0.0002039189189189189, "loss": 0.4394, "step": 2143 }, { "epoch": 0.642758103803635, "grad_norm": 0.2665957510471344, "learning_rate": 0.00020387387387387385, "loss": 0.4401, "step": 2144 }, { "epoch": 0.6430578976953345, "grad_norm": 0.2581312358379364, "learning_rate": 0.0002038288288288288, "loss": 0.4302, "step": 2145 }, { "epoch": 0.6433576915870339, "grad_norm": 0.25063759088516235, "learning_rate": 0.00020378378378378375, "loss": 0.4558, "step": 2146 }, { "epoch": 0.6436574854787334, "grad_norm": 0.2655949890613556, "learning_rate": 0.00020373873873873874, "loss": 0.4559, "step": 2147 }, { "epoch": 0.6439572793704328, "grad_norm": 0.25797712802886963, "learning_rate": 0.00020369369369369368, "loss": 0.4127, "step": 2148 }, { "epoch": 0.6442570732621323, "grad_norm": 0.2774755358695984, "learning_rate": 0.00020364864864864861, "loss": 0.4807, "step": 2149 }, { "epoch": 0.6445568671538318, "grad_norm": 0.27283889055252075, "learning_rate": 0.0002036036036036036, "loss": 0.4483, "step": 2150 }, { "epoch": 0.6448566610455312, "grad_norm": 0.2647114396095276, "learning_rate": 0.00020355855855855854, "loss": 0.4629, "step": 2151 }, { "epoch": 0.6451564549372306, "grad_norm": 0.2683508098125458, "learning_rate": 0.00020351351351351348, "loss": 0.4755, "step": 2152 }, { "epoch": 0.6454562488289302, "grad_norm": 0.26375094056129456, "learning_rate": 0.00020346846846846847, "loss": 0.4354, "step": 2153 }, { "epoch": 0.6457560427206296, "grad_norm": 0.2688734233379364, "learning_rate": 0.0002034234234234234, "loss": 0.4515, "step": 2154 }, { "epoch": 0.646055836612329, "grad_norm": 0.2545178532600403, "learning_rate": 0.00020337837837837834, "loss": 0.4281, "step": 2155 }, { "epoch": 0.6463556305040284, "grad_norm": 0.24753254652023315, "learning_rate": 0.00020333333333333333, "loss": 0.4332, "step": 2156 }, { "epoch": 0.646655424395728, "grad_norm": 0.2619148790836334, "learning_rate": 0.00020328828828828827, "loss": 0.4531, "step": 2157 }, { "epoch": 0.6469552182874274, "grad_norm": 0.2698518633842468, "learning_rate": 0.0002032432432432432, "loss": 0.4807, "step": 2158 }, { "epoch": 0.6472550121791268, "grad_norm": 0.2625516355037689, "learning_rate": 0.0002031981981981982, "loss": 0.4368, "step": 2159 }, { "epoch": 0.6475548060708263, "grad_norm": 0.25750574469566345, "learning_rate": 0.00020315315315315313, "loss": 0.4489, "step": 2160 }, { "epoch": 0.6478545999625258, "grad_norm": 0.2963887155056, "learning_rate": 0.00020310810810810807, "loss": 0.4567, "step": 2161 }, { "epoch": 0.6481543938542252, "grad_norm": 0.28631022572517395, "learning_rate": 0.00020306306306306306, "loss": 0.457, "step": 2162 }, { "epoch": 0.6484541877459247, "grad_norm": 0.2652076184749603, "learning_rate": 0.000203018018018018, "loss": 0.4008, "step": 2163 }, { "epoch": 0.6487539816376241, "grad_norm": 0.25599247217178345, "learning_rate": 0.00020297297297297294, "loss": 0.4236, "step": 2164 }, { "epoch": 0.6490537755293236, "grad_norm": 0.2655317783355713, "learning_rate": 0.00020292792792792793, "loss": 0.4401, "step": 2165 }, { "epoch": 0.6493535694210231, "grad_norm": 0.27608954906463623, "learning_rate": 0.00020288288288288286, "loss": 0.4469, "step": 2166 }, { "epoch": 0.6496533633127225, "grad_norm": 0.2523987293243408, "learning_rate": 0.0002028378378378378, "loss": 0.4134, "step": 2167 }, { "epoch": 0.6499531572044219, "grad_norm": 0.26536789536476135, "learning_rate": 0.0002027927927927928, "loss": 0.4416, "step": 2168 }, { "epoch": 0.6502529510961215, "grad_norm": 0.24977469444274902, "learning_rate": 0.00020274774774774773, "loss": 0.4318, "step": 2169 }, { "epoch": 0.6505527449878209, "grad_norm": 0.27510321140289307, "learning_rate": 0.0002027027027027027, "loss": 0.4535, "step": 2170 }, { "epoch": 0.6508525388795203, "grad_norm": 0.24680355191230774, "learning_rate": 0.00020265765765765763, "loss": 0.4247, "step": 2171 }, { "epoch": 0.6511523327712198, "grad_norm": 0.24580539762973785, "learning_rate": 0.0002026126126126126, "loss": 0.4356, "step": 2172 }, { "epoch": 0.6514521266629193, "grad_norm": 0.2560003101825714, "learning_rate": 0.00020256756756756756, "loss": 0.4019, "step": 2173 }, { "epoch": 0.6517519205546187, "grad_norm": 0.2403692603111267, "learning_rate": 0.0002025225225225225, "loss": 0.4088, "step": 2174 }, { "epoch": 0.6520517144463182, "grad_norm": 0.25952261686325073, "learning_rate": 0.00020247747747747746, "loss": 0.4585, "step": 2175 }, { "epoch": 0.6523515083380176, "grad_norm": 0.2858710289001465, "learning_rate": 0.00020243243243243242, "loss": 0.4616, "step": 2176 }, { "epoch": 0.6526513022297171, "grad_norm": 0.2761286795139313, "learning_rate": 0.00020238738738738736, "loss": 0.4694, "step": 2177 }, { "epoch": 0.6529510961214166, "grad_norm": 0.2620023190975189, "learning_rate": 0.00020234234234234232, "loss": 0.4441, "step": 2178 }, { "epoch": 0.653250890013116, "grad_norm": 0.2743069529533386, "learning_rate": 0.00020229729729729729, "loss": 0.4433, "step": 2179 }, { "epoch": 0.6535506839048154, "grad_norm": 0.25842079520225525, "learning_rate": 0.00020225225225225222, "loss": 0.475, "step": 2180 }, { "epoch": 0.653850477796515, "grad_norm": 0.267971932888031, "learning_rate": 0.0002022072072072072, "loss": 0.4384, "step": 2181 }, { "epoch": 0.6541502716882144, "grad_norm": 0.26229405403137207, "learning_rate": 0.00020216216216216215, "loss": 0.4525, "step": 2182 }, { "epoch": 0.6544500655799138, "grad_norm": 0.26667454838752747, "learning_rate": 0.0002021171171171171, "loss": 0.4384, "step": 2183 }, { "epoch": 0.6547498594716132, "grad_norm": 0.2867937982082367, "learning_rate": 0.00020207207207207208, "loss": 0.4575, "step": 2184 }, { "epoch": 0.6550496533633127, "grad_norm": 0.2772350013256073, "learning_rate": 0.00020202702702702701, "loss": 0.4515, "step": 2185 }, { "epoch": 0.6553494472550122, "grad_norm": 0.2599480450153351, "learning_rate": 0.00020198198198198195, "loss": 0.4614, "step": 2186 }, { "epoch": 0.6556492411467116, "grad_norm": 0.25755733251571655, "learning_rate": 0.00020193693693693694, "loss": 0.4543, "step": 2187 }, { "epoch": 0.6559490350384111, "grad_norm": 0.2630176246166229, "learning_rate": 0.00020189189189189188, "loss": 0.4317, "step": 2188 }, { "epoch": 0.6562488289301105, "grad_norm": 0.25234025716781616, "learning_rate": 0.00020184684684684682, "loss": 0.4187, "step": 2189 }, { "epoch": 0.65654862282181, "grad_norm": 0.2525855302810669, "learning_rate": 0.0002018018018018018, "loss": 0.4364, "step": 2190 }, { "epoch": 0.6568484167135095, "grad_norm": 0.2502008378505707, "learning_rate": 0.00020175675675675674, "loss": 0.4162, "step": 2191 }, { "epoch": 0.6571482106052089, "grad_norm": 0.24894630908966064, "learning_rate": 0.00020171171171171168, "loss": 0.4287, "step": 2192 }, { "epoch": 0.6574480044969083, "grad_norm": 0.24618011713027954, "learning_rate": 0.00020166666666666667, "loss": 0.4225, "step": 2193 }, { "epoch": 0.6577477983886079, "grad_norm": 0.3233836591243744, "learning_rate": 0.0002016216216216216, "loss": 0.5051, "step": 2194 }, { "epoch": 0.6580475922803073, "grad_norm": 0.2600880265235901, "learning_rate": 0.00020157657657657655, "loss": 0.4437, "step": 2195 }, { "epoch": 0.6583473861720067, "grad_norm": 0.2576145529747009, "learning_rate": 0.0002015315315315315, "loss": 0.4035, "step": 2196 }, { "epoch": 0.6586471800637062, "grad_norm": 0.2555258572101593, "learning_rate": 0.00020148648648648647, "loss": 0.409, "step": 2197 }, { "epoch": 0.6589469739554057, "grad_norm": 0.2691539525985718, "learning_rate": 0.0002014414414414414, "loss": 0.4281, "step": 2198 }, { "epoch": 0.6592467678471051, "grad_norm": 0.2896745502948761, "learning_rate": 0.00020139639639639637, "loss": 0.4529, "step": 2199 }, { "epoch": 0.6595465617388045, "grad_norm": 0.2579841911792755, "learning_rate": 0.00020135135135135134, "loss": 0.4479, "step": 2200 }, { "epoch": 0.659846355630504, "grad_norm": 0.24643990397453308, "learning_rate": 0.00020130630630630627, "loss": 0.4205, "step": 2201 }, { "epoch": 0.6601461495222035, "grad_norm": 0.2523176968097687, "learning_rate": 0.00020126126126126124, "loss": 0.428, "step": 2202 }, { "epoch": 0.660445943413903, "grad_norm": 0.27141350507736206, "learning_rate": 0.0002012162162162162, "loss": 0.4345, "step": 2203 }, { "epoch": 0.6607457373056024, "grad_norm": 0.2594592571258545, "learning_rate": 0.00020117117117117117, "loss": 0.428, "step": 2204 }, { "epoch": 0.6610455311973018, "grad_norm": 0.2673409581184387, "learning_rate": 0.0002011261261261261, "loss": 0.4229, "step": 2205 }, { "epoch": 0.6613453250890013, "grad_norm": 0.26660290360450745, "learning_rate": 0.00020108108108108107, "loss": 0.4635, "step": 2206 }, { "epoch": 0.6616451189807008, "grad_norm": 0.2599898874759674, "learning_rate": 0.00020103603603603603, "loss": 0.4398, "step": 2207 }, { "epoch": 0.6619449128724002, "grad_norm": 0.2786708474159241, "learning_rate": 0.00020099099099099097, "loss": 0.4221, "step": 2208 }, { "epoch": 0.6622447067640996, "grad_norm": 0.2445315271615982, "learning_rate": 0.00020094594594594593, "loss": 0.4143, "step": 2209 }, { "epoch": 0.6625445006557992, "grad_norm": 0.2615225315093994, "learning_rate": 0.0002009009009009009, "loss": 0.4388, "step": 2210 }, { "epoch": 0.6628442945474986, "grad_norm": 0.25724494457244873, "learning_rate": 0.00020085585585585583, "loss": 0.4359, "step": 2211 }, { "epoch": 0.663144088439198, "grad_norm": 0.2562429904937744, "learning_rate": 0.0002008108108108108, "loss": 0.4213, "step": 2212 }, { "epoch": 0.6634438823308975, "grad_norm": 0.25428321957588196, "learning_rate": 0.00020076576576576576, "loss": 0.4398, "step": 2213 }, { "epoch": 0.663743676222597, "grad_norm": 0.2503153383731842, "learning_rate": 0.0002007207207207207, "loss": 0.4266, "step": 2214 }, { "epoch": 0.6640434701142964, "grad_norm": 0.26181626319885254, "learning_rate": 0.00020067567567567566, "loss": 0.4398, "step": 2215 }, { "epoch": 0.6643432640059959, "grad_norm": 0.26600703597068787, "learning_rate": 0.00020063063063063062, "loss": 0.4418, "step": 2216 }, { "epoch": 0.6646430578976953, "grad_norm": 0.27476897835731506, "learning_rate": 0.00020058558558558556, "loss": 0.4405, "step": 2217 }, { "epoch": 0.6649428517893948, "grad_norm": 0.23994912207126617, "learning_rate": 0.0002005405405405405, "loss": 0.4171, "step": 2218 }, { "epoch": 0.6652426456810943, "grad_norm": 0.2647401988506317, "learning_rate": 0.0002004954954954955, "loss": 0.4514, "step": 2219 }, { "epoch": 0.6655424395727937, "grad_norm": 0.27040109038352966, "learning_rate": 0.00020045045045045043, "loss": 0.4665, "step": 2220 }, { "epoch": 0.6658422334644931, "grad_norm": 0.252128005027771, "learning_rate": 0.00020040540540540536, "loss": 0.4316, "step": 2221 }, { "epoch": 0.6661420273561927, "grad_norm": 0.2605067193508148, "learning_rate": 0.00020036036036036035, "loss": 0.4242, "step": 2222 }, { "epoch": 0.6664418212478921, "grad_norm": 0.26456427574157715, "learning_rate": 0.0002003153153153153, "loss": 0.4379, "step": 2223 }, { "epoch": 0.6667416151395915, "grad_norm": 0.24853096902370453, "learning_rate": 0.00020027027027027023, "loss": 0.4339, "step": 2224 }, { "epoch": 0.6670414090312909, "grad_norm": 0.2573966979980469, "learning_rate": 0.00020022522522522522, "loss": 0.4427, "step": 2225 }, { "epoch": 0.6673412029229905, "grad_norm": 0.2402806282043457, "learning_rate": 0.00020018018018018015, "loss": 0.4195, "step": 2226 }, { "epoch": 0.6676409968146899, "grad_norm": 0.2738892138004303, "learning_rate": 0.00020013513513513512, "loss": 0.4374, "step": 2227 }, { "epoch": 0.6679407907063893, "grad_norm": 0.2548046410083771, "learning_rate": 0.00020009009009009008, "loss": 0.4312, "step": 2228 }, { "epoch": 0.6682405845980888, "grad_norm": 0.24718345701694489, "learning_rate": 0.00020004504504504502, "loss": 0.4533, "step": 2229 }, { "epoch": 0.6685403784897883, "grad_norm": 0.2544947862625122, "learning_rate": 0.00019999999999999998, "loss": 0.4255, "step": 2230 }, { "epoch": 0.6688401723814877, "grad_norm": 0.2689761817455292, "learning_rate": 0.00019995495495495495, "loss": 0.451, "step": 2231 }, { "epoch": 0.6691399662731872, "grad_norm": 0.2824248969554901, "learning_rate": 0.00019990990990990988, "loss": 0.4374, "step": 2232 }, { "epoch": 0.6694397601648866, "grad_norm": 0.25299206376075745, "learning_rate": 0.00019986486486486485, "loss": 0.4301, "step": 2233 }, { "epoch": 0.6697395540565861, "grad_norm": 0.26550740003585815, "learning_rate": 0.0001998198198198198, "loss": 0.4332, "step": 2234 }, { "epoch": 0.6700393479482856, "grad_norm": 0.2446885108947754, "learning_rate": 0.00019977477477477475, "loss": 0.4207, "step": 2235 }, { "epoch": 0.670339141839985, "grad_norm": 0.23779337108135223, "learning_rate": 0.0001997297297297297, "loss": 0.432, "step": 2236 }, { "epoch": 0.6706389357316844, "grad_norm": 0.2619169354438782, "learning_rate": 0.00019968468468468468, "loss": 0.4332, "step": 2237 }, { "epoch": 0.670938729623384, "grad_norm": 0.258789598941803, "learning_rate": 0.00019963963963963964, "loss": 0.4328, "step": 2238 }, { "epoch": 0.6712385235150834, "grad_norm": 0.27282267808914185, "learning_rate": 0.00019959459459459458, "loss": 0.4423, "step": 2239 }, { "epoch": 0.6715383174067828, "grad_norm": 0.2593368887901306, "learning_rate": 0.00019954954954954954, "loss": 0.4473, "step": 2240 }, { "epoch": 0.6718381112984823, "grad_norm": 0.25508931279182434, "learning_rate": 0.0001995045045045045, "loss": 0.4371, "step": 2241 }, { "epoch": 0.6721379051901818, "grad_norm": 0.25891220569610596, "learning_rate": 0.00019945945945945944, "loss": 0.4143, "step": 2242 }, { "epoch": 0.6724376990818812, "grad_norm": 0.2661759853363037, "learning_rate": 0.00019941441441441438, "loss": 0.4585, "step": 2243 }, { "epoch": 0.6727374929735807, "grad_norm": 0.25630587339401245, "learning_rate": 0.00019936936936936937, "loss": 0.4442, "step": 2244 }, { "epoch": 0.6730372868652801, "grad_norm": 0.27050697803497314, "learning_rate": 0.0001993243243243243, "loss": 0.4437, "step": 2245 }, { "epoch": 0.6733370807569796, "grad_norm": 0.2564866244792938, "learning_rate": 0.00019927927927927924, "loss": 0.4352, "step": 2246 }, { "epoch": 0.673636874648679, "grad_norm": 0.2438693344593048, "learning_rate": 0.00019923423423423423, "loss": 0.4138, "step": 2247 }, { "epoch": 0.6739366685403785, "grad_norm": 0.26038050651550293, "learning_rate": 0.00019918918918918917, "loss": 0.4309, "step": 2248 }, { "epoch": 0.6742364624320779, "grad_norm": 0.2429644614458084, "learning_rate": 0.0001991441441441441, "loss": 0.4036, "step": 2249 }, { "epoch": 0.6745362563237775, "grad_norm": 0.24670763313770294, "learning_rate": 0.0001990990990990991, "loss": 0.4054, "step": 2250 }, { "epoch": 0.6748360502154769, "grad_norm": 0.2641808092594147, "learning_rate": 0.00019905405405405403, "loss": 0.4431, "step": 2251 }, { "epoch": 0.6751358441071763, "grad_norm": 0.2601335644721985, "learning_rate": 0.00019900900900900897, "loss": 0.403, "step": 2252 }, { "epoch": 0.6754356379988757, "grad_norm": 0.2698741555213928, "learning_rate": 0.00019896396396396396, "loss": 0.4328, "step": 2253 }, { "epoch": 0.6757354318905753, "grad_norm": 0.26246562600135803, "learning_rate": 0.0001989189189189189, "loss": 0.4264, "step": 2254 }, { "epoch": 0.6760352257822747, "grad_norm": 0.26453712582588196, "learning_rate": 0.00019887387387387384, "loss": 0.4552, "step": 2255 }, { "epoch": 0.6763350196739741, "grad_norm": 0.24887052178382874, "learning_rate": 0.00019882882882882883, "loss": 0.3918, "step": 2256 }, { "epoch": 0.6766348135656736, "grad_norm": 0.26789602637290955, "learning_rate": 0.00019878378378378376, "loss": 0.4416, "step": 2257 }, { "epoch": 0.6769346074573731, "grad_norm": 0.24345183372497559, "learning_rate": 0.0001987387387387387, "loss": 0.4355, "step": 2258 }, { "epoch": 0.6772344013490725, "grad_norm": 0.2475103735923767, "learning_rate": 0.0001986936936936937, "loss": 0.4277, "step": 2259 }, { "epoch": 0.677534195240772, "grad_norm": 0.2740587294101715, "learning_rate": 0.00019864864864864863, "loss": 0.4536, "step": 2260 }, { "epoch": 0.6778339891324714, "grad_norm": 0.23657365143299103, "learning_rate": 0.0001986036036036036, "loss": 0.3887, "step": 2261 }, { "epoch": 0.6781337830241709, "grad_norm": 0.267630398273468, "learning_rate": 0.00019855855855855856, "loss": 0.443, "step": 2262 }, { "epoch": 0.6784335769158704, "grad_norm": 0.2708898186683655, "learning_rate": 0.0001985135135135135, "loss": 0.4398, "step": 2263 }, { "epoch": 0.6787333708075698, "grad_norm": 0.26607415080070496, "learning_rate": 0.00019846846846846846, "loss": 0.4249, "step": 2264 }, { "epoch": 0.6790331646992692, "grad_norm": 0.2398756742477417, "learning_rate": 0.00019842342342342342, "loss": 0.419, "step": 2265 }, { "epoch": 0.6793329585909688, "grad_norm": 0.2509295344352722, "learning_rate": 0.00019837837837837836, "loss": 0.3858, "step": 2266 }, { "epoch": 0.6796327524826682, "grad_norm": 0.30269870162010193, "learning_rate": 0.00019833333333333332, "loss": 0.4564, "step": 2267 }, { "epoch": 0.6799325463743676, "grad_norm": 0.2576700448989868, "learning_rate": 0.00019828828828828826, "loss": 0.4296, "step": 2268 }, { "epoch": 0.680232340266067, "grad_norm": 0.29139164090156555, "learning_rate": 0.00019824324324324322, "loss": 0.4583, "step": 2269 }, { "epoch": 0.6805321341577666, "grad_norm": 0.2578124701976776, "learning_rate": 0.00019819819819819818, "loss": 0.4419, "step": 2270 }, { "epoch": 0.680831928049466, "grad_norm": 0.2546633780002594, "learning_rate": 0.00019815315315315312, "loss": 0.4311, "step": 2271 }, { "epoch": 0.6811317219411654, "grad_norm": 0.293409526348114, "learning_rate": 0.00019810810810810809, "loss": 0.4756, "step": 2272 }, { "epoch": 0.6814315158328649, "grad_norm": 0.249635249376297, "learning_rate": 0.00019806306306306305, "loss": 0.4434, "step": 2273 }, { "epoch": 0.6817313097245644, "grad_norm": 0.2729664146900177, "learning_rate": 0.00019801801801801799, "loss": 0.4721, "step": 2274 }, { "epoch": 0.6820311036162638, "grad_norm": 0.24961845576763153, "learning_rate": 0.00019797297297297298, "loss": 0.4124, "step": 2275 }, { "epoch": 0.6823308975079633, "grad_norm": 0.26508617401123047, "learning_rate": 0.00019792792792792791, "loss": 0.4311, "step": 2276 }, { "epoch": 0.6826306913996627, "grad_norm": 0.24888217449188232, "learning_rate": 0.00019788288288288285, "loss": 0.4334, "step": 2277 }, { "epoch": 0.6829304852913622, "grad_norm": 0.2550651431083679, "learning_rate": 0.00019783783783783784, "loss": 0.4289, "step": 2278 }, { "epoch": 0.6832302791830617, "grad_norm": 0.25816190242767334, "learning_rate": 0.00019779279279279278, "loss": 0.4425, "step": 2279 }, { "epoch": 0.6835300730747611, "grad_norm": 0.25145018100738525, "learning_rate": 0.00019774774774774772, "loss": 0.4123, "step": 2280 }, { "epoch": 0.6838298669664605, "grad_norm": 0.24678850173950195, "learning_rate": 0.0001977027027027027, "loss": 0.4309, "step": 2281 }, { "epoch": 0.68412966085816, "grad_norm": 0.2629925012588501, "learning_rate": 0.00019765765765765764, "loss": 0.4184, "step": 2282 }, { "epoch": 0.6844294547498595, "grad_norm": 0.2568414807319641, "learning_rate": 0.00019761261261261258, "loss": 0.4164, "step": 2283 }, { "epoch": 0.6847292486415589, "grad_norm": 0.25906744599342346, "learning_rate": 0.00019756756756756757, "loss": 0.4547, "step": 2284 }, { "epoch": 0.6850290425332584, "grad_norm": 0.2697434723377228, "learning_rate": 0.0001975225225225225, "loss": 0.4444, "step": 2285 }, { "epoch": 0.6853288364249578, "grad_norm": 0.2573794424533844, "learning_rate": 0.00019747747747747744, "loss": 0.4309, "step": 2286 }, { "epoch": 0.6856286303166573, "grad_norm": 0.2532881796360016, "learning_rate": 0.00019743243243243244, "loss": 0.4172, "step": 2287 }, { "epoch": 0.6859284242083568, "grad_norm": 0.253292977809906, "learning_rate": 0.00019738738738738737, "loss": 0.4305, "step": 2288 }, { "epoch": 0.6862282181000562, "grad_norm": 0.246769517660141, "learning_rate": 0.0001973423423423423, "loss": 0.4106, "step": 2289 }, { "epoch": 0.6865280119917556, "grad_norm": 0.2593647539615631, "learning_rate": 0.0001972972972972973, "loss": 0.4537, "step": 2290 }, { "epoch": 0.6868278058834552, "grad_norm": 0.25611796975135803, "learning_rate": 0.00019725225225225224, "loss": 0.43, "step": 2291 }, { "epoch": 0.6871275997751546, "grad_norm": 0.25119319558143616, "learning_rate": 0.00019720720720720717, "loss": 0.4142, "step": 2292 }, { "epoch": 0.687427393666854, "grad_norm": 0.250675231218338, "learning_rate": 0.00019716216216216214, "loss": 0.4134, "step": 2293 }, { "epoch": 0.6877271875585534, "grad_norm": 0.2680164873600006, "learning_rate": 0.0001971171171171171, "loss": 0.4201, "step": 2294 }, { "epoch": 0.688026981450253, "grad_norm": 0.26599201560020447, "learning_rate": 0.00019707207207207206, "loss": 0.45, "step": 2295 }, { "epoch": 0.6883267753419524, "grad_norm": 0.24248278141021729, "learning_rate": 0.000197027027027027, "loss": 0.4129, "step": 2296 }, { "epoch": 0.6886265692336518, "grad_norm": 0.25668129324913025, "learning_rate": 0.00019698198198198197, "loss": 0.4354, "step": 2297 }, { "epoch": 0.6889263631253513, "grad_norm": 0.26304370164871216, "learning_rate": 0.00019693693693693693, "loss": 0.4423, "step": 2298 }, { "epoch": 0.6892261570170508, "grad_norm": 0.2509578466415405, "learning_rate": 0.00019689189189189187, "loss": 0.4263, "step": 2299 }, { "epoch": 0.6895259509087502, "grad_norm": 0.2629247009754181, "learning_rate": 0.00019684684684684683, "loss": 0.4323, "step": 2300 }, { "epoch": 0.6898257448004497, "grad_norm": 0.24706493318080902, "learning_rate": 0.0001968018018018018, "loss": 0.3913, "step": 2301 }, { "epoch": 0.6901255386921491, "grad_norm": 0.29551559686660767, "learning_rate": 0.00019675675675675673, "loss": 0.419, "step": 2302 }, { "epoch": 0.6904253325838486, "grad_norm": 0.2612929046154022, "learning_rate": 0.0001967117117117117, "loss": 0.4223, "step": 2303 }, { "epoch": 0.6907251264755481, "grad_norm": 0.28399109840393066, "learning_rate": 0.00019666666666666666, "loss": 0.4715, "step": 2304 }, { "epoch": 0.6910249203672475, "grad_norm": 0.24555319547653198, "learning_rate": 0.0001966216216216216, "loss": 0.4559, "step": 2305 }, { "epoch": 0.6913247142589469, "grad_norm": 0.2576359808444977, "learning_rate": 0.00019657657657657656, "loss": 0.4314, "step": 2306 }, { "epoch": 0.6916245081506465, "grad_norm": 0.25595325231552124, "learning_rate": 0.00019653153153153152, "loss": 0.4286, "step": 2307 }, { "epoch": 0.6919243020423459, "grad_norm": 0.23903168737888336, "learning_rate": 0.00019648648648648646, "loss": 0.4259, "step": 2308 }, { "epoch": 0.6922240959340453, "grad_norm": 0.2797984480857849, "learning_rate": 0.00019644144144144145, "loss": 0.4621, "step": 2309 }, { "epoch": 0.6925238898257448, "grad_norm": 0.25375935435295105, "learning_rate": 0.0001963963963963964, "loss": 0.4356, "step": 2310 }, { "epoch": 0.6928236837174443, "grad_norm": 0.2765314280986786, "learning_rate": 0.00019635135135135132, "loss": 0.4474, "step": 2311 }, { "epoch": 0.6931234776091437, "grad_norm": 0.23902222514152527, "learning_rate": 0.00019630630630630632, "loss": 0.4176, "step": 2312 }, { "epoch": 0.6934232715008432, "grad_norm": 0.278622031211853, "learning_rate": 0.00019626126126126125, "loss": 0.4177, "step": 2313 }, { "epoch": 0.6937230653925426, "grad_norm": 0.25161993503570557, "learning_rate": 0.0001962162162162162, "loss": 0.4235, "step": 2314 }, { "epoch": 0.6940228592842421, "grad_norm": 0.28174108266830444, "learning_rate": 0.00019617117117117113, "loss": 0.4469, "step": 2315 }, { "epoch": 0.6943226531759416, "grad_norm": 0.24297156929969788, "learning_rate": 0.00019612612612612612, "loss": 0.4348, "step": 2316 }, { "epoch": 0.694622447067641, "grad_norm": 0.2582569122314453, "learning_rate": 0.00019608108108108105, "loss": 0.4373, "step": 2317 }, { "epoch": 0.6949222409593404, "grad_norm": 0.2808705270290375, "learning_rate": 0.00019603603603603602, "loss": 0.4574, "step": 2318 }, { "epoch": 0.69522203485104, "grad_norm": 0.27071914076805115, "learning_rate": 0.00019599099099099098, "loss": 0.4449, "step": 2319 }, { "epoch": 0.6955218287427394, "grad_norm": 0.27735450863838196, "learning_rate": 0.00019594594594594592, "loss": 0.4154, "step": 2320 }, { "epoch": 0.6958216226344388, "grad_norm": 0.25535905361175537, "learning_rate": 0.00019590090090090088, "loss": 0.4432, "step": 2321 }, { "epoch": 0.6961214165261382, "grad_norm": 0.24208863079547882, "learning_rate": 0.00019585585585585585, "loss": 0.4248, "step": 2322 }, { "epoch": 0.6964212104178378, "grad_norm": 0.26040393114089966, "learning_rate": 0.00019581081081081078, "loss": 0.4215, "step": 2323 }, { "epoch": 0.6967210043095372, "grad_norm": 0.24389687180519104, "learning_rate": 0.00019576576576576575, "loss": 0.422, "step": 2324 }, { "epoch": 0.6970207982012366, "grad_norm": 0.2545843720436096, "learning_rate": 0.0001957207207207207, "loss": 0.4384, "step": 2325 }, { "epoch": 0.6973205920929361, "grad_norm": 0.2566373348236084, "learning_rate": 0.00019567567567567565, "loss": 0.4383, "step": 2326 }, { "epoch": 0.6976203859846356, "grad_norm": 0.2538570463657379, "learning_rate": 0.0001956306306306306, "loss": 0.4285, "step": 2327 }, { "epoch": 0.697920179876335, "grad_norm": 0.25821006298065186, "learning_rate": 0.00019558558558558557, "loss": 0.4304, "step": 2328 }, { "epoch": 0.6982199737680345, "grad_norm": 0.26139143109321594, "learning_rate": 0.0001955405405405405, "loss": 0.4416, "step": 2329 }, { "epoch": 0.6985197676597339, "grad_norm": 0.2557656168937683, "learning_rate": 0.00019549549549549548, "loss": 0.4166, "step": 2330 }, { "epoch": 0.6988195615514334, "grad_norm": 0.2611480951309204, "learning_rate": 0.00019545045045045044, "loss": 0.4463, "step": 2331 }, { "epoch": 0.6991193554431329, "grad_norm": 0.24384301900863647, "learning_rate": 0.0001954054054054054, "loss": 0.4027, "step": 2332 }, { "epoch": 0.6994191493348323, "grad_norm": 0.2693532407283783, "learning_rate": 0.00019536036036036034, "loss": 0.4449, "step": 2333 }, { "epoch": 0.6997189432265317, "grad_norm": 0.2669787108898163, "learning_rate": 0.0001953153153153153, "loss": 0.4266, "step": 2334 }, { "epoch": 0.7000187371182313, "grad_norm": 0.23384937644004822, "learning_rate": 0.00019527027027027027, "loss": 0.4103, "step": 2335 }, { "epoch": 0.7003185310099307, "grad_norm": 0.2738743722438812, "learning_rate": 0.0001952252252252252, "loss": 0.4664, "step": 2336 }, { "epoch": 0.7006183249016301, "grad_norm": 0.2557884752750397, "learning_rate": 0.00019518018018018017, "loss": 0.4121, "step": 2337 }, { "epoch": 0.7009181187933295, "grad_norm": 0.24830694496631622, "learning_rate": 0.00019513513513513513, "loss": 0.4, "step": 2338 }, { "epoch": 0.7012179126850291, "grad_norm": 0.2636083960533142, "learning_rate": 0.00019509009009009007, "loss": 0.4418, "step": 2339 }, { "epoch": 0.7015177065767285, "grad_norm": 0.252029687166214, "learning_rate": 0.000195045045045045, "loss": 0.4205, "step": 2340 }, { "epoch": 0.701817500468428, "grad_norm": 0.25448256731033325, "learning_rate": 0.000195, "loss": 0.4517, "step": 2341 }, { "epoch": 0.7021172943601274, "grad_norm": 0.26609039306640625, "learning_rate": 0.00019495495495495493, "loss": 0.4213, "step": 2342 }, { "epoch": 0.7024170882518269, "grad_norm": 0.2746337652206421, "learning_rate": 0.00019490990990990987, "loss": 0.4183, "step": 2343 }, { "epoch": 0.7027168821435263, "grad_norm": 0.2514724135398865, "learning_rate": 0.00019486486486486486, "loss": 0.4329, "step": 2344 }, { "epoch": 0.7030166760352258, "grad_norm": 0.27683892846107483, "learning_rate": 0.0001948198198198198, "loss": 0.4207, "step": 2345 }, { "epoch": 0.7033164699269252, "grad_norm": 0.2525181174278259, "learning_rate": 0.00019477477477477473, "loss": 0.4193, "step": 2346 }, { "epoch": 0.7036162638186247, "grad_norm": 0.2759072482585907, "learning_rate": 0.00019472972972972973, "loss": 0.4508, "step": 2347 }, { "epoch": 0.7039160577103242, "grad_norm": 0.2594849169254303, "learning_rate": 0.00019468468468468466, "loss": 0.4289, "step": 2348 }, { "epoch": 0.7042158516020236, "grad_norm": 0.26971113681793213, "learning_rate": 0.0001946396396396396, "loss": 0.4514, "step": 2349 }, { "epoch": 0.704515645493723, "grad_norm": 0.25291457772254944, "learning_rate": 0.0001945945945945946, "loss": 0.4664, "step": 2350 }, { "epoch": 0.7048154393854226, "grad_norm": 0.2617851495742798, "learning_rate": 0.00019454954954954953, "loss": 0.4301, "step": 2351 }, { "epoch": 0.705115233277122, "grad_norm": 0.24216975271701813, "learning_rate": 0.00019450450450450446, "loss": 0.4144, "step": 2352 }, { "epoch": 0.7054150271688214, "grad_norm": 0.2737904489040375, "learning_rate": 0.00019445945945945945, "loss": 0.417, "step": 2353 }, { "epoch": 0.7057148210605209, "grad_norm": 0.27587682008743286, "learning_rate": 0.0001944144144144144, "loss": 0.4421, "step": 2354 }, { "epoch": 0.7060146149522204, "grad_norm": 0.24917447566986084, "learning_rate": 0.00019436936936936936, "loss": 0.406, "step": 2355 }, { "epoch": 0.7063144088439198, "grad_norm": 0.27958497405052185, "learning_rate": 0.00019432432432432432, "loss": 0.4591, "step": 2356 }, { "epoch": 0.7066142027356193, "grad_norm": 0.27273818850517273, "learning_rate": 0.00019427927927927926, "loss": 0.4219, "step": 2357 }, { "epoch": 0.7069139966273187, "grad_norm": 0.24517607688903809, "learning_rate": 0.00019423423423423422, "loss": 0.4063, "step": 2358 }, { "epoch": 0.7072137905190182, "grad_norm": 0.28854820132255554, "learning_rate": 0.00019418918918918918, "loss": 0.4383, "step": 2359 }, { "epoch": 0.7075135844107177, "grad_norm": 0.27321329712867737, "learning_rate": 0.00019414414414414412, "loss": 0.4479, "step": 2360 }, { "epoch": 0.7078133783024171, "grad_norm": 0.2749727666378021, "learning_rate": 0.00019409909909909908, "loss": 0.4272, "step": 2361 }, { "epoch": 0.7081131721941165, "grad_norm": 0.27384045720100403, "learning_rate": 0.00019405405405405405, "loss": 0.444, "step": 2362 }, { "epoch": 0.708412966085816, "grad_norm": 0.2604135274887085, "learning_rate": 0.00019400900900900898, "loss": 0.4248, "step": 2363 }, { "epoch": 0.7087127599775155, "grad_norm": 0.2598932385444641, "learning_rate": 0.00019396396396396395, "loss": 0.4401, "step": 2364 }, { "epoch": 0.7090125538692149, "grad_norm": 0.253755122423172, "learning_rate": 0.00019391891891891889, "loss": 0.4316, "step": 2365 }, { "epoch": 0.7093123477609143, "grad_norm": 0.2677047848701477, "learning_rate": 0.00019387387387387388, "loss": 0.4264, "step": 2366 }, { "epoch": 0.7096121416526139, "grad_norm": 0.24191899597644806, "learning_rate": 0.0001938288288288288, "loss": 0.4172, "step": 2367 }, { "epoch": 0.7099119355443133, "grad_norm": 0.2684822082519531, "learning_rate": 0.00019378378378378375, "loss": 0.4466, "step": 2368 }, { "epoch": 0.7102117294360127, "grad_norm": 0.2859225273132324, "learning_rate": 0.00019373873873873874, "loss": 0.4348, "step": 2369 }, { "epoch": 0.7105115233277122, "grad_norm": 0.2676656246185303, "learning_rate": 0.00019369369369369368, "loss": 0.4322, "step": 2370 }, { "epoch": 0.7108113172194117, "grad_norm": 0.27145159244537354, "learning_rate": 0.00019364864864864861, "loss": 0.4485, "step": 2371 }, { "epoch": 0.7111111111111111, "grad_norm": 0.2646559178829193, "learning_rate": 0.0001936036036036036, "loss": 0.4255, "step": 2372 }, { "epoch": 0.7114109050028106, "grad_norm": 0.26495832204818726, "learning_rate": 0.00019355855855855854, "loss": 0.4681, "step": 2373 }, { "epoch": 0.71171069889451, "grad_norm": 0.26360276341438293, "learning_rate": 0.00019351351351351348, "loss": 0.4219, "step": 2374 }, { "epoch": 0.7120104927862094, "grad_norm": 0.25372758507728577, "learning_rate": 0.00019346846846846847, "loss": 0.4542, "step": 2375 }, { "epoch": 0.712310286677909, "grad_norm": 0.25859972834587097, "learning_rate": 0.0001934234234234234, "loss": 0.4121, "step": 2376 }, { "epoch": 0.7126100805696084, "grad_norm": 0.2402067929506302, "learning_rate": 0.00019337837837837834, "loss": 0.4132, "step": 2377 }, { "epoch": 0.7129098744613078, "grad_norm": 0.25183209776878357, "learning_rate": 0.00019333333333333333, "loss": 0.4106, "step": 2378 }, { "epoch": 0.7132096683530073, "grad_norm": 0.24842600524425507, "learning_rate": 0.00019328828828828827, "loss": 0.3895, "step": 2379 }, { "epoch": 0.7135094622447068, "grad_norm": 0.2635684609413147, "learning_rate": 0.0001932432432432432, "loss": 0.4112, "step": 2380 }, { "epoch": 0.7138092561364062, "grad_norm": 0.2578394412994385, "learning_rate": 0.0001931981981981982, "loss": 0.4429, "step": 2381 }, { "epoch": 0.7141090500281057, "grad_norm": 0.2912173867225647, "learning_rate": 0.00019315315315315314, "loss": 0.4256, "step": 2382 }, { "epoch": 0.7144088439198051, "grad_norm": 0.2592369019985199, "learning_rate": 0.00019310810810810807, "loss": 0.4357, "step": 2383 }, { "epoch": 0.7147086378115046, "grad_norm": 0.25853514671325684, "learning_rate": 0.00019306306306306306, "loss": 0.3816, "step": 2384 }, { "epoch": 0.715008431703204, "grad_norm": 0.26601412892341614, "learning_rate": 0.000193018018018018, "loss": 0.4344, "step": 2385 }, { "epoch": 0.7153082255949035, "grad_norm": 0.2789902091026306, "learning_rate": 0.00019297297297297294, "loss": 0.435, "step": 2386 }, { "epoch": 0.7156080194866029, "grad_norm": 0.285659521818161, "learning_rate": 0.0001929279279279279, "loss": 0.4194, "step": 2387 }, { "epoch": 0.7159078133783024, "grad_norm": 0.24971207976341248, "learning_rate": 0.00019288288288288286, "loss": 0.4322, "step": 2388 }, { "epoch": 0.7162076072700019, "grad_norm": 0.30142828822135925, "learning_rate": 0.00019283783783783783, "loss": 0.415, "step": 2389 }, { "epoch": 0.7165074011617013, "grad_norm": 0.2843458950519562, "learning_rate": 0.00019279279279279277, "loss": 0.4605, "step": 2390 }, { "epoch": 0.7168071950534007, "grad_norm": 0.24795542657375336, "learning_rate": 0.00019274774774774773, "loss": 0.4147, "step": 2391 }, { "epoch": 0.7171069889451003, "grad_norm": 0.2908228635787964, "learning_rate": 0.0001927027027027027, "loss": 0.4573, "step": 2392 }, { "epoch": 0.7174067828367997, "grad_norm": 0.2770592272281647, "learning_rate": 0.00019265765765765763, "loss": 0.4248, "step": 2393 }, { "epoch": 0.7177065767284991, "grad_norm": 0.2638701796531677, "learning_rate": 0.0001926126126126126, "loss": 0.4409, "step": 2394 }, { "epoch": 0.7180063706201986, "grad_norm": 0.2952423393726349, "learning_rate": 0.00019256756756756756, "loss": 0.4651, "step": 2395 }, { "epoch": 0.7183061645118981, "grad_norm": 0.2776016294956207, "learning_rate": 0.0001925225225225225, "loss": 0.4453, "step": 2396 }, { "epoch": 0.7186059584035975, "grad_norm": 0.2560594081878662, "learning_rate": 0.00019247747747747746, "loss": 0.4441, "step": 2397 }, { "epoch": 0.718905752295297, "grad_norm": 0.2397725135087967, "learning_rate": 0.00019243243243243242, "loss": 0.3933, "step": 2398 }, { "epoch": 0.7192055461869964, "grad_norm": 0.2630067467689514, "learning_rate": 0.00019238738738738736, "loss": 0.4334, "step": 2399 }, { "epoch": 0.7195053400786959, "grad_norm": 0.2648056149482727, "learning_rate": 0.00019234234234234235, "loss": 0.4456, "step": 2400 }, { "epoch": 0.7198051339703954, "grad_norm": 0.27124911546707153, "learning_rate": 0.0001922972972972973, "loss": 0.4324, "step": 2401 }, { "epoch": 0.7201049278620948, "grad_norm": 0.2809448540210724, "learning_rate": 0.00019225225225225222, "loss": 0.4419, "step": 2402 }, { "epoch": 0.7204047217537942, "grad_norm": 0.2577565610408783, "learning_rate": 0.00019220720720720721, "loss": 0.441, "step": 2403 }, { "epoch": 0.7207045156454938, "grad_norm": 0.261111855506897, "learning_rate": 0.00019216216216216215, "loss": 0.4234, "step": 2404 }, { "epoch": 0.7210043095371932, "grad_norm": 0.2598218321800232, "learning_rate": 0.0001921171171171171, "loss": 0.4183, "step": 2405 }, { "epoch": 0.7213041034288926, "grad_norm": 0.2712027132511139, "learning_rate": 0.00019207207207207208, "loss": 0.4354, "step": 2406 }, { "epoch": 0.721603897320592, "grad_norm": 0.27912774682044983, "learning_rate": 0.00019202702702702702, "loss": 0.4594, "step": 2407 }, { "epoch": 0.7219036912122916, "grad_norm": 0.25328657031059265, "learning_rate": 0.00019198198198198195, "loss": 0.408, "step": 2408 }, { "epoch": 0.722203485103991, "grad_norm": 0.2425694465637207, "learning_rate": 0.00019193693693693694, "loss": 0.4197, "step": 2409 }, { "epoch": 0.7225032789956904, "grad_norm": 0.2552039623260498, "learning_rate": 0.00019189189189189188, "loss": 0.4383, "step": 2410 }, { "epoch": 0.7228030728873899, "grad_norm": 0.26128917932510376, "learning_rate": 0.00019184684684684682, "loss": 0.4204, "step": 2411 }, { "epoch": 0.7231028667790894, "grad_norm": 0.2713935077190399, "learning_rate": 0.00019180180180180178, "loss": 0.4325, "step": 2412 }, { "epoch": 0.7234026606707888, "grad_norm": 0.257618248462677, "learning_rate": 0.00019175675675675674, "loss": 0.4443, "step": 2413 }, { "epoch": 0.7237024545624883, "grad_norm": 0.2597353458404541, "learning_rate": 0.00019171171171171168, "loss": 0.4415, "step": 2414 }, { "epoch": 0.7240022484541877, "grad_norm": 0.26509061455726624, "learning_rate": 0.00019166666666666665, "loss": 0.4711, "step": 2415 }, { "epoch": 0.7243020423458872, "grad_norm": 0.26354658603668213, "learning_rate": 0.0001916216216216216, "loss": 0.4341, "step": 2416 }, { "epoch": 0.7246018362375867, "grad_norm": 0.2364490032196045, "learning_rate": 0.00019157657657657655, "loss": 0.4016, "step": 2417 }, { "epoch": 0.7249016301292861, "grad_norm": 0.24982942640781403, "learning_rate": 0.0001915315315315315, "loss": 0.427, "step": 2418 }, { "epoch": 0.7252014240209855, "grad_norm": 0.27166748046875, "learning_rate": 0.00019148648648648647, "loss": 0.4807, "step": 2419 }, { "epoch": 0.7255012179126851, "grad_norm": 0.24789117276668549, "learning_rate": 0.0001914414414414414, "loss": 0.4265, "step": 2420 }, { "epoch": 0.7258010118043845, "grad_norm": 0.2433491349220276, "learning_rate": 0.00019139639639639637, "loss": 0.4405, "step": 2421 }, { "epoch": 0.7261008056960839, "grad_norm": 0.24121679365634918, "learning_rate": 0.00019135135135135134, "loss": 0.4215, "step": 2422 }, { "epoch": 0.7264005995877834, "grad_norm": 0.25895169377326965, "learning_rate": 0.0001913063063063063, "loss": 0.4168, "step": 2423 }, { "epoch": 0.7267003934794829, "grad_norm": 0.24981217086315155, "learning_rate": 0.00019126126126126124, "loss": 0.4268, "step": 2424 }, { "epoch": 0.7270001873711823, "grad_norm": 0.25490307807922363, "learning_rate": 0.0001912162162162162, "loss": 0.4464, "step": 2425 }, { "epoch": 0.7272999812628818, "grad_norm": 0.2552802562713623, "learning_rate": 0.00019117117117117117, "loss": 0.4461, "step": 2426 }, { "epoch": 0.7275997751545812, "grad_norm": 0.27454614639282227, "learning_rate": 0.0001911261261261261, "loss": 0.4683, "step": 2427 }, { "epoch": 0.7278995690462807, "grad_norm": 0.2501683831214905, "learning_rate": 0.00019108108108108107, "loss": 0.4244, "step": 2428 }, { "epoch": 0.7281993629379802, "grad_norm": 0.24820026755332947, "learning_rate": 0.00019103603603603603, "loss": 0.4416, "step": 2429 }, { "epoch": 0.7284991568296796, "grad_norm": 0.25755947828292847, "learning_rate": 0.00019099099099099097, "loss": 0.4377, "step": 2430 }, { "epoch": 0.728798950721379, "grad_norm": 0.268839031457901, "learning_rate": 0.00019094594594594593, "loss": 0.46, "step": 2431 }, { "epoch": 0.7290987446130786, "grad_norm": 0.2707115113735199, "learning_rate": 0.0001909009009009009, "loss": 0.4375, "step": 2432 }, { "epoch": 0.729398538504778, "grad_norm": 0.25406280159950256, "learning_rate": 0.00019085585585585583, "loss": 0.441, "step": 2433 }, { "epoch": 0.7296983323964774, "grad_norm": 0.2569238841533661, "learning_rate": 0.00019081081081081082, "loss": 0.4446, "step": 2434 }, { "epoch": 0.7299981262881768, "grad_norm": 0.2784389555454254, "learning_rate": 0.00019076576576576576, "loss": 0.4396, "step": 2435 }, { "epoch": 0.7302979201798764, "grad_norm": 0.25094011425971985, "learning_rate": 0.0001907207207207207, "loss": 0.4189, "step": 2436 }, { "epoch": 0.7305977140715758, "grad_norm": 0.2696321904659271, "learning_rate": 0.00019067567567567563, "loss": 0.4575, "step": 2437 }, { "epoch": 0.7308975079632752, "grad_norm": 0.2526487112045288, "learning_rate": 0.00019063063063063062, "loss": 0.4081, "step": 2438 }, { "epoch": 0.7311973018549747, "grad_norm": 0.2577098608016968, "learning_rate": 0.00019058558558558556, "loss": 0.4478, "step": 2439 }, { "epoch": 0.7314970957466742, "grad_norm": 0.26381629705429077, "learning_rate": 0.0001905405405405405, "loss": 0.4573, "step": 2440 }, { "epoch": 0.7317968896383736, "grad_norm": 0.26065248250961304, "learning_rate": 0.0001904954954954955, "loss": 0.4413, "step": 2441 }, { "epoch": 0.7320966835300731, "grad_norm": 0.24994462728500366, "learning_rate": 0.00019045045045045043, "loss": 0.4016, "step": 2442 }, { "epoch": 0.7323964774217725, "grad_norm": 0.2742599844932556, "learning_rate": 0.00019040540540540536, "loss": 0.4578, "step": 2443 }, { "epoch": 0.732696271313472, "grad_norm": 0.23844504356384277, "learning_rate": 0.00019036036036036035, "loss": 0.4078, "step": 2444 }, { "epoch": 0.7329960652051715, "grad_norm": 0.2562139630317688, "learning_rate": 0.0001903153153153153, "loss": 0.4363, "step": 2445 }, { "epoch": 0.7332958590968709, "grad_norm": 0.2525213956832886, "learning_rate": 0.00019027027027027025, "loss": 0.4585, "step": 2446 }, { "epoch": 0.7335956529885703, "grad_norm": 0.2562119662761688, "learning_rate": 0.00019022522522522522, "loss": 0.4158, "step": 2447 }, { "epoch": 0.7338954468802699, "grad_norm": 0.2771570682525635, "learning_rate": 0.00019018018018018015, "loss": 0.4601, "step": 2448 }, { "epoch": 0.7341952407719693, "grad_norm": 0.2594900131225586, "learning_rate": 0.00019013513513513512, "loss": 0.4252, "step": 2449 }, { "epoch": 0.7344950346636687, "grad_norm": 0.27634164690971375, "learning_rate": 0.00019009009009009008, "loss": 0.4614, "step": 2450 }, { "epoch": 0.7347948285553682, "grad_norm": 0.27118000388145447, "learning_rate": 0.00019004504504504502, "loss": 0.425, "step": 2451 }, { "epoch": 0.7350946224470677, "grad_norm": 0.26404282450675964, "learning_rate": 0.00018999999999999998, "loss": 0.435, "step": 2452 }, { "epoch": 0.7353944163387671, "grad_norm": 0.24286137521266937, "learning_rate": 0.00018995495495495495, "loss": 0.4099, "step": 2453 }, { "epoch": 0.7356942102304665, "grad_norm": 0.2554706335067749, "learning_rate": 0.00018990990990990988, "loss": 0.4031, "step": 2454 }, { "epoch": 0.735994004122166, "grad_norm": 0.2666279375553131, "learning_rate": 0.00018986486486486485, "loss": 0.4397, "step": 2455 }, { "epoch": 0.7362937980138655, "grad_norm": 0.24479645490646362, "learning_rate": 0.0001898198198198198, "loss": 0.4059, "step": 2456 }, { "epoch": 0.736593591905565, "grad_norm": 0.27331724762916565, "learning_rate": 0.00018977477477477478, "loss": 0.4337, "step": 2457 }, { "epoch": 0.7368933857972644, "grad_norm": 0.2546418309211731, "learning_rate": 0.0001897297297297297, "loss": 0.4206, "step": 2458 }, { "epoch": 0.7371931796889638, "grad_norm": 0.2593313753604889, "learning_rate": 0.00018968468468468468, "loss": 0.39, "step": 2459 }, { "epoch": 0.7374929735806633, "grad_norm": 0.2757156789302826, "learning_rate": 0.00018963963963963964, "loss": 0.441, "step": 2460 }, { "epoch": 0.7377927674723628, "grad_norm": 0.2826617956161499, "learning_rate": 0.00018959459459459458, "loss": 0.4497, "step": 2461 }, { "epoch": 0.7380925613640622, "grad_norm": 0.26498305797576904, "learning_rate": 0.00018954954954954951, "loss": 0.4429, "step": 2462 }, { "epoch": 0.7383923552557616, "grad_norm": 0.22784557938575745, "learning_rate": 0.0001895045045045045, "loss": 0.4069, "step": 2463 }, { "epoch": 0.7386921491474612, "grad_norm": 0.277037113904953, "learning_rate": 0.00018945945945945944, "loss": 0.4257, "step": 2464 }, { "epoch": 0.7389919430391606, "grad_norm": 0.25758159160614014, "learning_rate": 0.00018941441441441438, "loss": 0.4255, "step": 2465 }, { "epoch": 0.73929173693086, "grad_norm": 0.24654820561408997, "learning_rate": 0.00018936936936936937, "loss": 0.402, "step": 2466 }, { "epoch": 0.7395915308225595, "grad_norm": 0.259376585483551, "learning_rate": 0.0001893243243243243, "loss": 0.416, "step": 2467 }, { "epoch": 0.739891324714259, "grad_norm": 0.28223109245300293, "learning_rate": 0.00018927927927927924, "loss": 0.4675, "step": 2468 }, { "epoch": 0.7401911186059584, "grad_norm": 0.2680475413799286, "learning_rate": 0.00018923423423423423, "loss": 0.4147, "step": 2469 }, { "epoch": 0.7404909124976579, "grad_norm": 0.2528432309627533, "learning_rate": 0.00018918918918918917, "loss": 0.4374, "step": 2470 }, { "epoch": 0.7407907063893573, "grad_norm": 0.26637372374534607, "learning_rate": 0.0001891441441441441, "loss": 0.4189, "step": 2471 }, { "epoch": 0.7410905002810567, "grad_norm": 0.2570081055164337, "learning_rate": 0.0001890990990990991, "loss": 0.4388, "step": 2472 }, { "epoch": 0.7413902941727563, "grad_norm": 0.27075570821762085, "learning_rate": 0.00018905405405405403, "loss": 0.4599, "step": 2473 }, { "epoch": 0.7416900880644557, "grad_norm": 0.2676197290420532, "learning_rate": 0.00018900900900900897, "loss": 0.4105, "step": 2474 }, { "epoch": 0.7419898819561551, "grad_norm": 0.24458040297031403, "learning_rate": 0.00018896396396396396, "loss": 0.4152, "step": 2475 }, { "epoch": 0.7422896758478545, "grad_norm": 0.2793339788913727, "learning_rate": 0.0001889189189189189, "loss": 0.4468, "step": 2476 }, { "epoch": 0.7425894697395541, "grad_norm": 0.25252237915992737, "learning_rate": 0.00018887387387387384, "loss": 0.4215, "step": 2477 }, { "epoch": 0.7428892636312535, "grad_norm": 0.27801933884620667, "learning_rate": 0.00018882882882882883, "loss": 0.4362, "step": 2478 }, { "epoch": 0.743189057522953, "grad_norm": 0.26056137681007385, "learning_rate": 0.00018878378378378376, "loss": 0.4342, "step": 2479 }, { "epoch": 0.7434888514146524, "grad_norm": 0.26250821352005005, "learning_rate": 0.00018873873873873873, "loss": 0.4196, "step": 2480 }, { "epoch": 0.7437886453063519, "grad_norm": 0.2682492733001709, "learning_rate": 0.0001886936936936937, "loss": 0.4368, "step": 2481 }, { "epoch": 0.7440884391980513, "grad_norm": 0.2572811245918274, "learning_rate": 0.00018864864864864863, "loss": 0.4363, "step": 2482 }, { "epoch": 0.7443882330897508, "grad_norm": 0.25746074318885803, "learning_rate": 0.0001886036036036036, "loss": 0.4045, "step": 2483 }, { "epoch": 0.7446880269814502, "grad_norm": 0.25470736622810364, "learning_rate": 0.00018855855855855853, "loss": 0.4062, "step": 2484 }, { "epoch": 0.7449878208731497, "grad_norm": 0.2766227722167969, "learning_rate": 0.0001885135135135135, "loss": 0.4605, "step": 2485 }, { "epoch": 0.7452876147648492, "grad_norm": 0.28737902641296387, "learning_rate": 0.00018846846846846846, "loss": 0.4481, "step": 2486 }, { "epoch": 0.7455874086565486, "grad_norm": 0.2646963894367218, "learning_rate": 0.0001884234234234234, "loss": 0.4212, "step": 2487 }, { "epoch": 0.745887202548248, "grad_norm": 0.2569124698638916, "learning_rate": 0.00018837837837837836, "loss": 0.4268, "step": 2488 }, { "epoch": 0.7461869964399476, "grad_norm": 0.25343701243400574, "learning_rate": 0.00018833333333333332, "loss": 0.4285, "step": 2489 }, { "epoch": 0.746486790331647, "grad_norm": 0.27101901173591614, "learning_rate": 0.00018828828828828826, "loss": 0.4426, "step": 2490 }, { "epoch": 0.7467865842233464, "grad_norm": 0.2594289779663086, "learning_rate": 0.00018824324324324325, "loss": 0.4333, "step": 2491 }, { "epoch": 0.7470863781150459, "grad_norm": 0.2643277049064636, "learning_rate": 0.00018819819819819819, "loss": 0.4407, "step": 2492 }, { "epoch": 0.7473861720067454, "grad_norm": 0.267240047454834, "learning_rate": 0.00018815315315315312, "loss": 0.4448, "step": 2493 }, { "epoch": 0.7476859658984448, "grad_norm": 0.24963083863258362, "learning_rate": 0.00018810810810810811, "loss": 0.4313, "step": 2494 }, { "epoch": 0.7479857597901443, "grad_norm": 0.2673603892326355, "learning_rate": 0.00018806306306306305, "loss": 0.4327, "step": 2495 }, { "epoch": 0.7482855536818437, "grad_norm": 0.25436538457870483, "learning_rate": 0.000188018018018018, "loss": 0.4251, "step": 2496 }, { "epoch": 0.7485853475735432, "grad_norm": 0.25511813163757324, "learning_rate": 0.00018797297297297298, "loss": 0.441, "step": 2497 }, { "epoch": 0.7488851414652427, "grad_norm": 0.26634255051612854, "learning_rate": 0.00018792792792792791, "loss": 0.4142, "step": 2498 }, { "epoch": 0.7491849353569421, "grad_norm": 0.2738245129585266, "learning_rate": 0.00018788288288288285, "loss": 0.446, "step": 2499 }, { "epoch": 0.7494847292486415, "grad_norm": 0.2478281557559967, "learning_rate": 0.00018783783783783784, "loss": 0.4355, "step": 2500 }, { "epoch": 0.7494847292486415, "eval_loss": 0.43330296874046326, "eval_runtime": 567.7062, "eval_samples_per_second": 3.803, "eval_steps_per_second": 0.476, "step": 2500 }, { "epoch": 0.749784523140341, "grad_norm": 0.24915559589862823, "learning_rate": 0.00018779279279279278, "loss": 0.431, "step": 2501 }, { "epoch": 0.7500843170320405, "grad_norm": 0.25979626178741455, "learning_rate": 0.00018774774774774772, "loss": 0.4336, "step": 2502 }, { "epoch": 0.7503841109237399, "grad_norm": 0.2514503002166748, "learning_rate": 0.0001877027027027027, "loss": 0.4104, "step": 2503 }, { "epoch": 0.7506839048154393, "grad_norm": 0.2693893015384674, "learning_rate": 0.00018765765765765764, "loss": 0.4706, "step": 2504 }, { "epoch": 0.7509836987071389, "grad_norm": 0.2881157696247101, "learning_rate": 0.00018761261261261258, "loss": 0.4346, "step": 2505 }, { "epoch": 0.7512834925988383, "grad_norm": 0.232576385140419, "learning_rate": 0.00018756756756756757, "loss": 0.42, "step": 2506 }, { "epoch": 0.7515832864905377, "grad_norm": 0.30108994245529175, "learning_rate": 0.0001875225225225225, "loss": 0.425, "step": 2507 }, { "epoch": 0.7518830803822372, "grad_norm": 0.28761547803878784, "learning_rate": 0.00018747747747747745, "loss": 0.47, "step": 2508 }, { "epoch": 0.7521828742739367, "grad_norm": 0.2556571662425995, "learning_rate": 0.0001874324324324324, "loss": 0.4236, "step": 2509 }, { "epoch": 0.7524826681656361, "grad_norm": 0.27593177556991577, "learning_rate": 0.00018738738738738737, "loss": 0.4209, "step": 2510 }, { "epoch": 0.7527824620573356, "grad_norm": 0.26506245136260986, "learning_rate": 0.0001873423423423423, "loss": 0.4182, "step": 2511 }, { "epoch": 0.753082255949035, "grad_norm": 0.31767213344573975, "learning_rate": 0.00018729729729729727, "loss": 0.4574, "step": 2512 }, { "epoch": 0.7533820498407345, "grad_norm": 0.25470229983329773, "learning_rate": 0.00018725225225225224, "loss": 0.4064, "step": 2513 }, { "epoch": 0.753681843732434, "grad_norm": 0.25668561458587646, "learning_rate": 0.0001872072072072072, "loss": 0.4247, "step": 2514 }, { "epoch": 0.7539816376241334, "grad_norm": 0.29275453090667725, "learning_rate": 0.00018716216216216214, "loss": 0.4232, "step": 2515 }, { "epoch": 0.7542814315158328, "grad_norm": 0.2815520167350769, "learning_rate": 0.0001871171171171171, "loss": 0.4402, "step": 2516 }, { "epoch": 0.7545812254075324, "grad_norm": 0.24637946486473083, "learning_rate": 0.00018707207207207207, "loss": 0.4209, "step": 2517 }, { "epoch": 0.7548810192992318, "grad_norm": 0.27894946932792664, "learning_rate": 0.000187027027027027, "loss": 0.453, "step": 2518 }, { "epoch": 0.7551808131909312, "grad_norm": 0.3131442070007324, "learning_rate": 0.00018698198198198197, "loss": 0.4728, "step": 2519 }, { "epoch": 0.7554806070826307, "grad_norm": 0.2698810398578644, "learning_rate": 0.00018693693693693693, "loss": 0.4387, "step": 2520 }, { "epoch": 0.7557804009743302, "grad_norm": 0.24818141758441925, "learning_rate": 0.00018689189189189187, "loss": 0.4065, "step": 2521 }, { "epoch": 0.7560801948660296, "grad_norm": 0.2903098464012146, "learning_rate": 0.00018684684684684683, "loss": 0.426, "step": 2522 }, { "epoch": 0.756379988757729, "grad_norm": 0.2602495551109314, "learning_rate": 0.0001868018018018018, "loss": 0.4144, "step": 2523 }, { "epoch": 0.7566797826494285, "grad_norm": 0.2648625373840332, "learning_rate": 0.00018675675675675673, "loss": 0.4476, "step": 2524 }, { "epoch": 0.756979576541128, "grad_norm": 0.26782098412513733, "learning_rate": 0.0001867117117117117, "loss": 0.4238, "step": 2525 }, { "epoch": 0.7572793704328274, "grad_norm": 0.27317121624946594, "learning_rate": 0.00018666666666666666, "loss": 0.4251, "step": 2526 }, { "epoch": 0.7575791643245269, "grad_norm": 0.2720593214035034, "learning_rate": 0.0001866216216216216, "loss": 0.4667, "step": 2527 }, { "epoch": 0.7578789582162263, "grad_norm": 0.2541276812553406, "learning_rate": 0.0001865765765765766, "loss": 0.431, "step": 2528 }, { "epoch": 0.7581787521079258, "grad_norm": 0.27258971333503723, "learning_rate": 0.00018653153153153152, "loss": 0.4263, "step": 2529 }, { "epoch": 0.7584785459996253, "grad_norm": 0.28021714091300964, "learning_rate": 0.00018648648648648646, "loss": 0.4276, "step": 2530 }, { "epoch": 0.7587783398913247, "grad_norm": 0.2505019009113312, "learning_rate": 0.00018644144144144145, "loss": 0.4105, "step": 2531 }, { "epoch": 0.7590781337830241, "grad_norm": 0.28030917048454285, "learning_rate": 0.0001863963963963964, "loss": 0.4428, "step": 2532 }, { "epoch": 0.7593779276747237, "grad_norm": 0.27447059750556946, "learning_rate": 0.00018635135135135133, "loss": 0.4551, "step": 2533 }, { "epoch": 0.7596777215664231, "grad_norm": 0.26824313402175903, "learning_rate": 0.00018630630630630626, "loss": 0.4445, "step": 2534 }, { "epoch": 0.7599775154581225, "grad_norm": 0.2674945294857025, "learning_rate": 0.00018626126126126125, "loss": 0.4513, "step": 2535 }, { "epoch": 0.760277309349822, "grad_norm": 0.2604798972606659, "learning_rate": 0.0001862162162162162, "loss": 0.4381, "step": 2536 }, { "epoch": 0.7605771032415215, "grad_norm": 0.27609342336654663, "learning_rate": 0.00018617117117117115, "loss": 0.4441, "step": 2537 }, { "epoch": 0.7608768971332209, "grad_norm": 0.2614879012107849, "learning_rate": 0.00018612612612612612, "loss": 0.435, "step": 2538 }, { "epoch": 0.7611766910249204, "grad_norm": 0.25386688113212585, "learning_rate": 0.00018608108108108105, "loss": 0.4434, "step": 2539 }, { "epoch": 0.7614764849166198, "grad_norm": 0.24181143939495087, "learning_rate": 0.00018603603603603602, "loss": 0.4175, "step": 2540 }, { "epoch": 0.7617762788083193, "grad_norm": 0.2645350694656372, "learning_rate": 0.00018599099099099098, "loss": 0.4136, "step": 2541 }, { "epoch": 0.7620760727000188, "grad_norm": 0.2677913010120392, "learning_rate": 0.00018594594594594592, "loss": 0.4436, "step": 2542 }, { "epoch": 0.7623758665917182, "grad_norm": 0.2717260420322418, "learning_rate": 0.00018590090090090088, "loss": 0.4565, "step": 2543 }, { "epoch": 0.7626756604834176, "grad_norm": 0.25026705861091614, "learning_rate": 0.00018585585585585585, "loss": 0.4119, "step": 2544 }, { "epoch": 0.7629754543751172, "grad_norm": 0.24770689010620117, "learning_rate": 0.00018581081081081078, "loss": 0.4097, "step": 2545 }, { "epoch": 0.7632752482668166, "grad_norm": 0.27625271677970886, "learning_rate": 0.00018576576576576575, "loss": 0.4269, "step": 2546 }, { "epoch": 0.763575042158516, "grad_norm": 0.27056175470352173, "learning_rate": 0.0001857207207207207, "loss": 0.4499, "step": 2547 }, { "epoch": 0.7638748360502154, "grad_norm": 0.2812648415565491, "learning_rate": 0.00018567567567567567, "loss": 0.4736, "step": 2548 }, { "epoch": 0.764174629941915, "grad_norm": 0.26717478036880493, "learning_rate": 0.0001856306306306306, "loss": 0.4072, "step": 2549 }, { "epoch": 0.7644744238336144, "grad_norm": 0.2870055139064789, "learning_rate": 0.00018558558558558558, "loss": 0.4229, "step": 2550 }, { "epoch": 0.7647742177253138, "grad_norm": 0.2580265700817108, "learning_rate": 0.00018554054054054054, "loss": 0.4068, "step": 2551 }, { "epoch": 0.7650740116170133, "grad_norm": 0.28002214431762695, "learning_rate": 0.00018549549549549548, "loss": 0.432, "step": 2552 }, { "epoch": 0.7653738055087128, "grad_norm": 0.27384141087532043, "learning_rate": 0.00018545045045045044, "loss": 0.4363, "step": 2553 }, { "epoch": 0.7656735994004122, "grad_norm": 0.2627524137496948, "learning_rate": 0.0001854054054054054, "loss": 0.4197, "step": 2554 }, { "epoch": 0.7659733932921117, "grad_norm": 0.2666347324848175, "learning_rate": 0.00018536036036036034, "loss": 0.3912, "step": 2555 }, { "epoch": 0.7662731871838111, "grad_norm": 0.2756651043891907, "learning_rate": 0.00018531531531531528, "loss": 0.4364, "step": 2556 }, { "epoch": 0.7665729810755106, "grad_norm": 0.2617150545120239, "learning_rate": 0.00018527027027027027, "loss": 0.4468, "step": 2557 }, { "epoch": 0.7668727749672101, "grad_norm": 0.27227911353111267, "learning_rate": 0.0001852252252252252, "loss": 0.4246, "step": 2558 }, { "epoch": 0.7671725688589095, "grad_norm": 0.2841823697090149, "learning_rate": 0.00018518018018018014, "loss": 0.4363, "step": 2559 }, { "epoch": 0.7674723627506089, "grad_norm": 0.253366082906723, "learning_rate": 0.00018513513513513513, "loss": 0.419, "step": 2560 }, { "epoch": 0.7677721566423085, "grad_norm": 0.2522357106208801, "learning_rate": 0.00018509009009009007, "loss": 0.4124, "step": 2561 }, { "epoch": 0.7680719505340079, "grad_norm": 0.2550141215324402, "learning_rate": 0.000185045045045045, "loss": 0.4256, "step": 2562 }, { "epoch": 0.7683717444257073, "grad_norm": 0.27578258514404297, "learning_rate": 0.000185, "loss": 0.4447, "step": 2563 }, { "epoch": 0.7686715383174068, "grad_norm": 0.2517780661582947, "learning_rate": 0.00018495495495495493, "loss": 0.4167, "step": 2564 }, { "epoch": 0.7689713322091062, "grad_norm": 0.2627197802066803, "learning_rate": 0.00018490990990990987, "loss": 0.421, "step": 2565 }, { "epoch": 0.7692711261008057, "grad_norm": 0.2572929263114929, "learning_rate": 0.00018486486486486486, "loss": 0.4295, "step": 2566 }, { "epoch": 0.7695709199925052, "grad_norm": 0.2549370229244232, "learning_rate": 0.0001848198198198198, "loss": 0.4353, "step": 2567 }, { "epoch": 0.7698707138842046, "grad_norm": 0.25990357995033264, "learning_rate": 0.00018477477477477474, "loss": 0.4598, "step": 2568 }, { "epoch": 0.770170507775904, "grad_norm": 0.26102861762046814, "learning_rate": 0.00018472972972972973, "loss": 0.4342, "step": 2569 }, { "epoch": 0.7704703016676036, "grad_norm": 0.26292112469673157, "learning_rate": 0.00018468468468468466, "loss": 0.4217, "step": 2570 }, { "epoch": 0.770770095559303, "grad_norm": 0.24879471957683563, "learning_rate": 0.00018463963963963963, "loss": 0.4034, "step": 2571 }, { "epoch": 0.7710698894510024, "grad_norm": 0.249162495136261, "learning_rate": 0.0001845945945945946, "loss": 0.4213, "step": 2572 }, { "epoch": 0.7713696833427018, "grad_norm": 0.25036314129829407, "learning_rate": 0.00018454954954954953, "loss": 0.449, "step": 2573 }, { "epoch": 0.7716694772344014, "grad_norm": 0.2511482238769531, "learning_rate": 0.0001845045045045045, "loss": 0.4285, "step": 2574 }, { "epoch": 0.7719692711261008, "grad_norm": 0.25358885526657104, "learning_rate": 0.00018445945945945946, "loss": 0.4392, "step": 2575 }, { "epoch": 0.7722690650178002, "grad_norm": 0.25731760263442993, "learning_rate": 0.0001844144144144144, "loss": 0.4326, "step": 2576 }, { "epoch": 0.7725688589094997, "grad_norm": 0.24017149209976196, "learning_rate": 0.00018436936936936936, "loss": 0.4285, "step": 2577 }, { "epoch": 0.7728686528011992, "grad_norm": 0.24697363376617432, "learning_rate": 0.00018432432432432432, "loss": 0.4326, "step": 2578 }, { "epoch": 0.7731684466928986, "grad_norm": 0.2622368335723877, "learning_rate": 0.00018427927927927926, "loss": 0.4051, "step": 2579 }, { "epoch": 0.7734682405845981, "grad_norm": 0.26079848408699036, "learning_rate": 0.00018423423423423422, "loss": 0.4296, "step": 2580 }, { "epoch": 0.7737680344762975, "grad_norm": 0.26790016889572144, "learning_rate": 0.00018418918918918916, "loss": 0.4487, "step": 2581 }, { "epoch": 0.774067828367997, "grad_norm": 0.26801207661628723, "learning_rate": 0.00018414414414414412, "loss": 0.441, "step": 2582 }, { "epoch": 0.7743676222596965, "grad_norm": 0.2615436911582947, "learning_rate": 0.00018409909909909909, "loss": 0.4356, "step": 2583 }, { "epoch": 0.7746674161513959, "grad_norm": 0.26157858967781067, "learning_rate": 0.00018405405405405402, "loss": 0.4624, "step": 2584 }, { "epoch": 0.7749672100430953, "grad_norm": 0.2570144832134247, "learning_rate": 0.000184009009009009, "loss": 0.4159, "step": 2585 }, { "epoch": 0.7752670039347949, "grad_norm": 0.25635403394699097, "learning_rate": 0.00018396396396396395, "loss": 0.4479, "step": 2586 }, { "epoch": 0.7755667978264943, "grad_norm": 0.24715913832187653, "learning_rate": 0.00018391891891891889, "loss": 0.4258, "step": 2587 }, { "epoch": 0.7758665917181937, "grad_norm": 0.2577861547470093, "learning_rate": 0.00018387387387387388, "loss": 0.4107, "step": 2588 }, { "epoch": 0.7761663856098932, "grad_norm": 0.24768322706222534, "learning_rate": 0.00018382882882882881, "loss": 0.4208, "step": 2589 }, { "epoch": 0.7764661795015927, "grad_norm": 0.24486133456230164, "learning_rate": 0.00018378378378378375, "loss": 0.4128, "step": 2590 }, { "epoch": 0.7767659733932921, "grad_norm": 0.2598220109939575, "learning_rate": 0.00018373873873873874, "loss": 0.4398, "step": 2591 }, { "epoch": 0.7770657672849915, "grad_norm": 0.2616111636161804, "learning_rate": 0.00018369369369369368, "loss": 0.4472, "step": 2592 }, { "epoch": 0.777365561176691, "grad_norm": 0.2481420487165451, "learning_rate": 0.00018364864864864862, "loss": 0.4073, "step": 2593 }, { "epoch": 0.7776653550683905, "grad_norm": 0.26911380887031555, "learning_rate": 0.0001836036036036036, "loss": 0.4584, "step": 2594 }, { "epoch": 0.77796514896009, "grad_norm": 0.2654714584350586, "learning_rate": 0.00018355855855855854, "loss": 0.4541, "step": 2595 }, { "epoch": 0.7782649428517894, "grad_norm": 0.2782737612724304, "learning_rate": 0.00018351351351351348, "loss": 0.4496, "step": 2596 }, { "epoch": 0.7785647367434888, "grad_norm": 0.24328523874282837, "learning_rate": 0.00018346846846846847, "loss": 0.4198, "step": 2597 }, { "epoch": 0.7788645306351883, "grad_norm": 0.2627139985561371, "learning_rate": 0.0001834234234234234, "loss": 0.4383, "step": 2598 }, { "epoch": 0.7791643245268878, "grad_norm": 0.25555041432380676, "learning_rate": 0.00018337837837837834, "loss": 0.4097, "step": 2599 }, { "epoch": 0.7794641184185872, "grad_norm": 0.24429571628570557, "learning_rate": 0.00018333333333333334, "loss": 0.3932, "step": 2600 } ], "logging_steps": 1, "max_steps": 6670, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.918856291720888e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }