{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9876543209876543, "eval_steps": 500, "global_step": 363, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00823045267489712, "grad_norm": 1.4866708861109026, "learning_rate": 5.405405405405406e-06, "loss": 1.7565, "step": 1 }, { "epoch": 0.01646090534979424, "grad_norm": 1.4564242960595204, "learning_rate": 1.0810810810810812e-05, "loss": 1.7428, "step": 2 }, { "epoch": 0.024691358024691357, "grad_norm": 1.5029588271629886, "learning_rate": 1.6216216216216218e-05, "loss": 1.7399, "step": 3 }, { "epoch": 0.03292181069958848, "grad_norm": 1.4275061812775922, "learning_rate": 2.1621621621621624e-05, "loss": 1.7083, "step": 4 }, { "epoch": 0.0411522633744856, "grad_norm": 1.3211892837590813, "learning_rate": 2.702702702702703e-05, "loss": 1.6472, "step": 5 }, { "epoch": 0.04938271604938271, "grad_norm": 1.2954095745224536, "learning_rate": 3.2432432432432436e-05, "loss": 1.5935, "step": 6 }, { "epoch": 0.05761316872427984, "grad_norm": 0.9115980406962145, "learning_rate": 3.783783783783784e-05, "loss": 1.4303, "step": 7 }, { "epoch": 0.06584362139917696, "grad_norm": 0.7713124020101795, "learning_rate": 4.324324324324325e-05, "loss": 1.3661, "step": 8 }, { "epoch": 0.07407407407407407, "grad_norm": 0.7920851957124333, "learning_rate": 4.8648648648648654e-05, "loss": 1.3093, "step": 9 }, { "epoch": 0.0823045267489712, "grad_norm": 0.8823079310051533, "learning_rate": 5.405405405405406e-05, "loss": 1.1587, "step": 10 }, { "epoch": 0.09053497942386832, "grad_norm": 0.8557863491129706, "learning_rate": 5.9459459459459466e-05, "loss": 1.0485, "step": 11 }, { "epoch": 0.09876543209876543, "grad_norm": 0.8802985405555526, "learning_rate": 6.486486486486487e-05, "loss": 0.9429, "step": 12 }, { "epoch": 0.10699588477366255, "grad_norm": 0.8436384566917278, "learning_rate": 7.027027027027028e-05, "loss": 0.8218, "step": 13 }, { "epoch": 0.11522633744855967, "grad_norm": 0.6676398141327192, "learning_rate": 7.567567567567568e-05, "loss": 0.7054, "step": 14 }, { "epoch": 0.12345679012345678, "grad_norm": 0.6471842089986185, "learning_rate": 8.108108108108109e-05, "loss": 0.6758, "step": 15 }, { "epoch": 0.13168724279835392, "grad_norm": 0.4142479004737112, "learning_rate": 8.64864864864865e-05, "loss": 0.6222, "step": 16 }, { "epoch": 0.13991769547325103, "grad_norm": 0.42850243250289555, "learning_rate": 9.18918918918919e-05, "loss": 0.5962, "step": 17 }, { "epoch": 0.14814814814814814, "grad_norm": 0.5568208902875614, "learning_rate": 9.729729729729731e-05, "loss": 0.5984, "step": 18 }, { "epoch": 0.15637860082304528, "grad_norm": 0.5540687236420326, "learning_rate": 0.0001027027027027027, "loss": 0.4938, "step": 19 }, { "epoch": 0.1646090534979424, "grad_norm": 0.33332950971206693, "learning_rate": 0.00010810810810810812, "loss": 0.5161, "step": 20 }, { "epoch": 0.1728395061728395, "grad_norm": 0.2653066423031725, "learning_rate": 0.00011351351351351351, "loss": 0.5136, "step": 21 }, { "epoch": 0.18106995884773663, "grad_norm": 0.2272101382108708, "learning_rate": 0.00011891891891891893, "loss": 0.4873, "step": 22 }, { "epoch": 0.18930041152263374, "grad_norm": 0.24006404965358885, "learning_rate": 0.00012432432432432433, "loss": 0.4844, "step": 23 }, { "epoch": 0.19753086419753085, "grad_norm": 0.20608476733616163, "learning_rate": 0.00012972972972972974, "loss": 0.4821, "step": 24 }, { "epoch": 0.205761316872428, "grad_norm": 0.16150407452920948, "learning_rate": 0.00013513513513513514, "loss": 0.4167, "step": 25 }, { "epoch": 0.2139917695473251, "grad_norm": 0.1538751616515209, "learning_rate": 0.00014054054054054056, "loss": 0.446, "step": 26 }, { "epoch": 0.2222222222222222, "grad_norm": 0.16472183357224798, "learning_rate": 0.00014594594594594595, "loss": 0.4142, "step": 27 }, { "epoch": 0.23045267489711935, "grad_norm": 0.1552702492617925, "learning_rate": 0.00015135135135135137, "loss": 0.4909, "step": 28 }, { "epoch": 0.23868312757201646, "grad_norm": 0.14727903417905372, "learning_rate": 0.00015675675675675676, "loss": 0.4065, "step": 29 }, { "epoch": 0.24691358024691357, "grad_norm": 0.14784235244019464, "learning_rate": 0.00016216216216216218, "loss": 0.3886, "step": 30 }, { "epoch": 0.2551440329218107, "grad_norm": 0.1450616035553425, "learning_rate": 0.00016756756756756757, "loss": 0.3902, "step": 31 }, { "epoch": 0.26337448559670784, "grad_norm": 0.153342097650188, "learning_rate": 0.000172972972972973, "loss": 0.3888, "step": 32 }, { "epoch": 0.2716049382716049, "grad_norm": 0.15900713930790533, "learning_rate": 0.00017837837837837839, "loss": 0.3524, "step": 33 }, { "epoch": 0.27983539094650206, "grad_norm": 0.1546499892536795, "learning_rate": 0.0001837837837837838, "loss": 0.3648, "step": 34 }, { "epoch": 0.2880658436213992, "grad_norm": 0.14320964717963003, "learning_rate": 0.0001891891891891892, "loss": 0.3658, "step": 35 }, { "epoch": 0.2962962962962963, "grad_norm": 0.13103727806567708, "learning_rate": 0.00019459459459459462, "loss": 0.3496, "step": 36 }, { "epoch": 0.3045267489711934, "grad_norm": 0.13308383037195082, "learning_rate": 0.0002, "loss": 0.3572, "step": 37 }, { "epoch": 0.31275720164609055, "grad_norm": 0.1387871382009263, "learning_rate": 0.00019999535665248002, "loss": 0.3214, "step": 38 }, { "epoch": 0.32098765432098764, "grad_norm": 0.14270131722019058, "learning_rate": 0.0001999814270411335, "loss": 0.3445, "step": 39 }, { "epoch": 0.3292181069958848, "grad_norm": 0.1545337969749864, "learning_rate": 0.000199958212459561, "loss": 0.32, "step": 40 }, { "epoch": 0.3374485596707819, "grad_norm": 0.13802182056866283, "learning_rate": 0.00019992571506363, "loss": 0.3449, "step": 41 }, { "epoch": 0.345679012345679, "grad_norm": 0.14162173979281106, "learning_rate": 0.00019988393787127441, "loss": 0.3262, "step": 42 }, { "epoch": 0.35390946502057613, "grad_norm": 0.13594846523403772, "learning_rate": 0.0001998328847622148, "loss": 0.2958, "step": 43 }, { "epoch": 0.36213991769547327, "grad_norm": 0.12596629667831052, "learning_rate": 0.00019977256047759765, "loss": 0.286, "step": 44 }, { "epoch": 0.37037037037037035, "grad_norm": 0.13835455454793713, "learning_rate": 0.00019970297061955533, "loss": 0.2878, "step": 45 }, { "epoch": 0.3786008230452675, "grad_norm": 0.14644104388608956, "learning_rate": 0.00019962412165068573, "loss": 0.2952, "step": 46 }, { "epoch": 0.3868312757201646, "grad_norm": 0.1361181054099992, "learning_rate": 0.00019953602089345217, "loss": 0.267, "step": 47 }, { "epoch": 0.3950617283950617, "grad_norm": 0.1545760777310844, "learning_rate": 0.0001994386765295032, "loss": 0.2823, "step": 48 }, { "epoch": 0.40329218106995884, "grad_norm": 0.16048561280797768, "learning_rate": 0.00019933209759891317, "loss": 0.2598, "step": 49 }, { "epoch": 0.411522633744856, "grad_norm": 0.14585143788717048, "learning_rate": 0.00019921629399934223, "loss": 0.2834, "step": 50 }, { "epoch": 0.41975308641975306, "grad_norm": 0.15590757372746555, "learning_rate": 0.00019909127648511755, "loss": 0.2619, "step": 51 }, { "epoch": 0.4279835390946502, "grad_norm": 0.142205678471833, "learning_rate": 0.0001989570566662345, "loss": 0.2477, "step": 52 }, { "epoch": 0.43621399176954734, "grad_norm": 0.14334288636686987, "learning_rate": 0.00019881364700727823, "loss": 0.2958, "step": 53 }, { "epoch": 0.4444444444444444, "grad_norm": 0.1584716514858334, "learning_rate": 0.0001986610608262665, "loss": 0.2708, "step": 54 }, { "epoch": 0.45267489711934156, "grad_norm": 0.15229425624208448, "learning_rate": 0.00019849931229341258, "loss": 0.2776, "step": 55 }, { "epoch": 0.4609053497942387, "grad_norm": 0.1497514774482752, "learning_rate": 0.00019832841642980945, "loss": 0.2325, "step": 56 }, { "epoch": 0.4691358024691358, "grad_norm": 0.15302882739571017, "learning_rate": 0.00019814838910603481, "loss": 0.2755, "step": 57 }, { "epoch": 0.4773662551440329, "grad_norm": 0.20080266795074445, "learning_rate": 0.00019795924704067721, "loss": 0.2421, "step": 58 }, { "epoch": 0.48559670781893005, "grad_norm": 0.15063814452987448, "learning_rate": 0.00019776100779878345, "loss": 0.2152, "step": 59 }, { "epoch": 0.49382716049382713, "grad_norm": 0.1424158898404743, "learning_rate": 0.00019755368979022732, "loss": 0.2424, "step": 60 }, { "epoch": 0.5020576131687243, "grad_norm": 0.15708199535695755, "learning_rate": 0.00019733731226800015, "loss": 0.2439, "step": 61 }, { "epoch": 0.5102880658436214, "grad_norm": 0.1469505519285418, "learning_rate": 0.00019711189532642243, "loss": 0.2174, "step": 62 }, { "epoch": 0.5185185185185185, "grad_norm": 0.15443629035666892, "learning_rate": 0.00019687745989927823, "loss": 0.2201, "step": 63 }, { "epoch": 0.5267489711934157, "grad_norm": 0.16091185410714737, "learning_rate": 0.00019663402775787066, "loss": 0.2153, "step": 64 }, { "epoch": 0.5349794238683128, "grad_norm": 0.16812247962269722, "learning_rate": 0.00019638162150900027, "loss": 0.2245, "step": 65 }, { "epoch": 0.5432098765432098, "grad_norm": 0.15476006323853086, "learning_rate": 0.00019612026459286578, "loss": 0.2168, "step": 66 }, { "epoch": 0.551440329218107, "grad_norm": 0.14319948869013133, "learning_rate": 0.00019584998128088684, "loss": 0.2102, "step": 67 }, { "epoch": 0.5596707818930041, "grad_norm": 0.1416595781349322, "learning_rate": 0.0001955707966734505, "loss": 0.2109, "step": 68 }, { "epoch": 0.5679012345679012, "grad_norm": 0.15627936343326146, "learning_rate": 0.00019528273669757972, "loss": 0.221, "step": 69 }, { "epoch": 0.5761316872427984, "grad_norm": 0.1355242418955904, "learning_rate": 0.0001949858281045261, "loss": 0.1934, "step": 70 }, { "epoch": 0.5843621399176955, "grad_norm": 0.15837694932534985, "learning_rate": 0.00019468009846728513, "loss": 0.2106, "step": 71 }, { "epoch": 0.5925925925925926, "grad_norm": 0.14575785598463487, "learning_rate": 0.00019436557617803595, "loss": 0.1958, "step": 72 }, { "epoch": 0.6008230452674898, "grad_norm": 0.15004026647926874, "learning_rate": 0.00019404229044550433, "loss": 0.2111, "step": 73 }, { "epoch": 0.6090534979423868, "grad_norm": 0.1531758472218286, "learning_rate": 0.00019371027129225042, "loss": 0.1796, "step": 74 }, { "epoch": 0.6172839506172839, "grad_norm": 0.14522275863691966, "learning_rate": 0.0001933695495518804, "loss": 0.1879, "step": 75 }, { "epoch": 0.6255144032921811, "grad_norm": 0.14778202822977027, "learning_rate": 0.00019302015686618326, "loss": 0.1783, "step": 76 }, { "epoch": 0.6337448559670782, "grad_norm": 0.14202535229938665, "learning_rate": 0.0001926621256821922, "loss": 0.1672, "step": 77 }, { "epoch": 0.6419753086419753, "grad_norm": 0.15532295497905474, "learning_rate": 0.00019229548924917146, "loss": 0.1894, "step": 78 }, { "epoch": 0.6502057613168725, "grad_norm": 0.1298933712297695, "learning_rate": 0.00019192028161552847, "loss": 0.1626, "step": 79 }, { "epoch": 0.6584362139917695, "grad_norm": 0.15486589167638415, "learning_rate": 0.0001915365376256519, "loss": 0.1829, "step": 80 }, { "epoch": 0.6666666666666666, "grad_norm": 0.13198994845547862, "learning_rate": 0.00019114429291667583, "loss": 0.1827, "step": 81 }, { "epoch": 0.6748971193415638, "grad_norm": 0.14486854526136775, "learning_rate": 0.00019074358391517023, "loss": 0.1711, "step": 82 }, { "epoch": 0.6831275720164609, "grad_norm": 0.18136392511258384, "learning_rate": 0.00019033444783375804, "loss": 0.1852, "step": 83 }, { "epoch": 0.691358024691358, "grad_norm": 0.12835927358561786, "learning_rate": 0.00018991692266765947, "loss": 0.1874, "step": 84 }, { "epoch": 0.6995884773662552, "grad_norm": 0.15923203558527596, "learning_rate": 0.00018949104719116332, "loss": 0.1754, "step": 85 }, { "epoch": 0.7078189300411523, "grad_norm": 0.1524830096667991, "learning_rate": 0.00018905686095402647, "loss": 0.1772, "step": 86 }, { "epoch": 0.7160493827160493, "grad_norm": 0.1458630884597557, "learning_rate": 0.0001886144042778006, "loss": 0.1884, "step": 87 }, { "epoch": 0.7242798353909465, "grad_norm": 0.14812531898730455, "learning_rate": 0.00018816371825208789, "loss": 0.1694, "step": 88 }, { "epoch": 0.7325102880658436, "grad_norm": 0.1371838995231899, "learning_rate": 0.0001877048447307252, "loss": 0.175, "step": 89 }, { "epoch": 0.7407407407407407, "grad_norm": 0.12597672532717027, "learning_rate": 0.00018723782632789701, "loss": 0.1663, "step": 90 }, { "epoch": 0.7489711934156379, "grad_norm": 0.13653897293999862, "learning_rate": 0.00018676270641417822, "loss": 0.1902, "step": 91 }, { "epoch": 0.757201646090535, "grad_norm": 0.11918081783294039, "learning_rate": 0.0001862795291125063, "loss": 0.1662, "step": 92 }, { "epoch": 0.7654320987654321, "grad_norm": 0.1333312758081247, "learning_rate": 0.0001857883392940837, "loss": 0.199, "step": 93 }, { "epoch": 0.7736625514403292, "grad_norm": 0.12930618535907987, "learning_rate": 0.000185289182574211, "loss": 0.1697, "step": 94 }, { "epoch": 0.7818930041152263, "grad_norm": 0.1336640654310965, "learning_rate": 0.0001847821053080505, "loss": 0.1852, "step": 95 }, { "epoch": 0.7901234567901234, "grad_norm": 0.14974272672832492, "learning_rate": 0.00018426715458632153, "loss": 0.1819, "step": 96 }, { "epoch": 0.7983539094650206, "grad_norm": 0.11738767625285426, "learning_rate": 0.00018374437823092724, "loss": 0.1628, "step": 97 }, { "epoch": 0.8065843621399177, "grad_norm": 0.1126501945549117, "learning_rate": 0.00018321382479051347, "loss": 0.1574, "step": 98 }, { "epoch": 0.8148148148148148, "grad_norm": 0.14716070780057008, "learning_rate": 0.00018267554353596025, "loss": 0.1671, "step": 99 }, { "epoch": 0.823045267489712, "grad_norm": 0.14236534823623162, "learning_rate": 0.0001821295844558062, "loss": 0.179, "step": 100 }, { "epoch": 0.831275720164609, "grad_norm": 0.14819744720356537, "learning_rate": 0.0001815759982516061, "loss": 0.1765, "step": 101 }, { "epoch": 0.8395061728395061, "grad_norm": 0.1359773653210936, "learning_rate": 0.00018101483633322255, "loss": 0.1736, "step": 102 }, { "epoch": 0.8477366255144033, "grad_norm": 0.12220336917197815, "learning_rate": 0.00018044615081405153, "loss": 0.1559, "step": 103 }, { "epoch": 0.8559670781893004, "grad_norm": 0.12734749296356374, "learning_rate": 0.00017986999450618295, "loss": 0.1598, "step": 104 }, { "epoch": 0.8641975308641975, "grad_norm": 0.13635727971275893, "learning_rate": 0.00017928642091549613, "loss": 0.1716, "step": 105 }, { "epoch": 0.8724279835390947, "grad_norm": 0.1256041209800328, "learning_rate": 0.00017869548423669077, "loss": 0.1694, "step": 106 }, { "epoch": 0.8806584362139918, "grad_norm": 0.156735390007985, "learning_rate": 0.00017809723934825405, "loss": 0.1711, "step": 107 }, { "epoch": 0.8888888888888888, "grad_norm": 0.13613462557969605, "learning_rate": 0.00017749174180736442, "loss": 0.1575, "step": 108 }, { "epoch": 0.897119341563786, "grad_norm": 0.12136550882477753, "learning_rate": 0.00017687904784473188, "loss": 0.1541, "step": 109 }, { "epoch": 0.9053497942386831, "grad_norm": 0.11805606680499708, "learning_rate": 0.00017625921435937637, "loss": 0.153, "step": 110 }, { "epoch": 0.9135802469135802, "grad_norm": 0.12306976139021918, "learning_rate": 0.00017563229891334338, "loss": 0.1723, "step": 111 }, { "epoch": 0.9218106995884774, "grad_norm": 0.12639467162562257, "learning_rate": 0.00017499835972635856, "loss": 0.1637, "step": 112 }, { "epoch": 0.9300411522633745, "grad_norm": 0.11341325633796959, "learning_rate": 0.00017435745567042095, "loss": 0.1471, "step": 113 }, { "epoch": 0.9382716049382716, "grad_norm": 0.12238460464162876, "learning_rate": 0.00017370964626433567, "loss": 0.1682, "step": 114 }, { "epoch": 0.9465020576131687, "grad_norm": 0.12574079950832473, "learning_rate": 0.0001730549916681868, "loss": 0.1493, "step": 115 }, { "epoch": 0.9547325102880658, "grad_norm": 0.13251823016582745, "learning_rate": 0.00017239355267775018, "loss": 0.1649, "step": 116 }, { "epoch": 0.9629629629629629, "grad_norm": 0.11699891536949006, "learning_rate": 0.0001717253907188477, "loss": 0.1628, "step": 117 }, { "epoch": 0.9711934156378601, "grad_norm": 0.11316448680114288, "learning_rate": 0.00017105056784164294, "loss": 0.1434, "step": 118 }, { "epoch": 0.9794238683127572, "grad_norm": 0.11531932007410475, "learning_rate": 0.00017036914671487852, "loss": 0.1565, "step": 119 }, { "epoch": 0.9876543209876543, "grad_norm": 0.11423778544173263, "learning_rate": 0.00016968119062005642, "loss": 0.1481, "step": 120 }, { "epoch": 0.9958847736625515, "grad_norm": 0.10808259099105172, "learning_rate": 0.00016898676344556118, "loss": 0.1393, "step": 121 }, { "epoch": 0.9958847736625515, "eval_loss": 0.1582392454147339, "eval_runtime": 24.3633, "eval_samples_per_second": 33.534, "eval_steps_per_second": 1.067, "step": 121 }, { "epoch": 1.0041152263374487, "grad_norm": 0.12179637517786698, "learning_rate": 0.00016828592968072678, "loss": 0.1367, "step": 122 }, { "epoch": 1.0123456790123457, "grad_norm": 0.12278954055202473, "learning_rate": 0.00016757875440984768, "loss": 0.1352, "step": 123 }, { "epoch": 1.0205761316872428, "grad_norm": 0.11262318129903952, "learning_rate": 0.0001668653033061347, "loss": 0.1319, "step": 124 }, { "epoch": 1.02880658436214, "grad_norm": 0.10685919423230769, "learning_rate": 0.00016614564262561608, "loss": 0.1483, "step": 125 }, { "epoch": 1.037037037037037, "grad_norm": 0.12688594869999706, "learning_rate": 0.0001654198392009846, "loss": 0.1345, "step": 126 }, { "epoch": 1.045267489711934, "grad_norm": 0.11380792656533252, "learning_rate": 0.0001646879604353908, "loss": 0.1435, "step": 127 }, { "epoch": 1.0534979423868314, "grad_norm": 0.11336135523754398, "learning_rate": 0.00016395007429618382, "loss": 0.1496, "step": 128 }, { "epoch": 1.0617283950617284, "grad_norm": 0.11068219074387899, "learning_rate": 0.00016320624930859904, "loss": 0.1412, "step": 129 }, { "epoch": 1.0699588477366255, "grad_norm": 0.10745622525472565, "learning_rate": 0.00016245655454939474, "loss": 0.1294, "step": 130 }, { "epoch": 1.0781893004115226, "grad_norm": 0.10201633324761311, "learning_rate": 0.00016170105964043695, "loss": 0.1443, "step": 131 }, { "epoch": 1.0864197530864197, "grad_norm": 0.1137314706455847, "learning_rate": 0.0001609398347422339, "loss": 0.1389, "step": 132 }, { "epoch": 1.0946502057613168, "grad_norm": 0.11204326746406651, "learning_rate": 0.00016017295054742046, "loss": 0.1422, "step": 133 }, { "epoch": 1.102880658436214, "grad_norm": 0.10142584272145459, "learning_rate": 0.00015940047827419303, "loss": 0.1301, "step": 134 }, { "epoch": 1.1111111111111112, "grad_norm": 0.10382986472663408, "learning_rate": 0.00015862248965969604, "loss": 0.1388, "step": 135 }, { "epoch": 1.1193415637860082, "grad_norm": 0.11472916493277631, "learning_rate": 0.00015783905695335946, "loss": 0.1406, "step": 136 }, { "epoch": 1.1275720164609053, "grad_norm": 0.11056711973862805, "learning_rate": 0.0001570502529101896, "loss": 0.1295, "step": 137 }, { "epoch": 1.1358024691358024, "grad_norm": 0.11881161089888494, "learning_rate": 0.00015625615078401244, "loss": 0.1491, "step": 138 }, { "epoch": 1.1440329218106995, "grad_norm": 0.10248132676392517, "learning_rate": 0.00015545682432067067, "loss": 0.1235, "step": 139 }, { "epoch": 1.1522633744855968, "grad_norm": 0.10737832299551002, "learning_rate": 0.0001546523477511754, "loss": 0.15, "step": 140 }, { "epoch": 1.1604938271604939, "grad_norm": 0.10807513117026502, "learning_rate": 0.00015384279578481221, "loss": 0.1302, "step": 141 }, { "epoch": 1.168724279835391, "grad_norm": 0.11235063041052787, "learning_rate": 0.00015302824360220353, "loss": 0.1386, "step": 142 }, { "epoch": 1.176954732510288, "grad_norm": 0.11303603244571206, "learning_rate": 0.00015220876684832638, "loss": 0.1354, "step": 143 }, { "epoch": 1.1851851851851851, "grad_norm": 0.10834449382941344, "learning_rate": 0.0001513844416254879, "loss": 0.1329, "step": 144 }, { "epoch": 1.1934156378600824, "grad_norm": 0.1079013280016857, "learning_rate": 0.00015055534448625766, "loss": 0.1395, "step": 145 }, { "epoch": 1.2016460905349795, "grad_norm": 0.10261607273228789, "learning_rate": 0.00014972155242635852, "loss": 0.129, "step": 146 }, { "epoch": 1.2098765432098766, "grad_norm": 0.11363708966450349, "learning_rate": 0.0001488831428775164, "loss": 0.1461, "step": 147 }, { "epoch": 1.2181069958847737, "grad_norm": 0.11418227389762935, "learning_rate": 0.00014804019370026926, "loss": 0.1408, "step": 148 }, { "epoch": 1.2263374485596708, "grad_norm": 0.11642847183223218, "learning_rate": 0.00014719278317673655, "loss": 0.1462, "step": 149 }, { "epoch": 1.2345679012345678, "grad_norm": 0.11219874980224723, "learning_rate": 0.0001463409900033493, "loss": 0.1302, "step": 150 }, { "epoch": 1.242798353909465, "grad_norm": 0.11039753961672125, "learning_rate": 0.00014548489328354195, "loss": 0.1349, "step": 151 }, { "epoch": 1.2510288065843622, "grad_norm": 0.10772127499535779, "learning_rate": 0.00014462457252040607, "loss": 0.134, "step": 152 }, { "epoch": 1.2592592592592593, "grad_norm": 0.10693684319005581, "learning_rate": 0.00014376010760930728, "loss": 0.1314, "step": 153 }, { "epoch": 1.2674897119341564, "grad_norm": 0.10099951149470732, "learning_rate": 0.00014289157883046568, "loss": 0.1314, "step": 154 }, { "epoch": 1.2757201646090535, "grad_norm": 0.10714073475937398, "learning_rate": 0.0001420190668415002, "loss": 0.1168, "step": 155 }, { "epoch": 1.2839506172839505, "grad_norm": 0.11431029472007842, "learning_rate": 0.00014114265266993846, "loss": 0.1457, "step": 156 }, { "epoch": 1.2921810699588478, "grad_norm": 0.11035801526331707, "learning_rate": 0.00014026241770569197, "loss": 0.1496, "step": 157 }, { "epoch": 1.300411522633745, "grad_norm": 0.10419031063352954, "learning_rate": 0.00013937844369349734, "loss": 0.1323, "step": 158 }, { "epoch": 1.308641975308642, "grad_norm": 0.10823681149718124, "learning_rate": 0.00013849081272532544, "loss": 0.1264, "step": 159 }, { "epoch": 1.316872427983539, "grad_norm": 0.11224858342990347, "learning_rate": 0.00013759960723275732, "loss": 0.1494, "step": 160 }, { "epoch": 1.3251028806584362, "grad_norm": 0.11222637320000997, "learning_rate": 0.00013670490997932922, "loss": 0.1446, "step": 161 }, { "epoch": 1.3333333333333333, "grad_norm": 0.11350101469976674, "learning_rate": 0.00013580680405284664, "loss": 0.1501, "step": 162 }, { "epoch": 1.3415637860082303, "grad_norm": 0.11216990215296897, "learning_rate": 0.00013490537285766808, "loss": 0.1518, "step": 163 }, { "epoch": 1.3497942386831276, "grad_norm": 0.10385126129598882, "learning_rate": 0.00013400070010695966, "loss": 0.1326, "step": 164 }, { "epoch": 1.3580246913580247, "grad_norm": 0.09801204121771982, "learning_rate": 0.00013309286981492085, "loss": 0.1385, "step": 165 }, { "epoch": 1.3662551440329218, "grad_norm": 0.10515015522554948, "learning_rate": 0.00013218196628898233, "loss": 0.1435, "step": 166 }, { "epoch": 1.374485596707819, "grad_norm": 0.11167870227165867, "learning_rate": 0.00013126807412197665, "loss": 0.1469, "step": 167 }, { "epoch": 1.382716049382716, "grad_norm": 0.110692438020908, "learning_rate": 0.0001303512781842824, "loss": 0.1267, "step": 168 }, { "epoch": 1.3909465020576133, "grad_norm": 0.11426160677474222, "learning_rate": 0.00012943166361594242, "loss": 0.1308, "step": 169 }, { "epoch": 1.3991769547325104, "grad_norm": 0.11583690935003194, "learning_rate": 0.00012850931581875723, "loss": 0.1484, "step": 170 }, { "epoch": 1.4074074074074074, "grad_norm": 0.10018605187361156, "learning_rate": 0.00012758432044835392, "loss": 0.141, "step": 171 }, { "epoch": 1.4156378600823045, "grad_norm": 0.10604765462173282, "learning_rate": 0.0001266567634062317, "loss": 0.1291, "step": 172 }, { "epoch": 1.4238683127572016, "grad_norm": 0.10158287607968718, "learning_rate": 0.0001257267308317845, "loss": 0.1276, "step": 173 }, { "epoch": 1.4320987654320987, "grad_norm": 0.10299808028319524, "learning_rate": 0.00012479430909430108, "loss": 0.1317, "step": 174 }, { "epoch": 1.4403292181069958, "grad_norm": 0.1118568576111719, "learning_rate": 0.00012385958478494487, "loss": 0.1288, "step": 175 }, { "epoch": 1.448559670781893, "grad_norm": 0.10353944126895748, "learning_rate": 0.00012292264470871182, "loss": 0.1175, "step": 176 }, { "epoch": 1.4567901234567902, "grad_norm": 0.11318325883515341, "learning_rate": 0.00012198357587636957, "loss": 0.1304, "step": 177 }, { "epoch": 1.4650205761316872, "grad_norm": 0.11698708468284778, "learning_rate": 0.00012104246549637683, "loss": 0.143, "step": 178 }, { "epoch": 1.4732510288065843, "grad_norm": 0.12408074070049697, "learning_rate": 0.00012009940096678452, "loss": 0.1583, "step": 179 }, { "epoch": 1.4814814814814814, "grad_norm": 0.115849428171122, "learning_rate": 0.00011915446986711953, "loss": 0.1401, "step": 180 }, { "epoch": 1.4897119341563787, "grad_norm": 0.10498841318355592, "learning_rate": 0.00011820775995025147, "loss": 0.1404, "step": 181 }, { "epoch": 1.4979423868312758, "grad_norm": 0.11284118297477287, "learning_rate": 0.0001172593591342432, "loss": 0.1479, "step": 182 }, { "epoch": 1.5061728395061729, "grad_norm": 0.1063063021523787, "learning_rate": 0.00011630935549418627, "loss": 0.1223, "step": 183 }, { "epoch": 1.51440329218107, "grad_norm": 0.09875055061391125, "learning_rate": 0.00011535783725402163, "loss": 0.1177, "step": 184 }, { "epoch": 1.522633744855967, "grad_norm": 0.10114695106701617, "learning_rate": 0.00011440489277834645, "loss": 0.1403, "step": 185 }, { "epoch": 1.5308641975308643, "grad_norm": 0.09017689726172246, "learning_rate": 0.0001134506105642081, "loss": 0.1114, "step": 186 }, { "epoch": 1.5390946502057612, "grad_norm": 0.09727031699745013, "learning_rate": 0.00011249507923288562, "loss": 0.1368, "step": 187 }, { "epoch": 1.5473251028806585, "grad_norm": 0.10131253354694934, "learning_rate": 0.0001115383875216598, "loss": 0.1278, "step": 188 }, { "epoch": 1.5555555555555556, "grad_norm": 0.09766756433474798, "learning_rate": 0.00011058062427557229, "loss": 0.1284, "step": 189 }, { "epoch": 1.5637860082304527, "grad_norm": 0.09911328195816287, "learning_rate": 0.00010962187843917497, "loss": 0.1284, "step": 190 }, { "epoch": 1.5720164609053497, "grad_norm": 0.10809617890378591, "learning_rate": 0.0001086622390482699, "loss": 0.1423, "step": 191 }, { "epoch": 1.5802469135802468, "grad_norm": 0.09456158065899406, "learning_rate": 0.00010770179522164079, "loss": 0.1317, "step": 192 }, { "epoch": 1.5884773662551441, "grad_norm": 0.09799719160217063, "learning_rate": 0.0001067406361527768, "loss": 0.1356, "step": 193 }, { "epoch": 1.596707818930041, "grad_norm": 0.09992479118544549, "learning_rate": 0.00010577885110158958, "loss": 0.1292, "step": 194 }, { "epoch": 1.6049382716049383, "grad_norm": 0.09693089475338096, "learning_rate": 0.00010481652938612374, "loss": 0.1391, "step": 195 }, { "epoch": 1.6131687242798354, "grad_norm": 0.09217578544631694, "learning_rate": 0.00010385376037426226, "loss": 0.1152, "step": 196 }, { "epoch": 1.6213991769547325, "grad_norm": 0.09249375316990154, "learning_rate": 0.00010289063347542726, "loss": 0.1154, "step": 197 }, { "epoch": 1.6296296296296298, "grad_norm": 0.09641811645171969, "learning_rate": 0.00010192723813227672, "loss": 0.1182, "step": 198 }, { "epoch": 1.6378600823045266, "grad_norm": 0.09748485166957398, "learning_rate": 0.00010096366381239808, "loss": 0.1338, "step": 199 }, { "epoch": 1.646090534979424, "grad_norm": 0.09383283234337292, "learning_rate": 0.0001, "loss": 0.1235, "step": 200 }, { "epoch": 1.654320987654321, "grad_norm": 0.09670711594290475, "learning_rate": 9.903633618760195e-05, "loss": 0.1222, "step": 201 }, { "epoch": 1.662551440329218, "grad_norm": 0.09682675756651625, "learning_rate": 9.807276186772333e-05, "loss": 0.1271, "step": 202 }, { "epoch": 1.6707818930041154, "grad_norm": 0.09455470748347092, "learning_rate": 9.710936652457276e-05, "loss": 0.1217, "step": 203 }, { "epoch": 1.6790123456790123, "grad_norm": 0.10988834676545467, "learning_rate": 9.614623962573776e-05, "loss": 0.1288, "step": 204 }, { "epoch": 1.6872427983539096, "grad_norm": 0.10415165974330459, "learning_rate": 9.518347061387628e-05, "loss": 0.1307, "step": 205 }, { "epoch": 1.6954732510288066, "grad_norm": 0.10985786659106267, "learning_rate": 9.422114889841044e-05, "loss": 0.1405, "step": 206 }, { "epoch": 1.7037037037037037, "grad_norm": 0.09921801439791741, "learning_rate": 9.325936384722321e-05, "loss": 0.1421, "step": 207 }, { "epoch": 1.7119341563786008, "grad_norm": 0.09695464903081527, "learning_rate": 9.229820477835927e-05, "loss": 0.1261, "step": 208 }, { "epoch": 1.7201646090534979, "grad_norm": 0.09747696306459716, "learning_rate": 9.133776095173015e-05, "loss": 0.1295, "step": 209 }, { "epoch": 1.7283950617283952, "grad_norm": 0.09028526494291234, "learning_rate": 9.037812156082504e-05, "loss": 0.1148, "step": 210 }, { "epoch": 1.736625514403292, "grad_norm": 0.11512926087416764, "learning_rate": 8.941937572442773e-05, "loss": 0.1385, "step": 211 }, { "epoch": 1.7448559670781894, "grad_norm": 0.09719378966338557, "learning_rate": 8.846161247834024e-05, "loss": 0.1308, "step": 212 }, { "epoch": 1.7530864197530864, "grad_norm": 0.09008557356074057, "learning_rate": 8.750492076711439e-05, "loss": 0.1142, "step": 213 }, { "epoch": 1.7613168724279835, "grad_norm": 0.10577290101156246, "learning_rate": 8.654938943579194e-05, "loss": 0.1409, "step": 214 }, { "epoch": 1.7695473251028808, "grad_norm": 0.10122461461639144, "learning_rate": 8.55951072216536e-05, "loss": 0.1314, "step": 215 }, { "epoch": 1.7777777777777777, "grad_norm": 0.09823573980962558, "learning_rate": 8.464216274597838e-05, "loss": 0.1219, "step": 216 }, { "epoch": 1.786008230452675, "grad_norm": 0.10401921170876789, "learning_rate": 8.369064450581373e-05, "loss": 0.144, "step": 217 }, { "epoch": 1.794238683127572, "grad_norm": 0.09078999043742002, "learning_rate": 8.274064086575681e-05, "loss": 0.1146, "step": 218 }, { "epoch": 1.8024691358024691, "grad_norm": 0.09290736424935679, "learning_rate": 8.179224004974857e-05, "loss": 0.1338, "step": 219 }, { "epoch": 1.8106995884773662, "grad_norm": 0.10089995497723929, "learning_rate": 8.084553013288048e-05, "loss": 0.1299, "step": 220 }, { "epoch": 1.8189300411522633, "grad_norm": 0.11086354657368475, "learning_rate": 7.990059903321553e-05, "loss": 0.1413, "step": 221 }, { "epoch": 1.8271604938271606, "grad_norm": 0.10239856014254489, "learning_rate": 7.89575345036232e-05, "loss": 0.1357, "step": 222 }, { "epoch": 1.8353909465020575, "grad_norm": 0.10469014425315878, "learning_rate": 7.801642412363041e-05, "loss": 0.1386, "step": 223 }, { "epoch": 1.8436213991769548, "grad_norm": 0.09567702676769906, "learning_rate": 7.707735529128819e-05, "loss": 0.1191, "step": 224 }, { "epoch": 1.8518518518518519, "grad_norm": 0.09887833758041985, "learning_rate": 7.614041521505517e-05, "loss": 0.1219, "step": 225 }, { "epoch": 1.860082304526749, "grad_norm": 0.11752510500534394, "learning_rate": 7.520569090569893e-05, "loss": 0.1436, "step": 226 }, { "epoch": 1.8683127572016462, "grad_norm": 0.10388061868632578, "learning_rate": 7.427326916821557e-05, "loss": 0.1521, "step": 227 }, { "epoch": 1.876543209876543, "grad_norm": 0.09472507284073291, "learning_rate": 7.334323659376829e-05, "loss": 0.1309, "step": 228 }, { "epoch": 1.8847736625514404, "grad_norm": 0.09368373509928322, "learning_rate": 7.24156795516461e-05, "loss": 0.1226, "step": 229 }, { "epoch": 1.8930041152263375, "grad_norm": 0.09798127382996634, "learning_rate": 7.149068418124281e-05, "loss": 0.134, "step": 230 }, { "epoch": 1.9012345679012346, "grad_norm": 0.09780244211128043, "learning_rate": 7.056833638405762e-05, "loss": 0.1139, "step": 231 }, { "epoch": 1.9094650205761317, "grad_norm": 0.09955863946262253, "learning_rate": 6.964872181571764e-05, "loss": 0.1372, "step": 232 }, { "epoch": 1.9176954732510287, "grad_norm": 0.0999251460303617, "learning_rate": 6.87319258780234e-05, "loss": 0.1302, "step": 233 }, { "epoch": 1.925925925925926, "grad_norm": 0.10537500302863075, "learning_rate": 6.781803371101774e-05, "loss": 0.1361, "step": 234 }, { "epoch": 1.934156378600823, "grad_norm": 0.10646426579724969, "learning_rate": 6.690713018507918e-05, "loss": 0.1382, "step": 235 }, { "epoch": 1.9423868312757202, "grad_norm": 0.09815506133539018, "learning_rate": 6.599929989304035e-05, "loss": 0.1248, "step": 236 }, { "epoch": 1.9506172839506173, "grad_norm": 0.10476726048050965, "learning_rate": 6.509462714233195e-05, "loss": 0.139, "step": 237 }, { "epoch": 1.9588477366255144, "grad_norm": 0.10022243654627384, "learning_rate": 6.419319594715339e-05, "loss": 0.1285, "step": 238 }, { "epoch": 1.9670781893004117, "grad_norm": 0.09626907084774464, "learning_rate": 6.32950900206708e-05, "loss": 0.1345, "step": 239 }, { "epoch": 1.9753086419753085, "grad_norm": 0.09684684548905444, "learning_rate": 6.240039276724272e-05, "loss": 0.1338, "step": 240 }, { "epoch": 1.9835390946502058, "grad_norm": 0.09617321255871392, "learning_rate": 6.150918727467455e-05, "loss": 0.1475, "step": 241 }, { "epoch": 1.991769547325103, "grad_norm": 0.10288685971512879, "learning_rate": 6.062155630650265e-05, "loss": 0.13, "step": 242 }, { "epoch": 2.0, "grad_norm": 0.09997416312505185, "learning_rate": 5.973758229430806e-05, "loss": 0.1282, "step": 243 }, { "epoch": 2.0, "eval_loss": 0.14014942944049835, "eval_runtime": 21.1655, "eval_samples_per_second": 38.601, "eval_steps_per_second": 1.228, "step": 243 }, { "epoch": 2.0082304526748973, "grad_norm": 0.0846538233930147, "learning_rate": 5.885734733006154e-05, "loss": 0.0975, "step": 244 }, { "epoch": 2.016460905349794, "grad_norm": 0.09549609421876461, "learning_rate": 5.798093315849984e-05, "loss": 0.1102, "step": 245 }, { "epoch": 2.0246913580246915, "grad_norm": 0.09559068048673255, "learning_rate": 5.710842116953438e-05, "loss": 0.1025, "step": 246 }, { "epoch": 2.0329218106995883, "grad_norm": 0.10047776150134546, "learning_rate": 5.623989239069275e-05, "loss": 0.1167, "step": 247 }, { "epoch": 2.0411522633744856, "grad_norm": 0.10670187619168023, "learning_rate": 5.537542747959394e-05, "loss": 0.1115, "step": 248 }, { "epoch": 2.049382716049383, "grad_norm": 0.10560096207358394, "learning_rate": 5.451510671645807e-05, "loss": 0.117, "step": 249 }, { "epoch": 2.05761316872428, "grad_norm": 0.10122771696509619, "learning_rate": 5.36590099966507e-05, "loss": 0.1253, "step": 250 }, { "epoch": 2.065843621399177, "grad_norm": 0.09738370841133107, "learning_rate": 5.2807216823263484e-05, "loss": 0.1188, "step": 251 }, { "epoch": 2.074074074074074, "grad_norm": 0.09886958144813014, "learning_rate": 5.1959806299730774e-05, "loss": 0.1237, "step": 252 }, { "epoch": 2.0823045267489713, "grad_norm": 0.10219049090054223, "learning_rate": 5.111685712248364e-05, "loss": 0.1137, "step": 253 }, { "epoch": 2.090534979423868, "grad_norm": 0.09836202650897583, "learning_rate": 5.0278447573641495e-05, "loss": 0.1196, "step": 254 }, { "epoch": 2.0987654320987654, "grad_norm": 0.09213881530201376, "learning_rate": 4.944465551374238e-05, "loss": 0.0999, "step": 255 }, { "epoch": 2.1069958847736627, "grad_norm": 0.09541725924400123, "learning_rate": 4.861555837451213e-05, "loss": 0.1042, "step": 256 }, { "epoch": 2.1152263374485596, "grad_norm": 0.09856792963956205, "learning_rate": 4.779123315167362e-05, "loss": 0.1139, "step": 257 }, { "epoch": 2.123456790123457, "grad_norm": 0.09596664358323789, "learning_rate": 4.6971756397796504e-05, "loss": 0.1127, "step": 258 }, { "epoch": 2.1316872427983538, "grad_norm": 0.10398664110407933, "learning_rate": 4.61572042151878e-05, "loss": 0.1226, "step": 259 }, { "epoch": 2.139917695473251, "grad_norm": 0.09339956051495163, "learning_rate": 4.5347652248824624e-05, "loss": 0.1099, "step": 260 }, { "epoch": 2.148148148148148, "grad_norm": 0.09890330904002925, "learning_rate": 4.4543175679329344e-05, "loss": 0.1195, "step": 261 }, { "epoch": 2.156378600823045, "grad_norm": 0.09997598174399476, "learning_rate": 4.3743849215987595e-05, "loss": 0.1128, "step": 262 }, { "epoch": 2.1646090534979425, "grad_norm": 0.09613540261918692, "learning_rate": 4.294974708981041e-05, "loss": 0.11, "step": 263 }, { "epoch": 2.1728395061728394, "grad_norm": 0.09811570643997117, "learning_rate": 4.216094304664056e-05, "loss": 0.1221, "step": 264 }, { "epoch": 2.1810699588477367, "grad_norm": 0.10130508026732313, "learning_rate": 4.137751034030399e-05, "loss": 0.1147, "step": 265 }, { "epoch": 2.1893004115226335, "grad_norm": 0.10155247373344696, "learning_rate": 4.059952172580694e-05, "loss": 0.1258, "step": 266 }, { "epoch": 2.197530864197531, "grad_norm": 0.10114305608112335, "learning_rate": 3.982704945257957e-05, "loss": 0.1125, "step": 267 }, { "epoch": 2.205761316872428, "grad_norm": 0.09834925536076868, "learning_rate": 3.906016525776611e-05, "loss": 0.1178, "step": 268 }, { "epoch": 2.213991769547325, "grad_norm": 0.10693349155074929, "learning_rate": 3.829894035956306e-05, "loss": 0.125, "step": 269 }, { "epoch": 2.2222222222222223, "grad_norm": 0.09467414992649671, "learning_rate": 3.7543445450605285e-05, "loss": 0.1054, "step": 270 }, { "epoch": 2.230452674897119, "grad_norm": 0.09826633722499567, "learning_rate": 3.6793750691400994e-05, "loss": 0.1066, "step": 271 }, { "epoch": 2.2386831275720165, "grad_norm": 0.10348161204951604, "learning_rate": 3.6049925703816214e-05, "loss": 0.1144, "step": 272 }, { "epoch": 2.246913580246914, "grad_norm": 0.10479446585724977, "learning_rate": 3.53120395646092e-05, "loss": 0.1135, "step": 273 }, { "epoch": 2.2551440329218106, "grad_norm": 0.0951654749026979, "learning_rate": 3.458016079901544e-05, "loss": 0.1074, "step": 274 }, { "epoch": 2.263374485596708, "grad_norm": 0.11076793087444725, "learning_rate": 3.38543573743839e-05, "loss": 0.1299, "step": 275 }, { "epoch": 2.271604938271605, "grad_norm": 0.0946857687815803, "learning_rate": 3.3134696693865316e-05, "loss": 0.106, "step": 276 }, { "epoch": 2.279835390946502, "grad_norm": 0.09607169531950183, "learning_rate": 3.242124559015234e-05, "loss": 0.0966, "step": 277 }, { "epoch": 2.288065843621399, "grad_norm": 0.10167592887197249, "learning_rate": 3.171407031927325e-05, "loss": 0.1162, "step": 278 }, { "epoch": 2.2962962962962963, "grad_norm": 0.09726294827804387, "learning_rate": 3.101323655443882e-05, "loss": 0.1139, "step": 279 }, { "epoch": 2.3045267489711936, "grad_norm": 0.09582434907073016, "learning_rate": 3.031880937994359e-05, "loss": 0.1084, "step": 280 }, { "epoch": 2.3127572016460904, "grad_norm": 0.11021289451624465, "learning_rate": 2.9630853285121508e-05, "loss": 0.1231, "step": 281 }, { "epoch": 2.3209876543209877, "grad_norm": 0.09391459178579925, "learning_rate": 2.894943215835708e-05, "loss": 0.1047, "step": 282 }, { "epoch": 2.3292181069958846, "grad_norm": 0.09899700696295954, "learning_rate": 2.827460928115232e-05, "loss": 0.1101, "step": 283 }, { "epoch": 2.337448559670782, "grad_norm": 0.09901505034589045, "learning_rate": 2.7606447322249872e-05, "loss": 0.1067, "step": 284 }, { "epoch": 2.3456790123456788, "grad_norm": 0.10420896589962987, "learning_rate": 2.6945008331813226e-05, "loss": 0.1282, "step": 285 }, { "epoch": 2.353909465020576, "grad_norm": 0.09729393595160353, "learning_rate": 2.629035373566433e-05, "loss": 0.1023, "step": 286 }, { "epoch": 2.3621399176954734, "grad_norm": 0.09829406149572398, "learning_rate": 2.5642544329579088e-05, "loss": 0.1155, "step": 287 }, { "epoch": 2.3703703703703702, "grad_norm": 0.10275430043325989, "learning_rate": 2.500164027364147e-05, "loss": 0.1252, "step": 288 }, { "epoch": 2.3786008230452675, "grad_norm": 0.10367750094719844, "learning_rate": 2.4367701086656624e-05, "loss": 0.1166, "step": 289 }, { "epoch": 2.386831275720165, "grad_norm": 0.10274988439011652, "learning_rate": 2.3740785640623643e-05, "loss": 0.1169, "step": 290 }, { "epoch": 2.3950617283950617, "grad_norm": 0.09057411016580416, "learning_rate": 2.312095215526814e-05, "loss": 0.1026, "step": 291 }, { "epoch": 2.403292181069959, "grad_norm": 0.104243770759663, "learning_rate": 2.2508258192635612e-05, "loss": 0.1127, "step": 292 }, { "epoch": 2.411522633744856, "grad_norm": 0.09595429188209968, "learning_rate": 2.1902760651745958e-05, "loss": 0.1117, "step": 293 }, { "epoch": 2.419753086419753, "grad_norm": 0.10143248145662324, "learning_rate": 2.1304515763309253e-05, "loss": 0.111, "step": 294 }, { "epoch": 2.42798353909465, "grad_norm": 0.1074361000716321, "learning_rate": 2.0713579084503876e-05, "loss": 0.1222, "step": 295 }, { "epoch": 2.4362139917695473, "grad_norm": 0.09389635392057138, "learning_rate": 2.013000549381706e-05, "loss": 0.1009, "step": 296 }, { "epoch": 2.4444444444444446, "grad_norm": 0.0950080424907371, "learning_rate": 1.9553849185948512e-05, "loss": 0.1111, "step": 297 }, { "epoch": 2.4526748971193415, "grad_norm": 0.09761821744754258, "learning_rate": 1.8985163666777473e-05, "loss": 0.1013, "step": 298 }, { "epoch": 2.460905349794239, "grad_norm": 0.10939996129330187, "learning_rate": 1.8424001748393905e-05, "loss": 0.1215, "step": 299 }, { "epoch": 2.4691358024691357, "grad_norm": 0.09848524806568702, "learning_rate": 1.787041554419381e-05, "loss": 0.1, "step": 300 }, { "epoch": 2.477366255144033, "grad_norm": 0.10208382321734329, "learning_rate": 1.7324456464039752e-05, "loss": 0.1156, "step": 301 }, { "epoch": 2.48559670781893, "grad_norm": 0.10860830565047937, "learning_rate": 1.6786175209486566e-05, "loss": 0.1135, "step": 302 }, { "epoch": 2.493827160493827, "grad_norm": 0.11254066618708665, "learning_rate": 1.6255621769072805e-05, "loss": 0.1202, "step": 303 }, { "epoch": 2.5020576131687244, "grad_norm": 0.09949710536532337, "learning_rate": 1.5732845413678477e-05, "loss": 0.108, "step": 304 }, { "epoch": 2.5102880658436213, "grad_norm": 0.09871004589964077, "learning_rate": 1.521789469194952e-05, "loss": 0.1048, "step": 305 }, { "epoch": 2.5185185185185186, "grad_norm": 0.09509589762300931, "learning_rate": 1.4710817425789014e-05, "loss": 0.1108, "step": 306 }, { "epoch": 2.526748971193416, "grad_norm": 0.09874336616077671, "learning_rate": 1.4211660705916285e-05, "loss": 0.1075, "step": 307 }, { "epoch": 2.5349794238683128, "grad_norm": 0.10610686628994136, "learning_rate": 1.3720470887493719e-05, "loss": 0.114, "step": 308 }, { "epoch": 2.5432098765432096, "grad_norm": 0.09371419408053047, "learning_rate": 1.3237293585821786e-05, "loss": 0.1, "step": 309 }, { "epoch": 2.551440329218107, "grad_norm": 0.097056043473284, "learning_rate": 1.2762173672102996e-05, "loss": 0.1091, "step": 310 }, { "epoch": 2.5596707818930042, "grad_norm": 0.10289500706688874, "learning_rate": 1.2295155269274827e-05, "loss": 0.108, "step": 311 }, { "epoch": 2.567901234567901, "grad_norm": 0.10969046879996797, "learning_rate": 1.1836281747912125e-05, "loss": 0.1231, "step": 312 }, { "epoch": 2.5761316872427984, "grad_norm": 0.09910058926653355, "learning_rate": 1.1385595722199438e-05, "loss": 0.1132, "step": 313 }, { "epoch": 2.5843621399176957, "grad_norm": 0.0973576033135528, "learning_rate": 1.0943139045973549e-05, "loss": 0.1152, "step": 314 }, { "epoch": 2.5925925925925926, "grad_norm": 0.10324605327000957, "learning_rate": 1.050895280883668e-05, "loss": 0.119, "step": 315 }, { "epoch": 2.60082304526749, "grad_norm": 0.09940459072200475, "learning_rate": 1.0083077332340562e-05, "loss": 0.1086, "step": 316 }, { "epoch": 2.6090534979423867, "grad_norm": 0.10054320644748665, "learning_rate": 9.665552166241964e-06, "loss": 0.1149, "step": 317 }, { "epoch": 2.617283950617284, "grad_norm": 0.10479612628242368, "learning_rate": 9.256416084829778e-06, "loss": 0.1274, "step": 318 }, { "epoch": 2.625514403292181, "grad_norm": 0.09669311712494659, "learning_rate": 8.855707083324183e-06, "loss": 0.1165, "step": 319 }, { "epoch": 2.633744855967078, "grad_norm": 0.09887270600607351, "learning_rate": 8.46346237434813e-06, "loss": 0.1056, "step": 320 }, { "epoch": 2.6419753086419755, "grad_norm": 0.10455687155682203, "learning_rate": 8.079718384471557e-06, "loss": 0.1152, "step": 321 }, { "epoch": 2.6502057613168724, "grad_norm": 0.10011518680387445, "learning_rate": 7.704510750828542e-06, "loss": 0.1056, "step": 322 }, { "epoch": 2.6584362139917697, "grad_norm": 0.09682701363965382, "learning_rate": 7.337874317807802e-06, "loss": 0.1034, "step": 323 }, { "epoch": 2.6666666666666665, "grad_norm": 0.10000970149912758, "learning_rate": 6.979843133816743e-06, "loss": 0.1119, "step": 324 }, { "epoch": 2.674897119341564, "grad_norm": 0.10465713299866858, "learning_rate": 6.630450448119618e-06, "loss": 0.1183, "step": 325 }, { "epoch": 2.6831275720164607, "grad_norm": 0.10068814150000575, "learning_rate": 6.289728707749609e-06, "loss": 0.11, "step": 326 }, { "epoch": 2.691358024691358, "grad_norm": 0.09310993381960417, "learning_rate": 5.957709554495683e-06, "loss": 0.103, "step": 327 }, { "epoch": 2.6995884773662553, "grad_norm": 0.10472398535370353, "learning_rate": 5.634423821964074e-06, "loss": 0.1161, "step": 328 }, { "epoch": 2.707818930041152, "grad_norm": 0.10645507272996577, "learning_rate": 5.319901532714877e-06, "loss": 0.1217, "step": 329 }, { "epoch": 2.7160493827160495, "grad_norm": 0.10602188133639034, "learning_rate": 5.014171895473929e-06, "loss": 0.1144, "step": 330 }, { "epoch": 2.7242798353909468, "grad_norm": 0.10289659614865782, "learning_rate": 4.717263302420283e-06, "loss": 0.1079, "step": 331 }, { "epoch": 2.7325102880658436, "grad_norm": 0.09777304769239446, "learning_rate": 4.429203326549525e-06, "loss": 0.0999, "step": 332 }, { "epoch": 2.7407407407407405, "grad_norm": 0.1006102230121861, "learning_rate": 4.1500187191131466e-06, "loss": 0.099, "step": 333 }, { "epoch": 2.748971193415638, "grad_norm": 0.10180225031662725, "learning_rate": 3.879735407134244e-06, "loss": 0.1173, "step": 334 }, { "epoch": 2.757201646090535, "grad_norm": 0.10250402250260239, "learning_rate": 3.6183784909997187e-06, "loss": 0.1139, "step": 335 }, { "epoch": 2.765432098765432, "grad_norm": 0.10615733085722019, "learning_rate": 3.3659722421293783e-06, "loss": 0.1133, "step": 336 }, { "epoch": 2.7736625514403292, "grad_norm": 0.10439855178259218, "learning_rate": 3.1225401007217936e-06, "loss": 0.1119, "step": 337 }, { "epoch": 2.7818930041152266, "grad_norm": 0.10526144305704047, "learning_rate": 2.8881046735775742e-06, "loss": 0.1219, "step": 338 }, { "epoch": 2.7901234567901234, "grad_norm": 0.10521105916873999, "learning_rate": 2.66268773199988e-06, "loss": 0.1074, "step": 339 }, { "epoch": 2.7983539094650207, "grad_norm": 0.10748613106096833, "learning_rate": 2.446310209772684e-06, "loss": 0.1128, "step": 340 }, { "epoch": 2.8065843621399176, "grad_norm": 0.09914523804031186, "learning_rate": 2.2389922012165944e-06, "loss": 0.1121, "step": 341 }, { "epoch": 2.814814814814815, "grad_norm": 0.09936886240495578, "learning_rate": 2.0407529593228116e-06, "loss": 0.108, "step": 342 }, { "epoch": 2.8230452674897117, "grad_norm": 0.09150083914451453, "learning_rate": 1.8516108939651945e-06, "loss": 0.1033, "step": 343 }, { "epoch": 2.831275720164609, "grad_norm": 0.10108462806007419, "learning_rate": 1.6715835701905603e-06, "loss": 0.1126, "step": 344 }, { "epoch": 2.8395061728395063, "grad_norm": 0.09387291972239033, "learning_rate": 1.5006877065874336e-06, "loss": 0.1033, "step": 345 }, { "epoch": 2.847736625514403, "grad_norm": 0.10136947076698849, "learning_rate": 1.3389391737335112e-06, "loss": 0.1104, "step": 346 }, { "epoch": 2.8559670781893005, "grad_norm": 0.09974875048963104, "learning_rate": 1.1863529927217732e-06, "loss": 0.1043, "step": 347 }, { "epoch": 2.8641975308641974, "grad_norm": 0.1025752381453182, "learning_rate": 1.0429433337655115e-06, "loss": 0.1122, "step": 348 }, { "epoch": 2.8724279835390947, "grad_norm": 0.10454016619792661, "learning_rate": 9.087235148824368e-07, "loss": 0.1184, "step": 349 }, { "epoch": 2.8806584362139915, "grad_norm": 0.09589319596591109, "learning_rate": 7.837060006577801e-07, "loss": 0.1046, "step": 350 }, { "epoch": 2.888888888888889, "grad_norm": 0.09451088377127861, "learning_rate": 6.679024010868618e-07, "loss": 0.1103, "step": 351 }, { "epoch": 2.897119341563786, "grad_norm": 0.09770471400992828, "learning_rate": 5.613234704967996e-07, "loss": 0.1027, "step": 352 }, { "epoch": 2.905349794238683, "grad_norm": 0.09543457833989027, "learning_rate": 4.639791065478738e-07, "loss": 0.1025, "step": 353 }, { "epoch": 2.9135802469135803, "grad_norm": 0.10382486799240734, "learning_rate": 3.758783493142737e-07, "loss": 0.118, "step": 354 }, { "epoch": 2.9218106995884776, "grad_norm": 0.1030393593220336, "learning_rate": 2.9702938044468e-07, "loss": 0.1179, "step": 355 }, { "epoch": 2.9300411522633745, "grad_norm": 0.10073287500645238, "learning_rate": 2.2743952240236176e-07, "loss": 0.1058, "step": 356 }, { "epoch": 2.9382716049382713, "grad_norm": 0.10064431575396734, "learning_rate": 1.6711523778520921e-07, "loss": 0.1061, "step": 357 }, { "epoch": 2.9465020576131686, "grad_norm": 0.08907660041993512, "learning_rate": 1.1606212872559141e-07, "loss": 0.0985, "step": 358 }, { "epoch": 2.954732510288066, "grad_norm": 0.10069703961300915, "learning_rate": 7.428493637002821e-08, "loss": 0.1107, "step": 359 }, { "epoch": 2.962962962962963, "grad_norm": 0.09834578070623724, "learning_rate": 4.178754043898669e-08, "loss": 0.1056, "step": 360 }, { "epoch": 2.97119341563786, "grad_norm": 0.10056360393735715, "learning_rate": 1.8572958866514e-08, "loss": 0.1123, "step": 361 }, { "epoch": 2.9794238683127574, "grad_norm": 0.0943621498377698, "learning_rate": 4.643347520005836e-09, "loss": 0.1045, "step": 362 }, { "epoch": 2.9876543209876543, "grad_norm": 0.09983652481727213, "learning_rate": 0.0, "loss": 0.1135, "step": 363 }, { "epoch": 2.9876543209876543, "eval_loss": 0.13842210173606873, "eval_runtime": 21.2699, "eval_samples_per_second": 38.411, "eval_steps_per_second": 1.222, "step": 363 }, { "epoch": 2.9876543209876543, "step": 363, "total_flos": 9.618537139981517e+16, "train_loss": 0.21143763487742953, "train_runtime": 3172.33, "train_samples_per_second": 14.668, "train_steps_per_second": 0.114 } ], "logging_steps": 1, "max_steps": 363, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.618537139981517e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }