{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990049751243781, "eval_steps": 100, "global_step": 251, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003980099502487562, "grad_norm": 3.8580713272094727, "learning_rate": 1e-07, "loss": 1.266, "step": 1 }, { "epoch": 0.007960199004975124, "grad_norm": 3.767099618911743, "learning_rate": 1e-07, "loss": 1.2058, "step": 2 }, { "epoch": 0.011940298507462687, "grad_norm": 3.6648917198181152, "learning_rate": 1e-07, "loss": 1.1977, "step": 3 }, { "epoch": 0.015920398009950248, "grad_norm": 4.032405853271484, "learning_rate": 1e-07, "loss": 1.3709, "step": 4 }, { "epoch": 0.01990049751243781, "grad_norm": 3.6155037879943848, "learning_rate": 1e-07, "loss": 1.1766, "step": 5 }, { "epoch": 0.023880597014925373, "grad_norm": 4.01386022567749, "learning_rate": 1e-07, "loss": 1.322, "step": 6 }, { "epoch": 0.027860696517412936, "grad_norm": 3.610419988632202, "learning_rate": 1e-07, "loss": 1.2293, "step": 7 }, { "epoch": 0.031840796019900496, "grad_norm": 3.849774122238159, "learning_rate": 1e-07, "loss": 1.2478, "step": 8 }, { "epoch": 0.03582089552238806, "grad_norm": 4.040029048919678, "learning_rate": 1e-07, "loss": 1.3693, "step": 9 }, { "epoch": 0.03980099502487562, "grad_norm": 3.2718212604522705, "learning_rate": 1e-07, "loss": 1.1815, "step": 10 }, { "epoch": 0.04378109452736319, "grad_norm": 3.513458490371704, "learning_rate": 1e-07, "loss": 1.2456, "step": 11 }, { "epoch": 0.04776119402985075, "grad_norm": 3.62939453125, "learning_rate": 1e-07, "loss": 1.2566, "step": 12 }, { "epoch": 0.051741293532338306, "grad_norm": 3.0019543170928955, "learning_rate": 1e-07, "loss": 1.1008, "step": 13 }, { "epoch": 0.05572139303482587, "grad_norm": 3.506714105606079, "learning_rate": 1e-07, "loss": 1.2656, "step": 14 }, { "epoch": 0.05970149253731343, "grad_norm": 3.310253858566284, "learning_rate": 1e-07, "loss": 1.2106, "step": 15 }, { "epoch": 0.06368159203980099, "grad_norm": 3.259761095046997, "learning_rate": 1e-07, "loss": 1.2235, "step": 16 }, { "epoch": 0.06766169154228856, "grad_norm": 3.4364163875579834, "learning_rate": 1e-07, "loss": 1.2248, "step": 17 }, { "epoch": 0.07164179104477612, "grad_norm": 3.2793073654174805, "learning_rate": 1e-07, "loss": 1.2489, "step": 18 }, { "epoch": 0.07562189054726368, "grad_norm": 3.304138660430908, "learning_rate": 1e-07, "loss": 1.2144, "step": 19 }, { "epoch": 0.07960199004975124, "grad_norm": 3.531829833984375, "learning_rate": 1e-07, "loss": 1.2233, "step": 20 }, { "epoch": 0.08358208955223881, "grad_norm": 2.7901320457458496, "learning_rate": 1e-07, "loss": 1.2514, "step": 21 }, { "epoch": 0.08756218905472637, "grad_norm": 2.325275182723999, "learning_rate": 1e-07, "loss": 1.1432, "step": 22 }, { "epoch": 0.09154228855721393, "grad_norm": 2.518044948577881, "learning_rate": 1e-07, "loss": 1.2058, "step": 23 }, { "epoch": 0.0955223880597015, "grad_norm": 2.309213161468506, "learning_rate": 1e-07, "loss": 1.1712, "step": 24 }, { "epoch": 0.09950248756218906, "grad_norm": 2.2889468669891357, "learning_rate": 1e-07, "loss": 1.1712, "step": 25 }, { "epoch": 0.10348258706467661, "grad_norm": 2.4013020992279053, "learning_rate": 1e-07, "loss": 1.1562, "step": 26 }, { "epoch": 0.10746268656716418, "grad_norm": 2.434525728225708, "learning_rate": 1e-07, "loss": 1.2198, "step": 27 }, { "epoch": 0.11144278606965174, "grad_norm": 2.251544952392578, "learning_rate": 1e-07, "loss": 1.1243, "step": 28 }, { "epoch": 0.1154228855721393, "grad_norm": 2.2087714672088623, "learning_rate": 1e-07, "loss": 1.0644, "step": 29 }, { "epoch": 0.11940298507462686, "grad_norm": 2.489380359649658, "learning_rate": 1e-07, "loss": 1.2045, "step": 30 }, { "epoch": 0.12338308457711443, "grad_norm": 2.4045751094818115, "learning_rate": 1e-07, "loss": 1.2104, "step": 31 }, { "epoch": 0.12736318407960198, "grad_norm": 2.1436238288879395, "learning_rate": 1e-07, "loss": 1.0513, "step": 32 }, { "epoch": 0.13134328358208955, "grad_norm": 2.2431857585906982, "learning_rate": 1e-07, "loss": 1.1339, "step": 33 }, { "epoch": 0.13532338308457711, "grad_norm": 2.311044454574585, "learning_rate": 1e-07, "loss": 1.1645, "step": 34 }, { "epoch": 0.13930348258706468, "grad_norm": 2.4365642070770264, "learning_rate": 1e-07, "loss": 1.1917, "step": 35 }, { "epoch": 0.14328358208955225, "grad_norm": 2.1745855808258057, "learning_rate": 1e-07, "loss": 1.1828, "step": 36 }, { "epoch": 0.1472636815920398, "grad_norm": 2.0474634170532227, "learning_rate": 1e-07, "loss": 1.1491, "step": 37 }, { "epoch": 0.15124378109452735, "grad_norm": 1.9449026584625244, "learning_rate": 1e-07, "loss": 1.08, "step": 38 }, { "epoch": 0.15522388059701492, "grad_norm": 3.04811692237854, "learning_rate": 1e-07, "loss": 1.1285, "step": 39 }, { "epoch": 0.15920398009950248, "grad_norm": 3.6523616313934326, "learning_rate": 1e-07, "loss": 1.1066, "step": 40 }, { "epoch": 0.16318407960199005, "grad_norm": 3.586474657058716, "learning_rate": 1e-07, "loss": 1.0668, "step": 41 }, { "epoch": 0.16716417910447762, "grad_norm": 3.7307212352752686, "learning_rate": 1e-07, "loss": 1.0895, "step": 42 }, { "epoch": 0.17114427860696518, "grad_norm": 3.6780145168304443, "learning_rate": 1e-07, "loss": 1.0969, "step": 43 }, { "epoch": 0.17512437810945275, "grad_norm": 3.404796600341797, "learning_rate": 1e-07, "loss": 1.0623, "step": 44 }, { "epoch": 0.1791044776119403, "grad_norm": 2.9056687355041504, "learning_rate": 1e-07, "loss": 1.0852, "step": 45 }, { "epoch": 0.18308457711442785, "grad_norm": 2.536485195159912, "learning_rate": 1e-07, "loss": 1.1154, "step": 46 }, { "epoch": 0.18706467661691542, "grad_norm": 2.7943172454833984, "learning_rate": 1e-07, "loss": 1.1182, "step": 47 }, { "epoch": 0.191044776119403, "grad_norm": 2.3565714359283447, "learning_rate": 1e-07, "loss": 1.0874, "step": 48 }, { "epoch": 0.19502487562189055, "grad_norm": 2.36067271232605, "learning_rate": 1e-07, "loss": 1.0897, "step": 49 }, { "epoch": 0.19900497512437812, "grad_norm": 1.958436369895935, "learning_rate": 1e-07, "loss": 1.0027, "step": 50 }, { "epoch": 0.20298507462686566, "grad_norm": 2.0697078704833984, "learning_rate": 1e-07, "loss": 1.0524, "step": 51 }, { "epoch": 0.20696517412935322, "grad_norm": 2.2829782962799072, "learning_rate": 1e-07, "loss": 1.0981, "step": 52 }, { "epoch": 0.2109452736318408, "grad_norm": 2.299952507019043, "learning_rate": 1e-07, "loss": 1.0779, "step": 53 }, { "epoch": 0.21492537313432836, "grad_norm": 2.0332202911376953, "learning_rate": 1e-07, "loss": 0.9759, "step": 54 }, { "epoch": 0.21890547263681592, "grad_norm": 2.410606622695923, "learning_rate": 1e-07, "loss": 1.0155, "step": 55 }, { "epoch": 0.2228855721393035, "grad_norm": 2.6469626426696777, "learning_rate": 1e-07, "loss": 1.1728, "step": 56 }, { "epoch": 0.22686567164179106, "grad_norm": 2.503451347351074, "learning_rate": 1e-07, "loss": 1.0717, "step": 57 }, { "epoch": 0.2308457711442786, "grad_norm": 2.1651253700256348, "learning_rate": 1e-07, "loss": 1.0946, "step": 58 }, { "epoch": 0.23482587064676616, "grad_norm": 2.1998214721679688, "learning_rate": 1e-07, "loss": 1.0693, "step": 59 }, { "epoch": 0.23880597014925373, "grad_norm": 2.221885919570923, "learning_rate": 1e-07, "loss": 1.0597, "step": 60 }, { "epoch": 0.2427860696517413, "grad_norm": 2.011728525161743, "learning_rate": 1e-07, "loss": 1.0108, "step": 61 }, { "epoch": 0.24676616915422886, "grad_norm": 2.034902334213257, "learning_rate": 1e-07, "loss": 1.0204, "step": 62 }, { "epoch": 0.2507462686567164, "grad_norm": 2.29103946685791, "learning_rate": 1e-07, "loss": 1.0653, "step": 63 }, { "epoch": 0.25472636815920396, "grad_norm": 2.3828279972076416, "learning_rate": 1e-07, "loss": 1.0372, "step": 64 }, { "epoch": 0.25870646766169153, "grad_norm": 2.386139392852783, "learning_rate": 1e-07, "loss": 1.0268, "step": 65 }, { "epoch": 0.2626865671641791, "grad_norm": 2.423414707183838, "learning_rate": 1e-07, "loss": 1.0457, "step": 66 }, { "epoch": 0.26666666666666666, "grad_norm": 2.455092191696167, "learning_rate": 1e-07, "loss": 1.0568, "step": 67 }, { "epoch": 0.27064676616915423, "grad_norm": 2.3402390480041504, "learning_rate": 1e-07, "loss": 1.016, "step": 68 }, { "epoch": 0.2746268656716418, "grad_norm": 2.493511199951172, "learning_rate": 1e-07, "loss": 1.0211, "step": 69 }, { "epoch": 0.27860696517412936, "grad_norm": 2.6465377807617188, "learning_rate": 1e-07, "loss": 1.0279, "step": 70 }, { "epoch": 0.28258706467661693, "grad_norm": 3.05757999420166, "learning_rate": 1e-07, "loss": 0.9864, "step": 71 }, { "epoch": 0.2865671641791045, "grad_norm": 2.768735408782959, "learning_rate": 1e-07, "loss": 1.0058, "step": 72 }, { "epoch": 0.29054726368159206, "grad_norm": 2.534254550933838, "learning_rate": 1e-07, "loss": 0.9786, "step": 73 }, { "epoch": 0.2945273631840796, "grad_norm": 1.736907720565796, "learning_rate": 1e-07, "loss": 0.9466, "step": 74 }, { "epoch": 0.29850746268656714, "grad_norm": 1.4108011722564697, "learning_rate": 1e-07, "loss": 0.9232, "step": 75 }, { "epoch": 0.3024875621890547, "grad_norm": 1.571252465248108, "learning_rate": 1e-07, "loss": 0.9139, "step": 76 }, { "epoch": 0.30646766169154227, "grad_norm": 1.639280915260315, "learning_rate": 1e-07, "loss": 0.9272, "step": 77 }, { "epoch": 0.31044776119402984, "grad_norm": 1.46147882938385, "learning_rate": 1e-07, "loss": 0.8913, "step": 78 }, { "epoch": 0.3144278606965174, "grad_norm": 1.1830153465270996, "learning_rate": 1e-07, "loss": 0.879, "step": 79 }, { "epoch": 0.31840796019900497, "grad_norm": 1.535416841506958, "learning_rate": 1e-07, "loss": 0.8703, "step": 80 }, { "epoch": 0.32238805970149254, "grad_norm": 1.8527772426605225, "learning_rate": 1e-07, "loss": 0.8882, "step": 81 }, { "epoch": 0.3263681592039801, "grad_norm": 1.4239050149917603, "learning_rate": 1e-07, "loss": 0.9042, "step": 82 }, { "epoch": 0.33034825870646767, "grad_norm": 1.9072166681289673, "learning_rate": 1e-07, "loss": 0.9118, "step": 83 }, { "epoch": 0.33432835820895523, "grad_norm": 1.8409007787704468, "learning_rate": 1e-07, "loss": 0.9043, "step": 84 }, { "epoch": 0.3383084577114428, "grad_norm": 1.3596872091293335, "learning_rate": 1e-07, "loss": 0.8902, "step": 85 }, { "epoch": 0.34228855721393037, "grad_norm": 1.314673900604248, "learning_rate": 1e-07, "loss": 0.8509, "step": 86 }, { "epoch": 0.34626865671641793, "grad_norm": 1.4804043769836426, "learning_rate": 1e-07, "loss": 0.8603, "step": 87 }, { "epoch": 0.3502487562189055, "grad_norm": 1.3986847400665283, "learning_rate": 1e-07, "loss": 0.8841, "step": 88 }, { "epoch": 0.354228855721393, "grad_norm": 1.3893916606903076, "learning_rate": 1e-07, "loss": 0.8705, "step": 89 }, { "epoch": 0.3582089552238806, "grad_norm": 1.4482388496398926, "learning_rate": 1e-07, "loss": 0.8706, "step": 90 }, { "epoch": 0.36218905472636814, "grad_norm": 1.446069359779358, "learning_rate": 1e-07, "loss": 0.8445, "step": 91 }, { "epoch": 0.3661691542288557, "grad_norm": 1.493567943572998, "learning_rate": 1e-07, "loss": 0.8823, "step": 92 }, { "epoch": 0.3701492537313433, "grad_norm": 1.6904833316802979, "learning_rate": 1e-07, "loss": 0.8494, "step": 93 }, { "epoch": 0.37412935323383084, "grad_norm": 1.469353199005127, "learning_rate": 1e-07, "loss": 0.8808, "step": 94 }, { "epoch": 0.3781094527363184, "grad_norm": 1.498508334159851, "learning_rate": 1e-07, "loss": 0.8738, "step": 95 }, { "epoch": 0.382089552238806, "grad_norm": 1.3189642429351807, "learning_rate": 1e-07, "loss": 0.8181, "step": 96 }, { "epoch": 0.38606965174129354, "grad_norm": 1.2873642444610596, "learning_rate": 1e-07, "loss": 0.8537, "step": 97 }, { "epoch": 0.3900497512437811, "grad_norm": 1.366760492324829, "learning_rate": 1e-07, "loss": 0.7977, "step": 98 }, { "epoch": 0.3940298507462687, "grad_norm": 1.363909125328064, "learning_rate": 1e-07, "loss": 0.9297, "step": 99 }, { "epoch": 0.39800995024875624, "grad_norm": 2.0010488033294678, "learning_rate": 1e-07, "loss": 0.8176, "step": 100 }, { "epoch": 0.4019900497512438, "grad_norm": 1.9680029153823853, "learning_rate": 1e-07, "loss": 0.818, "step": 101 }, { "epoch": 0.4059701492537313, "grad_norm": 1.674877405166626, "learning_rate": 1e-07, "loss": 0.8414, "step": 102 }, { "epoch": 0.4099502487562189, "grad_norm": 1.5867595672607422, "learning_rate": 1e-07, "loss": 0.7887, "step": 103 }, { "epoch": 0.41393034825870645, "grad_norm": 1.6069676876068115, "learning_rate": 1e-07, "loss": 0.9112, "step": 104 }, { "epoch": 0.417910447761194, "grad_norm": 1.5387823581695557, "learning_rate": 1e-07, "loss": 0.8173, "step": 105 }, { "epoch": 0.4218905472636816, "grad_norm": 1.2036306858062744, "learning_rate": 1e-07, "loss": 0.7781, "step": 106 }, { "epoch": 0.42587064676616915, "grad_norm": 1.4114856719970703, "learning_rate": 1e-07, "loss": 0.8078, "step": 107 }, { "epoch": 0.4298507462686567, "grad_norm": 1.2400673627853394, "learning_rate": 1e-07, "loss": 0.8003, "step": 108 }, { "epoch": 0.4338308457711443, "grad_norm": 1.3319138288497925, "learning_rate": 1e-07, "loss": 0.7651, "step": 109 }, { "epoch": 0.43781094527363185, "grad_norm": 1.2857623100280762, "learning_rate": 1e-07, "loss": 0.7556, "step": 110 }, { "epoch": 0.4417910447761194, "grad_norm": 1.5332382917404175, "learning_rate": 1e-07, "loss": 0.7802, "step": 111 }, { "epoch": 0.445771144278607, "grad_norm": 1.331517219543457, "learning_rate": 1e-07, "loss": 0.7785, "step": 112 }, { "epoch": 0.44975124378109455, "grad_norm": 1.271018147468567, "learning_rate": 1e-07, "loss": 0.7571, "step": 113 }, { "epoch": 0.4537313432835821, "grad_norm": 1.3966182470321655, "learning_rate": 1e-07, "loss": 0.7302, "step": 114 }, { "epoch": 0.4577114427860697, "grad_norm": 1.2569191455841064, "learning_rate": 1e-07, "loss": 0.725, "step": 115 }, { "epoch": 0.4616915422885572, "grad_norm": 1.4532155990600586, "learning_rate": 1e-07, "loss": 0.6981, "step": 116 }, { "epoch": 0.46567164179104475, "grad_norm": 1.393580436706543, "learning_rate": 1e-07, "loss": 0.6814, "step": 117 }, { "epoch": 0.4696517412935323, "grad_norm": 1.4711062908172607, "learning_rate": 1e-07, "loss": 0.6821, "step": 118 }, { "epoch": 0.4736318407960199, "grad_norm": 2.436079740524292, "learning_rate": 1e-07, "loss": 0.8096, "step": 119 }, { "epoch": 0.47761194029850745, "grad_norm": 1.2138607501983643, "learning_rate": 1e-07, "loss": 0.7009, "step": 120 }, { "epoch": 0.481592039800995, "grad_norm": 1.053438425064087, "learning_rate": 1e-07, "loss": 0.6848, "step": 121 }, { "epoch": 0.4855721393034826, "grad_norm": 1.3829809427261353, "learning_rate": 1e-07, "loss": 0.7276, "step": 122 }, { "epoch": 0.48955223880597015, "grad_norm": 1.6276544332504272, "learning_rate": 1e-07, "loss": 0.7052, "step": 123 }, { "epoch": 0.4935323383084577, "grad_norm": 1.399953007698059, "learning_rate": 1e-07, "loss": 0.7116, "step": 124 }, { "epoch": 0.4975124378109453, "grad_norm": 1.118137001991272, "learning_rate": 1e-07, "loss": 0.6912, "step": 125 }, { "epoch": 0.5014925373134328, "grad_norm": 0.9562084674835205, "learning_rate": 1e-07, "loss": 0.6726, "step": 126 }, { "epoch": 0.5054726368159204, "grad_norm": 1.0179568529129028, "learning_rate": 1e-07, "loss": 0.6047, "step": 127 }, { "epoch": 0.5094527363184079, "grad_norm": 0.950977087020874, "learning_rate": 1e-07, "loss": 0.6962, "step": 128 }, { "epoch": 0.5134328358208955, "grad_norm": 0.7613106966018677, "learning_rate": 1e-07, "loss": 0.6381, "step": 129 }, { "epoch": 0.5174129353233831, "grad_norm": 0.9950020909309387, "learning_rate": 1e-07, "loss": 0.6478, "step": 130 }, { "epoch": 0.5213930348258706, "grad_norm": 1.0094377994537354, "learning_rate": 1e-07, "loss": 0.6558, "step": 131 }, { "epoch": 0.5253731343283582, "grad_norm": 1.064998745918274, "learning_rate": 1e-07, "loss": 0.6483, "step": 132 }, { "epoch": 0.5293532338308458, "grad_norm": 0.7481135725975037, "learning_rate": 1e-07, "loss": 0.6469, "step": 133 }, { "epoch": 0.5333333333333333, "grad_norm": 0.9318634867668152, "learning_rate": 1e-07, "loss": 0.6822, "step": 134 }, { "epoch": 0.5373134328358209, "grad_norm": 0.7420485615730286, "learning_rate": 1e-07, "loss": 0.5645, "step": 135 }, { "epoch": 0.5412935323383085, "grad_norm": 0.7259682416915894, "learning_rate": 1e-07, "loss": 0.6168, "step": 136 }, { "epoch": 0.545273631840796, "grad_norm": 0.6080716252326965, "learning_rate": 1e-07, "loss": 0.5775, "step": 137 }, { "epoch": 0.5492537313432836, "grad_norm": 0.7300984263420105, "learning_rate": 1e-07, "loss": 0.6141, "step": 138 }, { "epoch": 0.5532338308457712, "grad_norm": 0.8576564192771912, "learning_rate": 1e-07, "loss": 0.5893, "step": 139 }, { "epoch": 0.5572139303482587, "grad_norm": 0.9741348624229431, "learning_rate": 1e-07, "loss": 0.6059, "step": 140 }, { "epoch": 0.5611940298507463, "grad_norm": 1.1145509481430054, "learning_rate": 1e-07, "loss": 0.6597, "step": 141 }, { "epoch": 0.5651741293532339, "grad_norm": 0.8707802891731262, "learning_rate": 1e-07, "loss": 0.6206, "step": 142 }, { "epoch": 0.5691542288557214, "grad_norm": 1.1013362407684326, "learning_rate": 1e-07, "loss": 0.6066, "step": 143 }, { "epoch": 0.573134328358209, "grad_norm": 1.1516404151916504, "learning_rate": 1e-07, "loss": 0.6594, "step": 144 }, { "epoch": 0.5771144278606966, "grad_norm": 0.7887628078460693, "learning_rate": 1e-07, "loss": 0.5577, "step": 145 }, { "epoch": 0.5810945273631841, "grad_norm": 0.8240315318107605, "learning_rate": 1e-07, "loss": 0.5911, "step": 146 }, { "epoch": 0.5850746268656717, "grad_norm": 0.6422097682952881, "learning_rate": 1e-07, "loss": 0.5345, "step": 147 }, { "epoch": 0.5890547263681593, "grad_norm": 0.6938869953155518, "learning_rate": 1e-07, "loss": 0.5884, "step": 148 }, { "epoch": 0.5930348258706468, "grad_norm": 1.2094863653182983, "learning_rate": 1e-07, "loss": 0.6102, "step": 149 }, { "epoch": 0.5970149253731343, "grad_norm": 0.876981258392334, "learning_rate": 1e-07, "loss": 0.6256, "step": 150 }, { "epoch": 0.6009950248756218, "grad_norm": 0.6353152394294739, "learning_rate": 1e-07, "loss": 0.5946, "step": 151 }, { "epoch": 0.6049751243781094, "grad_norm": 0.7054216265678406, "learning_rate": 1e-07, "loss": 0.5915, "step": 152 }, { "epoch": 0.608955223880597, "grad_norm": 0.6976187229156494, "learning_rate": 1e-07, "loss": 0.5769, "step": 153 }, { "epoch": 0.6129353233830845, "grad_norm": 0.727631688117981, "learning_rate": 1e-07, "loss": 0.6159, "step": 154 }, { "epoch": 0.6169154228855721, "grad_norm": 0.5325585007667542, "learning_rate": 1e-07, "loss": 0.6424, "step": 155 }, { "epoch": 0.6208955223880597, "grad_norm": 0.7219927906990051, "learning_rate": 1e-07, "loss": 0.599, "step": 156 }, { "epoch": 0.6248756218905472, "grad_norm": 0.4903685748577118, "learning_rate": 1e-07, "loss": 0.5413, "step": 157 }, { "epoch": 0.6288557213930348, "grad_norm": 0.6661394238471985, "learning_rate": 1e-07, "loss": 0.5479, "step": 158 }, { "epoch": 0.6328358208955224, "grad_norm": 0.9018793702125549, "learning_rate": 1e-07, "loss": 0.5804, "step": 159 }, { "epoch": 0.6368159203980099, "grad_norm": 0.5995965600013733, "learning_rate": 1e-07, "loss": 0.596, "step": 160 }, { "epoch": 0.6407960199004975, "grad_norm": 0.5746481418609619, "learning_rate": 1e-07, "loss": 0.5606, "step": 161 }, { "epoch": 0.6447761194029851, "grad_norm": 0.9206658005714417, "learning_rate": 1e-07, "loss": 0.6063, "step": 162 }, { "epoch": 0.6487562189054726, "grad_norm": 0.47769197821617126, "learning_rate": 1e-07, "loss": 0.5873, "step": 163 }, { "epoch": 0.6527363184079602, "grad_norm": 0.5097990036010742, "learning_rate": 1e-07, "loss": 0.6004, "step": 164 }, { "epoch": 0.6567164179104478, "grad_norm": 0.5074031949043274, "learning_rate": 1e-07, "loss": 0.6, "step": 165 }, { "epoch": 0.6606965174129353, "grad_norm": 0.45879438519477844, "learning_rate": 1e-07, "loss": 0.5441, "step": 166 }, { "epoch": 0.6646766169154229, "grad_norm": 0.6278098821640015, "learning_rate": 1e-07, "loss": 0.5243, "step": 167 }, { "epoch": 0.6686567164179105, "grad_norm": 0.6123387217521667, "learning_rate": 1e-07, "loss": 0.5989, "step": 168 }, { "epoch": 0.672636815920398, "grad_norm": 0.6250973343849182, "learning_rate": 1e-07, "loss": 0.5789, "step": 169 }, { "epoch": 0.6766169154228856, "grad_norm": 0.8891845941543579, "learning_rate": 1e-07, "loss": 0.5517, "step": 170 }, { "epoch": 0.6805970149253732, "grad_norm": 0.6342730522155762, "learning_rate": 1e-07, "loss": 0.5602, "step": 171 }, { "epoch": 0.6845771144278607, "grad_norm": 0.7926033735275269, "learning_rate": 1e-07, "loss": 0.5956, "step": 172 }, { "epoch": 0.6885572139303483, "grad_norm": 0.45265182852745056, "learning_rate": 1e-07, "loss": 0.5617, "step": 173 }, { "epoch": 0.6925373134328359, "grad_norm": 0.5704881548881531, "learning_rate": 1e-07, "loss": 0.5962, "step": 174 }, { "epoch": 0.6965174129353234, "grad_norm": 0.5954543352127075, "learning_rate": 1e-07, "loss": 0.5436, "step": 175 }, { "epoch": 0.700497512437811, "grad_norm": 0.5004026293754578, "learning_rate": 1e-07, "loss": 0.5666, "step": 176 }, { "epoch": 0.7044776119402985, "grad_norm": 0.5121272206306458, "learning_rate": 1e-07, "loss": 0.5977, "step": 177 }, { "epoch": 0.708457711442786, "grad_norm": 0.5957807898521423, "learning_rate": 1e-07, "loss": 0.5512, "step": 178 }, { "epoch": 0.7124378109452736, "grad_norm": 0.6544281840324402, "learning_rate": 1e-07, "loss": 0.5805, "step": 179 }, { "epoch": 0.7164179104477612, "grad_norm": 0.4494013786315918, "learning_rate": 1e-07, "loss": 0.537, "step": 180 }, { "epoch": 0.7203980099502487, "grad_norm": 0.497487336397171, "learning_rate": 1e-07, "loss": 0.5499, "step": 181 }, { "epoch": 0.7243781094527363, "grad_norm": 0.7070022225379944, "learning_rate": 1e-07, "loss": 0.545, "step": 182 }, { "epoch": 0.7283582089552239, "grad_norm": 0.7148118615150452, "learning_rate": 1e-07, "loss": 0.5912, "step": 183 }, { "epoch": 0.7323383084577114, "grad_norm": 0.4213866889476776, "learning_rate": 1e-07, "loss": 0.5878, "step": 184 }, { "epoch": 0.736318407960199, "grad_norm": 0.4360056221485138, "learning_rate": 1e-07, "loss": 0.6289, "step": 185 }, { "epoch": 0.7402985074626866, "grad_norm": 0.5059033632278442, "learning_rate": 1e-07, "loss": 0.5813, "step": 186 }, { "epoch": 0.7442786069651741, "grad_norm": 0.6717720031738281, "learning_rate": 1e-07, "loss": 0.5539, "step": 187 }, { "epoch": 0.7482587064676617, "grad_norm": 0.736268162727356, "learning_rate": 1e-07, "loss": 0.5862, "step": 188 }, { "epoch": 0.7522388059701492, "grad_norm": 0.5463230609893799, "learning_rate": 1e-07, "loss": 0.5603, "step": 189 }, { "epoch": 0.7562189054726368, "grad_norm": 0.4267301857471466, "learning_rate": 1e-07, "loss": 0.5204, "step": 190 }, { "epoch": 0.7601990049751244, "grad_norm": 0.5766838192939758, "learning_rate": 1e-07, "loss": 0.5404, "step": 191 }, { "epoch": 0.764179104477612, "grad_norm": 0.47452178597450256, "learning_rate": 1e-07, "loss": 0.5719, "step": 192 }, { "epoch": 0.7681592039800995, "grad_norm": 0.496635377407074, "learning_rate": 1e-07, "loss": 0.5583, "step": 193 }, { "epoch": 0.7721393034825871, "grad_norm": 0.6146615147590637, "learning_rate": 1e-07, "loss": 0.5705, "step": 194 }, { "epoch": 0.7761194029850746, "grad_norm": 0.4717157781124115, "learning_rate": 1e-07, "loss": 0.5237, "step": 195 }, { "epoch": 0.7800995024875622, "grad_norm": 0.5160589218139648, "learning_rate": 1e-07, "loss": 0.5903, "step": 196 }, { "epoch": 0.7840796019900498, "grad_norm": 0.5136131644248962, "learning_rate": 1e-07, "loss": 0.5902, "step": 197 }, { "epoch": 0.7880597014925373, "grad_norm": 0.46506819128990173, "learning_rate": 1e-07, "loss": 0.5339, "step": 198 }, { "epoch": 0.7920398009950249, "grad_norm": 0.39230668544769287, "learning_rate": 1e-07, "loss": 0.5582, "step": 199 }, { "epoch": 0.7960199004975125, "grad_norm": 0.4060211777687073, "learning_rate": 1e-07, "loss": 0.5464, "step": 200 }, { "epoch": 0.8, "grad_norm": 0.4357733428478241, "learning_rate": 1e-07, "loss": 0.5187, "step": 201 }, { "epoch": 0.8039800995024876, "grad_norm": 0.5215465426445007, "learning_rate": 1e-07, "loss": 0.576, "step": 202 }, { "epoch": 0.8079601990049752, "grad_norm": 0.41915613412857056, "learning_rate": 1e-07, "loss": 0.5453, "step": 203 }, { "epoch": 0.8119402985074626, "grad_norm": 0.4252179265022278, "learning_rate": 1e-07, "loss": 0.6138, "step": 204 }, { "epoch": 0.8159203980099502, "grad_norm": 0.5336969494819641, "learning_rate": 1e-07, "loss": 0.6403, "step": 205 }, { "epoch": 0.8199004975124378, "grad_norm": 0.655704915523529, "learning_rate": 1e-07, "loss": 0.5871, "step": 206 }, { "epoch": 0.8238805970149253, "grad_norm": 0.5526642203330994, "learning_rate": 1e-07, "loss": 0.5848, "step": 207 }, { "epoch": 0.8278606965174129, "grad_norm": 0.3965601623058319, "learning_rate": 1e-07, "loss": 0.5439, "step": 208 }, { "epoch": 0.8318407960199005, "grad_norm": 0.3756254017353058, "learning_rate": 1e-07, "loss": 0.544, "step": 209 }, { "epoch": 0.835820895522388, "grad_norm": 0.5047779083251953, "learning_rate": 1e-07, "loss": 0.5752, "step": 210 }, { "epoch": 0.8398009950248756, "grad_norm": 0.389487624168396, "learning_rate": 1e-07, "loss": 0.5365, "step": 211 }, { "epoch": 0.8437810945273632, "grad_norm": 0.46802979707717896, "learning_rate": 1e-07, "loss": 0.6019, "step": 212 }, { "epoch": 0.8477611940298507, "grad_norm": 0.5024377703666687, "learning_rate": 1e-07, "loss": 0.5073, "step": 213 }, { "epoch": 0.8517412935323383, "grad_norm": 0.5403252243995667, "learning_rate": 1e-07, "loss": 0.5839, "step": 214 }, { "epoch": 0.8557213930348259, "grad_norm": 0.3976481556892395, "learning_rate": 1e-07, "loss": 0.5703, "step": 215 }, { "epoch": 0.8597014925373134, "grad_norm": 0.44842153787612915, "learning_rate": 1e-07, "loss": 0.539, "step": 216 }, { "epoch": 0.863681592039801, "grad_norm": 0.5411170721054077, "learning_rate": 1e-07, "loss": 0.572, "step": 217 }, { "epoch": 0.8676616915422886, "grad_norm": 0.7187299728393555, "learning_rate": 1e-07, "loss": 0.5798, "step": 218 }, { "epoch": 0.8716417910447761, "grad_norm": 0.5184516310691833, "learning_rate": 1e-07, "loss": 0.5325, "step": 219 }, { "epoch": 0.8756218905472637, "grad_norm": 0.3858243525028229, "learning_rate": 1e-07, "loss": 0.5687, "step": 220 }, { "epoch": 0.8796019900497513, "grad_norm": 0.3441164493560791, "learning_rate": 1e-07, "loss": 0.5146, "step": 221 }, { "epoch": 0.8835820895522388, "grad_norm": 0.4126099944114685, "learning_rate": 1e-07, "loss": 0.5639, "step": 222 }, { "epoch": 0.8875621890547264, "grad_norm": 0.6766132712364197, "learning_rate": 1e-07, "loss": 0.5536, "step": 223 }, { "epoch": 0.891542288557214, "grad_norm": 0.38082340359687805, "learning_rate": 1e-07, "loss": 0.5399, "step": 224 }, { "epoch": 0.8955223880597015, "grad_norm": 0.46175238490104675, "learning_rate": 1e-07, "loss": 0.5954, "step": 225 }, { "epoch": 0.8995024875621891, "grad_norm": 0.49846476316452026, "learning_rate": 1e-07, "loss": 0.5369, "step": 226 }, { "epoch": 0.9034825870646767, "grad_norm": 0.4804675877094269, "learning_rate": 1e-07, "loss": 0.5606, "step": 227 }, { "epoch": 0.9074626865671642, "grad_norm": 0.4761047959327698, "learning_rate": 1e-07, "loss": 0.5435, "step": 228 }, { "epoch": 0.9114427860696518, "grad_norm": 0.6701684594154358, "learning_rate": 1e-07, "loss": 0.5366, "step": 229 }, { "epoch": 0.9154228855721394, "grad_norm": 0.3914811909198761, "learning_rate": 1e-07, "loss": 0.5536, "step": 230 }, { "epoch": 0.9194029850746268, "grad_norm": 0.8211656212806702, "learning_rate": 1e-07, "loss": 0.5245, "step": 231 }, { "epoch": 0.9233830845771144, "grad_norm": 0.6509301066398621, "learning_rate": 1e-07, "loss": 0.5995, "step": 232 }, { "epoch": 0.9273631840796019, "grad_norm": 0.5072076916694641, "learning_rate": 1e-07, "loss": 0.5317, "step": 233 }, { "epoch": 0.9313432835820895, "grad_norm": 0.6173104047775269, "learning_rate": 1e-07, "loss": 0.548, "step": 234 }, { "epoch": 0.9353233830845771, "grad_norm": 0.45433592796325684, "learning_rate": 1e-07, "loss": 0.5083, "step": 235 }, { "epoch": 0.9393034825870646, "grad_norm": 0.40389034152030945, "learning_rate": 1e-07, "loss": 0.5363, "step": 236 }, { "epoch": 0.9432835820895522, "grad_norm": 0.3999645709991455, "learning_rate": 1e-07, "loss": 0.5291, "step": 237 }, { "epoch": 0.9472636815920398, "grad_norm": 0.4328477084636688, "learning_rate": 1e-07, "loss": 0.5549, "step": 238 }, { "epoch": 0.9512437810945273, "grad_norm": 0.529513955116272, "learning_rate": 1e-07, "loss": 0.5208, "step": 239 }, { "epoch": 0.9552238805970149, "grad_norm": 0.41321346163749695, "learning_rate": 1e-07, "loss": 0.5135, "step": 240 }, { "epoch": 0.9592039800995025, "grad_norm": 0.42416515946388245, "learning_rate": 1e-07, "loss": 0.4893, "step": 241 }, { "epoch": 0.96318407960199, "grad_norm": 0.8604540228843689, "learning_rate": 1e-07, "loss": 0.5321, "step": 242 }, { "epoch": 0.9671641791044776, "grad_norm": 0.4030352830886841, "learning_rate": 1e-07, "loss": 0.5037, "step": 243 }, { "epoch": 0.9711442786069652, "grad_norm": 0.45229312777519226, "learning_rate": 1e-07, "loss": 0.5289, "step": 244 }, { "epoch": 0.9751243781094527, "grad_norm": 0.4864250719547272, "learning_rate": 1e-07, "loss": 0.5453, "step": 245 }, { "epoch": 0.9791044776119403, "grad_norm": 0.3489474058151245, "learning_rate": 1e-07, "loss": 0.5357, "step": 246 }, { "epoch": 0.9830845771144279, "grad_norm": 0.4253968894481659, "learning_rate": 1e-07, "loss": 0.5415, "step": 247 }, { "epoch": 0.9870646766169154, "grad_norm": 0.4792375862598419, "learning_rate": 1e-07, "loss": 0.4918, "step": 248 }, { "epoch": 0.991044776119403, "grad_norm": 0.5180811285972595, "learning_rate": 1e-07, "loss": 0.4897, "step": 249 }, { "epoch": 0.9950248756218906, "grad_norm": 0.5927690863609314, "learning_rate": 1e-07, "loss": 0.5214, "step": 250 }, { "epoch": 0.9990049751243781, "grad_norm": 0.3848709166049957, "learning_rate": 1e-07, "loss": 0.5349, "step": 251 }, { "epoch": 0.9990049751243781, "step": 251, "total_flos": 2.785366857660498e+17, "train_loss": 0.7864806888350453, "train_runtime": 27405.4608, "train_samples_per_second": 0.44, "train_steps_per_second": 0.009 } ], "logging_steps": 1, "max_steps": 251, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.785366857660498e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }