{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.24282982791587, "eval_steps": 500, "global_step": 6500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00019120458891013384, "grad_norm": 9.746596336364746, "learning_rate": 0.0, "loss": 0.8644, "step": 1 }, { "epoch": 0.0003824091778202677, "grad_norm": 7.742455005645752, "learning_rate": 4.404279284341596e-07, "loss": 0.6036, "step": 2 }, { "epoch": 0.0005736137667304016, "grad_norm": 4.959498405456543, "learning_rate": 6.980617508384441e-07, "loss": 0.5138, "step": 3 }, { "epoch": 0.0007648183556405354, "grad_norm": 12.7059326171875, "learning_rate": 8.808558568683192e-07, "loss": 0.6537, "step": 4 }, { "epoch": 0.0009560229445506692, "grad_norm": 9.178160667419434, "learning_rate": 1.0226419808043158e-06, "loss": 0.5527, "step": 5 }, { "epoch": 0.001147227533460803, "grad_norm": 9.766277313232422, "learning_rate": 1.1384896792726035e-06, "loss": 0.4063, "step": 6 }, { "epoch": 0.0013384321223709368, "grad_norm": 5.554647922515869, "learning_rate": 1.236437512701272e-06, "loss": 0.7499, "step": 7 }, { "epoch": 0.0015296367112810707, "grad_norm": 8.689834594726562, "learning_rate": 1.3212837853024787e-06, "loss": 1.4177, "step": 8 }, { "epoch": 0.0017208413001912047, "grad_norm": 3.352787494659424, "learning_rate": 1.3961235016768883e-06, "loss": 0.5945, "step": 9 }, { "epoch": 0.0019120458891013384, "grad_norm": 8.300889015197754, "learning_rate": 1.4630699092384754e-06, "loss": 0.7862, "step": 10 }, { "epoch": 0.002103250478011472, "grad_norm": 4.62298583984375, "learning_rate": 1.5236303013560567e-06, "loss": 0.4052, "step": 11 }, { "epoch": 0.002294455066921606, "grad_norm": 4.900755405426025, "learning_rate": 1.578917607706763e-06, "loss": 0.2796, "step": 12 }, { "epoch": 0.00248565965583174, "grad_norm": 4.150928497314453, "learning_rate": 1.6297769993563666e-06, "loss": 0.3957, "step": 13 }, { "epoch": 0.0026768642447418736, "grad_norm": 3.4511361122131348, "learning_rate": 1.6768654411354316e-06, "loss": 0.5152, "step": 14 }, { "epoch": 0.0028680688336520078, "grad_norm": 3.906311511993408, "learning_rate": 1.7207037316427597e-06, "loss": 0.5973, "step": 15 }, { "epoch": 0.0030592734225621415, "grad_norm": 5.145625591278076, "learning_rate": 1.7617117137366383e-06, "loss": 0.5577, "step": 16 }, { "epoch": 0.003250478011472275, "grad_norm": 4.597131252288818, "learning_rate": 1.800232791723491e-06, "loss": 0.4065, "step": 17 }, { "epoch": 0.0034416826003824093, "grad_norm": 5.256167411804199, "learning_rate": 1.8365514301110474e-06, "loss": 0.4937, "step": 18 }, { "epoch": 0.003632887189292543, "grad_norm": 6.020605087280273, "learning_rate": 1.8709059148844288e-06, "loss": 0.2741, "step": 19 }, { "epoch": 0.0038240917782026767, "grad_norm": 6.1135969161987305, "learning_rate": 1.9034978376726347e-06, "loss": 0.6598, "step": 20 }, { "epoch": 0.00401529636711281, "grad_norm": 2.16821026802063, "learning_rate": 1.934499263539716e-06, "loss": 0.4142, "step": 21 }, { "epoch": 0.004206500956022944, "grad_norm": 4.964519023895264, "learning_rate": 1.964058229790216e-06, "loss": 0.5294, "step": 22 }, { "epoch": 0.004397705544933079, "grad_norm": 2.558614492416382, "learning_rate": 1.992303021449765e-06, "loss": 0.451, "step": 23 }, { "epoch": 0.004588910133843212, "grad_norm": 5.189733028411865, "learning_rate": 2.019345536140923e-06, "loss": 0.5024, "step": 24 }, { "epoch": 0.004780114722753346, "grad_norm": 5.467967510223389, "learning_rate": 2.0452839616086316e-06, "loss": 0.6787, "step": 25 }, { "epoch": 0.00497131931166348, "grad_norm": 4.121450424194336, "learning_rate": 2.070204927790526e-06, "loss": 0.6174, "step": 26 }, { "epoch": 0.0051625239005736135, "grad_norm": 2.465310573577881, "learning_rate": 2.094185252515332e-06, "loss": 0.3246, "step": 27 }, { "epoch": 0.005353728489483747, "grad_norm": 3.9040215015411377, "learning_rate": 2.117293369569591e-06, "loss": 0.3824, "step": 28 }, { "epoch": 0.005544933078393882, "grad_norm": 4.060995101928711, "learning_rate": 2.1395905060565537e-06, "loss": 0.6113, "step": 29 }, { "epoch": 0.0057361376673040155, "grad_norm": 3.5145015716552734, "learning_rate": 2.1611316600769195e-06, "loss": 0.25, "step": 30 }, { "epoch": 0.005927342256214149, "grad_norm": 4.692299842834473, "learning_rate": 2.1819664180398482e-06, "loss": 0.3016, "step": 31 }, { "epoch": 0.006118546845124283, "grad_norm": 4.2027788162231445, "learning_rate": 2.202139642170798e-06, "loss": 0.873, "step": 32 }, { "epoch": 0.006309751434034417, "grad_norm": 2.9636874198913574, "learning_rate": 2.2216920521945006e-06, "loss": 0.8008, "step": 33 }, { "epoch": 0.00650095602294455, "grad_norm": 1.739428162574768, "learning_rate": 2.240660720157651e-06, "loss": 0.1883, "step": 34 }, { "epoch": 0.006692160611854685, "grad_norm": 4.254518032073975, "learning_rate": 2.2590794935055878e-06, "loss": 0.5216, "step": 35 }, { "epoch": 0.006883365200764819, "grad_norm": 3.1227238178253174, "learning_rate": 2.276979358545207e-06, "loss": 0.4041, "step": 36 }, { "epoch": 0.007074569789674952, "grad_norm": 3.3138492107391357, "learning_rate": 2.2943887540983186e-06, "loss": 0.2946, "step": 37 }, { "epoch": 0.007265774378585086, "grad_norm": 4.404989719390869, "learning_rate": 2.3113338433185884e-06, "loss": 0.476, "step": 38 }, { "epoch": 0.00745697896749522, "grad_norm": 3.9010329246520996, "learning_rate": 2.3278387501948105e-06, "loss": 0.7358, "step": 39 }, { "epoch": 0.0076481835564053535, "grad_norm": 4.1798624992370605, "learning_rate": 2.3439257661067945e-06, "loss": 0.51, "step": 40 }, { "epoch": 0.007839388145315488, "grad_norm": 3.3263652324676514, "learning_rate": 2.3596155308722216e-06, "loss": 0.4974, "step": 41 }, { "epoch": 0.00803059273422562, "grad_norm": 5.814546585083008, "learning_rate": 2.3749271919738757e-06, "loss": 0.4005, "step": 42 }, { "epoch": 0.008221797323135755, "grad_norm": 2.7462821006774902, "learning_rate": 2.389878545048738e-06, "loss": 0.391, "step": 43 }, { "epoch": 0.008413001912045888, "grad_norm": 4.502755165100098, "learning_rate": 2.4044861582243756e-06, "loss": 0.2376, "step": 44 }, { "epoch": 0.008604206500956023, "grad_norm": 4.496071815490723, "learning_rate": 2.4187654824812036e-06, "loss": 0.8074, "step": 45 }, { "epoch": 0.008795411089866157, "grad_norm": 4.023655414581299, "learning_rate": 2.4327309498839246e-06, "loss": 0.5484, "step": 46 }, { "epoch": 0.00898661567877629, "grad_norm": 2.9175775051116943, "learning_rate": 2.446396061247859e-06, "loss": 0.3416, "step": 47 }, { "epoch": 0.009177820267686425, "grad_norm": 2.873981237411499, "learning_rate": 2.4597734645750825e-06, "loss": 0.5039, "step": 48 }, { "epoch": 0.009369024856596558, "grad_norm": 2.632161855697632, "learning_rate": 2.472875025402544e-06, "loss": 0.4038, "step": 49 }, { "epoch": 0.009560229445506692, "grad_norm": 3.235424041748047, "learning_rate": 2.4857118900427907e-06, "loss": 0.3458, "step": 50 }, { "epoch": 0.009751434034416827, "grad_norm": 2.7174906730651855, "learning_rate": 2.498294542561935e-06, "loss": 0.4972, "step": 51 }, { "epoch": 0.00994263862332696, "grad_norm": 3.14343523979187, "learning_rate": 2.5106328562246856e-06, "loss": 0.4773, "step": 52 }, { "epoch": 0.010133843212237094, "grad_norm": 3.828115224838257, "learning_rate": 2.5227361400389193e-06, "loss": 0.4097, "step": 53 }, { "epoch": 0.010325047801147227, "grad_norm": 3.4634366035461426, "learning_rate": 2.5346131809494915e-06, "loss": 0.582, "step": 54 }, { "epoch": 0.010516252390057362, "grad_norm": 3.1296660900115967, "learning_rate": 2.5462722821603727e-06, "loss": 0.3681, "step": 55 }, { "epoch": 0.010707456978967494, "grad_norm": 2.358661651611328, "learning_rate": 2.557721298003751e-06, "loss": 0.1711, "step": 56 }, { "epoch": 0.010898661567877629, "grad_norm": 4.519606113433838, "learning_rate": 2.568967665722873e-06, "loss": 0.7541, "step": 57 }, { "epoch": 0.011089866156787764, "grad_norm": 2.1992197036743164, "learning_rate": 2.580018434490713e-06, "loss": 0.2106, "step": 58 }, { "epoch": 0.011281070745697896, "grad_norm": 2.597233295440674, "learning_rate": 2.5908802919480436e-06, "loss": 0.4043, "step": 59 }, { "epoch": 0.011472275334608031, "grad_norm": 3.097158670425415, "learning_rate": 2.601559588511079e-06, "loss": 0.3287, "step": 60 }, { "epoch": 0.011663479923518164, "grad_norm": 2.8475654125213623, "learning_rate": 2.6120623596699453e-06, "loss": 0.3242, "step": 61 }, { "epoch": 0.011854684512428298, "grad_norm": 2.1070339679718018, "learning_rate": 2.622394346474008e-06, "loss": 0.1107, "step": 62 }, { "epoch": 0.012045889101338431, "grad_norm": 3.0335943698883057, "learning_rate": 2.63256101437816e-06, "loss": 0.5753, "step": 63 }, { "epoch": 0.012237093690248566, "grad_norm": 5.001894950866699, "learning_rate": 2.6425675706049575e-06, "loss": 0.6856, "step": 64 }, { "epoch": 0.0124282982791587, "grad_norm": 2.677110433578491, "learning_rate": 2.652418980160682e-06, "loss": 0.1709, "step": 65 }, { "epoch": 0.012619502868068833, "grad_norm": 2.887890100479126, "learning_rate": 2.6621199806286598e-06, "loss": 0.5874, "step": 66 }, { "epoch": 0.012810707456978968, "grad_norm": 2.3578383922576904, "learning_rate": 2.6716750958501646e-06, "loss": 0.1294, "step": 67 }, { "epoch": 0.0130019120458891, "grad_norm": 3.1397762298583984, "learning_rate": 2.6810886485918104e-06, "loss": 0.3335, "step": 68 }, { "epoch": 0.013193116634799235, "grad_norm": 2.663269281387329, "learning_rate": 2.690364772288209e-06, "loss": 0.2245, "step": 69 }, { "epoch": 0.01338432122370937, "grad_norm": 5.426806449890137, "learning_rate": 2.6995074219397478e-06, "loss": 0.5171, "step": 70 }, { "epoch": 0.013575525812619503, "grad_norm": 5.457904815673828, "learning_rate": 2.708520384237387e-06, "loss": 0.3069, "step": 71 }, { "epoch": 0.013766730401529637, "grad_norm": 2.8190979957580566, "learning_rate": 2.7174072869793666e-06, "loss": 0.4272, "step": 72 }, { "epoch": 0.01395793499043977, "grad_norm": 2.0407586097717285, "learning_rate": 2.7261716078384117e-06, "loss": 0.2898, "step": 73 }, { "epoch": 0.014149139579349905, "grad_norm": 2.2423548698425293, "learning_rate": 2.734816682532479e-06, "loss": 0.2225, "step": 74 }, { "epoch": 0.014340344168260038, "grad_norm": 2.801767587661743, "learning_rate": 2.7433457124470753e-06, "loss": 0.1297, "step": 75 }, { "epoch": 0.014531548757170172, "grad_norm": 3.7313790321350098, "learning_rate": 2.751761771752748e-06, "loss": 0.6037, "step": 76 }, { "epoch": 0.014722753346080307, "grad_norm": 2.962400436401367, "learning_rate": 2.760067814057329e-06, "loss": 0.4091, "step": 77 }, { "epoch": 0.01491395793499044, "grad_norm": 1.2434055805206299, "learning_rate": 2.76826667862897e-06, "loss": 0.3433, "step": 78 }, { "epoch": 0.015105162523900574, "grad_norm": 4.778526782989502, "learning_rate": 2.7763610962227783e-06, "loss": 0.4217, "step": 79 }, { "epoch": 0.015296367112810707, "grad_norm": 1.6460481882095337, "learning_rate": 2.7843536945409537e-06, "loss": 0.0981, "step": 80 }, { "epoch": 0.015487571701720841, "grad_norm": 3.009162187576294, "learning_rate": 2.7922470033537765e-06, "loss": 0.1744, "step": 81 }, { "epoch": 0.015678776290630976, "grad_norm": 4.354190826416016, "learning_rate": 2.8000434593063812e-06, "loss": 0.4956, "step": 82 }, { "epoch": 0.01586998087954111, "grad_norm": 3.888239860534668, "learning_rate": 2.807745410434209e-06, "loss": 0.4807, "step": 83 }, { "epoch": 0.01606118546845124, "grad_norm": 2.944679021835327, "learning_rate": 2.8153551204080353e-06, "loss": 0.4177, "step": 84 }, { "epoch": 0.016252390057361378, "grad_norm": 3.1436245441436768, "learning_rate": 2.822874772527807e-06, "loss": 0.2304, "step": 85 }, { "epoch": 0.01644359464627151, "grad_norm": 4.823106288909912, "learning_rate": 2.830306473482897e-06, "loss": 0.5006, "step": 86 }, { "epoch": 0.016634799235181644, "grad_norm": 2.8725435733795166, "learning_rate": 2.837652256894998e-06, "loss": 0.128, "step": 87 }, { "epoch": 0.016826003824091777, "grad_norm": 4.338479995727539, "learning_rate": 2.8449140866585356e-06, "loss": 0.2579, "step": 88 }, { "epoch": 0.017017208413001913, "grad_norm": 4.045529365539551, "learning_rate": 2.852093860092363e-06, "loss": 0.9362, "step": 89 }, { "epoch": 0.017208413001912046, "grad_norm": 3.180530309677124, "learning_rate": 2.8591934109153636e-06, "loss": 0.3104, "step": 90 }, { "epoch": 0.01739961759082218, "grad_norm": 3.4100022315979004, "learning_rate": 2.8662145120576384e-06, "loss": 0.3072, "step": 91 }, { "epoch": 0.017590822179732315, "grad_norm": 2.9597487449645996, "learning_rate": 2.873158878318084e-06, "loss": 0.2439, "step": 92 }, { "epoch": 0.017782026768642448, "grad_norm": 3.1329643726348877, "learning_rate": 2.8800281688782923e-06, "loss": 0.1225, "step": 93 }, { "epoch": 0.01797323135755258, "grad_norm": 3.316901445388794, "learning_rate": 2.8868239896820188e-06, "loss": 0.2914, "step": 94 }, { "epoch": 0.018164435946462717, "grad_norm": 2.207570791244507, "learning_rate": 2.8935478956887446e-06, "loss": 0.294, "step": 95 }, { "epoch": 0.01835564053537285, "grad_norm": 2.481140375137329, "learning_rate": 2.900201393009242e-06, "loss": 0.5148, "step": 96 }, { "epoch": 0.018546845124282983, "grad_norm": 2.634587526321411, "learning_rate": 2.906785940930483e-06, "loss": 0.5636, "step": 97 }, { "epoch": 0.018738049713193115, "grad_norm": 2.602517604827881, "learning_rate": 2.913302953836704e-06, "loss": 0.0733, "step": 98 }, { "epoch": 0.01892925430210325, "grad_norm": 2.664775848388672, "learning_rate": 2.9197538030329443e-06, "loss": 0.2034, "step": 99 }, { "epoch": 0.019120458891013385, "grad_norm": 4.784229278564453, "learning_rate": 2.9261398184769508e-06, "loss": 0.4161, "step": 100 }, { "epoch": 0.019311663479923517, "grad_norm": 20.801212310791016, "learning_rate": 2.9324622904249074e-06, "loss": 0.312, "step": 101 }, { "epoch": 0.019502868068833654, "grad_norm": 2.7974696159362793, "learning_rate": 2.938722470996094e-06, "loss": 0.436, "step": 102 }, { "epoch": 0.019694072657743786, "grad_norm": 2.719644784927368, "learning_rate": 2.944921575661221e-06, "loss": 0.2729, "step": 103 }, { "epoch": 0.01988527724665392, "grad_norm": 2.62326717376709, "learning_rate": 2.951060784658845e-06, "loss": 0.1857, "step": 104 }, { "epoch": 0.020076481835564052, "grad_norm": 3.2945799827575684, "learning_rate": 2.957141244344032e-06, "loss": 0.2618, "step": 105 }, { "epoch": 0.02026768642447419, "grad_norm": 3.1361422538757324, "learning_rate": 2.963164068473079e-06, "loss": 0.2186, "step": 106 }, { "epoch": 0.02045889101338432, "grad_norm": 3.901604175567627, "learning_rate": 2.9691303394279335e-06, "loss": 0.5334, "step": 107 }, { "epoch": 0.020650095602294454, "grad_norm": 4.767889499664307, "learning_rate": 2.975041109383651e-06, "loss": 0.4494, "step": 108 }, { "epoch": 0.02084130019120459, "grad_norm": 2.728724241256714, "learning_rate": 2.9808974014220527e-06, "loss": 0.3868, "step": 109 }, { "epoch": 0.021032504780114723, "grad_norm": 3.014492988586426, "learning_rate": 2.986700210594532e-06, "loss": 0.3775, "step": 110 }, { "epoch": 0.021223709369024856, "grad_norm": 2.4681789875030518, "learning_rate": 2.9924505049367623e-06, "loss": 0.3131, "step": 111 }, { "epoch": 0.02141491395793499, "grad_norm": 3.5321526527404785, "learning_rate": 2.9981492264379103e-06, "loss": 0.3168, "step": 112 }, { "epoch": 0.021606118546845125, "grad_norm": 2.0447781085968018, "learning_rate": 3.003797291966757e-06, "loss": 0.3725, "step": 113 }, { "epoch": 0.021797323135755258, "grad_norm": 3.804521322250366, "learning_rate": 3.0093955941570325e-06, "loss": 0.571, "step": 114 }, { "epoch": 0.02198852772466539, "grad_norm": 2.5446314811706543, "learning_rate": 3.014945002254081e-06, "loss": 0.3944, "step": 115 }, { "epoch": 0.022179732313575527, "grad_norm": 2.264204502105713, "learning_rate": 3.0204463629248725e-06, "loss": 0.2348, "step": 116 }, { "epoch": 0.02237093690248566, "grad_norm": 2.108393907546997, "learning_rate": 3.0259005010332546e-06, "loss": 0.259, "step": 117 }, { "epoch": 0.022562141491395793, "grad_norm": 4.435889720916748, "learning_rate": 3.031308220382203e-06, "loss": 0.2287, "step": 118 }, { "epoch": 0.022753346080305926, "grad_norm": 2.867352247238159, "learning_rate": 3.0366703044247632e-06, "loss": 0.1899, "step": 119 }, { "epoch": 0.022944550669216062, "grad_norm": 4.31091833114624, "learning_rate": 3.0419875169452382e-06, "loss": 0.771, "step": 120 }, { "epoch": 0.023135755258126195, "grad_norm": 2.2673258781433105, "learning_rate": 3.0472606027121134e-06, "loss": 0.3879, "step": 121 }, { "epoch": 0.023326959847036328, "grad_norm": 2.5156142711639404, "learning_rate": 3.0524902881041045e-06, "loss": 0.2798, "step": 122 }, { "epoch": 0.023518164435946464, "grad_norm": 1.8811085224151611, "learning_rate": 3.0576772817106653e-06, "loss": 0.1592, "step": 123 }, { "epoch": 0.023709369024856597, "grad_norm": 2.619985818862915, "learning_rate": 3.062822274908168e-06, "loss": 0.2492, "step": 124 }, { "epoch": 0.02390057361376673, "grad_norm": 2.3442344665527344, "learning_rate": 3.067925942412948e-06, "loss": 0.1004, "step": 125 }, { "epoch": 0.024091778202676863, "grad_norm": 3.19574236869812, "learning_rate": 3.0729889428123194e-06, "loss": 0.6887, "step": 126 }, { "epoch": 0.024282982791587, "grad_norm": 1.725019097328186, "learning_rate": 3.0780119190745983e-06, "loss": 0.2958, "step": 127 }, { "epoch": 0.024474187380497132, "grad_norm": 2.455963373184204, "learning_rate": 3.082995499039117e-06, "loss": 0.2027, "step": 128 }, { "epoch": 0.024665391969407265, "grad_norm": 6.7689619064331055, "learning_rate": 3.0879402958871817e-06, "loss": 0.0587, "step": 129 }, { "epoch": 0.0248565965583174, "grad_norm": 1.6544665098190308, "learning_rate": 3.0928469085948413e-06, "loss": 0.0546, "step": 130 }, { "epoch": 0.025047801147227534, "grad_norm": 3.8221983909606934, "learning_rate": 3.0977159223683077e-06, "loss": 0.1562, "step": 131 }, { "epoch": 0.025239005736137667, "grad_norm": 2.911043167114258, "learning_rate": 3.102547909062819e-06, "loss": 0.3307, "step": 132 }, { "epoch": 0.025430210325047803, "grad_norm": 3.8134329319000244, "learning_rate": 3.1073434275857012e-06, "loss": 0.6647, "step": 133 }, { "epoch": 0.025621414913957936, "grad_norm": 3.2995803356170654, "learning_rate": 3.112103024284324e-06, "loss": 0.1994, "step": 134 }, { "epoch": 0.02581261950286807, "grad_norm": 1.8985707759857178, "learning_rate": 3.1168272333196477e-06, "loss": 0.2707, "step": 135 }, { "epoch": 0.0260038240917782, "grad_norm": 2.5111820697784424, "learning_rate": 3.1215165770259696e-06, "loss": 0.2573, "step": 136 }, { "epoch": 0.026195028680688338, "grad_norm": 1.4703351259231567, "learning_rate": 3.1261715662575076e-06, "loss": 0.032, "step": 137 }, { "epoch": 0.02638623326959847, "grad_norm": 3.487067937850952, "learning_rate": 3.1307927007223687e-06, "loss": 0.3179, "step": 138 }, { "epoch": 0.026577437858508603, "grad_norm": 3.4684128761291504, "learning_rate": 3.135380469304468e-06, "loss": 0.4447, "step": 139 }, { "epoch": 0.02676864244741874, "grad_norm": 3.0979361534118652, "learning_rate": 3.1399353503739065e-06, "loss": 0.2441, "step": 140 }, { "epoch": 0.026959847036328873, "grad_norm": 3.7381913661956787, "learning_rate": 3.1444578120863033e-06, "loss": 0.3272, "step": 141 }, { "epoch": 0.027151051625239005, "grad_norm": 2.558560609817505, "learning_rate": 3.148948312671547e-06, "loss": 0.3229, "step": 142 }, { "epoch": 0.027342256214149138, "grad_norm": 2.115128755569458, "learning_rate": 3.153407300712423e-06, "loss": 0.1838, "step": 143 }, { "epoch": 0.027533460803059275, "grad_norm": 3.6304619312286377, "learning_rate": 3.157835215413526e-06, "loss": 0.3281, "step": 144 }, { "epoch": 0.027724665391969407, "grad_norm": 7.865360260009766, "learning_rate": 3.1622324868608695e-06, "loss": 0.8346, "step": 145 }, { "epoch": 0.02791586998087954, "grad_norm": 3.5878102779388428, "learning_rate": 3.166599536272571e-06, "loss": 0.5395, "step": 146 }, { "epoch": 0.028107074569789676, "grad_norm": 3.4643099308013916, "learning_rate": 3.170936776240988e-06, "loss": 0.3664, "step": 147 }, { "epoch": 0.02829827915869981, "grad_norm": 3.340297222137451, "learning_rate": 3.1752446109666377e-06, "loss": 0.3959, "step": 148 }, { "epoch": 0.028489483747609942, "grad_norm": 3.389150619506836, "learning_rate": 3.1795234364842463e-06, "loss": 0.0883, "step": 149 }, { "epoch": 0.028680688336520075, "grad_norm": 8.659844398498535, "learning_rate": 3.183773640881235e-06, "loss": 0.2891, "step": 150 }, { "epoch": 0.02887189292543021, "grad_norm": 2.904853582382202, "learning_rate": 3.1879956045089473e-06, "loss": 0.4851, "step": 151 }, { "epoch": 0.029063097514340344, "grad_norm": 2.432720184326172, "learning_rate": 3.192189700186908e-06, "loss": 0.4643, "step": 152 }, { "epoch": 0.029254302103250477, "grad_norm": 2.321363925933838, "learning_rate": 3.196356293400379e-06, "loss": 0.1767, "step": 153 }, { "epoch": 0.029445506692160613, "grad_norm": 1.9591343402862549, "learning_rate": 3.2004957424914884e-06, "loss": 0.1611, "step": 154 }, { "epoch": 0.029636711281070746, "grad_norm": 2.822035551071167, "learning_rate": 3.204608398844164e-06, "loss": 0.2053, "step": 155 }, { "epoch": 0.02982791586998088, "grad_norm": 2.73078989982605, "learning_rate": 3.2086946070631297e-06, "loss": 0.095, "step": 156 }, { "epoch": 0.030019120458891012, "grad_norm": 4.012321949005127, "learning_rate": 3.212754705147177e-06, "loss": 0.4317, "step": 157 }, { "epoch": 0.030210325047801148, "grad_norm": 2.332084894180298, "learning_rate": 3.216789024656938e-06, "loss": 0.4723, "step": 158 }, { "epoch": 0.03040152963671128, "grad_norm": 3.0491888523101807, "learning_rate": 3.2207978908773634e-06, "loss": 0.4482, "step": 159 }, { "epoch": 0.030592734225621414, "grad_norm": 1.8330838680267334, "learning_rate": 3.2247816229751133e-06, "loss": 0.1647, "step": 160 }, { "epoch": 0.03078393881453155, "grad_norm": 2.9033660888671875, "learning_rate": 3.228740534151037e-06, "loss": 0.3954, "step": 161 }, { "epoch": 0.030975143403441683, "grad_norm": 2.846273899078369, "learning_rate": 3.2326749317879352e-06, "loss": 0.1128, "step": 162 }, { "epoch": 0.031166347992351816, "grad_norm": 3.386489152908325, "learning_rate": 3.2365851175937783e-06, "loss": 0.4637, "step": 163 }, { "epoch": 0.03135755258126195, "grad_norm": 2.893277883529663, "learning_rate": 3.240471387740541e-06, "loss": 0.5229, "step": 164 }, { "epoch": 0.03154875717017208, "grad_norm": 2.0880420207977295, "learning_rate": 3.244334032998816e-06, "loss": 0.4664, "step": 165 }, { "epoch": 0.03173996175908222, "grad_norm": 4.319767475128174, "learning_rate": 3.2481733388683686e-06, "loss": 0.5711, "step": 166 }, { "epoch": 0.031931166347992354, "grad_norm": 3.115651845932007, "learning_rate": 3.251989585704759e-06, "loss": 0.1742, "step": 167 }, { "epoch": 0.03212237093690248, "grad_norm": 3.5364396572113037, "learning_rate": 3.255783048842195e-06, "loss": 0.1186, "step": 168 }, { "epoch": 0.03231357552581262, "grad_norm": 2.261878252029419, "learning_rate": 3.2595539987127332e-06, "loss": 0.1794, "step": 169 }, { "epoch": 0.032504780114722756, "grad_norm": 3.346421003341675, "learning_rate": 3.2633027009619666e-06, "loss": 0.5637, "step": 170 }, { "epoch": 0.032695984703632885, "grad_norm": 3.0582921504974365, "learning_rate": 3.267029416561317e-06, "loss": 0.3209, "step": 171 }, { "epoch": 0.03288718929254302, "grad_norm": 2.956470251083374, "learning_rate": 3.270734401917057e-06, "loss": 0.3218, "step": 172 }, { "epoch": 0.03307839388145316, "grad_norm": 1.4681670665740967, "learning_rate": 3.27441790897617e-06, "loss": 0.2129, "step": 173 }, { "epoch": 0.03326959847036329, "grad_norm": 4.20260763168335, "learning_rate": 3.278080185329157e-06, "loss": 0.3151, "step": 174 }, { "epoch": 0.033460803059273424, "grad_norm": 4.02779483795166, "learning_rate": 3.281721474309904e-06, "loss": 0.3233, "step": 175 }, { "epoch": 0.03365200764818355, "grad_norm": 3.5959632396698, "learning_rate": 3.2853420150926944e-06, "loss": 0.7, "step": 176 }, { "epoch": 0.03384321223709369, "grad_norm": 2.106358528137207, "learning_rate": 3.2889420427864873e-06, "loss": 0.6374, "step": 177 }, { "epoch": 0.034034416826003826, "grad_norm": 2.9504098892211914, "learning_rate": 3.2925217885265225e-06, "loss": 0.408, "step": 178 }, { "epoch": 0.034225621414913955, "grad_norm": 2.7030961513519287, "learning_rate": 3.296081479563376e-06, "loss": 0.3322, "step": 179 }, { "epoch": 0.03441682600382409, "grad_norm": 2.271064281463623, "learning_rate": 3.299621339349523e-06, "loss": 0.1597, "step": 180 }, { "epoch": 0.03460803059273423, "grad_norm": 3.3034112453460693, "learning_rate": 3.3031415876235085e-06, "loss": 0.1215, "step": 181 }, { "epoch": 0.03479923518164436, "grad_norm": 3.9516632556915283, "learning_rate": 3.306642440491798e-06, "loss": 0.6046, "step": 182 }, { "epoch": 0.03499043977055449, "grad_norm": 5.616006374359131, "learning_rate": 3.310124110508389e-06, "loss": 0.8098, "step": 183 }, { "epoch": 0.03518164435946463, "grad_norm": 1.1683584451675415, "learning_rate": 3.3135868067522437e-06, "loss": 0.2074, "step": 184 }, { "epoch": 0.03537284894837476, "grad_norm": 4.46233606338501, "learning_rate": 3.3170307349026344e-06, "loss": 0.2912, "step": 185 }, { "epoch": 0.035564053537284895, "grad_norm": 2.703484535217285, "learning_rate": 3.3204560973124523e-06, "loss": 0.1338, "step": 186 }, { "epoch": 0.03575525812619503, "grad_norm": 2.688272476196289, "learning_rate": 3.3238630930795473e-06, "loss": 0.1497, "step": 187 }, { "epoch": 0.03594646271510516, "grad_norm": 3.20407772064209, "learning_rate": 3.3272519181161784e-06, "loss": 0.3101, "step": 188 }, { "epoch": 0.0361376673040153, "grad_norm": 3.1082072257995605, "learning_rate": 3.330622765216604e-06, "loss": 0.2251, "step": 189 }, { "epoch": 0.036328871892925434, "grad_norm": 2.8750967979431152, "learning_rate": 3.333975824122904e-06, "loss": 0.475, "step": 190 }, { "epoch": 0.03652007648183556, "grad_norm": 3.8573219776153564, "learning_rate": 3.337311281589066e-06, "loss": 0.093, "step": 191 }, { "epoch": 0.0367112810707457, "grad_norm": 1.341035008430481, "learning_rate": 3.3406293214434016e-06, "loss": 0.053, "step": 192 }, { "epoch": 0.03690248565965583, "grad_norm": 2.519542694091797, "learning_rate": 3.343930124649337e-06, "loss": 0.1018, "step": 193 }, { "epoch": 0.037093690248565965, "grad_norm": 3.034273386001587, "learning_rate": 3.3472138693646427e-06, "loss": 0.3481, "step": 194 }, { "epoch": 0.0372848948374761, "grad_norm": 1.9574315547943115, "learning_rate": 3.350480730999126e-06, "loss": 0.2867, "step": 195 }, { "epoch": 0.03747609942638623, "grad_norm": 2.153280735015869, "learning_rate": 3.353730882270863e-06, "loss": 0.3375, "step": 196 }, { "epoch": 0.03766730401529637, "grad_norm": 3.426328420639038, "learning_rate": 3.3569644932609887e-06, "loss": 0.4611, "step": 197 }, { "epoch": 0.0378585086042065, "grad_norm": 2.072275161743164, "learning_rate": 3.360181731467104e-06, "loss": 0.2144, "step": 198 }, { "epoch": 0.03804971319311663, "grad_norm": 2.887531280517578, "learning_rate": 3.3633827618553393e-06, "loss": 0.2432, "step": 199 }, { "epoch": 0.03824091778202677, "grad_norm": 5.023746013641357, "learning_rate": 3.3665677469111103e-06, "loss": 0.2016, "step": 200 }, { "epoch": 0.038432122370936905, "grad_norm": 3.695518970489502, "learning_rate": 3.3697368466886087e-06, "loss": 0.5206, "step": 201 }, { "epoch": 0.038623326959847035, "grad_norm": 3.6168696880340576, "learning_rate": 3.3728902188590666e-06, "loss": 0.5451, "step": 202 }, { "epoch": 0.03881453154875717, "grad_norm": 4.44756555557251, "learning_rate": 3.376028018757826e-06, "loss": 0.4795, "step": 203 }, { "epoch": 0.03900573613766731, "grad_norm": 3.0557193756103516, "learning_rate": 3.3791503994302537e-06, "loss": 0.2987, "step": 204 }, { "epoch": 0.03919694072657744, "grad_norm": 3.2316536903381348, "learning_rate": 3.382257511676538e-06, "loss": 0.3986, "step": 205 }, { "epoch": 0.03938814531548757, "grad_norm": 3.7219014167785645, "learning_rate": 3.3853495040953806e-06, "loss": 0.3432, "step": 206 }, { "epoch": 0.0395793499043977, "grad_norm": 3.6749324798583984, "learning_rate": 3.388426523126653e-06, "loss": 0.5342, "step": 207 }, { "epoch": 0.03977055449330784, "grad_norm": 3.951026439666748, "learning_rate": 3.3914887130930047e-06, "loss": 0.6743, "step": 208 }, { "epoch": 0.039961759082217975, "grad_norm": 2.203550338745117, "learning_rate": 3.3945362162404853e-06, "loss": 0.272, "step": 209 }, { "epoch": 0.040152963671128104, "grad_norm": 3.5759997367858887, "learning_rate": 3.397569172778191e-06, "loss": 0.4403, "step": 210 }, { "epoch": 0.04034416826003824, "grad_norm": 2.468674421310425, "learning_rate": 3.400587720916976e-06, "loss": 0.2374, "step": 211 }, { "epoch": 0.04053537284894838, "grad_norm": 1.7186170816421509, "learning_rate": 3.4035919969072384e-06, "loss": 0.1339, "step": 212 }, { "epoch": 0.040726577437858506, "grad_norm": 3.243154525756836, "learning_rate": 3.4065821350758317e-06, "loss": 0.2984, "step": 213 }, { "epoch": 0.04091778202676864, "grad_norm": 3.2775168418884277, "learning_rate": 3.409558267862093e-06, "loss": 0.5302, "step": 214 }, { "epoch": 0.04110898661567878, "grad_norm": 1.8264061212539673, "learning_rate": 3.4125205258530534e-06, "loss": 0.5952, "step": 215 }, { "epoch": 0.04130019120458891, "grad_norm": 5.033473491668701, "learning_rate": 3.4154690378178107e-06, "loss": 0.2228, "step": 216 }, { "epoch": 0.041491395793499045, "grad_norm": 1.7373385429382324, "learning_rate": 3.4184039307411206e-06, "loss": 0.1134, "step": 217 }, { "epoch": 0.04168260038240918, "grad_norm": 2.2630789279937744, "learning_rate": 3.4213253298562123e-06, "loss": 0.1515, "step": 218 }, { "epoch": 0.04187380497131931, "grad_norm": 3.2471377849578857, "learning_rate": 3.4242333586768554e-06, "loss": 0.1851, "step": 219 }, { "epoch": 0.04206500956022945, "grad_norm": 4.196602821350098, "learning_rate": 3.4271281390286914e-06, "loss": 0.9757, "step": 220 }, { "epoch": 0.042256214149139576, "grad_norm": 3.0383758544921875, "learning_rate": 3.4300097910798572e-06, "loss": 0.426, "step": 221 }, { "epoch": 0.04244741873804971, "grad_norm": 2.5604920387268066, "learning_rate": 3.4328784333709227e-06, "loss": 0.2252, "step": 222 }, { "epoch": 0.04263862332695985, "grad_norm": 3.8304648399353027, "learning_rate": 3.4357341828441426e-06, "loss": 0.3957, "step": 223 }, { "epoch": 0.04282982791586998, "grad_norm": 2.3922934532165527, "learning_rate": 3.43857715487207e-06, "loss": 0.1992, "step": 224 }, { "epoch": 0.043021032504780114, "grad_norm": 3.5151939392089844, "learning_rate": 3.4414074632855194e-06, "loss": 0.2571, "step": 225 }, { "epoch": 0.04321223709369025, "grad_norm": 3.84830904006958, "learning_rate": 3.4442252204009168e-06, "loss": 0.7981, "step": 226 }, { "epoch": 0.04340344168260038, "grad_norm": 3.0907559394836426, "learning_rate": 3.447030537047043e-06, "loss": 0.4319, "step": 227 }, { "epoch": 0.043594646271510516, "grad_norm": 3.6448781490325928, "learning_rate": 3.4498235225911925e-06, "loss": 0.4267, "step": 228 }, { "epoch": 0.04378585086042065, "grad_norm": 1.7876298427581787, "learning_rate": 3.452604284964754e-06, "loss": 0.2185, "step": 229 }, { "epoch": 0.04397705544933078, "grad_norm": 1.8112496137619019, "learning_rate": 3.4553729306882404e-06, "loss": 0.1537, "step": 230 }, { "epoch": 0.04416826003824092, "grad_norm": 4.762514591217041, "learning_rate": 3.4581295648957726e-06, "loss": 0.4439, "step": 231 }, { "epoch": 0.044359464627151055, "grad_norm": 2.5081305503845215, "learning_rate": 3.4608742913590325e-06, "loss": 0.2384, "step": 232 }, { "epoch": 0.044550669216061184, "grad_norm": 4.172029495239258, "learning_rate": 3.4636072125107046e-06, "loss": 0.6397, "step": 233 }, { "epoch": 0.04474187380497132, "grad_norm": 2.4700138568878174, "learning_rate": 3.4663284294674142e-06, "loss": 0.5316, "step": 234 }, { "epoch": 0.044933078393881457, "grad_norm": 2.9252090454101562, "learning_rate": 3.4690380420521746e-06, "loss": 0.2955, "step": 235 }, { "epoch": 0.045124282982791586, "grad_norm": 2.3378536701202393, "learning_rate": 3.4717361488163627e-06, "loss": 0.3204, "step": 236 }, { "epoch": 0.04531548757170172, "grad_norm": 5.816604137420654, "learning_rate": 3.4744228470612224e-06, "loss": 0.1649, "step": 237 }, { "epoch": 0.04550669216061185, "grad_norm": 2.326578140258789, "learning_rate": 3.4770982328589232e-06, "loss": 0.201, "step": 238 }, { "epoch": 0.04569789674952199, "grad_norm": 4.211781024932861, "learning_rate": 3.4797624010731722e-06, "loss": 0.7306, "step": 239 }, { "epoch": 0.045889101338432124, "grad_norm": 1.4428257942199707, "learning_rate": 3.482415445379398e-06, "loss": 0.1414, "step": 240 }, { "epoch": 0.046080305927342254, "grad_norm": 3.0482757091522217, "learning_rate": 3.4850574582845136e-06, "loss": 0.5237, "step": 241 }, { "epoch": 0.04627151051625239, "grad_norm": 2.439641237258911, "learning_rate": 3.4876885311462725e-06, "loss": 0.2411, "step": 242 }, { "epoch": 0.046462715105162526, "grad_norm": 3.4746406078338623, "learning_rate": 3.4903087541922198e-06, "loss": 0.2901, "step": 243 }, { "epoch": 0.046653919694072656, "grad_norm": 2.937816619873047, "learning_rate": 3.492918216538264e-06, "loss": 0.1849, "step": 244 }, { "epoch": 0.04684512428298279, "grad_norm": 4.481417655944824, "learning_rate": 3.49551700620686e-06, "loss": 0.6734, "step": 245 }, { "epoch": 0.04703632887189293, "grad_norm": 3.3815813064575195, "learning_rate": 3.498105210144825e-06, "loss": 0.7956, "step": 246 }, { "epoch": 0.04722753346080306, "grad_norm": 1.8896292448043823, "learning_rate": 3.5006829142407957e-06, "loss": 0.1625, "step": 247 }, { "epoch": 0.047418738049713194, "grad_norm": 2.2965781688690186, "learning_rate": 3.503250203342327e-06, "loss": 0.1043, "step": 248 }, { "epoch": 0.04760994263862333, "grad_norm": 3.055568218231201, "learning_rate": 3.5058071612726523e-06, "loss": 0.2573, "step": 249 }, { "epoch": 0.04780114722753346, "grad_norm": 2.876594305038452, "learning_rate": 3.5083538708471065e-06, "loss": 0.1837, "step": 250 }, { "epoch": 0.047992351816443596, "grad_norm": 2.6785998344421387, "learning_rate": 3.5108904138892164e-06, "loss": 0.3405, "step": 251 }, { "epoch": 0.048183556405353725, "grad_norm": 3.1598598957061768, "learning_rate": 3.5134168712464794e-06, "loss": 0.3016, "step": 252 }, { "epoch": 0.04837476099426386, "grad_norm": 2.128307342529297, "learning_rate": 3.515933322805821e-06, "loss": 0.3255, "step": 253 }, { "epoch": 0.048565965583174, "grad_norm": 2.671571969985962, "learning_rate": 3.518439847508758e-06, "loss": 0.2336, "step": 254 }, { "epoch": 0.04875717017208413, "grad_norm": 5.808933734893799, "learning_rate": 3.5209365233662508e-06, "loss": 0.2801, "step": 255 }, { "epoch": 0.048948374760994263, "grad_norm": 2.248617649078369, "learning_rate": 3.5234234274732767e-06, "loss": 0.1844, "step": 256 }, { "epoch": 0.0491395793499044, "grad_norm": 2.858762264251709, "learning_rate": 3.5259006360231123e-06, "loss": 0.4732, "step": 257 }, { "epoch": 0.04933078393881453, "grad_norm": 2.7468910217285156, "learning_rate": 3.5283682243213417e-06, "loss": 0.2897, "step": 258 }, { "epoch": 0.049521988527724665, "grad_norm": 1.2300924062728882, "learning_rate": 3.5308262667995906e-06, "loss": 0.2216, "step": 259 }, { "epoch": 0.0497131931166348, "grad_norm": 3.6707677841186523, "learning_rate": 3.5332748370290014e-06, "loss": 0.5293, "step": 260 }, { "epoch": 0.04990439770554493, "grad_norm": 2.7596583366394043, "learning_rate": 3.5357140077334416e-06, "loss": 0.2129, "step": 261 }, { "epoch": 0.05009560229445507, "grad_norm": 5.763503074645996, "learning_rate": 3.5381438508024672e-06, "loss": 0.1661, "step": 262 }, { "epoch": 0.050286806883365204, "grad_norm": 3.31182599067688, "learning_rate": 3.5405644373040366e-06, "loss": 0.3727, "step": 263 }, { "epoch": 0.05047801147227533, "grad_norm": 2.9972293376922607, "learning_rate": 3.5429758374969793e-06, "loss": 0.3564, "step": 264 }, { "epoch": 0.05066921606118547, "grad_norm": 2.814368963241577, "learning_rate": 3.5453781208432355e-06, "loss": 0.4932, "step": 265 }, { "epoch": 0.050860420650095606, "grad_norm": 4.843621730804443, "learning_rate": 3.547771356019861e-06, "loss": 0.3832, "step": 266 }, { "epoch": 0.051051625239005735, "grad_norm": 3.9224517345428467, "learning_rate": 3.550155610930807e-06, "loss": 0.1663, "step": 267 }, { "epoch": 0.05124282982791587, "grad_norm": 2.4663190841674805, "learning_rate": 3.5525309527184838e-06, "loss": 0.0759, "step": 268 }, { "epoch": 0.051434034416826, "grad_norm": 2.4381537437438965, "learning_rate": 3.5548974477751014e-06, "loss": 0.1891, "step": 269 }, { "epoch": 0.05162523900573614, "grad_norm": 3.2417197227478027, "learning_rate": 3.5572551617538078e-06, "loss": 0.5146, "step": 270 }, { "epoch": 0.05181644359464627, "grad_norm": 2.6413793563842773, "learning_rate": 3.559604159579614e-06, "loss": 0.2155, "step": 271 }, { "epoch": 0.0520076481835564, "grad_norm": 2.974642753601074, "learning_rate": 3.56194450546013e-06, "loss": 0.3031, "step": 272 }, { "epoch": 0.05219885277246654, "grad_norm": 2.5996057987213135, "learning_rate": 3.5642762628960825e-06, "loss": 0.236, "step": 273 }, { "epoch": 0.052390057361376675, "grad_norm": 2.6430821418762207, "learning_rate": 3.566599494691667e-06, "loss": 0.1634, "step": 274 }, { "epoch": 0.052581261950286805, "grad_norm": 4.012979984283447, "learning_rate": 3.5689142629646885e-06, "loss": 0.1486, "step": 275 }, { "epoch": 0.05277246653919694, "grad_norm": 2.250370740890503, "learning_rate": 3.571220629156528e-06, "loss": 0.3647, "step": 276 }, { "epoch": 0.05296367112810708, "grad_norm": 2.271556854248047, "learning_rate": 3.5735186540419348e-06, "loss": 0.21, "step": 277 }, { "epoch": 0.05315487571701721, "grad_norm": 1.3703874349594116, "learning_rate": 3.5758083977386276e-06, "loss": 0.2091, "step": 278 }, { "epoch": 0.05334608030592734, "grad_norm": 2.649837017059326, "learning_rate": 3.5780899197167356e-06, "loss": 0.1355, "step": 279 }, { "epoch": 0.05353728489483748, "grad_norm": 3.166032552719116, "learning_rate": 3.580363278808066e-06, "loss": 0.4169, "step": 280 }, { "epoch": 0.05372848948374761, "grad_norm": 4.7268290519714355, "learning_rate": 3.582628533215206e-06, "loss": 0.2532, "step": 281 }, { "epoch": 0.053919694072657745, "grad_norm": 2.934070587158203, "learning_rate": 3.584885740520463e-06, "loss": 0.4672, "step": 282 }, { "epoch": 0.054110898661567874, "grad_norm": 2.0285329818725586, "learning_rate": 3.5871349576946483e-06, "loss": 0.3499, "step": 283 }, { "epoch": 0.05430210325047801, "grad_norm": 2.122689962387085, "learning_rate": 3.5893762411057063e-06, "loss": 0.2839, "step": 284 }, { "epoch": 0.05449330783938815, "grad_norm": 3.105699300765991, "learning_rate": 3.5916096465271888e-06, "loss": 0.4167, "step": 285 }, { "epoch": 0.054684512428298276, "grad_norm": 2.428645372390747, "learning_rate": 3.593835229146582e-06, "loss": 0.2457, "step": 286 }, { "epoch": 0.05487571701720841, "grad_norm": 2.716440439224243, "learning_rate": 3.5960530435734936e-06, "loss": 0.252, "step": 287 }, { "epoch": 0.05506692160611855, "grad_norm": 2.986215829849243, "learning_rate": 3.598263143847686e-06, "loss": 0.1737, "step": 288 }, { "epoch": 0.05525812619502868, "grad_norm": 1.8082088232040405, "learning_rate": 3.600465583446982e-06, "loss": 0.1929, "step": 289 }, { "epoch": 0.055449330783938815, "grad_norm": 2.961946964263916, "learning_rate": 3.602660415295029e-06, "loss": 0.2653, "step": 290 }, { "epoch": 0.05564053537284895, "grad_norm": 3.455840587615967, "learning_rate": 3.6048476917689273e-06, "loss": 0.3367, "step": 291 }, { "epoch": 0.05583173996175908, "grad_norm": 1.9973338842391968, "learning_rate": 3.607027464706731e-06, "loss": 0.1757, "step": 292 }, { "epoch": 0.05602294455066922, "grad_norm": 3.1876704692840576, "learning_rate": 3.609199785414821e-06, "loss": 0.3237, "step": 293 }, { "epoch": 0.05621414913957935, "grad_norm": 2.5493903160095215, "learning_rate": 3.6113647046751477e-06, "loss": 0.1988, "step": 294 }, { "epoch": 0.05640535372848948, "grad_norm": 3.4992916584014893, "learning_rate": 3.6135222727523593e-06, "loss": 0.3859, "step": 295 }, { "epoch": 0.05659655831739962, "grad_norm": 1.860656499862671, "learning_rate": 3.6156725394007973e-06, "loss": 0.2747, "step": 296 }, { "epoch": 0.05678776290630975, "grad_norm": 2.110995054244995, "learning_rate": 3.6178155538713884e-06, "loss": 0.2186, "step": 297 }, { "epoch": 0.056978967495219884, "grad_norm": 2.3457248210906982, "learning_rate": 3.6199513649184063e-06, "loss": 0.2764, "step": 298 }, { "epoch": 0.05717017208413002, "grad_norm": 2.250549554824829, "learning_rate": 3.622080020806132e-06, "loss": 0.1175, "step": 299 }, { "epoch": 0.05736137667304015, "grad_norm": 3.1696152687072754, "learning_rate": 3.6242015693153945e-06, "loss": 0.1506, "step": 300 }, { "epoch": 0.057552581261950286, "grad_norm": 2.9207370281219482, "learning_rate": 3.62631605775001e-06, "loss": 0.3109, "step": 301 }, { "epoch": 0.05774378585086042, "grad_norm": 2.2986581325531006, "learning_rate": 3.6284235329431073e-06, "loss": 0.2265, "step": 302 }, { "epoch": 0.05793499043977055, "grad_norm": 2.269641876220703, "learning_rate": 3.6305240412633507e-06, "loss": 0.1851, "step": 303 }, { "epoch": 0.05812619502868069, "grad_norm": 2.343125104904175, "learning_rate": 3.632617628621067e-06, "loss": 0.1895, "step": 304 }, { "epoch": 0.058317399617590825, "grad_norm": 3.1341187953948975, "learning_rate": 3.634704340474261e-06, "loss": 0.3916, "step": 305 }, { "epoch": 0.058508604206500954, "grad_norm": 1.92935311794281, "learning_rate": 3.6367842218345383e-06, "loss": 0.1707, "step": 306 }, { "epoch": 0.05869980879541109, "grad_norm": 2.106807231903076, "learning_rate": 3.6388573172729357e-06, "loss": 0.3863, "step": 307 }, { "epoch": 0.05889101338432123, "grad_norm": 0.9721649885177612, "learning_rate": 3.640923670925647e-06, "loss": 0.3013, "step": 308 }, { "epoch": 0.059082217973231356, "grad_norm": 2.205176830291748, "learning_rate": 3.642983326499665e-06, "loss": 0.2873, "step": 309 }, { "epoch": 0.05927342256214149, "grad_norm": 2.2065727710723877, "learning_rate": 3.6450363272783236e-06, "loss": 0.2406, "step": 310 }, { "epoch": 0.05946462715105163, "grad_norm": 5.1225361824035645, "learning_rate": 3.647082716126761e-06, "loss": 0.143, "step": 311 }, { "epoch": 0.05965583173996176, "grad_norm": 3.16314435005188, "learning_rate": 3.6491225354972893e-06, "loss": 0.1193, "step": 312 }, { "epoch": 0.059847036328871894, "grad_norm": 3.749077558517456, "learning_rate": 3.651155827434673e-06, "loss": 0.6098, "step": 313 }, { "epoch": 0.060038240917782024, "grad_norm": 3.1285479068756104, "learning_rate": 3.6531826335813365e-06, "loss": 0.6629, "step": 314 }, { "epoch": 0.06022944550669216, "grad_norm": 3.448866128921509, "learning_rate": 3.6552029951824756e-06, "loss": 0.3374, "step": 315 }, { "epoch": 0.060420650095602296, "grad_norm": 1.7784894704818726, "learning_rate": 3.6572169530910974e-06, "loss": 0.1474, "step": 316 }, { "epoch": 0.060611854684512426, "grad_norm": 2.4160869121551514, "learning_rate": 3.6592245477729737e-06, "loss": 0.3054, "step": 317 }, { "epoch": 0.06080305927342256, "grad_norm": 3.9507014751434326, "learning_rate": 3.661225819311523e-06, "loss": 0.2749, "step": 318 }, { "epoch": 0.0609942638623327, "grad_norm": 1.2071470022201538, "learning_rate": 3.66322080741261e-06, "loss": 0.1813, "step": 319 }, { "epoch": 0.06118546845124283, "grad_norm": 4.6793646812438965, "learning_rate": 3.665209551409273e-06, "loss": 0.3954, "step": 320 }, { "epoch": 0.061376673040152964, "grad_norm": 3.9796018600463867, "learning_rate": 3.6671920902663776e-06, "loss": 0.5827, "step": 321 }, { "epoch": 0.0615678776290631, "grad_norm": 3.5386040210723877, "learning_rate": 3.669168462585197e-06, "loss": 0.5768, "step": 322 }, { "epoch": 0.06175908221797323, "grad_norm": 1.5878996849060059, "learning_rate": 3.67113870660792e-06, "loss": 0.203, "step": 323 }, { "epoch": 0.061950286806883366, "grad_norm": 1.8842241764068604, "learning_rate": 3.673102860222095e-06, "loss": 0.1778, "step": 324 }, { "epoch": 0.0621414913957935, "grad_norm": 2.419157028198242, "learning_rate": 3.675060960964998e-06, "loss": 0.1822, "step": 325 }, { "epoch": 0.06233269598470363, "grad_norm": 3.7156035900115967, "learning_rate": 3.6770130460279383e-06, "loss": 0.6939, "step": 326 }, { "epoch": 0.06252390057361376, "grad_norm": 2.3214938640594482, "learning_rate": 3.678959152260497e-06, "loss": 0.3233, "step": 327 }, { "epoch": 0.0627151051625239, "grad_norm": 6.248021602630615, "learning_rate": 3.6808993161747004e-06, "loss": 0.291, "step": 328 }, { "epoch": 0.06290630975143403, "grad_norm": 3.342606782913208, "learning_rate": 3.682833573949131e-06, "loss": 0.1396, "step": 329 }, { "epoch": 0.06309751434034416, "grad_norm": 2.0580525398254395, "learning_rate": 3.6847619614329755e-06, "loss": 0.0865, "step": 330 }, { "epoch": 0.0632887189292543, "grad_norm": 2.6703569889068604, "learning_rate": 3.68668451415001e-06, "loss": 0.1062, "step": 331 }, { "epoch": 0.06347992351816444, "grad_norm": 3.680206775665283, "learning_rate": 3.6886012673025277e-06, "loss": 0.7988, "step": 332 }, { "epoch": 0.06367112810707456, "grad_norm": 2.1449387073516846, "learning_rate": 3.6905122557752073e-06, "loss": 0.2702, "step": 333 }, { "epoch": 0.06386233269598471, "grad_norm": 3.1063783168792725, "learning_rate": 3.6924175141389183e-06, "loss": 0.2693, "step": 334 }, { "epoch": 0.06405353728489484, "grad_norm": 2.218688488006592, "learning_rate": 3.6943170766544804e-06, "loss": 0.3532, "step": 335 }, { "epoch": 0.06424474187380497, "grad_norm": 2.191000461578369, "learning_rate": 3.6962109772763544e-06, "loss": 0.2497, "step": 336 }, { "epoch": 0.06443594646271511, "grad_norm": 2.405402898788452, "learning_rate": 3.6980992496562857e-06, "loss": 0.1078, "step": 337 }, { "epoch": 0.06462715105162524, "grad_norm": 2.5214316844940186, "learning_rate": 3.699981927146893e-06, "loss": 0.1989, "step": 338 }, { "epoch": 0.06481835564053537, "grad_norm": 3.761211633682251, "learning_rate": 3.701859042805201e-06, "loss": 0.4005, "step": 339 }, { "epoch": 0.06500956022944551, "grad_norm": 1.934031367301941, "learning_rate": 3.7037306293961262e-06, "loss": 0.4611, "step": 340 }, { "epoch": 0.06520076481835564, "grad_norm": 2.1387293338775635, "learning_rate": 3.7055967193959047e-06, "loss": 0.2271, "step": 341 }, { "epoch": 0.06539196940726577, "grad_norm": 2.621551990509033, "learning_rate": 3.7074573449954767e-06, "loss": 0.1125, "step": 342 }, { "epoch": 0.06558317399617591, "grad_norm": 1.7968058586120605, "learning_rate": 3.709312538103816e-06, "loss": 0.113, "step": 343 }, { "epoch": 0.06577437858508604, "grad_norm": 2.509631395339966, "learning_rate": 3.7111623303512168e-06, "loss": 0.2285, "step": 344 }, { "epoch": 0.06596558317399617, "grad_norm": 2.8840248584747314, "learning_rate": 3.7130067530925253e-06, "loss": 0.2937, "step": 345 }, { "epoch": 0.06615678776290632, "grad_norm": 3.3847973346710205, "learning_rate": 3.7148458374103296e-06, "loss": 0.1772, "step": 346 }, { "epoch": 0.06634799235181645, "grad_norm": 2.8074636459350586, "learning_rate": 3.716679614118107e-06, "loss": 0.3393, "step": 347 }, { "epoch": 0.06653919694072657, "grad_norm": 9.130067825317383, "learning_rate": 3.7185081137633166e-06, "loss": 1.0307, "step": 348 }, { "epoch": 0.06673040152963672, "grad_norm": 1.991129994392395, "learning_rate": 3.7203313666304545e-06, "loss": 0.1372, "step": 349 }, { "epoch": 0.06692160611854685, "grad_norm": 3.1855194568634033, "learning_rate": 3.722149402744063e-06, "loss": 0.1485, "step": 350 }, { "epoch": 0.06711281070745698, "grad_norm": 3.3848493099212646, "learning_rate": 3.7239622518716983e-06, "loss": 0.5177, "step": 351 }, { "epoch": 0.0673040152963671, "grad_norm": 3.134887933731079, "learning_rate": 3.725769943526854e-06, "loss": 0.6924, "step": 352 }, { "epoch": 0.06749521988527725, "grad_norm": 3.605255126953125, "learning_rate": 3.727572506971844e-06, "loss": 0.4911, "step": 353 }, { "epoch": 0.06768642447418738, "grad_norm": 2.0924346446990967, "learning_rate": 3.7293699712206464e-06, "loss": 0.2743, "step": 354 }, { "epoch": 0.06787762906309751, "grad_norm": 3.687945604324341, "learning_rate": 3.731162365041703e-06, "loss": 0.1613, "step": 355 }, { "epoch": 0.06806883365200765, "grad_norm": 5.654581546783447, "learning_rate": 3.7329497169606825e-06, "loss": 0.2055, "step": 356 }, { "epoch": 0.06826003824091778, "grad_norm": 1.5821623802185059, "learning_rate": 3.734732055263207e-06, "loss": 0.1343, "step": 357 }, { "epoch": 0.06845124282982791, "grad_norm": 3.3485209941864014, "learning_rate": 3.736509407997536e-06, "loss": 0.6085, "step": 358 }, { "epoch": 0.06864244741873805, "grad_norm": 3.3173418045043945, "learning_rate": 3.738281802977213e-06, "loss": 0.3427, "step": 359 }, { "epoch": 0.06883365200764818, "grad_norm": 2.4630320072174072, "learning_rate": 3.7400492677836824e-06, "loss": 0.1692, "step": 360 }, { "epoch": 0.06902485659655831, "grad_norm": 2.6465280055999756, "learning_rate": 3.7418118297688577e-06, "loss": 0.1822, "step": 361 }, { "epoch": 0.06921606118546846, "grad_norm": 2.0654947757720947, "learning_rate": 3.743569516057668e-06, "loss": 0.091, "step": 362 }, { "epoch": 0.06940726577437858, "grad_norm": 3.139086961746216, "learning_rate": 3.7453223535505566e-06, "loss": 0.2461, "step": 363 }, { "epoch": 0.06959847036328871, "grad_norm": 3.0427610874176025, "learning_rate": 3.747070368925958e-06, "loss": 0.3526, "step": 364 }, { "epoch": 0.06978967495219886, "grad_norm": 3.79207181930542, "learning_rate": 3.7488135886427275e-06, "loss": 0.4793, "step": 365 }, { "epoch": 0.06998087954110899, "grad_norm": 1.8372082710266113, "learning_rate": 3.750552038942548e-06, "loss": 0.1645, "step": 366 }, { "epoch": 0.07017208413001912, "grad_norm": 2.4285526275634766, "learning_rate": 3.7522857458523022e-06, "loss": 0.1195, "step": 367 }, { "epoch": 0.07036328871892926, "grad_norm": 2.658700942993164, "learning_rate": 3.7540147351864037e-06, "loss": 0.2826, "step": 368 }, { "epoch": 0.07055449330783939, "grad_norm": 3.1620702743530273, "learning_rate": 3.7557390325491095e-06, "loss": 0.2795, "step": 369 }, { "epoch": 0.07074569789674952, "grad_norm": 2.690842628479004, "learning_rate": 3.757458663336794e-06, "loss": 0.4693, "step": 370 }, { "epoch": 0.07093690248565966, "grad_norm": 2.008033275604248, "learning_rate": 3.7591736527401917e-06, "loss": 0.2736, "step": 371 }, { "epoch": 0.07112810707456979, "grad_norm": 3.240271806716919, "learning_rate": 3.760884025746611e-06, "loss": 0.3499, "step": 372 }, { "epoch": 0.07131931166347992, "grad_norm": 1.3944287300109863, "learning_rate": 3.7625898071421226e-06, "loss": 0.0905, "step": 373 }, { "epoch": 0.07151051625239006, "grad_norm": 2.2824788093566895, "learning_rate": 3.7642910215137073e-06, "loss": 0.2282, "step": 374 }, { "epoch": 0.07170172084130019, "grad_norm": 4.351092338562012, "learning_rate": 3.765987693251391e-06, "loss": 0.3069, "step": 375 }, { "epoch": 0.07189292543021032, "grad_norm": 4.460353851318359, "learning_rate": 3.767679846550338e-06, "loss": 0.6941, "step": 376 }, { "epoch": 0.07208413001912047, "grad_norm": 2.765394926071167, "learning_rate": 3.76936750541292e-06, "loss": 0.4267, "step": 377 }, { "epoch": 0.0722753346080306, "grad_norm": 2.731600284576416, "learning_rate": 3.771050693650764e-06, "loss": 0.3801, "step": 378 }, { "epoch": 0.07246653919694072, "grad_norm": 1.8360090255737305, "learning_rate": 3.772729434886761e-06, "loss": 0.2339, "step": 379 }, { "epoch": 0.07265774378585087, "grad_norm": 1.9759913682937622, "learning_rate": 3.774403752557064e-06, "loss": 0.3144, "step": 380 }, { "epoch": 0.072848948374761, "grad_norm": 1.7601401805877686, "learning_rate": 3.7760736699130425e-06, "loss": 0.1128, "step": 381 }, { "epoch": 0.07304015296367113, "grad_norm": 4.08651065826416, "learning_rate": 3.7777392100232265e-06, "loss": 0.4442, "step": 382 }, { "epoch": 0.07323135755258126, "grad_norm": 3.4187510013580322, "learning_rate": 3.7794003957752135e-06, "loss": 0.4448, "step": 383 }, { "epoch": 0.0734225621414914, "grad_norm": 1.9716267585754395, "learning_rate": 3.781057249877561e-06, "loss": 0.3026, "step": 384 }, { "epoch": 0.07361376673040153, "grad_norm": 1.3850023746490479, "learning_rate": 3.7827097948616442e-06, "loss": 0.2962, "step": 385 }, { "epoch": 0.07380497131931166, "grad_norm": 3.1978061199188232, "learning_rate": 3.784358053083497e-06, "loss": 0.4785, "step": 386 }, { "epoch": 0.0739961759082218, "grad_norm": 2.065589666366577, "learning_rate": 3.786002046725626e-06, "loss": 0.1068, "step": 387 }, { "epoch": 0.07418738049713193, "grad_norm": 2.613898277282715, "learning_rate": 3.7876417977988023e-06, "loss": 0.2658, "step": 388 }, { "epoch": 0.07437858508604206, "grad_norm": 2.723231077194214, "learning_rate": 3.7892773281438285e-06, "loss": 0.451, "step": 389 }, { "epoch": 0.0745697896749522, "grad_norm": 11.916027069091797, "learning_rate": 3.7909086594332863e-06, "loss": 0.4356, "step": 390 }, { "epoch": 0.07476099426386233, "grad_norm": 1.5050503015518188, "learning_rate": 3.792535813173256e-06, "loss": 0.1137, "step": 391 }, { "epoch": 0.07495219885277246, "grad_norm": 2.0423741340637207, "learning_rate": 3.7941588107050227e-06, "loss": 0.228, "step": 392 }, { "epoch": 0.0751434034416826, "grad_norm": 5.4225969314575195, "learning_rate": 3.795777673206752e-06, "loss": 0.3271, "step": 393 }, { "epoch": 0.07533460803059273, "grad_norm": 2.6373708248138428, "learning_rate": 3.7973924216951487e-06, "loss": 0.2295, "step": 394 }, { "epoch": 0.07552581261950286, "grad_norm": 7.188790798187256, "learning_rate": 3.7990030770270936e-06, "loss": 0.2779, "step": 395 }, { "epoch": 0.075717017208413, "grad_norm": 2.465902805328369, "learning_rate": 3.800609659901264e-06, "loss": 0.2997, "step": 396 }, { "epoch": 0.07590822179732314, "grad_norm": 2.4251866340637207, "learning_rate": 3.802212190859722e-06, "loss": 0.3933, "step": 397 }, { "epoch": 0.07609942638623327, "grad_norm": 2.4671387672424316, "learning_rate": 3.803810690289499e-06, "loss": 0.2564, "step": 398 }, { "epoch": 0.07629063097514341, "grad_norm": 2.4797089099884033, "learning_rate": 3.8054051784241454e-06, "loss": 0.1865, "step": 399 }, { "epoch": 0.07648183556405354, "grad_norm": 2.997403860092163, "learning_rate": 3.8069956753452695e-06, "loss": 0.3207, "step": 400 }, { "epoch": 0.07667304015296367, "grad_norm": 1.8929557800292969, "learning_rate": 3.808582200984058e-06, "loss": 0.1249, "step": 401 }, { "epoch": 0.07686424474187381, "grad_norm": 3.276320695877075, "learning_rate": 3.8101647751227683e-06, "loss": 0.4781, "step": 402 }, { "epoch": 0.07705544933078394, "grad_norm": 1.9976557493209839, "learning_rate": 3.811743417396214e-06, "loss": 0.184, "step": 403 }, { "epoch": 0.07724665391969407, "grad_norm": 3.980438232421875, "learning_rate": 3.813318147293226e-06, "loss": 0.4318, "step": 404 }, { "epoch": 0.07743785850860421, "grad_norm": 2.1951019763946533, "learning_rate": 3.8148889841580915e-06, "loss": 0.2133, "step": 405 }, { "epoch": 0.07762906309751434, "grad_norm": 4.515348434448242, "learning_rate": 3.816455947191985e-06, "loss": 0.5805, "step": 406 }, { "epoch": 0.07782026768642447, "grad_norm": 2.6555490493774414, "learning_rate": 3.818019055454375e-06, "loss": 0.3141, "step": 407 }, { "epoch": 0.07801147227533461, "grad_norm": 1.6512001752853394, "learning_rate": 3.819578327864414e-06, "loss": 0.2455, "step": 408 }, { "epoch": 0.07820267686424474, "grad_norm": 5.637848854064941, "learning_rate": 3.821133783202312e-06, "loss": 0.335, "step": 409 }, { "epoch": 0.07839388145315487, "grad_norm": 1.5848608016967773, "learning_rate": 3.8226854401106974e-06, "loss": 0.2007, "step": 410 }, { "epoch": 0.078585086042065, "grad_norm": 2.4538025856018066, "learning_rate": 3.824233317095951e-06, "loss": 0.309, "step": 411 }, { "epoch": 0.07877629063097515, "grad_norm": 3.901400089263916, "learning_rate": 3.82577743252954e-06, "loss": 0.1759, "step": 412 }, { "epoch": 0.07896749521988528, "grad_norm": 3.340503454208374, "learning_rate": 3.8273178046493155e-06, "loss": 0.4415, "step": 413 }, { "epoch": 0.0791586998087954, "grad_norm": 3.1490180492401123, "learning_rate": 3.828854451560812e-06, "loss": 0.3878, "step": 414 }, { "epoch": 0.07934990439770555, "grad_norm": 1.568618893623352, "learning_rate": 3.830387391238524e-06, "loss": 0.2918, "step": 415 }, { "epoch": 0.07954110898661568, "grad_norm": 2.177776336669922, "learning_rate": 3.831916641527165e-06, "loss": 0.1442, "step": 416 }, { "epoch": 0.0797323135755258, "grad_norm": 2.933709144592285, "learning_rate": 3.833442220142912e-06, "loss": 0.4187, "step": 417 }, { "epoch": 0.07992351816443595, "grad_norm": 1.9458264112472534, "learning_rate": 3.834964144674645e-06, "loss": 0.0881, "step": 418 }, { "epoch": 0.08011472275334608, "grad_norm": 2.3854639530181885, "learning_rate": 3.836482432585154e-06, "loss": 0.1411, "step": 419 }, { "epoch": 0.08030592734225621, "grad_norm": 2.503591299057007, "learning_rate": 3.837997101212351e-06, "loss": 0.2308, "step": 420 }, { "epoch": 0.08049713193116635, "grad_norm": 2.7284929752349854, "learning_rate": 3.839508167770447e-06, "loss": 0.5046, "step": 421 }, { "epoch": 0.08068833652007648, "grad_norm": 2.8158528804779053, "learning_rate": 3.8410156493511355e-06, "loss": 0.4343, "step": 422 }, { "epoch": 0.08087954110898661, "grad_norm": 2.605060577392578, "learning_rate": 3.842519562924747e-06, "loss": 0.242, "step": 423 }, { "epoch": 0.08107074569789675, "grad_norm": 2.482698440551758, "learning_rate": 3.8440199253413985e-06, "loss": 0.2665, "step": 424 }, { "epoch": 0.08126195028680688, "grad_norm": 2.105672836303711, "learning_rate": 3.8455167533321224e-06, "loss": 0.1729, "step": 425 }, { "epoch": 0.08145315487571701, "grad_norm": 4.377143383026123, "learning_rate": 3.847010063509991e-06, "loss": 0.5099, "step": 426 }, { "epoch": 0.08164435946462716, "grad_norm": 1.9579622745513916, "learning_rate": 3.848499872371217e-06, "loss": 0.1786, "step": 427 }, { "epoch": 0.08183556405353729, "grad_norm": 2.5516083240509033, "learning_rate": 3.849986196296252e-06, "loss": 0.1474, "step": 428 }, { "epoch": 0.08202676864244741, "grad_norm": 2.7220704555511475, "learning_rate": 3.851469051550867e-06, "loss": 0.3341, "step": 429 }, { "epoch": 0.08221797323135756, "grad_norm": 1.9412697553634644, "learning_rate": 3.852948454287213e-06, "loss": 0.163, "step": 430 }, { "epoch": 0.08240917782026769, "grad_norm": 2.0283761024475098, "learning_rate": 3.854424420544888e-06, "loss": 0.0943, "step": 431 }, { "epoch": 0.08260038240917782, "grad_norm": 2.609199285507202, "learning_rate": 3.855896966251971e-06, "loss": 0.246, "step": 432 }, { "epoch": 0.08279158699808796, "grad_norm": 1.5453547239303589, "learning_rate": 3.857366107226057e-06, "loss": 0.224, "step": 433 }, { "epoch": 0.08298279158699809, "grad_norm": 1.983446717262268, "learning_rate": 3.85883185917528e-06, "loss": 0.2313, "step": 434 }, { "epoch": 0.08317399617590822, "grad_norm": 2.101289749145508, "learning_rate": 3.860294237699313e-06, "loss": 0.1372, "step": 435 }, { "epoch": 0.08336520076481836, "grad_norm": 1.78105628490448, "learning_rate": 3.861753258290372e-06, "loss": 0.1718, "step": 436 }, { "epoch": 0.08355640535372849, "grad_norm": 2.88455867767334, "learning_rate": 3.863208936334194e-06, "loss": 0.1571, "step": 437 }, { "epoch": 0.08374760994263862, "grad_norm": 2.6141340732574463, "learning_rate": 3.864661287111015e-06, "loss": 0.2811, "step": 438 }, { "epoch": 0.08393881453154876, "grad_norm": 2.1665003299713135, "learning_rate": 3.866110325796531e-06, "loss": 0.1615, "step": 439 }, { "epoch": 0.0841300191204589, "grad_norm": 2.2355403900146484, "learning_rate": 3.86755606746285e-06, "loss": 0.2134, "step": 440 }, { "epoch": 0.08432122370936902, "grad_norm": 25.508962631225586, "learning_rate": 3.868998527079432e-06, "loss": 0.5128, "step": 441 }, { "epoch": 0.08451242829827915, "grad_norm": 3.2912681102752686, "learning_rate": 3.870437719514017e-06, "loss": 0.2624, "step": 442 }, { "epoch": 0.0847036328871893, "grad_norm": 2.363339900970459, "learning_rate": 3.871873659533549e-06, "loss": 0.1605, "step": 443 }, { "epoch": 0.08489483747609942, "grad_norm": 4.588367938995361, "learning_rate": 3.873306361805082e-06, "loss": 0.3243, "step": 444 }, { "epoch": 0.08508604206500955, "grad_norm": 4.233880519866943, "learning_rate": 3.874735840896679e-06, "loss": 0.6475, "step": 445 }, { "epoch": 0.0852772466539197, "grad_norm": 2.803955316543579, "learning_rate": 3.876162111278302e-06, "loss": 0.2044, "step": 446 }, { "epoch": 0.08546845124282983, "grad_norm": 2.126622438430786, "learning_rate": 3.877585187322691e-06, "loss": 0.1412, "step": 447 }, { "epoch": 0.08565965583173996, "grad_norm": 2.711604356765747, "learning_rate": 3.879005083306229e-06, "loss": 0.4514, "step": 448 }, { "epoch": 0.0858508604206501, "grad_norm": 1.7695578336715698, "learning_rate": 3.88042181340981e-06, "loss": 0.0854, "step": 449 }, { "epoch": 0.08604206500956023, "grad_norm": 3.0042665004730225, "learning_rate": 3.881835391719679e-06, "loss": 0.1316, "step": 450 }, { "epoch": 0.08623326959847036, "grad_norm": 4.021963119506836, "learning_rate": 3.883245832228278e-06, "loss": 0.5587, "step": 451 }, { "epoch": 0.0864244741873805, "grad_norm": 2.566257953643799, "learning_rate": 3.884653148835076e-06, "loss": 0.5083, "step": 452 }, { "epoch": 0.08661567877629063, "grad_norm": 2.0462639331817627, "learning_rate": 3.8860573553473914e-06, "loss": 0.161, "step": 453 }, { "epoch": 0.08680688336520076, "grad_norm": 2.4460551738739014, "learning_rate": 3.887458465481203e-06, "loss": 0.1692, "step": 454 }, { "epoch": 0.0869980879541109, "grad_norm": 2.9268269538879395, "learning_rate": 3.888856492861955e-06, "loss": 0.3161, "step": 455 }, { "epoch": 0.08718929254302103, "grad_norm": 2.3640105724334717, "learning_rate": 3.890251451025352e-06, "loss": 0.1184, "step": 456 }, { "epoch": 0.08738049713193116, "grad_norm": 3.2469921112060547, "learning_rate": 3.8916433534181465e-06, "loss": 0.3819, "step": 457 }, { "epoch": 0.0875717017208413, "grad_norm": 3.044752359390259, "learning_rate": 3.893032213398913e-06, "loss": 0.5221, "step": 458 }, { "epoch": 0.08776290630975143, "grad_norm": 1.492080807685852, "learning_rate": 3.894418044238823e-06, "loss": 0.1747, "step": 459 }, { "epoch": 0.08795411089866156, "grad_norm": 1.967621922492981, "learning_rate": 3.8958008591224e-06, "loss": 0.4279, "step": 460 }, { "epoch": 0.08814531548757171, "grad_norm": 4.419346809387207, "learning_rate": 3.897180671148275e-06, "loss": 0.222, "step": 461 }, { "epoch": 0.08833652007648184, "grad_norm": 3.055210590362549, "learning_rate": 3.898557493329932e-06, "loss": 0.203, "step": 462 }, { "epoch": 0.08852772466539197, "grad_norm": 2.82248854637146, "learning_rate": 3.899931338596441e-06, "loss": 0.2677, "step": 463 }, { "epoch": 0.08871892925430211, "grad_norm": 4.254226207733154, "learning_rate": 3.901302219793192e-06, "loss": 0.6093, "step": 464 }, { "epoch": 0.08891013384321224, "grad_norm": 3.696042537689209, "learning_rate": 3.902670149682608e-06, "loss": 0.6688, "step": 465 }, { "epoch": 0.08910133843212237, "grad_norm": 2.048828601837158, "learning_rate": 3.904035140944864e-06, "loss": 0.0932, "step": 466 }, { "epoch": 0.08929254302103251, "grad_norm": 1.4884488582611084, "learning_rate": 3.905397206178592e-06, "loss": 0.1926, "step": 467 }, { "epoch": 0.08948374760994264, "grad_norm": 4.860929012298584, "learning_rate": 3.906756357901574e-06, "loss": 0.2957, "step": 468 }, { "epoch": 0.08967495219885277, "grad_norm": 3.0578386783599854, "learning_rate": 3.908112608551437e-06, "loss": 0.2792, "step": 469 }, { "epoch": 0.08986615678776291, "grad_norm": 2.4391660690307617, "learning_rate": 3.9094659704863346e-06, "loss": 0.3934, "step": 470 }, { "epoch": 0.09005736137667304, "grad_norm": 3.1707816123962402, "learning_rate": 3.910816455985621e-06, "loss": 0.4196, "step": 471 }, { "epoch": 0.09024856596558317, "grad_norm": 3.7489700317382812, "learning_rate": 3.912164077250522e-06, "loss": 0.5584, "step": 472 }, { "epoch": 0.0904397705544933, "grad_norm": 3.4774014949798584, "learning_rate": 3.9135088464047945e-06, "loss": 0.3015, "step": 473 }, { "epoch": 0.09063097514340344, "grad_norm": 3.203939914703369, "learning_rate": 3.9148507754953815e-06, "loss": 0.1764, "step": 474 }, { "epoch": 0.09082217973231357, "grad_norm": 5.508978366851807, "learning_rate": 3.91618987649306e-06, "loss": 0.1797, "step": 475 }, { "epoch": 0.0910133843212237, "grad_norm": 3.0496325492858887, "learning_rate": 3.917526161293082e-06, "loss": 0.2904, "step": 476 }, { "epoch": 0.09120458891013385, "grad_norm": 2.569993257522583, "learning_rate": 3.9188596417158075e-06, "loss": 0.2979, "step": 477 }, { "epoch": 0.09139579349904398, "grad_norm": 2.830799102783203, "learning_rate": 3.920190329507332e-06, "loss": 0.1373, "step": 478 }, { "epoch": 0.0915869980879541, "grad_norm": 1.5657826662063599, "learning_rate": 3.921518236340108e-06, "loss": 0.1339, "step": 479 }, { "epoch": 0.09177820267686425, "grad_norm": 1.5757731199264526, "learning_rate": 3.922843373813557e-06, "loss": 0.1992, "step": 480 }, { "epoch": 0.09196940726577438, "grad_norm": 3.687089443206787, "learning_rate": 3.924165753454686e-06, "loss": 0.1859, "step": 481 }, { "epoch": 0.09216061185468451, "grad_norm": 4.091752052307129, "learning_rate": 3.925485386718674e-06, "loss": 0.2528, "step": 482 }, { "epoch": 0.09235181644359465, "grad_norm": 1.9640034437179565, "learning_rate": 3.926802284989481e-06, "loss": 0.3403, "step": 483 }, { "epoch": 0.09254302103250478, "grad_norm": 2.387704849243164, "learning_rate": 3.928116459580432e-06, "loss": 0.4675, "step": 484 }, { "epoch": 0.09273422562141491, "grad_norm": 1.9029995203018188, "learning_rate": 3.9294279217347985e-06, "loss": 0.2266, "step": 485 }, { "epoch": 0.09292543021032505, "grad_norm": 3.2065725326538086, "learning_rate": 3.930736682626379e-06, "loss": 0.25, "step": 486 }, { "epoch": 0.09311663479923518, "grad_norm": 2.709934711456299, "learning_rate": 3.932042753360069e-06, "loss": 0.2074, "step": 487 }, { "epoch": 0.09330783938814531, "grad_norm": 3.0203776359558105, "learning_rate": 3.933346144972424e-06, "loss": 0.3794, "step": 488 }, { "epoch": 0.09349904397705545, "grad_norm": 3.989088296890259, "learning_rate": 3.934646868432223e-06, "loss": 0.3628, "step": 489 }, { "epoch": 0.09369024856596558, "grad_norm": 1.333524465560913, "learning_rate": 3.93594493464102e-06, "loss": 0.2219, "step": 490 }, { "epoch": 0.09388145315487571, "grad_norm": 2.184206247329712, "learning_rate": 3.937240354433692e-06, "loss": 0.4398, "step": 491 }, { "epoch": 0.09407265774378586, "grad_norm": 2.465756893157959, "learning_rate": 3.9385331385789845e-06, "loss": 0.1526, "step": 492 }, { "epoch": 0.09426386233269599, "grad_norm": 2.6147758960723877, "learning_rate": 3.939823297780045e-06, "loss": 0.1571, "step": 493 }, { "epoch": 0.09445506692160612, "grad_norm": 2.757384777069092, "learning_rate": 3.941110842674955e-06, "loss": 0.178, "step": 494 }, { "epoch": 0.09464627151051626, "grad_norm": 3.065559148788452, "learning_rate": 3.94239578383726e-06, "loss": 0.5259, "step": 495 }, { "epoch": 0.09483747609942639, "grad_norm": 3.7982611656188965, "learning_rate": 3.9436781317764865e-06, "loss": 0.6654, "step": 496 }, { "epoch": 0.09502868068833652, "grad_norm": 1.8163230419158936, "learning_rate": 3.944957896938659e-06, "loss": 0.2204, "step": 497 }, { "epoch": 0.09521988527724666, "grad_norm": 2.6650335788726807, "learning_rate": 3.946235089706813e-06, "loss": 0.2265, "step": 498 }, { "epoch": 0.09541108986615679, "grad_norm": 1.7298262119293213, "learning_rate": 3.947509720401494e-06, "loss": 0.0825, "step": 499 }, { "epoch": 0.09560229445506692, "grad_norm": 2.199256181716919, "learning_rate": 3.948781799281266e-06, "loss": 0.0587, "step": 500 }, { "epoch": 0.09560229445506692, "eval_runtime": 758.2728, "eval_samples_per_second": 2.023, "eval_steps_per_second": 0.253, "step": 500 }, { "epoch": 0.09579349904397706, "grad_norm": 2.560211658477783, "learning_rate": 3.950051336543202e-06, "loss": 0.3781, "step": 501 }, { "epoch": 0.09598470363288719, "grad_norm": 2.047006845474243, "learning_rate": 3.951318342323376e-06, "loss": 0.3203, "step": 502 }, { "epoch": 0.09617590822179732, "grad_norm": 2.891329050064087, "learning_rate": 3.952582826697346e-06, "loss": 0.3811, "step": 503 }, { "epoch": 0.09636711281070745, "grad_norm": 2.8886239528656006, "learning_rate": 3.953844799680639e-06, "loss": 0.269, "step": 504 }, { "epoch": 0.0965583173996176, "grad_norm": 1.302022099494934, "learning_rate": 3.955104271229223e-06, "loss": 0.0563, "step": 505 }, { "epoch": 0.09674952198852772, "grad_norm": 2.516767740249634, "learning_rate": 3.9563612512399815e-06, "loss": 0.1752, "step": 506 }, { "epoch": 0.09694072657743785, "grad_norm": 5.090005874633789, "learning_rate": 3.957615749551177e-06, "loss": 0.3137, "step": 507 }, { "epoch": 0.097131931166348, "grad_norm": 2.3271141052246094, "learning_rate": 3.958867775942918e-06, "loss": 0.2331, "step": 508 }, { "epoch": 0.09732313575525812, "grad_norm": 5.051674842834473, "learning_rate": 3.960117340137608e-06, "loss": 0.2622, "step": 509 }, { "epoch": 0.09751434034416825, "grad_norm": 2.13322114944458, "learning_rate": 3.96136445180041e-06, "loss": 0.1548, "step": 510 }, { "epoch": 0.0977055449330784, "grad_norm": 2.8355138301849365, "learning_rate": 3.962609120539684e-06, "loss": 0.1487, "step": 511 }, { "epoch": 0.09789674952198853, "grad_norm": 1.6841158866882324, "learning_rate": 3.963851355907436e-06, "loss": 0.1633, "step": 512 }, { "epoch": 0.09808795411089866, "grad_norm": 3.454113483428955, "learning_rate": 3.965091167399761e-06, "loss": 0.2414, "step": 513 }, { "epoch": 0.0982791586998088, "grad_norm": 3.103166103363037, "learning_rate": 3.966328564457273e-06, "loss": 0.2673, "step": 514 }, { "epoch": 0.09847036328871893, "grad_norm": 3.4055354595184326, "learning_rate": 3.967563556465537e-06, "loss": 0.434, "step": 515 }, { "epoch": 0.09866156787762906, "grad_norm": 2.7894411087036133, "learning_rate": 3.968796152755501e-06, "loss": 0.2902, "step": 516 }, { "epoch": 0.0988527724665392, "grad_norm": 2.1280155181884766, "learning_rate": 3.970026362603916e-06, "loss": 0.2733, "step": 517 }, { "epoch": 0.09904397705544933, "grad_norm": 3.4343643188476562, "learning_rate": 3.97125419523375e-06, "loss": 0.5206, "step": 518 }, { "epoch": 0.09923518164435946, "grad_norm": 2.7457900047302246, "learning_rate": 3.972479659814613e-06, "loss": 0.2337, "step": 519 }, { "epoch": 0.0994263862332696, "grad_norm": 3.7752184867858887, "learning_rate": 3.973702765463161e-06, "loss": 0.5257, "step": 520 }, { "epoch": 0.09961759082217973, "grad_norm": 2.813133478164673, "learning_rate": 3.974923521243501e-06, "loss": 0.2109, "step": 521 }, { "epoch": 0.09980879541108986, "grad_norm": 2.6638405323028564, "learning_rate": 3.976141936167601e-06, "loss": 0.2409, "step": 522 }, { "epoch": 0.1, "grad_norm": 2.6184728145599365, "learning_rate": 3.9773580191956855e-06, "loss": 0.4386, "step": 523 }, { "epoch": 0.10019120458891013, "grad_norm": 2.8022634983062744, "learning_rate": 3.978571779236627e-06, "loss": 0.2676, "step": 524 }, { "epoch": 0.10038240917782026, "grad_norm": 3.0874242782592773, "learning_rate": 3.979783225148348e-06, "loss": 0.1031, "step": 525 }, { "epoch": 0.10057361376673041, "grad_norm": 2.6263234615325928, "learning_rate": 3.980992365738197e-06, "loss": 0.3226, "step": 526 }, { "epoch": 0.10076481835564054, "grad_norm": 3.469144821166992, "learning_rate": 3.982199209763339e-06, "loss": 0.1735, "step": 527 }, { "epoch": 0.10095602294455067, "grad_norm": 3.713228940963745, "learning_rate": 3.983403765931139e-06, "loss": 0.2317, "step": 528 }, { "epoch": 0.10114722753346081, "grad_norm": 4.205535411834717, "learning_rate": 3.98460604289953e-06, "loss": 0.1452, "step": 529 }, { "epoch": 0.10133843212237094, "grad_norm": 2.1234936714172363, "learning_rate": 3.985806049277395e-06, "loss": 0.1145, "step": 530 }, { "epoch": 0.10152963671128107, "grad_norm": 2.30922794342041, "learning_rate": 3.987003793624931e-06, "loss": 0.144, "step": 531 }, { "epoch": 0.10172084130019121, "grad_norm": 1.6173641681671143, "learning_rate": 3.98819928445402e-06, "loss": 0.3568, "step": 532 }, { "epoch": 0.10191204588910134, "grad_norm": 3.590470552444458, "learning_rate": 3.989392530228588e-06, "loss": 0.4316, "step": 533 }, { "epoch": 0.10210325047801147, "grad_norm": 3.209644079208374, "learning_rate": 3.990583539364967e-06, "loss": 0.4416, "step": 534 }, { "epoch": 0.1022944550669216, "grad_norm": 3.220745801925659, "learning_rate": 3.99177232023225e-06, "loss": 0.1935, "step": 535 }, { "epoch": 0.10248565965583174, "grad_norm": 1.88006591796875, "learning_rate": 3.992958881152644e-06, "loss": 0.2317, "step": 536 }, { "epoch": 0.10267686424474187, "grad_norm": 1.5669440031051636, "learning_rate": 3.9941432304018205e-06, "loss": 0.0725, "step": 537 }, { "epoch": 0.102868068833652, "grad_norm": 2.0758368968963623, "learning_rate": 3.995325376209261e-06, "loss": 0.2837, "step": 538 }, { "epoch": 0.10305927342256214, "grad_norm": 3.3589015007019043, "learning_rate": 3.996505326758601e-06, "loss": 0.5328, "step": 539 }, { "epoch": 0.10325047801147227, "grad_norm": 2.2015810012817383, "learning_rate": 3.997683090187967e-06, "loss": 0.3724, "step": 540 }, { "epoch": 0.1034416826003824, "grad_norm": 3.714031934738159, "learning_rate": 3.998858674590317e-06, "loss": 0.2856, "step": 541 }, { "epoch": 0.10363288718929255, "grad_norm": 2.511425018310547, "learning_rate": 4.000032088013775e-06, "loss": 0.2015, "step": 542 }, { "epoch": 0.10382409177820268, "grad_norm": 2.7374191284179688, "learning_rate": 4.001203338461952e-06, "loss": 0.1779, "step": 543 }, { "epoch": 0.1040152963671128, "grad_norm": 3.1923158168792725, "learning_rate": 4.002372433894288e-06, "loss": 0.2595, "step": 544 }, { "epoch": 0.10420650095602295, "grad_norm": 2.5342373847961426, "learning_rate": 4.0035393822263685e-06, "loss": 0.3994, "step": 545 }, { "epoch": 0.10439770554493308, "grad_norm": 2.794058084487915, "learning_rate": 4.0047041913302425e-06, "loss": 0.5341, "step": 546 }, { "epoch": 0.10458891013384321, "grad_norm": 2.975745439529419, "learning_rate": 4.005866869034748e-06, "loss": 0.3757, "step": 547 }, { "epoch": 0.10478011472275335, "grad_norm": 1.5141457319259644, "learning_rate": 4.0070274231258276e-06, "loss": 0.2261, "step": 548 }, { "epoch": 0.10497131931166348, "grad_norm": 1.4179555177688599, "learning_rate": 4.008185861346832e-06, "loss": 0.1121, "step": 549 }, { "epoch": 0.10516252390057361, "grad_norm": 1.3781744241714478, "learning_rate": 4.009342191398848e-06, "loss": 0.0929, "step": 550 }, { "epoch": 0.10535372848948375, "grad_norm": 3.2945396900177, "learning_rate": 4.010496420940983e-06, "loss": 0.5864, "step": 551 }, { "epoch": 0.10554493307839388, "grad_norm": 2.5260989665985107, "learning_rate": 4.0116485575906874e-06, "loss": 0.3214, "step": 552 }, { "epoch": 0.10573613766730401, "grad_norm": 2.2214319705963135, "learning_rate": 4.01279860892405e-06, "loss": 0.3037, "step": 553 }, { "epoch": 0.10592734225621415, "grad_norm": 2.315720796585083, "learning_rate": 4.013946582476095e-06, "loss": 0.3353, "step": 554 }, { "epoch": 0.10611854684512428, "grad_norm": 2.7136638164520264, "learning_rate": 4.015092485741078e-06, "loss": 0.2516, "step": 555 }, { "epoch": 0.10630975143403441, "grad_norm": 2.8678059577941895, "learning_rate": 4.0162363261727876e-06, "loss": 0.1223, "step": 556 }, { "epoch": 0.10650095602294456, "grad_norm": 3.997982978820801, "learning_rate": 4.017378111184824e-06, "loss": 0.2095, "step": 557 }, { "epoch": 0.10669216061185469, "grad_norm": 2.3306539058685303, "learning_rate": 4.018517848150896e-06, "loss": 0.2294, "step": 558 }, { "epoch": 0.10688336520076482, "grad_norm": 1.9543094635009766, "learning_rate": 4.019655544405104e-06, "loss": 0.1078, "step": 559 }, { "epoch": 0.10707456978967496, "grad_norm": 2.487516403198242, "learning_rate": 4.0207912072422265e-06, "loss": 0.2011, "step": 560 }, { "epoch": 0.10726577437858509, "grad_norm": 1.3109228610992432, "learning_rate": 4.0219248439179914e-06, "loss": 0.1198, "step": 561 }, { "epoch": 0.10745697896749522, "grad_norm": 1.7776157855987549, "learning_rate": 4.023056461649366e-06, "loss": 0.0965, "step": 562 }, { "epoch": 0.10764818355640535, "grad_norm": 2.4455111026763916, "learning_rate": 4.024186067614824e-06, "loss": 0.1997, "step": 563 }, { "epoch": 0.10783938814531549, "grad_norm": 2.7624261379241943, "learning_rate": 4.0253136689546225e-06, "loss": 0.6754, "step": 564 }, { "epoch": 0.10803059273422562, "grad_norm": 1.7886794805526733, "learning_rate": 4.026439272771073e-06, "loss": 0.292, "step": 565 }, { "epoch": 0.10822179732313575, "grad_norm": 1.396551489830017, "learning_rate": 4.027562886128808e-06, "loss": 0.095, "step": 566 }, { "epoch": 0.10841300191204589, "grad_norm": 2.8856143951416016, "learning_rate": 4.028684516055048e-06, "loss": 0.2296, "step": 567 }, { "epoch": 0.10860420650095602, "grad_norm": 2.7726597785949707, "learning_rate": 4.029804169539866e-06, "loss": 0.1276, "step": 568 }, { "epoch": 0.10879541108986615, "grad_norm": 2.4296844005584717, "learning_rate": 4.030921853536447e-06, "loss": 0.2504, "step": 569 }, { "epoch": 0.1089866156787763, "grad_norm": 2.3982651233673096, "learning_rate": 4.032037574961348e-06, "loss": 0.487, "step": 570 }, { "epoch": 0.10917782026768642, "grad_norm": 2.2958669662475586, "learning_rate": 4.033151340694757e-06, "loss": 0.352, "step": 571 }, { "epoch": 0.10936902485659655, "grad_norm": 2.792046546936035, "learning_rate": 4.0342631575807424e-06, "loss": 0.3026, "step": 572 }, { "epoch": 0.1095602294455067, "grad_norm": 2.0497055053710938, "learning_rate": 4.035373032427511e-06, "loss": 0.207, "step": 573 }, { "epoch": 0.10975143403441683, "grad_norm": 1.5417126417160034, "learning_rate": 4.036480972007654e-06, "loss": 0.0919, "step": 574 }, { "epoch": 0.10994263862332695, "grad_norm": 2.187443494796753, "learning_rate": 4.037586983058396e-06, "loss": 0.134, "step": 575 }, { "epoch": 0.1101338432122371, "grad_norm": 3.1193389892578125, "learning_rate": 4.038691072281846e-06, "loss": 0.5783, "step": 576 }, { "epoch": 0.11032504780114723, "grad_norm": 2.0621426105499268, "learning_rate": 4.0397932463452296e-06, "loss": 0.3393, "step": 577 }, { "epoch": 0.11051625239005736, "grad_norm": 3.7791013717651367, "learning_rate": 4.040893511881142e-06, "loss": 0.469, "step": 578 }, { "epoch": 0.1107074569789675, "grad_norm": 1.6285041570663452, "learning_rate": 4.0419918754877816e-06, "loss": 0.0855, "step": 579 }, { "epoch": 0.11089866156787763, "grad_norm": 2.735530138015747, "learning_rate": 4.043088343729189e-06, "loss": 0.2253, "step": 580 }, { "epoch": 0.11108986615678776, "grad_norm": 3.3225202560424805, "learning_rate": 4.0441829231354805e-06, "loss": 0.2688, "step": 581 }, { "epoch": 0.1112810707456979, "grad_norm": 2.049799680709839, "learning_rate": 4.0452756202030864e-06, "loss": 0.2561, "step": 582 }, { "epoch": 0.11147227533460803, "grad_norm": 2.2837252616882324, "learning_rate": 4.046366441394976e-06, "loss": 0.4603, "step": 583 }, { "epoch": 0.11166347992351816, "grad_norm": 1.8498018980026245, "learning_rate": 4.0474553931408905e-06, "loss": 0.3059, "step": 584 }, { "epoch": 0.1118546845124283, "grad_norm": 2.7567930221557617, "learning_rate": 4.0485424818375704e-06, "loss": 0.3465, "step": 585 }, { "epoch": 0.11204588910133843, "grad_norm": 2.339862585067749, "learning_rate": 4.04962771384898e-06, "loss": 0.1157, "step": 586 }, { "epoch": 0.11223709369024856, "grad_norm": 2.2230653762817383, "learning_rate": 4.050711095506532e-06, "loss": 0.1013, "step": 587 }, { "epoch": 0.1124282982791587, "grad_norm": 4.637607574462891, "learning_rate": 4.051792633109307e-06, "loss": 0.3786, "step": 588 }, { "epoch": 0.11261950286806884, "grad_norm": 2.5454115867614746, "learning_rate": 4.052872332924277e-06, "loss": 0.223, "step": 589 }, { "epoch": 0.11281070745697896, "grad_norm": 2.093069076538086, "learning_rate": 4.053950201186518e-06, "loss": 0.1641, "step": 590 }, { "epoch": 0.11300191204588911, "grad_norm": 2.5368142127990723, "learning_rate": 4.055026244099433e-06, "loss": 0.3767, "step": 591 }, { "epoch": 0.11319311663479924, "grad_norm": 1.601399302482605, "learning_rate": 4.056100467834957e-06, "loss": 0.1006, "step": 592 }, { "epoch": 0.11338432122370937, "grad_norm": 2.5372474193573, "learning_rate": 4.05717287853378e-06, "loss": 0.1315, "step": 593 }, { "epoch": 0.1135755258126195, "grad_norm": 2.6447410583496094, "learning_rate": 4.058243482305548e-06, "loss": 0.1529, "step": 594 }, { "epoch": 0.11376673040152964, "grad_norm": 3.427429676055908, "learning_rate": 4.059312285229079e-06, "loss": 0.6611, "step": 595 }, { "epoch": 0.11395793499043977, "grad_norm": 2.3087265491485596, "learning_rate": 4.060379293352566e-06, "loss": 0.226, "step": 596 }, { "epoch": 0.1141491395793499, "grad_norm": 2.3593530654907227, "learning_rate": 4.061444512693784e-06, "loss": 0.3704, "step": 597 }, { "epoch": 0.11434034416826004, "grad_norm": 2.03262996673584, "learning_rate": 4.062507949240291e-06, "loss": 0.1638, "step": 598 }, { "epoch": 0.11453154875717017, "grad_norm": 2.4928765296936035, "learning_rate": 4.063569608949637e-06, "loss": 0.1744, "step": 599 }, { "epoch": 0.1147227533460803, "grad_norm": 6.897516250610352, "learning_rate": 4.0646294977495545e-06, "loss": 0.2404, "step": 600 }, { "epoch": 0.11491395793499044, "grad_norm": 1.9926093816757202, "learning_rate": 4.065687621538164e-06, "loss": 0.2995, "step": 601 }, { "epoch": 0.11510516252390057, "grad_norm": 3.1814188957214355, "learning_rate": 4.066743986184169e-06, "loss": 0.4707, "step": 602 }, { "epoch": 0.1152963671128107, "grad_norm": 1.9630004167556763, "learning_rate": 4.067798597527053e-06, "loss": 0.2011, "step": 603 }, { "epoch": 0.11548757170172085, "grad_norm": 3.1169941425323486, "learning_rate": 4.068851461377267e-06, "loss": 0.1188, "step": 604 }, { "epoch": 0.11567877629063097, "grad_norm": 1.6685168743133545, "learning_rate": 4.069902583516429e-06, "loss": 0.0651, "step": 605 }, { "epoch": 0.1158699808795411, "grad_norm": 3.466280937194824, "learning_rate": 4.070951969697511e-06, "loss": 0.1449, "step": 606 }, { "epoch": 0.11606118546845125, "grad_norm": 2.8022851943969727, "learning_rate": 4.071999625645027e-06, "loss": 0.2323, "step": 607 }, { "epoch": 0.11625239005736138, "grad_norm": 2.7856459617614746, "learning_rate": 4.073045557055227e-06, "loss": 0.4914, "step": 608 }, { "epoch": 0.1164435946462715, "grad_norm": 2.7790656089782715, "learning_rate": 4.07408976959627e-06, "loss": 0.5342, "step": 609 }, { "epoch": 0.11663479923518165, "grad_norm": 2.095447063446045, "learning_rate": 4.075132268908421e-06, "loss": 0.2162, "step": 610 }, { "epoch": 0.11682600382409178, "grad_norm": 1.7271842956542969, "learning_rate": 4.076173060604226e-06, "loss": 0.1058, "step": 611 }, { "epoch": 0.11701720841300191, "grad_norm": 3.3794445991516113, "learning_rate": 4.077212150268698e-06, "loss": 0.1215, "step": 612 }, { "epoch": 0.11720841300191205, "grad_norm": 5.20923376083374, "learning_rate": 4.078249543459495e-06, "loss": 0.2695, "step": 613 }, { "epoch": 0.11739961759082218, "grad_norm": 3.670212984085083, "learning_rate": 4.079285245707096e-06, "loss": 0.9118, "step": 614 }, { "epoch": 0.11759082217973231, "grad_norm": 4.551514148712158, "learning_rate": 4.080319262514981e-06, "loss": 0.407, "step": 615 }, { "epoch": 0.11778202676864245, "grad_norm": 2.7722604274749756, "learning_rate": 4.081351599359807e-06, "loss": 0.3355, "step": 616 }, { "epoch": 0.11797323135755258, "grad_norm": 2.6713385581970215, "learning_rate": 4.0823822616915795e-06, "loss": 0.1726, "step": 617 }, { "epoch": 0.11816443594646271, "grad_norm": 2.3525755405426025, "learning_rate": 4.083411254933824e-06, "loss": 0.1748, "step": 618 }, { "epoch": 0.11835564053537286, "grad_norm": 2.2042713165283203, "learning_rate": 4.084438584483764e-06, "loss": 0.2662, "step": 619 }, { "epoch": 0.11854684512428298, "grad_norm": 2.0608386993408203, "learning_rate": 4.085464255712483e-06, "loss": 0.2041, "step": 620 }, { "epoch": 0.11873804971319311, "grad_norm": 2.0503997802734375, "learning_rate": 4.0864882739650965e-06, "loss": 0.2242, "step": 621 }, { "epoch": 0.11892925430210326, "grad_norm": 1.8372803926467896, "learning_rate": 4.087510644560921e-06, "loss": 0.1795, "step": 622 }, { "epoch": 0.11912045889101339, "grad_norm": 1.664029598236084, "learning_rate": 4.088531372793636e-06, "loss": 0.1379, "step": 623 }, { "epoch": 0.11931166347992352, "grad_norm": 1.8293601274490356, "learning_rate": 4.089550463931449e-06, "loss": 0.1597, "step": 624 }, { "epoch": 0.11950286806883365, "grad_norm": 3.36576771736145, "learning_rate": 4.090567923217263e-06, "loss": 0.1534, "step": 625 }, { "epoch": 0.11969407265774379, "grad_norm": 2.3628504276275635, "learning_rate": 4.091583755868833e-06, "loss": 0.348, "step": 626 }, { "epoch": 0.11988527724665392, "grad_norm": 2.525311231613159, "learning_rate": 4.0925979670789294e-06, "loss": 0.4548, "step": 627 }, { "epoch": 0.12007648183556405, "grad_norm": 2.786006450653076, "learning_rate": 4.093610562015496e-06, "loss": 0.3034, "step": 628 }, { "epoch": 0.12026768642447419, "grad_norm": 2.4381093978881836, "learning_rate": 4.09462154582181e-06, "loss": 0.3277, "step": 629 }, { "epoch": 0.12045889101338432, "grad_norm": 1.9374213218688965, "learning_rate": 4.095630923616636e-06, "loss": 0.1188, "step": 630 }, { "epoch": 0.12065009560229445, "grad_norm": 3.029554843902588, "learning_rate": 4.096638700494381e-06, "loss": 0.1719, "step": 631 }, { "epoch": 0.12084130019120459, "grad_norm": 2.0337328910827637, "learning_rate": 4.097644881525257e-06, "loss": 0.3093, "step": 632 }, { "epoch": 0.12103250478011472, "grad_norm": 2.4461922645568848, "learning_rate": 4.098649471755419e-06, "loss": 0.3777, "step": 633 }, { "epoch": 0.12122370936902485, "grad_norm": 2.149134635925293, "learning_rate": 4.099652476207133e-06, "loss": 0.1888, "step": 634 }, { "epoch": 0.121414913957935, "grad_norm": 3.174314022064209, "learning_rate": 4.1006538998789146e-06, "loss": 0.4976, "step": 635 }, { "epoch": 0.12160611854684512, "grad_norm": 2.157924175262451, "learning_rate": 4.101653747745683e-06, "loss": 0.2177, "step": 636 }, { "epoch": 0.12179732313575525, "grad_norm": 9.099021911621094, "learning_rate": 4.102652024758911e-06, "loss": 0.3181, "step": 637 }, { "epoch": 0.1219885277246654, "grad_norm": 2.279358386993408, "learning_rate": 4.10364873584677e-06, "loss": 0.1611, "step": 638 }, { "epoch": 0.12217973231357553, "grad_norm": 6.471635818481445, "learning_rate": 4.104643885914275e-06, "loss": 0.2495, "step": 639 }, { "epoch": 0.12237093690248566, "grad_norm": 1.9951330423355103, "learning_rate": 4.1056374798434325e-06, "loss": 0.1462, "step": 640 }, { "epoch": 0.1225621414913958, "grad_norm": 2.0457870960235596, "learning_rate": 4.106629522493381e-06, "loss": 0.2578, "step": 641 }, { "epoch": 0.12275334608030593, "grad_norm": 2.4925954341888428, "learning_rate": 4.107620018700538e-06, "loss": 0.2238, "step": 642 }, { "epoch": 0.12294455066921606, "grad_norm": 3.2228965759277344, "learning_rate": 4.1086089732787325e-06, "loss": 0.1979, "step": 643 }, { "epoch": 0.1231357552581262, "grad_norm": 1.5814366340637207, "learning_rate": 4.109596391019356e-06, "loss": 0.097, "step": 644 }, { "epoch": 0.12332695984703633, "grad_norm": 3.6417267322540283, "learning_rate": 4.110582276691498e-06, "loss": 0.354, "step": 645 }, { "epoch": 0.12351816443594646, "grad_norm": 2.278165102005005, "learning_rate": 4.11156663504208e-06, "loss": 0.1463, "step": 646 }, { "epoch": 0.1237093690248566, "grad_norm": 1.5671286582946777, "learning_rate": 4.112549470795998e-06, "loss": 0.1445, "step": 647 }, { "epoch": 0.12390057361376673, "grad_norm": 3.1425745487213135, "learning_rate": 4.113530788656255e-06, "loss": 0.2828, "step": 648 }, { "epoch": 0.12409177820267686, "grad_norm": 3.6428639888763428, "learning_rate": 4.1145105933041e-06, "loss": 0.1176, "step": 649 }, { "epoch": 0.124282982791587, "grad_norm": 1.9820659160614014, "learning_rate": 4.115488889399157e-06, "loss": 0.1025, "step": 650 }, { "epoch": 0.12447418738049713, "grad_norm": 2.4328808784484863, "learning_rate": 4.116465681579564e-06, "loss": 0.3629, "step": 651 }, { "epoch": 0.12466539196940726, "grad_norm": 2.9326987266540527, "learning_rate": 4.117440974462098e-06, "loss": 0.1771, "step": 652 }, { "epoch": 0.1248565965583174, "grad_norm": 2.000542640686035, "learning_rate": 4.11841477264231e-06, "loss": 0.1175, "step": 653 }, { "epoch": 0.12504780114722752, "grad_norm": 2.4583964347839355, "learning_rate": 4.119387080694656e-06, "loss": 0.2045, "step": 654 }, { "epoch": 0.12523900573613767, "grad_norm": 2.519341230392456, "learning_rate": 4.1203579031726235e-06, "loss": 0.2256, "step": 655 }, { "epoch": 0.1254302103250478, "grad_norm": 1.732568621635437, "learning_rate": 4.12132724460886e-06, "loss": 0.0891, "step": 656 }, { "epoch": 0.12562141491395792, "grad_norm": 3.676959276199341, "learning_rate": 4.122295109515299e-06, "loss": 0.5975, "step": 657 }, { "epoch": 0.12581261950286807, "grad_norm": 3.2852377891540527, "learning_rate": 4.123261502383291e-06, "loss": 0.6327, "step": 658 }, { "epoch": 0.1260038240917782, "grad_norm": 2.5524721145629883, "learning_rate": 4.12422642768372e-06, "loss": 0.2364, "step": 659 }, { "epoch": 0.12619502868068833, "grad_norm": 2.0922858715057373, "learning_rate": 4.125189889867135e-06, "loss": 0.147, "step": 660 }, { "epoch": 0.12638623326959847, "grad_norm": 2.2585864067077637, "learning_rate": 4.126151893363871e-06, "loss": 0.1199, "step": 661 }, { "epoch": 0.1265774378585086, "grad_norm": 1.344876766204834, "learning_rate": 4.12711244258417e-06, "loss": 0.0836, "step": 662 }, { "epoch": 0.12676864244741873, "grad_norm": 4.379039764404297, "learning_rate": 4.128071541918302e-06, "loss": 0.2658, "step": 663 }, { "epoch": 0.12695984703632887, "grad_norm": 2.2609031200408936, "learning_rate": 4.129029195736687e-06, "loss": 0.3937, "step": 664 }, { "epoch": 0.12715105162523901, "grad_norm": 2.849423408508301, "learning_rate": 4.129985408390017e-06, "loss": 0.1937, "step": 665 }, { "epoch": 0.12734225621414913, "grad_norm": 3.229069471359253, "learning_rate": 4.130940184209367e-06, "loss": 0.4889, "step": 666 }, { "epoch": 0.12753346080305927, "grad_norm": 3.168386459350586, "learning_rate": 4.131893527506318e-06, "loss": 0.3612, "step": 667 }, { "epoch": 0.12772466539196942, "grad_norm": 1.0114667415618896, "learning_rate": 4.132845442573078e-06, "loss": 0.0365, "step": 668 }, { "epoch": 0.12791586998087953, "grad_norm": 1.751471757888794, "learning_rate": 4.133795933682587e-06, "loss": 0.097, "step": 669 }, { "epoch": 0.12810707456978968, "grad_norm": 7.675336837768555, "learning_rate": 4.13474500508864e-06, "loss": 0.6252, "step": 670 }, { "epoch": 0.12829827915869982, "grad_norm": 2.195767402648926, "learning_rate": 4.135692661026002e-06, "loss": 0.2651, "step": 671 }, { "epoch": 0.12848948374760993, "grad_norm": 2.2194595336914062, "learning_rate": 4.136638905710514e-06, "loss": 0.1452, "step": 672 }, { "epoch": 0.12868068833652008, "grad_norm": 2.8059849739074707, "learning_rate": 4.137583743339215e-06, "loss": 0.1959, "step": 673 }, { "epoch": 0.12887189292543022, "grad_norm": 2.4581644535064697, "learning_rate": 4.138527178090445e-06, "loss": 0.1656, "step": 674 }, { "epoch": 0.12906309751434034, "grad_norm": 2.713122606277466, "learning_rate": 4.1394692141239635e-06, "loss": 0.172, "step": 675 }, { "epoch": 0.12925430210325048, "grad_norm": 3.6402595043182373, "learning_rate": 4.140409855581052e-06, "loss": 0.5809, "step": 676 }, { "epoch": 0.12944550669216062, "grad_norm": 3.1227595806121826, "learning_rate": 4.14134910658463e-06, "loss": 0.3983, "step": 677 }, { "epoch": 0.12963671128107074, "grad_norm": 2.864481210708618, "learning_rate": 4.14228697123936e-06, "loss": 0.3984, "step": 678 }, { "epoch": 0.12982791586998088, "grad_norm": 1.6978756189346313, "learning_rate": 4.143223453631755e-06, "loss": 0.074, "step": 679 }, { "epoch": 0.13001912045889102, "grad_norm": 1.8725084066390991, "learning_rate": 4.144158557830285e-06, "loss": 0.1087, "step": 680 }, { "epoch": 0.13021032504780114, "grad_norm": 3.0994725227355957, "learning_rate": 4.1450922878854865e-06, "loss": 0.3181, "step": 681 }, { "epoch": 0.13040152963671128, "grad_norm": 2.7692768573760986, "learning_rate": 4.146024647830064e-06, "loss": 0.3154, "step": 682 }, { "epoch": 0.13059273422562143, "grad_norm": 2.107736825942993, "learning_rate": 4.146955641678995e-06, "loss": 0.3758, "step": 683 }, { "epoch": 0.13078393881453154, "grad_norm": 1.6708070039749146, "learning_rate": 4.147885273429636e-06, "loss": 0.1446, "step": 684 }, { "epoch": 0.13097514340344169, "grad_norm": 2.343808174133301, "learning_rate": 4.148813547061823e-06, "loss": 0.2898, "step": 685 }, { "epoch": 0.13116634799235183, "grad_norm": 3.282604932785034, "learning_rate": 4.1497404665379755e-06, "loss": 0.3803, "step": 686 }, { "epoch": 0.13135755258126194, "grad_norm": 2.8634464740753174, "learning_rate": 4.150666035803198e-06, "loss": 0.1605, "step": 687 }, { "epoch": 0.1315487571701721, "grad_norm": 2.119781255722046, "learning_rate": 4.151590258785376e-06, "loss": 0.1974, "step": 688 }, { "epoch": 0.13173996175908223, "grad_norm": 2.3915913105010986, "learning_rate": 4.1525131393952865e-06, "loss": 0.2118, "step": 689 }, { "epoch": 0.13193116634799235, "grad_norm": 2.3511242866516113, "learning_rate": 4.153434681526684e-06, "loss": 0.3729, "step": 690 }, { "epoch": 0.1321223709369025, "grad_norm": 2.7544589042663574, "learning_rate": 4.154354889056412e-06, "loss": 0.3998, "step": 691 }, { "epoch": 0.13231357552581263, "grad_norm": 2.3922064304351807, "learning_rate": 4.155273765844489e-06, "loss": 0.1746, "step": 692 }, { "epoch": 0.13250478011472275, "grad_norm": 2.2704615592956543, "learning_rate": 4.156191315734217e-06, "loss": 0.1901, "step": 693 }, { "epoch": 0.1326959847036329, "grad_norm": 1.6022690534591675, "learning_rate": 4.157107542552267e-06, "loss": 0.1309, "step": 694 }, { "epoch": 0.13288718929254303, "grad_norm": 2.8731253147125244, "learning_rate": 4.158022450108784e-06, "loss": 0.3855, "step": 695 }, { "epoch": 0.13307839388145315, "grad_norm": 2.325465679168701, "learning_rate": 4.158936042197477e-06, "loss": 0.1592, "step": 696 }, { "epoch": 0.1332695984703633, "grad_norm": 3.571281909942627, "learning_rate": 4.1598483225957125e-06, "loss": 0.5375, "step": 697 }, { "epoch": 0.13346080305927344, "grad_norm": 2.267007350921631, "learning_rate": 4.1607592950646145e-06, "loss": 0.1196, "step": 698 }, { "epoch": 0.13365200764818355, "grad_norm": 2.3478941917419434, "learning_rate": 4.161668963349148e-06, "loss": 0.1641, "step": 699 }, { "epoch": 0.1338432122370937, "grad_norm": 2.05322265625, "learning_rate": 4.162577331178222e-06, "loss": 0.131, "step": 700 }, { "epoch": 0.1340344168260038, "grad_norm": 4.005086898803711, "learning_rate": 4.163484402264773e-06, "loss": 0.4016, "step": 701 }, { "epoch": 0.13422562141491395, "grad_norm": 3.990110397338867, "learning_rate": 4.1643901803058575e-06, "loss": 0.4845, "step": 702 }, { "epoch": 0.1344168260038241, "grad_norm": 2.7093546390533447, "learning_rate": 4.165294668982747e-06, "loss": 0.3717, "step": 703 }, { "epoch": 0.1346080305927342, "grad_norm": 2.439100980758667, "learning_rate": 4.1661978719610135e-06, "loss": 0.3189, "step": 704 }, { "epoch": 0.13479923518164436, "grad_norm": 2.0016961097717285, "learning_rate": 4.167099792890619e-06, "loss": 0.1995, "step": 705 }, { "epoch": 0.1349904397705545, "grad_norm": 2.2571163177490234, "learning_rate": 4.168000435406005e-06, "loss": 0.1032, "step": 706 }, { "epoch": 0.13518164435946461, "grad_norm": 2.6907873153686523, "learning_rate": 4.168899803126179e-06, "loss": 0.3268, "step": 707 }, { "epoch": 0.13537284894837476, "grad_norm": 2.1795923709869385, "learning_rate": 4.169797899654807e-06, "loss": 0.2979, "step": 708 }, { "epoch": 0.1355640535372849, "grad_norm": 1.801849126815796, "learning_rate": 4.1706947285802905e-06, "loss": 0.2202, "step": 709 }, { "epoch": 0.13575525812619502, "grad_norm": 3.593956232070923, "learning_rate": 4.1715902934758625e-06, "loss": 0.4659, "step": 710 }, { "epoch": 0.13594646271510516, "grad_norm": 2.4480674266815186, "learning_rate": 4.172484597899666e-06, "loss": 0.1247, "step": 711 }, { "epoch": 0.1361376673040153, "grad_norm": 2.3875763416290283, "learning_rate": 4.1733776453948425e-06, "loss": 0.064, "step": 712 }, { "epoch": 0.13632887189292542, "grad_norm": 2.754842758178711, "learning_rate": 4.174269439489614e-06, "loss": 0.1734, "step": 713 }, { "epoch": 0.13652007648183556, "grad_norm": 3.3177061080932617, "learning_rate": 4.175159983697367e-06, "loss": 0.452, "step": 714 }, { "epoch": 0.1367112810707457, "grad_norm": 1.6432844400405884, "learning_rate": 4.176049281516738e-06, "loss": 0.1835, "step": 715 }, { "epoch": 0.13690248565965582, "grad_norm": 3.1820502281188965, "learning_rate": 4.176937336431696e-06, "loss": 0.3835, "step": 716 }, { "epoch": 0.13709369024856596, "grad_norm": 2.664802312850952, "learning_rate": 4.177824151911616e-06, "loss": 0.1186, "step": 717 }, { "epoch": 0.1372848948374761, "grad_norm": 2.4787399768829346, "learning_rate": 4.178709731411373e-06, "loss": 0.3117, "step": 718 }, { "epoch": 0.13747609942638622, "grad_norm": 2.4012043476104736, "learning_rate": 4.179594078371414e-06, "loss": 0.0682, "step": 719 }, { "epoch": 0.13766730401529637, "grad_norm": 2.465858221054077, "learning_rate": 4.180477196217842e-06, "loss": 0.335, "step": 720 }, { "epoch": 0.1378585086042065, "grad_norm": 2.301172971725464, "learning_rate": 4.181359088362493e-06, "loss": 0.3335, "step": 721 }, { "epoch": 0.13804971319311662, "grad_norm": 1.929276466369629, "learning_rate": 4.182239758203017e-06, "loss": 0.0915, "step": 722 }, { "epoch": 0.13824091778202677, "grad_norm": 2.7580771446228027, "learning_rate": 4.183119209122958e-06, "loss": 0.3561, "step": 723 }, { "epoch": 0.1384321223709369, "grad_norm": 1.6526697874069214, "learning_rate": 4.183997444491827e-06, "loss": 0.2137, "step": 724 }, { "epoch": 0.13862332695984703, "grad_norm": 2.3815810680389404, "learning_rate": 4.184874467665185e-06, "loss": 0.1152, "step": 725 }, { "epoch": 0.13881453154875717, "grad_norm": 2.2507095336914062, "learning_rate": 4.185750281984717e-06, "loss": 0.3442, "step": 726 }, { "epoch": 0.1390057361376673, "grad_norm": 3.3361330032348633, "learning_rate": 4.1866248907783065e-06, "loss": 0.2196, "step": 727 }, { "epoch": 0.13919694072657743, "grad_norm": 2.5841755867004395, "learning_rate": 4.187498297360117e-06, "loss": 0.2489, "step": 728 }, { "epoch": 0.13938814531548757, "grad_norm": 2.5357816219329834, "learning_rate": 4.188370505030664e-06, "loss": 0.1614, "step": 729 }, { "epoch": 0.13957934990439771, "grad_norm": 2.3405823707580566, "learning_rate": 4.189241517076887e-06, "loss": 0.1311, "step": 730 }, { "epoch": 0.13977055449330783, "grad_norm": 2.4516336917877197, "learning_rate": 4.190111336772229e-06, "loss": 0.1509, "step": 731 }, { "epoch": 0.13996175908221797, "grad_norm": 3.084892511367798, "learning_rate": 4.190979967376708e-06, "loss": 0.2953, "step": 732 }, { "epoch": 0.14015296367112812, "grad_norm": 2.2274246215820312, "learning_rate": 4.19184741213699e-06, "loss": 0.4075, "step": 733 }, { "epoch": 0.14034416826003823, "grad_norm": 3.0207102298736572, "learning_rate": 4.192713674286461e-06, "loss": 0.2865, "step": 734 }, { "epoch": 0.14053537284894838, "grad_norm": 1.836266040802002, "learning_rate": 4.193578757045304e-06, "loss": 0.1552, "step": 735 }, { "epoch": 0.14072657743785852, "grad_norm": 3.116115093231201, "learning_rate": 4.194442663620563e-06, "loss": 0.2001, "step": 736 }, { "epoch": 0.14091778202676863, "grad_norm": 3.3889942169189453, "learning_rate": 4.1953053972062215e-06, "loss": 0.1891, "step": 737 }, { "epoch": 0.14110898661567878, "grad_norm": 2.072631359100342, "learning_rate": 4.196166960983269e-06, "loss": 0.1645, "step": 738 }, { "epoch": 0.14130019120458892, "grad_norm": 3.678781032562256, "learning_rate": 4.197027358119775e-06, "loss": 0.5332, "step": 739 }, { "epoch": 0.14149139579349904, "grad_norm": 2.225128650665283, "learning_rate": 4.1978865917709535e-06, "loss": 0.1687, "step": 740 }, { "epoch": 0.14168260038240918, "grad_norm": 5.0827131271362305, "learning_rate": 4.198744665079239e-06, "loss": 0.4454, "step": 741 }, { "epoch": 0.14187380497131932, "grad_norm": 3.1659598350524902, "learning_rate": 4.199601581174351e-06, "loss": 0.4498, "step": 742 }, { "epoch": 0.14206500956022944, "grad_norm": 2.8837954998016357, "learning_rate": 4.200457343173363e-06, "loss": 0.1854, "step": 743 }, { "epoch": 0.14225621414913958, "grad_norm": 2.6731128692626953, "learning_rate": 4.20131195418077e-06, "loss": 0.2493, "step": 744 }, { "epoch": 0.14244741873804972, "grad_norm": 2.6178650856018066, "learning_rate": 4.2021654172885625e-06, "loss": 0.4237, "step": 745 }, { "epoch": 0.14263862332695984, "grad_norm": 2.731067657470703, "learning_rate": 4.203017735576281e-06, "loss": 0.3222, "step": 746 }, { "epoch": 0.14282982791586998, "grad_norm": 2.4263575077056885, "learning_rate": 4.203868912111097e-06, "loss": 0.3628, "step": 747 }, { "epoch": 0.14302103250478013, "grad_norm": 1.9994317293167114, "learning_rate": 4.204718949947867e-06, "loss": 0.1756, "step": 748 }, { "epoch": 0.14321223709369024, "grad_norm": 3.0634548664093018, "learning_rate": 4.205567852129206e-06, "loss": 0.2029, "step": 749 }, { "epoch": 0.14340344168260039, "grad_norm": 2.2715470790863037, "learning_rate": 4.20641562168555e-06, "loss": 0.2621, "step": 750 }, { "epoch": 0.14359464627151053, "grad_norm": 4.5768842697143555, "learning_rate": 4.207262261635224e-06, "loss": 0.3184, "step": 751 }, { "epoch": 0.14378585086042064, "grad_norm": 2.8422701358795166, "learning_rate": 4.208107774984498e-06, "loss": 0.3711, "step": 752 }, { "epoch": 0.1439770554493308, "grad_norm": 1.645248293876648, "learning_rate": 4.20895216472766e-06, "loss": 0.1714, "step": 753 }, { "epoch": 0.14416826003824093, "grad_norm": 2.0097696781158447, "learning_rate": 4.20979543384708e-06, "loss": 0.2713, "step": 754 }, { "epoch": 0.14435946462715105, "grad_norm": 1.5780304670333862, "learning_rate": 4.210637585313263e-06, "loss": 0.1144, "step": 755 }, { "epoch": 0.1445506692160612, "grad_norm": 2.3878743648529053, "learning_rate": 4.2114786220849235e-06, "loss": 0.1314, "step": 756 }, { "epoch": 0.14474187380497133, "grad_norm": 2.5952939987182617, "learning_rate": 4.212318547109039e-06, "loss": 0.3168, "step": 757 }, { "epoch": 0.14493307839388145, "grad_norm": 1.6484967470169067, "learning_rate": 4.213157363320921e-06, "loss": 0.195, "step": 758 }, { "epoch": 0.1451242829827916, "grad_norm": 1.6973950862884521, "learning_rate": 4.213995073644266e-06, "loss": 0.0901, "step": 759 }, { "epoch": 0.14531548757170173, "grad_norm": 2.196223258972168, "learning_rate": 4.214831680991224e-06, "loss": 0.1255, "step": 760 }, { "epoch": 0.14550669216061185, "grad_norm": 1.6075400114059448, "learning_rate": 4.2156671882624575e-06, "loss": 0.2015, "step": 761 }, { "epoch": 0.145697896749522, "grad_norm": 1.5651001930236816, "learning_rate": 4.2165015983472025e-06, "loss": 0.0658, "step": 762 }, { "epoch": 0.1458891013384321, "grad_norm": 2.242047071456909, "learning_rate": 4.217334914123325e-06, "loss": 0.2173, "step": 763 }, { "epoch": 0.14608030592734225, "grad_norm": 4.271725654602051, "learning_rate": 4.218167138457386e-06, "loss": 0.3126, "step": 764 }, { "epoch": 0.1462715105162524, "grad_norm": 1.832981824874878, "learning_rate": 4.218998274204695e-06, "loss": 0.1925, "step": 765 }, { "epoch": 0.1464627151051625, "grad_norm": 2.5712363719940186, "learning_rate": 4.219828324209373e-06, "loss": 0.2876, "step": 766 }, { "epoch": 0.14665391969407265, "grad_norm": 1.7425990104675293, "learning_rate": 4.2206572913044095e-06, "loss": 0.1229, "step": 767 }, { "epoch": 0.1468451242829828, "grad_norm": 2.2643516063690186, "learning_rate": 4.22148517831172e-06, "loss": 0.1774, "step": 768 }, { "epoch": 0.1470363288718929, "grad_norm": 2.998650312423706, "learning_rate": 4.222311988042205e-06, "loss": 0.1941, "step": 769 }, { "epoch": 0.14722753346080306, "grad_norm": 3.894683361053467, "learning_rate": 4.223137723295803e-06, "loss": 0.6313, "step": 770 }, { "epoch": 0.1474187380497132, "grad_norm": 1.2565118074417114, "learning_rate": 4.2239623868615564e-06, "loss": 0.2454, "step": 771 }, { "epoch": 0.14760994263862331, "grad_norm": 3.5236380100250244, "learning_rate": 4.224785981517657e-06, "loss": 0.3753, "step": 772 }, { "epoch": 0.14780114722753346, "grad_norm": 2.124140501022339, "learning_rate": 4.225608510031509e-06, "loss": 0.241, "step": 773 }, { "epoch": 0.1479923518164436, "grad_norm": 1.7390496730804443, "learning_rate": 4.226429975159786e-06, "loss": 0.0968, "step": 774 }, { "epoch": 0.14818355640535372, "grad_norm": 3.544078826904297, "learning_rate": 4.22725037964848e-06, "loss": 0.1911, "step": 775 }, { "epoch": 0.14837476099426386, "grad_norm": 4.519067287445068, "learning_rate": 4.228069726232962e-06, "loss": 0.8982, "step": 776 }, { "epoch": 0.148565965583174, "grad_norm": 2.7855448722839355, "learning_rate": 4.228888017638035e-06, "loss": 0.3423, "step": 777 }, { "epoch": 0.14875717017208412, "grad_norm": 3.33547043800354, "learning_rate": 4.229705256577988e-06, "loss": 0.2157, "step": 778 }, { "epoch": 0.14894837476099426, "grad_norm": 2.3527631759643555, "learning_rate": 4.2305214457566505e-06, "loss": 0.1733, "step": 779 }, { "epoch": 0.1491395793499044, "grad_norm": 3.6699483394622803, "learning_rate": 4.231336587867446e-06, "loss": 0.2432, "step": 780 }, { "epoch": 0.14933078393881452, "grad_norm": 1.638396143913269, "learning_rate": 4.232150685593444e-06, "loss": 0.0568, "step": 781 }, { "epoch": 0.14952198852772466, "grad_norm": 3.646986961364746, "learning_rate": 4.232963741607416e-06, "loss": 0.4378, "step": 782 }, { "epoch": 0.1497131931166348, "grad_norm": 2.220862865447998, "learning_rate": 4.233775758571886e-06, "loss": 0.2993, "step": 783 }, { "epoch": 0.14990439770554492, "grad_norm": 3.7839043140411377, "learning_rate": 4.234586739139182e-06, "loss": 0.47, "step": 784 }, { "epoch": 0.15009560229445507, "grad_norm": 1.9454379081726074, "learning_rate": 4.235396685951493e-06, "loss": 0.1672, "step": 785 }, { "epoch": 0.1502868068833652, "grad_norm": 2.693408727645874, "learning_rate": 4.236205601640911e-06, "loss": 0.3557, "step": 786 }, { "epoch": 0.15047801147227532, "grad_norm": 2.1505634784698486, "learning_rate": 4.237013488829494e-06, "loss": 0.0909, "step": 787 }, { "epoch": 0.15066921606118547, "grad_norm": 1.8870456218719482, "learning_rate": 4.237820350129308e-06, "loss": 0.1915, "step": 788 }, { "epoch": 0.1508604206500956, "grad_norm": 3.362215518951416, "learning_rate": 4.23862618814248e-06, "loss": 0.549, "step": 789 }, { "epoch": 0.15105162523900573, "grad_norm": 3.406558036804199, "learning_rate": 4.239431005461254e-06, "loss": 0.3686, "step": 790 }, { "epoch": 0.15124282982791587, "grad_norm": 2.572495460510254, "learning_rate": 4.240234804668029e-06, "loss": 0.2132, "step": 791 }, { "epoch": 0.151434034416826, "grad_norm": 3.3531334400177, "learning_rate": 4.2410375883354235e-06, "loss": 0.2027, "step": 792 }, { "epoch": 0.15162523900573613, "grad_norm": 1.9929064512252808, "learning_rate": 4.241839359026311e-06, "loss": 0.0925, "step": 793 }, { "epoch": 0.15181644359464627, "grad_norm": 4.2206010818481445, "learning_rate": 4.242640119293882e-06, "loss": 0.3048, "step": 794 }, { "epoch": 0.15200764818355642, "grad_norm": 3.0227558612823486, "learning_rate": 4.24343987168168e-06, "loss": 0.372, "step": 795 }, { "epoch": 0.15219885277246653, "grad_norm": 2.1193037033081055, "learning_rate": 4.2442386187236584e-06, "loss": 0.2249, "step": 796 }, { "epoch": 0.15239005736137667, "grad_norm": 2.863941192626953, "learning_rate": 4.2450363629442295e-06, "loss": 0.3385, "step": 797 }, { "epoch": 0.15258126195028682, "grad_norm": 1.4282262325286865, "learning_rate": 4.245833106858305e-06, "loss": 0.1862, "step": 798 }, { "epoch": 0.15277246653919693, "grad_norm": 3.935479164123535, "learning_rate": 4.2466288529713505e-06, "loss": 0.2419, "step": 799 }, { "epoch": 0.15296367112810708, "grad_norm": 1.3507992029190063, "learning_rate": 4.247423603779429e-06, "loss": 0.1504, "step": 800 }, { "epoch": 0.15315487571701722, "grad_norm": 3.4311065673828125, "learning_rate": 4.248217361769252e-06, "loss": 0.4921, "step": 801 }, { "epoch": 0.15334608030592733, "grad_norm": 1.9401636123657227, "learning_rate": 4.2490101294182175e-06, "loss": 0.3157, "step": 802 }, { "epoch": 0.15353728489483748, "grad_norm": 2.6393685340881348, "learning_rate": 4.249801909194468e-06, "loss": 0.3513, "step": 803 }, { "epoch": 0.15372848948374762, "grad_norm": 2.4486782550811768, "learning_rate": 4.250592703556928e-06, "loss": 0.1468, "step": 804 }, { "epoch": 0.15391969407265774, "grad_norm": 1.5397764444351196, "learning_rate": 4.251382514955353e-06, "loss": 0.1181, "step": 805 }, { "epoch": 0.15411089866156788, "grad_norm": 2.7118899822235107, "learning_rate": 4.252171345830375e-06, "loss": 0.1423, "step": 806 }, { "epoch": 0.15430210325047802, "grad_norm": 2.3002309799194336, "learning_rate": 4.252959198613545e-06, "loss": 0.2643, "step": 807 }, { "epoch": 0.15449330783938814, "grad_norm": 4.120339393615723, "learning_rate": 4.253746075727386e-06, "loss": 0.3035, "step": 808 }, { "epoch": 0.15468451242829828, "grad_norm": 1.6706206798553467, "learning_rate": 4.254531979585426e-06, "loss": 0.1403, "step": 809 }, { "epoch": 0.15487571701720843, "grad_norm": 1.846777319908142, "learning_rate": 4.2553169125922515e-06, "loss": 0.1069, "step": 810 }, { "epoch": 0.15506692160611854, "grad_norm": 2.311530590057373, "learning_rate": 4.256100877143548e-06, "loss": 0.1826, "step": 811 }, { "epoch": 0.15525812619502868, "grad_norm": 1.510845422744751, "learning_rate": 4.256883875626145e-06, "loss": 0.0622, "step": 812 }, { "epoch": 0.15544933078393883, "grad_norm": 1.923916220664978, "learning_rate": 4.2576659104180575e-06, "loss": 0.2099, "step": 813 }, { "epoch": 0.15564053537284894, "grad_norm": 3.773207902908325, "learning_rate": 4.258446983888535e-06, "loss": 0.5412, "step": 814 }, { "epoch": 0.15583173996175909, "grad_norm": 2.4419755935668945, "learning_rate": 4.259227098398094e-06, "loss": 0.4255, "step": 815 }, { "epoch": 0.15602294455066923, "grad_norm": 4.022161960601807, "learning_rate": 4.260006256298574e-06, "loss": 0.4799, "step": 816 }, { "epoch": 0.15621414913957934, "grad_norm": 1.609773874282837, "learning_rate": 4.260784459933167e-06, "loss": 0.1932, "step": 817 }, { "epoch": 0.1564053537284895, "grad_norm": 1.0987634658813477, "learning_rate": 4.261561711636471e-06, "loss": 0.051, "step": 818 }, { "epoch": 0.15659655831739963, "grad_norm": 2.9759936332702637, "learning_rate": 4.262338013734527e-06, "loss": 0.178, "step": 819 }, { "epoch": 0.15678776290630975, "grad_norm": 1.6191765069961548, "learning_rate": 4.263113368544856e-06, "loss": 0.1116, "step": 820 }, { "epoch": 0.1569789674952199, "grad_norm": 2.261420965194702, "learning_rate": 4.2638877783765115e-06, "loss": 0.2843, "step": 821 }, { "epoch": 0.15717017208413, "grad_norm": 1.752107858657837, "learning_rate": 4.26466124553011e-06, "loss": 0.2237, "step": 822 }, { "epoch": 0.15736137667304015, "grad_norm": 2.615859270095825, "learning_rate": 4.265433772297882e-06, "loss": 0.1974, "step": 823 }, { "epoch": 0.1575525812619503, "grad_norm": 1.7172234058380127, "learning_rate": 4.2662053609637e-06, "loss": 0.1186, "step": 824 }, { "epoch": 0.1577437858508604, "grad_norm": 1.4844310283660889, "learning_rate": 4.266976013803132e-06, "loss": 0.0642, "step": 825 }, { "epoch": 0.15793499043977055, "grad_norm": 2.956435441970825, "learning_rate": 4.267745733083475e-06, "loss": 0.2152, "step": 826 }, { "epoch": 0.1581261950286807, "grad_norm": 3.139362335205078, "learning_rate": 4.268514521063796e-06, "loss": 0.3454, "step": 827 }, { "epoch": 0.1583173996175908, "grad_norm": 3.0984015464782715, "learning_rate": 4.269282379994972e-06, "loss": 0.3052, "step": 828 }, { "epoch": 0.15850860420650095, "grad_norm": 2.265077829360962, "learning_rate": 4.27004931211973e-06, "loss": 0.1137, "step": 829 }, { "epoch": 0.1586998087954111, "grad_norm": 3.9762401580810547, "learning_rate": 4.270815319672684e-06, "loss": 0.3162, "step": 830 }, { "epoch": 0.1588910133843212, "grad_norm": 2.093393325805664, "learning_rate": 4.2715804048803785e-06, "loss": 0.0985, "step": 831 }, { "epoch": 0.15908221797323135, "grad_norm": 1.6958585977554321, "learning_rate": 4.272344569961324e-06, "loss": 0.1414, "step": 832 }, { "epoch": 0.1592734225621415, "grad_norm": 1.754672884941101, "learning_rate": 4.273107817126036e-06, "loss": 0.1115, "step": 833 }, { "epoch": 0.1594646271510516, "grad_norm": 2.3991124629974365, "learning_rate": 4.273870148577072e-06, "loss": 0.3149, "step": 834 }, { "epoch": 0.15965583173996176, "grad_norm": 1.6448285579681396, "learning_rate": 4.2746315665090745e-06, "loss": 0.0904, "step": 835 }, { "epoch": 0.1598470363288719, "grad_norm": 2.705641984939575, "learning_rate": 4.275392073108804e-06, "loss": 0.2193, "step": 836 }, { "epoch": 0.16003824091778202, "grad_norm": 3.0307109355926514, "learning_rate": 4.27615167055518e-06, "loss": 0.1033, "step": 837 }, { "epoch": 0.16022944550669216, "grad_norm": 3.0683274269104004, "learning_rate": 4.276910361019314e-06, "loss": 0.3433, "step": 838 }, { "epoch": 0.1604206500956023, "grad_norm": 2.956444501876831, "learning_rate": 4.277668146664553e-06, "loss": 0.3323, "step": 839 }, { "epoch": 0.16061185468451242, "grad_norm": 1.917574167251587, "learning_rate": 4.278425029646511e-06, "loss": 0.1704, "step": 840 }, { "epoch": 0.16080305927342256, "grad_norm": 2.8657712936401367, "learning_rate": 4.2791810121131075e-06, "loss": 0.217, "step": 841 }, { "epoch": 0.1609942638623327, "grad_norm": 2.1672983169555664, "learning_rate": 4.279936096204607e-06, "loss": 0.1102, "step": 842 }, { "epoch": 0.16118546845124282, "grad_norm": 2.7191829681396484, "learning_rate": 4.2806902840536505e-06, "loss": 0.2781, "step": 843 }, { "epoch": 0.16137667304015296, "grad_norm": 1.849647045135498, "learning_rate": 4.2814435777852955e-06, "loss": 0.2105, "step": 844 }, { "epoch": 0.1615678776290631, "grad_norm": 2.904238700866699, "learning_rate": 4.2821959795170494e-06, "loss": 0.5104, "step": 845 }, { "epoch": 0.16175908221797322, "grad_norm": 1.7526648044586182, "learning_rate": 4.282947491358906e-06, "loss": 0.1464, "step": 846 }, { "epoch": 0.16195028680688336, "grad_norm": 3.317267417907715, "learning_rate": 4.283698115413385e-06, "loss": 0.4144, "step": 847 }, { "epoch": 0.1621414913957935, "grad_norm": 1.7838001251220703, "learning_rate": 4.284447853775558e-06, "loss": 0.0672, "step": 848 }, { "epoch": 0.16233269598470362, "grad_norm": 2.0435261726379395, "learning_rate": 4.2851967085330925e-06, "loss": 0.1293, "step": 849 }, { "epoch": 0.16252390057361377, "grad_norm": 2.0593795776367188, "learning_rate": 4.2859446817662824e-06, "loss": 0.0951, "step": 850 }, { "epoch": 0.1627151051625239, "grad_norm": 2.386403799057007, "learning_rate": 4.286691775548084e-06, "loss": 0.3164, "step": 851 }, { "epoch": 0.16290630975143403, "grad_norm": 1.7959471940994263, "learning_rate": 4.28743799194415e-06, "loss": 0.1443, "step": 852 }, { "epoch": 0.16309751434034417, "grad_norm": 4.4852681159973145, "learning_rate": 4.288183333012865e-06, "loss": 0.6776, "step": 853 }, { "epoch": 0.1632887189292543, "grad_norm": 1.6934727430343628, "learning_rate": 4.288927800805377e-06, "loss": 0.1368, "step": 854 }, { "epoch": 0.16347992351816443, "grad_norm": 3.038947582244873, "learning_rate": 4.289671397365632e-06, "loss": 0.3026, "step": 855 }, { "epoch": 0.16367112810707457, "grad_norm": 2.0727596282958984, "learning_rate": 4.290414124730413e-06, "loss": 0.1034, "step": 856 }, { "epoch": 0.1638623326959847, "grad_norm": 3.337085008621216, "learning_rate": 4.291155984929362e-06, "loss": 0.4055, "step": 857 }, { "epoch": 0.16405353728489483, "grad_norm": 2.652127504348755, "learning_rate": 4.291896979985027e-06, "loss": 0.3585, "step": 858 }, { "epoch": 0.16424474187380497, "grad_norm": 3.1562952995300293, "learning_rate": 4.292637111912882e-06, "loss": 0.446, "step": 859 }, { "epoch": 0.16443594646271512, "grad_norm": 2.3302462100982666, "learning_rate": 4.293376382721373e-06, "loss": 0.2846, "step": 860 }, { "epoch": 0.16462715105162523, "grad_norm": 1.904043197631836, "learning_rate": 4.294114794411938e-06, "loss": 0.1925, "step": 861 }, { "epoch": 0.16481835564053537, "grad_norm": 2.2849550247192383, "learning_rate": 4.294852348979047e-06, "loss": 0.1391, "step": 862 }, { "epoch": 0.16500956022944552, "grad_norm": 2.6952767372131348, "learning_rate": 4.295589048410235e-06, "loss": 0.302, "step": 863 }, { "epoch": 0.16520076481835563, "grad_norm": 3.0628738403320312, "learning_rate": 4.2963248946861294e-06, "loss": 0.4776, "step": 864 }, { "epoch": 0.16539196940726578, "grad_norm": 2.940101385116577, "learning_rate": 4.297059889780485e-06, "loss": 0.2961, "step": 865 }, { "epoch": 0.16558317399617592, "grad_norm": 2.9635932445526123, "learning_rate": 4.297794035660217e-06, "loss": 0.4084, "step": 866 }, { "epoch": 0.16577437858508604, "grad_norm": 1.4240761995315552, "learning_rate": 4.298527334285426e-06, "loss": 0.0877, "step": 867 }, { "epoch": 0.16596558317399618, "grad_norm": 1.576123595237732, "learning_rate": 4.29925978760944e-06, "loss": 0.1101, "step": 868 }, { "epoch": 0.16615678776290632, "grad_norm": 2.8661000728607178, "learning_rate": 4.299991397578835e-06, "loss": 0.1854, "step": 869 }, { "epoch": 0.16634799235181644, "grad_norm": 2.202101469039917, "learning_rate": 4.300722166133473e-06, "loss": 0.1717, "step": 870 }, { "epoch": 0.16653919694072658, "grad_norm": 1.9804801940917969, "learning_rate": 4.301452095206531e-06, "loss": 0.419, "step": 871 }, { "epoch": 0.16673040152963672, "grad_norm": 1.402471899986267, "learning_rate": 4.302181186724532e-06, "loss": 0.1489, "step": 872 }, { "epoch": 0.16692160611854684, "grad_norm": 2.836812973022461, "learning_rate": 4.302909442607371e-06, "loss": 0.2623, "step": 873 }, { "epoch": 0.16711281070745698, "grad_norm": 1.8831483125686646, "learning_rate": 4.303636864768353e-06, "loss": 0.1072, "step": 874 }, { "epoch": 0.16730401529636713, "grad_norm": 1.8472790718078613, "learning_rate": 4.304363455114219e-06, "loss": 0.1246, "step": 875 }, { "epoch": 0.16749521988527724, "grad_norm": 2.792137622833252, "learning_rate": 4.305089215545175e-06, "loss": 0.3111, "step": 876 }, { "epoch": 0.16768642447418738, "grad_norm": 2.943768262863159, "learning_rate": 4.305814147954922e-06, "loss": 0.6068, "step": 877 }, { "epoch": 0.16787762906309753, "grad_norm": 2.1078765392303467, "learning_rate": 4.306538254230691e-06, "loss": 0.1484, "step": 878 }, { "epoch": 0.16806883365200764, "grad_norm": 3.882357597351074, "learning_rate": 4.307261536253264e-06, "loss": 0.2091, "step": 879 }, { "epoch": 0.1682600382409178, "grad_norm": 1.381016492843628, "learning_rate": 4.30798399589701e-06, "loss": 0.0775, "step": 880 }, { "epoch": 0.16845124282982793, "grad_norm": 2.2446751594543457, "learning_rate": 4.308705635029911e-06, "loss": 0.0931, "step": 881 }, { "epoch": 0.16864244741873805, "grad_norm": 3.000154733657837, "learning_rate": 4.309426455513591e-06, "loss": 0.3437, "step": 882 }, { "epoch": 0.1688336520076482, "grad_norm": 2.4180312156677246, "learning_rate": 4.310146459203349e-06, "loss": 0.1738, "step": 883 }, { "epoch": 0.1690248565965583, "grad_norm": 1.9331579208374023, "learning_rate": 4.310865647948177e-06, "loss": 0.2825, "step": 884 }, { "epoch": 0.16921606118546845, "grad_norm": 3.176043748855591, "learning_rate": 4.311584023590803e-06, "loss": 0.435, "step": 885 }, { "epoch": 0.1694072657743786, "grad_norm": 2.9748902320861816, "learning_rate": 4.312301587967709e-06, "loss": 0.442, "step": 886 }, { "epoch": 0.1695984703632887, "grad_norm": 1.8928931951522827, "learning_rate": 4.313018342909162e-06, "loss": 0.0825, "step": 887 }, { "epoch": 0.16978967495219885, "grad_norm": 3.570660352706909, "learning_rate": 4.3137342902392415e-06, "loss": 0.4021, "step": 888 }, { "epoch": 0.169980879541109, "grad_norm": 1.958633542060852, "learning_rate": 4.31444943177587e-06, "loss": 0.2978, "step": 889 }, { "epoch": 0.1701720841300191, "grad_norm": 2.055731773376465, "learning_rate": 4.315163769330838e-06, "loss": 0.2606, "step": 890 }, { "epoch": 0.17036328871892925, "grad_norm": 2.733567714691162, "learning_rate": 4.3158773047098325e-06, "loss": 0.3208, "step": 891 }, { "epoch": 0.1705544933078394, "grad_norm": 2.0781280994415283, "learning_rate": 4.3165900397124626e-06, "loss": 0.2838, "step": 892 }, { "epoch": 0.1707456978967495, "grad_norm": 4.172160625457764, "learning_rate": 4.317301976132288e-06, "loss": 0.1485, "step": 893 }, { "epoch": 0.17093690248565965, "grad_norm": 2.199040174484253, "learning_rate": 4.31801311575685e-06, "loss": 0.1481, "step": 894 }, { "epoch": 0.1711281070745698, "grad_norm": 3.143498420715332, "learning_rate": 4.318723460367692e-06, "loss": 0.5856, "step": 895 }, { "epoch": 0.1713193116634799, "grad_norm": 2.4403462409973145, "learning_rate": 4.319433011740389e-06, "loss": 0.4743, "step": 896 }, { "epoch": 0.17151051625239006, "grad_norm": 1.9097824096679688, "learning_rate": 4.3201417716445755e-06, "loss": 0.1086, "step": 897 }, { "epoch": 0.1717017208413002, "grad_norm": 2.7453420162200928, "learning_rate": 4.32084974184397e-06, "loss": 0.1981, "step": 898 }, { "epoch": 0.1718929254302103, "grad_norm": 1.3835936784744263, "learning_rate": 4.321556924096402e-06, "loss": 0.1109, "step": 899 }, { "epoch": 0.17208413001912046, "grad_norm": 1.855629563331604, "learning_rate": 4.322263320153839e-06, "loss": 0.0757, "step": 900 }, { "epoch": 0.1722753346080306, "grad_norm": 2.9253125190734863, "learning_rate": 4.322968931762411e-06, "loss": 0.2818, "step": 901 }, { "epoch": 0.17246653919694072, "grad_norm": 2.60461163520813, "learning_rate": 4.323673760662438e-06, "loss": 0.2558, "step": 902 }, { "epoch": 0.17265774378585086, "grad_norm": 1.764817476272583, "learning_rate": 4.324377808588454e-06, "loss": 0.1228, "step": 903 }, { "epoch": 0.172848948374761, "grad_norm": 2.949141502380371, "learning_rate": 4.3250810772692355e-06, "loss": 0.3251, "step": 904 }, { "epoch": 0.17304015296367112, "grad_norm": 2.435056686401367, "learning_rate": 4.3257835684278235e-06, "loss": 0.3511, "step": 905 }, { "epoch": 0.17323135755258126, "grad_norm": 2.2405662536621094, "learning_rate": 4.3264852837815515e-06, "loss": 0.0803, "step": 906 }, { "epoch": 0.1734225621414914, "grad_norm": 2.5514614582061768, "learning_rate": 4.327186225042066e-06, "loss": 0.3172, "step": 907 }, { "epoch": 0.17361376673040152, "grad_norm": 2.14652419090271, "learning_rate": 4.327886393915363e-06, "loss": 0.1575, "step": 908 }, { "epoch": 0.17380497131931166, "grad_norm": 1.3867746591567993, "learning_rate": 4.328585792101795e-06, "loss": 0.1291, "step": 909 }, { "epoch": 0.1739961759082218, "grad_norm": 3.6421027183532715, "learning_rate": 4.329284421296114e-06, "loss": 0.5589, "step": 910 }, { "epoch": 0.17418738049713192, "grad_norm": 2.516690731048584, "learning_rate": 4.329982283187484e-06, "loss": 0.1707, "step": 911 }, { "epoch": 0.17437858508604206, "grad_norm": 3.2310264110565186, "learning_rate": 4.330679379459511e-06, "loss": 0.2891, "step": 912 }, { "epoch": 0.1745697896749522, "grad_norm": 1.8623366355895996, "learning_rate": 4.331375711790265e-06, "loss": 0.1513, "step": 913 }, { "epoch": 0.17476099426386232, "grad_norm": 3.0351459980010986, "learning_rate": 4.332071281852306e-06, "loss": 0.3003, "step": 914 }, { "epoch": 0.17495219885277247, "grad_norm": 3.0663366317749023, "learning_rate": 4.332766091312704e-06, "loss": 0.3235, "step": 915 }, { "epoch": 0.1751434034416826, "grad_norm": 1.8554519414901733, "learning_rate": 4.333460141833072e-06, "loss": 0.1387, "step": 916 }, { "epoch": 0.17533460803059273, "grad_norm": 3.4480252265930176, "learning_rate": 4.33415343506958e-06, "loss": 0.2172, "step": 917 }, { "epoch": 0.17552581261950287, "grad_norm": 2.396263837814331, "learning_rate": 4.334845972672983e-06, "loss": 0.2369, "step": 918 }, { "epoch": 0.175717017208413, "grad_norm": 2.6219048500061035, "learning_rate": 4.335537756288644e-06, "loss": 0.2771, "step": 919 }, { "epoch": 0.17590822179732313, "grad_norm": 2.162111759185791, "learning_rate": 4.3362287875565595e-06, "loss": 0.3263, "step": 920 }, { "epoch": 0.17609942638623327, "grad_norm": 2.3528635501861572, "learning_rate": 4.33691906811138e-06, "loss": 0.1726, "step": 921 }, { "epoch": 0.17629063097514341, "grad_norm": 3.782550096511841, "learning_rate": 4.337608599582434e-06, "loss": 0.3164, "step": 922 }, { "epoch": 0.17648183556405353, "grad_norm": 2.536262035369873, "learning_rate": 4.338297383593754e-06, "loss": 0.3199, "step": 923 }, { "epoch": 0.17667304015296367, "grad_norm": 4.701323986053467, "learning_rate": 4.338985421764091e-06, "loss": 0.258, "step": 924 }, { "epoch": 0.17686424474187382, "grad_norm": 2.707467555999756, "learning_rate": 4.33967271570695e-06, "loss": 0.105, "step": 925 }, { "epoch": 0.17705544933078393, "grad_norm": 3.847241163253784, "learning_rate": 4.340359267030601e-06, "loss": 0.6552, "step": 926 }, { "epoch": 0.17724665391969407, "grad_norm": 1.8708347082138062, "learning_rate": 4.341045077338109e-06, "loss": 0.2187, "step": 927 }, { "epoch": 0.17743785850860422, "grad_norm": 2.500690460205078, "learning_rate": 4.341730148227351e-06, "loss": 0.1601, "step": 928 }, { "epoch": 0.17762906309751433, "grad_norm": 1.6813768148422241, "learning_rate": 4.342414481291046e-06, "loss": 0.0899, "step": 929 }, { "epoch": 0.17782026768642448, "grad_norm": 1.971083641052246, "learning_rate": 4.343098078116767e-06, "loss": 0.1689, "step": 930 }, { "epoch": 0.17801147227533462, "grad_norm": 1.656010389328003, "learning_rate": 4.343780940286974e-06, "loss": 0.0773, "step": 931 }, { "epoch": 0.17820267686424474, "grad_norm": 3.547978639602661, "learning_rate": 4.344463069379024e-06, "loss": 0.4317, "step": 932 }, { "epoch": 0.17839388145315488, "grad_norm": 1.7893317937850952, "learning_rate": 4.345144466965205e-06, "loss": 0.1723, "step": 933 }, { "epoch": 0.17858508604206502, "grad_norm": 1.7867648601531982, "learning_rate": 4.345825134612752e-06, "loss": 0.1345, "step": 934 }, { "epoch": 0.17877629063097514, "grad_norm": 3.299664258956909, "learning_rate": 4.3465050738838635e-06, "loss": 0.4693, "step": 935 }, { "epoch": 0.17896749521988528, "grad_norm": 2.6684377193450928, "learning_rate": 4.347184286335733e-06, "loss": 0.3825, "step": 936 }, { "epoch": 0.17915869980879542, "grad_norm": 1.9024335145950317, "learning_rate": 4.347862773520565e-06, "loss": 0.1348, "step": 937 }, { "epoch": 0.17934990439770554, "grad_norm": 2.407212972640991, "learning_rate": 4.348540536985596e-06, "loss": 0.3481, "step": 938 }, { "epoch": 0.17954110898661568, "grad_norm": 2.431903600692749, "learning_rate": 4.349217578273117e-06, "loss": 0.326, "step": 939 }, { "epoch": 0.17973231357552583, "grad_norm": 2.8086395263671875, "learning_rate": 4.349893898920495e-06, "loss": 0.3743, "step": 940 }, { "epoch": 0.17992351816443594, "grad_norm": 1.9264482259750366, "learning_rate": 4.350569500460188e-06, "loss": 0.2072, "step": 941 }, { "epoch": 0.18011472275334608, "grad_norm": 1.8798692226409912, "learning_rate": 4.35124438441978e-06, "loss": 0.0674, "step": 942 }, { "epoch": 0.1803059273422562, "grad_norm": 1.4653009176254272, "learning_rate": 4.351918552321986e-06, "loss": 0.0763, "step": 943 }, { "epoch": 0.18049713193116634, "grad_norm": 2.255692958831787, "learning_rate": 4.3525920056846815e-06, "loss": 0.124, "step": 944 }, { "epoch": 0.1806883365200765, "grad_norm": 2.478980779647827, "learning_rate": 4.35326474602092e-06, "loss": 0.3314, "step": 945 }, { "epoch": 0.1808795411089866, "grad_norm": 1.5225646495819092, "learning_rate": 4.353936774838954e-06, "loss": 0.353, "step": 946 }, { "epoch": 0.18107074569789675, "grad_norm": 1.214327096939087, "learning_rate": 4.354608093642257e-06, "loss": 0.0809, "step": 947 }, { "epoch": 0.1812619502868069, "grad_norm": 2.2034029960632324, "learning_rate": 4.355278703929541e-06, "loss": 0.2837, "step": 948 }, { "epoch": 0.181453154875717, "grad_norm": 3.38645076751709, "learning_rate": 4.3559486071947785e-06, "loss": 0.378, "step": 949 }, { "epoch": 0.18164435946462715, "grad_norm": 1.7553558349609375, "learning_rate": 4.3566178049272204e-06, "loss": 0.1021, "step": 950 }, { "epoch": 0.1818355640535373, "grad_norm": 2.1390013694763184, "learning_rate": 4.357286298611418e-06, "loss": 0.2978, "step": 951 }, { "epoch": 0.1820267686424474, "grad_norm": 2.9334323406219482, "learning_rate": 4.3579540897272416e-06, "loss": 0.5175, "step": 952 }, { "epoch": 0.18221797323135755, "grad_norm": 1.2445647716522217, "learning_rate": 4.358621179749902e-06, "loss": 0.108, "step": 953 }, { "epoch": 0.1824091778202677, "grad_norm": 1.3846619129180908, "learning_rate": 4.359287570149967e-06, "loss": 0.1046, "step": 954 }, { "epoch": 0.1826003824091778, "grad_norm": 2.8698277473449707, "learning_rate": 4.359953262393382e-06, "loss": 0.1461, "step": 955 }, { "epoch": 0.18279158699808795, "grad_norm": 1.6638492345809937, "learning_rate": 4.3606182579414905e-06, "loss": 0.1026, "step": 956 }, { "epoch": 0.1829827915869981, "grad_norm": 2.5066792964935303, "learning_rate": 4.361282558251054e-06, "loss": 0.2583, "step": 957 }, { "epoch": 0.1831739961759082, "grad_norm": 3.5391042232513428, "learning_rate": 4.361946164774267e-06, "loss": 0.5604, "step": 958 }, { "epoch": 0.18336520076481835, "grad_norm": 2.347865104675293, "learning_rate": 4.36260907895878e-06, "loss": 0.2277, "step": 959 }, { "epoch": 0.1835564053537285, "grad_norm": 2.174129009246826, "learning_rate": 4.363271302247718e-06, "loss": 0.1745, "step": 960 }, { "epoch": 0.1837476099426386, "grad_norm": 1.4797195196151733, "learning_rate": 4.3639328360796964e-06, "loss": 0.1074, "step": 961 }, { "epoch": 0.18393881453154876, "grad_norm": 2.326012134552002, "learning_rate": 4.364593681888845e-06, "loss": 0.1232, "step": 962 }, { "epoch": 0.1841300191204589, "grad_norm": 2.7982571125030518, "learning_rate": 4.365253841104822e-06, "loss": 0.5077, "step": 963 }, { "epoch": 0.18432122370936901, "grad_norm": 3.0020434856414795, "learning_rate": 4.365913315152833e-06, "loss": 0.5043, "step": 964 }, { "epoch": 0.18451242829827916, "grad_norm": 4.000023365020752, "learning_rate": 4.366572105453653e-06, "loss": 0.3364, "step": 965 }, { "epoch": 0.1847036328871893, "grad_norm": 1.9444904327392578, "learning_rate": 4.367230213423641e-06, "loss": 0.1445, "step": 966 }, { "epoch": 0.18489483747609942, "grad_norm": 1.875847578048706, "learning_rate": 4.367887640474758e-06, "loss": 0.2279, "step": 967 }, { "epoch": 0.18508604206500956, "grad_norm": 1.9915648698806763, "learning_rate": 4.368544388014591e-06, "loss": 0.1222, "step": 968 }, { "epoch": 0.1852772466539197, "grad_norm": 2.044407844543457, "learning_rate": 4.369200457446364e-06, "loss": 0.1119, "step": 969 }, { "epoch": 0.18546845124282982, "grad_norm": 1.7450714111328125, "learning_rate": 4.3698558501689585e-06, "loss": 0.1761, "step": 970 }, { "epoch": 0.18565965583173996, "grad_norm": 1.9846551418304443, "learning_rate": 4.370510567576933e-06, "loss": 0.2268, "step": 971 }, { "epoch": 0.1858508604206501, "grad_norm": 2.735543727874756, "learning_rate": 4.3711646110605385e-06, "loss": 0.2611, "step": 972 }, { "epoch": 0.18604206500956022, "grad_norm": 2.1292803287506104, "learning_rate": 4.371817982005741e-06, "loss": 0.1536, "step": 973 }, { "epoch": 0.18623326959847036, "grad_norm": 2.738206386566162, "learning_rate": 4.372470681794229e-06, "loss": 0.1698, "step": 974 }, { "epoch": 0.1864244741873805, "grad_norm": 3.367114543914795, "learning_rate": 4.373122711803442e-06, "loss": 0.1996, "step": 975 }, { "epoch": 0.18661567877629062, "grad_norm": 3.1926040649414062, "learning_rate": 4.373774073406583e-06, "loss": 0.3569, "step": 976 }, { "epoch": 0.18680688336520077, "grad_norm": 3.1637473106384277, "learning_rate": 4.374424767972636e-06, "loss": 0.228, "step": 977 }, { "epoch": 0.1869980879541109, "grad_norm": 3.3024322986602783, "learning_rate": 4.375074796866383e-06, "loss": 0.5118, "step": 978 }, { "epoch": 0.18718929254302102, "grad_norm": 3.079582929611206, "learning_rate": 4.375724161448419e-06, "loss": 0.1612, "step": 979 }, { "epoch": 0.18738049713193117, "grad_norm": 1.2228554487228394, "learning_rate": 4.376372863075179e-06, "loss": 0.0437, "step": 980 }, { "epoch": 0.1875717017208413, "grad_norm": 3.335177183151245, "learning_rate": 4.377020903098941e-06, "loss": 0.169, "step": 981 }, { "epoch": 0.18776290630975143, "grad_norm": 2.6680805683135986, "learning_rate": 4.377668282867852e-06, "loss": 0.3572, "step": 982 }, { "epoch": 0.18795411089866157, "grad_norm": 2.7457680702209473, "learning_rate": 4.378315003725942e-06, "loss": 0.6047, "step": 983 }, { "epoch": 0.1881453154875717, "grad_norm": 2.021188974380493, "learning_rate": 4.3789610670131445e-06, "loss": 0.2667, "step": 984 }, { "epoch": 0.18833652007648183, "grad_norm": 2.8314571380615234, "learning_rate": 4.3796064740653045e-06, "loss": 0.4146, "step": 985 }, { "epoch": 0.18852772466539197, "grad_norm": 1.4372998476028442, "learning_rate": 4.380251226214205e-06, "loss": 0.1061, "step": 986 }, { "epoch": 0.18871892925430211, "grad_norm": 3.8322417736053467, "learning_rate": 4.380895324787575e-06, "loss": 0.2334, "step": 987 }, { "epoch": 0.18891013384321223, "grad_norm": 1.867491602897644, "learning_rate": 4.381538771109115e-06, "loss": 0.1069, "step": 988 }, { "epoch": 0.18910133843212237, "grad_norm": 2.0068299770355225, "learning_rate": 4.382181566498504e-06, "loss": 0.1687, "step": 989 }, { "epoch": 0.18929254302103252, "grad_norm": 2.4695441722869873, "learning_rate": 4.382823712271419e-06, "loss": 0.2867, "step": 990 }, { "epoch": 0.18948374760994263, "grad_norm": 3.3172388076782227, "learning_rate": 4.383465209739559e-06, "loss": 0.2976, "step": 991 }, { "epoch": 0.18967495219885278, "grad_norm": 1.749539852142334, "learning_rate": 4.384106060210646e-06, "loss": 0.1326, "step": 992 }, { "epoch": 0.18986615678776292, "grad_norm": 2.1479973793029785, "learning_rate": 4.384746264988454e-06, "loss": 0.1488, "step": 993 }, { "epoch": 0.19005736137667303, "grad_norm": 2.0028390884399414, "learning_rate": 4.385385825372819e-06, "loss": 0.1353, "step": 994 }, { "epoch": 0.19024856596558318, "grad_norm": 2.371243953704834, "learning_rate": 4.386024742659655e-06, "loss": 0.2165, "step": 995 }, { "epoch": 0.19043977055449332, "grad_norm": 1.8113404512405396, "learning_rate": 4.386663018140972e-06, "loss": 0.2138, "step": 996 }, { "epoch": 0.19063097514340344, "grad_norm": 2.1039295196533203, "learning_rate": 4.38730065310489e-06, "loss": 0.1077, "step": 997 }, { "epoch": 0.19082217973231358, "grad_norm": 1.7272989749908447, "learning_rate": 4.387937648835653e-06, "loss": 0.1083, "step": 998 }, { "epoch": 0.19101338432122372, "grad_norm": 3.76637601852417, "learning_rate": 4.388574006613651e-06, "loss": 0.2184, "step": 999 }, { "epoch": 0.19120458891013384, "grad_norm": 2.589611053466797, "learning_rate": 4.389209727715426e-06, "loss": 0.111, "step": 1000 }, { "epoch": 0.19120458891013384, "eval_runtime": 801.9588, "eval_samples_per_second": 1.913, "eval_steps_per_second": 0.239, "step": 1000 }, { "epoch": 0.19139579349904398, "grad_norm": 2.5449726581573486, "learning_rate": 4.389844813413695e-06, "loss": 0.2094, "step": 1001 }, { "epoch": 0.19158699808795412, "grad_norm": 2.380741834640503, "learning_rate": 4.390479264977362e-06, "loss": 0.2957, "step": 1002 }, { "epoch": 0.19177820267686424, "grad_norm": 2.056061029434204, "learning_rate": 4.391113083671535e-06, "loss": 0.3615, "step": 1003 }, { "epoch": 0.19196940726577438, "grad_norm": 2.0398001670837402, "learning_rate": 4.391746270757536e-06, "loss": 0.1292, "step": 1004 }, { "epoch": 0.1921606118546845, "grad_norm": 4.531376838684082, "learning_rate": 4.392378827492925e-06, "loss": 0.1914, "step": 1005 }, { "epoch": 0.19235181644359464, "grad_norm": 2.163278102874756, "learning_rate": 4.393010755131506e-06, "loss": 0.0862, "step": 1006 }, { "epoch": 0.19254302103250479, "grad_norm": 2.507943868637085, "learning_rate": 4.393642054923349e-06, "loss": 0.4044, "step": 1007 }, { "epoch": 0.1927342256214149, "grad_norm": 3.3545026779174805, "learning_rate": 4.394272728114798e-06, "loss": 0.4105, "step": 1008 }, { "epoch": 0.19292543021032504, "grad_norm": 3.325700044631958, "learning_rate": 4.3949027759484944e-06, "loss": 0.3183, "step": 1009 }, { "epoch": 0.1931166347992352, "grad_norm": 1.7217702865600586, "learning_rate": 4.395532199663382e-06, "loss": 0.1855, "step": 1010 }, { "epoch": 0.1933078393881453, "grad_norm": 1.5247466564178467, "learning_rate": 4.396161000494729e-06, "loss": 0.0998, "step": 1011 }, { "epoch": 0.19349904397705545, "grad_norm": 2.8211965560913086, "learning_rate": 4.396789179674141e-06, "loss": 0.0913, "step": 1012 }, { "epoch": 0.1936902485659656, "grad_norm": 3.427171468734741, "learning_rate": 4.397416738429569e-06, "loss": 0.1638, "step": 1013 }, { "epoch": 0.1938814531548757, "grad_norm": 2.2865030765533447, "learning_rate": 4.398043677985337e-06, "loss": 0.1951, "step": 1014 }, { "epoch": 0.19407265774378585, "grad_norm": 1.854646921157837, "learning_rate": 4.398669999562141e-06, "loss": 0.1315, "step": 1015 }, { "epoch": 0.194263862332696, "grad_norm": 2.7197680473327637, "learning_rate": 4.399295704377077e-06, "loss": 0.3981, "step": 1016 }, { "epoch": 0.1944550669216061, "grad_norm": 1.8270578384399414, "learning_rate": 4.399920793643645e-06, "loss": 0.1379, "step": 1017 }, { "epoch": 0.19464627151051625, "grad_norm": 3.02530574798584, "learning_rate": 4.400545268571768e-06, "loss": 0.2511, "step": 1018 }, { "epoch": 0.1948374760994264, "grad_norm": 2.5796334743499756, "learning_rate": 4.401169130367807e-06, "loss": 0.1854, "step": 1019 }, { "epoch": 0.1950286806883365, "grad_norm": 3.3476648330688477, "learning_rate": 4.4017923802345695e-06, "loss": 0.3143, "step": 1020 }, { "epoch": 0.19521988527724665, "grad_norm": 2.5378341674804688, "learning_rate": 4.4024150193713316e-06, "loss": 0.4362, "step": 1021 }, { "epoch": 0.1954110898661568, "grad_norm": 1.8505767583847046, "learning_rate": 4.403037048973844e-06, "loss": 0.2285, "step": 1022 }, { "epoch": 0.1956022944550669, "grad_norm": 2.6891872882843018, "learning_rate": 4.403658470234348e-06, "loss": 0.2091, "step": 1023 }, { "epoch": 0.19579349904397705, "grad_norm": 2.9312164783477783, "learning_rate": 4.404279284341596e-06, "loss": 0.2019, "step": 1024 }, { "epoch": 0.1959847036328872, "grad_norm": 2.8806815147399902, "learning_rate": 4.404899492480853e-06, "loss": 0.1688, "step": 1025 }, { "epoch": 0.1961759082217973, "grad_norm": 3.487785577774048, "learning_rate": 4.405519095833921e-06, "loss": 0.502, "step": 1026 }, { "epoch": 0.19636711281070746, "grad_norm": 2.5800065994262695, "learning_rate": 4.406138095579144e-06, "loss": 0.3615, "step": 1027 }, { "epoch": 0.1965583173996176, "grad_norm": 2.3235068321228027, "learning_rate": 4.406756492891431e-06, "loss": 0.2984, "step": 1028 }, { "epoch": 0.19674952198852771, "grad_norm": 4.1836628913879395, "learning_rate": 4.407374288942261e-06, "loss": 0.3625, "step": 1029 }, { "epoch": 0.19694072657743786, "grad_norm": 3.1203243732452393, "learning_rate": 4.407991484899696e-06, "loss": 0.1182, "step": 1030 }, { "epoch": 0.197131931166348, "grad_norm": 1.4133257865905762, "learning_rate": 4.408608081928404e-06, "loss": 0.0475, "step": 1031 }, { "epoch": 0.19732313575525812, "grad_norm": 2.494713068008423, "learning_rate": 4.4092240811896605e-06, "loss": 0.2211, "step": 1032 }, { "epoch": 0.19751434034416826, "grad_norm": 1.6299430131912231, "learning_rate": 4.40983948384137e-06, "loss": 0.1825, "step": 1033 }, { "epoch": 0.1977055449330784, "grad_norm": 3.060887575149536, "learning_rate": 4.410454291038076e-06, "loss": 0.307, "step": 1034 }, { "epoch": 0.19789674952198852, "grad_norm": 2.6344666481018066, "learning_rate": 4.411068503930969e-06, "loss": 0.2975, "step": 1035 }, { "epoch": 0.19808795411089866, "grad_norm": 1.8494610786437988, "learning_rate": 4.41168212366791e-06, "loss": 0.0681, "step": 1036 }, { "epoch": 0.1982791586998088, "grad_norm": 2.327357530593872, "learning_rate": 4.412295151393437e-06, "loss": 0.1178, "step": 1037 }, { "epoch": 0.19847036328871892, "grad_norm": 1.8669428825378418, "learning_rate": 4.412907588248774e-06, "loss": 0.2461, "step": 1038 }, { "epoch": 0.19866156787762906, "grad_norm": 3.062044143676758, "learning_rate": 4.413519435371853e-06, "loss": 0.468, "step": 1039 }, { "epoch": 0.1988527724665392, "grad_norm": 3.0303285121917725, "learning_rate": 4.4141306938973205e-06, "loss": 0.3054, "step": 1040 }, { "epoch": 0.19904397705544932, "grad_norm": 2.5444071292877197, "learning_rate": 4.414741364956551e-06, "loss": 0.1862, "step": 1041 }, { "epoch": 0.19923518164435947, "grad_norm": 2.047647476196289, "learning_rate": 4.415351449677661e-06, "loss": 0.2525, "step": 1042 }, { "epoch": 0.1994263862332696, "grad_norm": 3.2048020362854004, "learning_rate": 4.415960949185519e-06, "loss": 0.1087, "step": 1043 }, { "epoch": 0.19961759082217972, "grad_norm": 1.5147563219070435, "learning_rate": 4.416569864601761e-06, "loss": 0.0919, "step": 1044 }, { "epoch": 0.19980879541108987, "grad_norm": 2.4613304138183594, "learning_rate": 4.4171781970448015e-06, "loss": 0.4064, "step": 1045 }, { "epoch": 0.2, "grad_norm": 2.334059953689575, "learning_rate": 4.417785947629845e-06, "loss": 0.2635, "step": 1046 }, { "epoch": 0.20019120458891013, "grad_norm": 1.446562647819519, "learning_rate": 4.418393117468899e-06, "loss": 0.1417, "step": 1047 }, { "epoch": 0.20038240917782027, "grad_norm": 3.4641342163085938, "learning_rate": 4.418999707670787e-06, "loss": 0.3766, "step": 1048 }, { "epoch": 0.2005736137667304, "grad_norm": 1.2733123302459717, "learning_rate": 4.4196057193411596e-06, "loss": 0.0523, "step": 1049 }, { "epoch": 0.20076481835564053, "grad_norm": 1.9554495811462402, "learning_rate": 4.420211153582507e-06, "loss": 0.1459, "step": 1050 }, { "epoch": 0.20095602294455067, "grad_norm": 3.5012214183807373, "learning_rate": 4.4208160114941716e-06, "loss": 0.4968, "step": 1051 }, { "epoch": 0.20114722753346082, "grad_norm": 2.0732827186584473, "learning_rate": 4.421420294172356e-06, "loss": 0.318, "step": 1052 }, { "epoch": 0.20133843212237093, "grad_norm": 2.6593754291534424, "learning_rate": 4.422024002710142e-06, "loss": 0.2944, "step": 1053 }, { "epoch": 0.20152963671128107, "grad_norm": 2.268272876739502, "learning_rate": 4.422627138197499e-06, "loss": 0.2813, "step": 1054 }, { "epoch": 0.20172084130019122, "grad_norm": 2.346163034439087, "learning_rate": 4.423229701721291e-06, "loss": 0.1823, "step": 1055 }, { "epoch": 0.20191204588910133, "grad_norm": 1.8060640096664429, "learning_rate": 4.423831694365298e-06, "loss": 0.1057, "step": 1056 }, { "epoch": 0.20210325047801148, "grad_norm": 2.9678351879119873, "learning_rate": 4.424433117210219e-06, "loss": 0.4058, "step": 1057 }, { "epoch": 0.20229445506692162, "grad_norm": 2.440826416015625, "learning_rate": 4.42503397133369e-06, "loss": 0.314, "step": 1058 }, { "epoch": 0.20248565965583173, "grad_norm": 2.8080196380615234, "learning_rate": 4.425634257810289e-06, "loss": 0.6267, "step": 1059 }, { "epoch": 0.20267686424474188, "grad_norm": 3.3547277450561523, "learning_rate": 4.426233977711554e-06, "loss": 0.2995, "step": 1060 }, { "epoch": 0.20286806883365202, "grad_norm": 1.7534793615341187, "learning_rate": 4.426833132105993e-06, "loss": 0.2257, "step": 1061 }, { "epoch": 0.20305927342256214, "grad_norm": 1.749898910522461, "learning_rate": 4.4274317220590905e-06, "loss": 0.1218, "step": 1062 }, { "epoch": 0.20325047801147228, "grad_norm": 1.7936410903930664, "learning_rate": 4.428029748633326e-06, "loss": 0.1347, "step": 1063 }, { "epoch": 0.20344168260038242, "grad_norm": 3.712153196334839, "learning_rate": 4.42862721288818e-06, "loss": 0.4845, "step": 1064 }, { "epoch": 0.20363288718929254, "grad_norm": 3.04533052444458, "learning_rate": 4.429224115880146e-06, "loss": 0.3209, "step": 1065 }, { "epoch": 0.20382409177820268, "grad_norm": 1.8066027164459229, "learning_rate": 4.429820458662747e-06, "loss": 0.1314, "step": 1066 }, { "epoch": 0.2040152963671128, "grad_norm": 1.466841697692871, "learning_rate": 4.43041624228654e-06, "loss": 0.0976, "step": 1067 }, { "epoch": 0.20420650095602294, "grad_norm": 2.905015468597412, "learning_rate": 4.431011467799127e-06, "loss": 0.273, "step": 1068 }, { "epoch": 0.20439770554493308, "grad_norm": 2.144706964492798, "learning_rate": 4.4316061362451714e-06, "loss": 0.1751, "step": 1069 }, { "epoch": 0.2045889101338432, "grad_norm": 2.6781883239746094, "learning_rate": 4.432200248666409e-06, "loss": 0.535, "step": 1070 }, { "epoch": 0.20478011472275334, "grad_norm": 2.1494102478027344, "learning_rate": 4.4327938061016515e-06, "loss": 0.1759, "step": 1071 }, { "epoch": 0.20497131931166349, "grad_norm": 2.51594877243042, "learning_rate": 4.433386809586803e-06, "loss": 0.1144, "step": 1072 }, { "epoch": 0.2051625239005736, "grad_norm": 1.9181592464447021, "learning_rate": 4.433979260154872e-06, "loss": 0.1752, "step": 1073 }, { "epoch": 0.20535372848948374, "grad_norm": 2.254838466644287, "learning_rate": 4.4345711588359805e-06, "loss": 0.0968, "step": 1074 }, { "epoch": 0.2055449330783939, "grad_norm": 2.161107063293457, "learning_rate": 4.435162506657369e-06, "loss": 0.0813, "step": 1075 }, { "epoch": 0.205736137667304, "grad_norm": 2.8448431491851807, "learning_rate": 4.435753304643421e-06, "loss": 0.6405, "step": 1076 }, { "epoch": 0.20592734225621415, "grad_norm": 1.856491208076477, "learning_rate": 4.436343553815657e-06, "loss": 0.1512, "step": 1077 }, { "epoch": 0.2061185468451243, "grad_norm": 2.017235517501831, "learning_rate": 4.43693325519276e-06, "loss": 0.1791, "step": 1078 }, { "epoch": 0.2063097514340344, "grad_norm": 2.606391429901123, "learning_rate": 4.437522409790575e-06, "loss": 0.2601, "step": 1079 }, { "epoch": 0.20650095602294455, "grad_norm": 2.5404584407806396, "learning_rate": 4.438111018622127e-06, "loss": 0.2392, "step": 1080 }, { "epoch": 0.2066921606118547, "grad_norm": 2.630312919616699, "learning_rate": 4.438699082697624e-06, "loss": 0.1322, "step": 1081 }, { "epoch": 0.2068833652007648, "grad_norm": 2.1289303302764893, "learning_rate": 4.4392866030244765e-06, "loss": 0.2648, "step": 1082 }, { "epoch": 0.20707456978967495, "grad_norm": 3.3554677963256836, "learning_rate": 4.439873580607301e-06, "loss": 0.5565, "step": 1083 }, { "epoch": 0.2072657743785851, "grad_norm": 2.1717491149902344, "learning_rate": 4.440460016447934e-06, "loss": 0.1442, "step": 1084 }, { "epoch": 0.2074569789674952, "grad_norm": 2.714172601699829, "learning_rate": 4.441045911545436e-06, "loss": 0.3317, "step": 1085 }, { "epoch": 0.20764818355640535, "grad_norm": 2.02815318107605, "learning_rate": 4.441631266896111e-06, "loss": 0.1441, "step": 1086 }, { "epoch": 0.2078393881453155, "grad_norm": 1.8019261360168457, "learning_rate": 4.442216083493512e-06, "loss": 0.0926, "step": 1087 }, { "epoch": 0.2080305927342256, "grad_norm": 1.846985101699829, "learning_rate": 4.442800362328448e-06, "loss": 0.1832, "step": 1088 }, { "epoch": 0.20822179732313575, "grad_norm": 2.089106321334839, "learning_rate": 4.443384104389001e-06, "loss": 0.1436, "step": 1089 }, { "epoch": 0.2084130019120459, "grad_norm": 2.037452459335327, "learning_rate": 4.443967310660528e-06, "loss": 0.1578, "step": 1090 }, { "epoch": 0.208604206500956, "grad_norm": 2.5428967475891113, "learning_rate": 4.444549982125679e-06, "loss": 0.5048, "step": 1091 }, { "epoch": 0.20879541108986616, "grad_norm": 1.7422195672988892, "learning_rate": 4.445132119764402e-06, "loss": 0.1375, "step": 1092 }, { "epoch": 0.2089866156787763, "grad_norm": 1.4205963611602783, "learning_rate": 4.445713724553953e-06, "loss": 0.0716, "step": 1093 }, { "epoch": 0.20917782026768642, "grad_norm": 2.83103346824646, "learning_rate": 4.446294797468908e-06, "loss": 0.1791, "step": 1094 }, { "epoch": 0.20936902485659656, "grad_norm": 2.2832252979278564, "learning_rate": 4.446875339481172e-06, "loss": 0.2271, "step": 1095 }, { "epoch": 0.2095602294455067, "grad_norm": 2.6716253757476807, "learning_rate": 4.447455351559987e-06, "loss": 0.4096, "step": 1096 }, { "epoch": 0.20975143403441682, "grad_norm": 1.877040147781372, "learning_rate": 4.448034834671944e-06, "loss": 0.27, "step": 1097 }, { "epoch": 0.20994263862332696, "grad_norm": 1.4391602277755737, "learning_rate": 4.448613789780993e-06, "loss": 0.076, "step": 1098 }, { "epoch": 0.2101338432122371, "grad_norm": 1.844596266746521, "learning_rate": 4.449192217848449e-06, "loss": 0.1119, "step": 1099 }, { "epoch": 0.21032504780114722, "grad_norm": 4.228497505187988, "learning_rate": 4.449770119833007e-06, "loss": 0.1945, "step": 1100 }, { "epoch": 0.21051625239005736, "grad_norm": 2.4177355766296387, "learning_rate": 4.450347496690746e-06, "loss": 0.3176, "step": 1101 }, { "epoch": 0.2107074569789675, "grad_norm": 3.1131842136383057, "learning_rate": 4.450924349375142e-06, "loss": 0.3662, "step": 1102 }, { "epoch": 0.21089866156787762, "grad_norm": 1.61544668674469, "learning_rate": 4.4515006788370775e-06, "loss": 0.2676, "step": 1103 }, { "epoch": 0.21108986615678776, "grad_norm": 4.033684253692627, "learning_rate": 4.4520764860248474e-06, "loss": 0.4563, "step": 1104 }, { "epoch": 0.2112810707456979, "grad_norm": 2.592637062072754, "learning_rate": 4.452651771884173e-06, "loss": 0.3416, "step": 1105 }, { "epoch": 0.21147227533460802, "grad_norm": 1.7672158479690552, "learning_rate": 4.45322653735821e-06, "loss": 0.1088, "step": 1106 }, { "epoch": 0.21166347992351817, "grad_norm": 1.9779208898544312, "learning_rate": 4.453800783387554e-06, "loss": 0.2377, "step": 1107 }, { "epoch": 0.2118546845124283, "grad_norm": 2.543431043624878, "learning_rate": 4.454374510910254e-06, "loss": 0.2436, "step": 1108 }, { "epoch": 0.21204588910133843, "grad_norm": 4.803256988525391, "learning_rate": 4.454947720861822e-06, "loss": 0.3721, "step": 1109 }, { "epoch": 0.21223709369024857, "grad_norm": 2.2504935264587402, "learning_rate": 4.455520414175238e-06, "loss": 0.2385, "step": 1110 }, { "epoch": 0.2124282982791587, "grad_norm": 2.656966209411621, "learning_rate": 4.456092591780964e-06, "loss": 0.1749, "step": 1111 }, { "epoch": 0.21261950286806883, "grad_norm": 2.746769666671753, "learning_rate": 4.4566642546069476e-06, "loss": 0.109, "step": 1112 }, { "epoch": 0.21281070745697897, "grad_norm": 3.2161478996276855, "learning_rate": 4.457235403578636e-06, "loss": 0.2271, "step": 1113 }, { "epoch": 0.2130019120458891, "grad_norm": 4.615819931030273, "learning_rate": 4.457806039618983e-06, "loss": 0.5054, "step": 1114 }, { "epoch": 0.21319311663479923, "grad_norm": 2.4721786975860596, "learning_rate": 4.458376163648458e-06, "loss": 0.2637, "step": 1115 }, { "epoch": 0.21338432122370937, "grad_norm": 1.3068219423294067, "learning_rate": 4.458945776585056e-06, "loss": 0.1306, "step": 1116 }, { "epoch": 0.21357552581261952, "grad_norm": 1.7614368200302124, "learning_rate": 4.459514879344301e-06, "loss": 0.0854, "step": 1117 }, { "epoch": 0.21376673040152963, "grad_norm": 2.025942325592041, "learning_rate": 4.460083472839265e-06, "loss": 0.1001, "step": 1118 }, { "epoch": 0.21395793499043977, "grad_norm": 1.3237824440002441, "learning_rate": 4.460651557980566e-06, "loss": 0.091, "step": 1119 }, { "epoch": 0.21414913957934992, "grad_norm": 3.109684705734253, "learning_rate": 4.461219135676386e-06, "loss": 0.6804, "step": 1120 }, { "epoch": 0.21434034416826003, "grad_norm": 3.659759044647217, "learning_rate": 4.461786206832473e-06, "loss": 0.4264, "step": 1121 }, { "epoch": 0.21453154875717018, "grad_norm": 3.0191221237182617, "learning_rate": 4.462352772352151e-06, "loss": 0.2061, "step": 1122 }, { "epoch": 0.21472275334608032, "grad_norm": 0.91303551197052, "learning_rate": 4.4629188331363334e-06, "loss": 0.1592, "step": 1123 }, { "epoch": 0.21491395793499043, "grad_norm": 2.4983134269714355, "learning_rate": 4.463484390083525e-06, "loss": 0.1401, "step": 1124 }, { "epoch": 0.21510516252390058, "grad_norm": 2.4315645694732666, "learning_rate": 4.464049444089835e-06, "loss": 0.1035, "step": 1125 }, { "epoch": 0.2152963671128107, "grad_norm": 2.098078489303589, "learning_rate": 4.464613996048983e-06, "loss": 0.2432, "step": 1126 }, { "epoch": 0.21548757170172084, "grad_norm": 2.7613978385925293, "learning_rate": 4.465178046852309e-06, "loss": 0.401, "step": 1127 }, { "epoch": 0.21567877629063098, "grad_norm": 2.0903496742248535, "learning_rate": 4.465741597388782e-06, "loss": 0.1743, "step": 1128 }, { "epoch": 0.2158699808795411, "grad_norm": 2.099884271621704, "learning_rate": 4.466304648545006e-06, "loss": 0.1594, "step": 1129 }, { "epoch": 0.21606118546845124, "grad_norm": 1.7775846719741821, "learning_rate": 4.466867201205232e-06, "loss": 0.0927, "step": 1130 }, { "epoch": 0.21625239005736138, "grad_norm": 1.9474855661392212, "learning_rate": 4.467429256251365e-06, "loss": 0.1128, "step": 1131 }, { "epoch": 0.2164435946462715, "grad_norm": 2.1535003185272217, "learning_rate": 4.467990814562967e-06, "loss": 0.2644, "step": 1132 }, { "epoch": 0.21663479923518164, "grad_norm": 3.28275203704834, "learning_rate": 4.4685518770172775e-06, "loss": 0.4298, "step": 1133 }, { "epoch": 0.21682600382409178, "grad_norm": 2.098271608352661, "learning_rate": 4.469112444489207e-06, "loss": 0.1012, "step": 1134 }, { "epoch": 0.2170172084130019, "grad_norm": 2.1434457302093506, "learning_rate": 4.469672517851359e-06, "loss": 0.2505, "step": 1135 }, { "epoch": 0.21720841300191204, "grad_norm": 2.248441219329834, "learning_rate": 4.470232097974025e-06, "loss": 0.2486, "step": 1136 }, { "epoch": 0.2173996175908222, "grad_norm": 1.4229638576507568, "learning_rate": 4.470791185725206e-06, "loss": 0.0635, "step": 1137 }, { "epoch": 0.2175908221797323, "grad_norm": 2.0013930797576904, "learning_rate": 4.4713497819706065e-06, "loss": 0.1122, "step": 1138 }, { "epoch": 0.21778202676864244, "grad_norm": 2.527571678161621, "learning_rate": 4.471907887573656e-06, "loss": 0.3839, "step": 1139 }, { "epoch": 0.2179732313575526, "grad_norm": 3.140901565551758, "learning_rate": 4.472465503395508e-06, "loss": 0.5159, "step": 1140 }, { "epoch": 0.2181644359464627, "grad_norm": 3.4777331352233887, "learning_rate": 4.473022630295051e-06, "loss": 0.2159, "step": 1141 }, { "epoch": 0.21835564053537285, "grad_norm": 1.6637810468673706, "learning_rate": 4.473579269128917e-06, "loss": 0.2454, "step": 1142 }, { "epoch": 0.218546845124283, "grad_norm": 3.4433083534240723, "learning_rate": 4.474135420751486e-06, "loss": 0.2348, "step": 1143 }, { "epoch": 0.2187380497131931, "grad_norm": 2.1617848873138428, "learning_rate": 4.474691086014902e-06, "loss": 0.2981, "step": 1144 }, { "epoch": 0.21892925430210325, "grad_norm": 2.460859775543213, "learning_rate": 4.475246265769069e-06, "loss": 0.4682, "step": 1145 }, { "epoch": 0.2191204588910134, "grad_norm": 1.9391944408416748, "learning_rate": 4.47580096086167e-06, "loss": 0.2622, "step": 1146 }, { "epoch": 0.2193116634799235, "grad_norm": 2.875788927078247, "learning_rate": 4.476355172138167e-06, "loss": 0.4374, "step": 1147 }, { "epoch": 0.21950286806883365, "grad_norm": 2.8496835231781006, "learning_rate": 4.476908900441812e-06, "loss": 0.297, "step": 1148 }, { "epoch": 0.2196940726577438, "grad_norm": 2.6283702850341797, "learning_rate": 4.477462146613657e-06, "loss": 0.1253, "step": 1149 }, { "epoch": 0.2198852772466539, "grad_norm": 1.8835588693618774, "learning_rate": 4.478014911492556e-06, "loss": 0.1859, "step": 1150 }, { "epoch": 0.22007648183556405, "grad_norm": 13.009642601013184, "learning_rate": 4.478567195915176e-06, "loss": 0.3384, "step": 1151 }, { "epoch": 0.2202676864244742, "grad_norm": 0.9838836193084717, "learning_rate": 4.479119000716005e-06, "loss": 0.0698, "step": 1152 }, { "epoch": 0.2204588910133843, "grad_norm": 2.835535764694214, "learning_rate": 4.479670326727359e-06, "loss": 0.394, "step": 1153 }, { "epoch": 0.22065009560229445, "grad_norm": 2.8879711627960205, "learning_rate": 4.480221174779389e-06, "loss": 0.2163, "step": 1154 }, { "epoch": 0.2208413001912046, "grad_norm": 2.8582956790924072, "learning_rate": 4.480771545700088e-06, "loss": 0.0787, "step": 1155 }, { "epoch": 0.2210325047801147, "grad_norm": 2.854708194732666, "learning_rate": 4.481321440315302e-06, "loss": 0.2055, "step": 1156 }, { "epoch": 0.22122370936902486, "grad_norm": 1.8422484397888184, "learning_rate": 4.481870859448731e-06, "loss": 0.1446, "step": 1157 }, { "epoch": 0.221414913957935, "grad_norm": 2.160968542098999, "learning_rate": 4.482419803921941e-06, "loss": 0.2785, "step": 1158 }, { "epoch": 0.22160611854684512, "grad_norm": 2.0334866046905518, "learning_rate": 4.482968274554374e-06, "loss": 0.1943, "step": 1159 }, { "epoch": 0.22179732313575526, "grad_norm": 1.5216292142868042, "learning_rate": 4.483516272163348e-06, "loss": 0.0838, "step": 1160 }, { "epoch": 0.2219885277246654, "grad_norm": 3.173959970474243, "learning_rate": 4.4840637975640696e-06, "loss": 0.272, "step": 1161 }, { "epoch": 0.22217973231357552, "grad_norm": 1.9395912885665894, "learning_rate": 4.4846108515696406e-06, "loss": 0.2258, "step": 1162 }, { "epoch": 0.22237093690248566, "grad_norm": 1.6804591417312622, "learning_rate": 4.485157434991062e-06, "loss": 0.0927, "step": 1163 }, { "epoch": 0.2225621414913958, "grad_norm": 2.5178074836730957, "learning_rate": 4.485703548637246e-06, "loss": 0.2079, "step": 1164 }, { "epoch": 0.22275334608030592, "grad_norm": 2.004671096801758, "learning_rate": 4.48624919331502e-06, "loss": 0.232, "step": 1165 }, { "epoch": 0.22294455066921606, "grad_norm": 1.9772891998291016, "learning_rate": 4.486794369829135e-06, "loss": 0.1463, "step": 1166 }, { "epoch": 0.2231357552581262, "grad_norm": 2.044581174850464, "learning_rate": 4.487339078982273e-06, "loss": 0.1012, "step": 1167 }, { "epoch": 0.22332695984703632, "grad_norm": 2.186039924621582, "learning_rate": 4.48788332157505e-06, "loss": 0.1192, "step": 1168 }, { "epoch": 0.22351816443594646, "grad_norm": 1.783949851989746, "learning_rate": 4.488427098406031e-06, "loss": 0.0964, "step": 1169 }, { "epoch": 0.2237093690248566, "grad_norm": 2.8822665214538574, "learning_rate": 4.4889704102717304e-06, "loss": 0.402, "step": 1170 }, { "epoch": 0.22390057361376672, "grad_norm": 2.142519235610962, "learning_rate": 4.48951325796662e-06, "loss": 0.302, "step": 1171 }, { "epoch": 0.22409177820267687, "grad_norm": 1.857020616531372, "learning_rate": 4.49005564228314e-06, "loss": 0.3136, "step": 1172 }, { "epoch": 0.224282982791587, "grad_norm": 2.0371313095092773, "learning_rate": 4.490597564011701e-06, "loss": 0.2399, "step": 1173 }, { "epoch": 0.22447418738049713, "grad_norm": 2.0099854469299316, "learning_rate": 4.491139023940692e-06, "loss": 0.0889, "step": 1174 }, { "epoch": 0.22466539196940727, "grad_norm": 2.0622718334198, "learning_rate": 4.49168002285649e-06, "loss": 0.1362, "step": 1175 }, { "epoch": 0.2248565965583174, "grad_norm": 3.3179798126220703, "learning_rate": 4.492220561543467e-06, "loss": 0.4877, "step": 1176 }, { "epoch": 0.22504780114722753, "grad_norm": 2.5217320919036865, "learning_rate": 4.49276064078399e-06, "loss": 0.5047, "step": 1177 }, { "epoch": 0.22523900573613767, "grad_norm": 3.4361255168914795, "learning_rate": 4.493300261358436e-06, "loss": 0.5802, "step": 1178 }, { "epoch": 0.22543021032504781, "grad_norm": 2.50624942779541, "learning_rate": 4.493839424045196e-06, "loss": 0.2938, "step": 1179 }, { "epoch": 0.22562141491395793, "grad_norm": 2.448352575302124, "learning_rate": 4.494378129620678e-06, "loss": 0.1441, "step": 1180 }, { "epoch": 0.22581261950286807, "grad_norm": 1.622539758682251, "learning_rate": 4.494916378859321e-06, "loss": 0.092, "step": 1181 }, { "epoch": 0.22600382409177822, "grad_norm": 2.642896890640259, "learning_rate": 4.495454172533592e-06, "loss": 0.3514, "step": 1182 }, { "epoch": 0.22619502868068833, "grad_norm": 2.202483892440796, "learning_rate": 4.495991511414005e-06, "loss": 0.2871, "step": 1183 }, { "epoch": 0.22638623326959847, "grad_norm": 1.2011011838912964, "learning_rate": 4.496528396269117e-06, "loss": 0.0971, "step": 1184 }, { "epoch": 0.22657743785850862, "grad_norm": 2.026979923248291, "learning_rate": 4.497064827865537e-06, "loss": 0.1101, "step": 1185 }, { "epoch": 0.22676864244741873, "grad_norm": 1.4967538118362427, "learning_rate": 4.49760080696794e-06, "loss": 0.2007, "step": 1186 }, { "epoch": 0.22695984703632888, "grad_norm": 2.381471872329712, "learning_rate": 4.498136334339059e-06, "loss": 0.0943, "step": 1187 }, { "epoch": 0.227151051625239, "grad_norm": 2.089556932449341, "learning_rate": 4.498671410739708e-06, "loss": 0.1357, "step": 1188 }, { "epoch": 0.22734225621414914, "grad_norm": 3.158884048461914, "learning_rate": 4.4992060369287745e-06, "loss": 0.4473, "step": 1189 }, { "epoch": 0.22753346080305928, "grad_norm": 1.9269723892211914, "learning_rate": 4.499740213663238e-06, "loss": 0.2718, "step": 1190 }, { "epoch": 0.2277246653919694, "grad_norm": 3.3276264667510986, "learning_rate": 4.500273941698166e-06, "loss": 0.4174, "step": 1191 }, { "epoch": 0.22791586998087954, "grad_norm": 1.854248285293579, "learning_rate": 4.500807221786725e-06, "loss": 0.1561, "step": 1192 }, { "epoch": 0.22810707456978968, "grad_norm": 5.148835182189941, "learning_rate": 4.50134005468019e-06, "loss": 0.3131, "step": 1193 }, { "epoch": 0.2282982791586998, "grad_norm": 2.1514906883239746, "learning_rate": 4.501872441127943e-06, "loss": 0.1113, "step": 1194 }, { "epoch": 0.22848948374760994, "grad_norm": 2.2153143882751465, "learning_rate": 4.502404381877488e-06, "loss": 0.2921, "step": 1195 }, { "epoch": 0.22868068833652008, "grad_norm": 1.5667790174484253, "learning_rate": 4.502935877674451e-06, "loss": 0.1318, "step": 1196 }, { "epoch": 0.2288718929254302, "grad_norm": 3.2153244018554688, "learning_rate": 4.503466929262589e-06, "loss": 0.5028, "step": 1197 }, { "epoch": 0.22906309751434034, "grad_norm": 1.3133012056350708, "learning_rate": 4.503997537383796e-06, "loss": 0.1566, "step": 1198 }, { "epoch": 0.22925430210325048, "grad_norm": 1.9280524253845215, "learning_rate": 4.50452770277811e-06, "loss": 0.2353, "step": 1199 }, { "epoch": 0.2294455066921606, "grad_norm": 2.680346965789795, "learning_rate": 4.505057426183714e-06, "loss": 0.1869, "step": 1200 }, { "epoch": 0.22963671128107074, "grad_norm": 3.4344375133514404, "learning_rate": 4.505586708336951e-06, "loss": 0.4891, "step": 1201 }, { "epoch": 0.2298279158699809, "grad_norm": 1.4899710416793823, "learning_rate": 4.506115549972324e-06, "loss": 0.1304, "step": 1202 }, { "epoch": 0.230019120458891, "grad_norm": 2.121846914291382, "learning_rate": 4.506643951822501e-06, "loss": 0.1658, "step": 1203 }, { "epoch": 0.23021032504780115, "grad_norm": 2.3687522411346436, "learning_rate": 4.507171914618329e-06, "loss": 0.3938, "step": 1204 }, { "epoch": 0.2304015296367113, "grad_norm": 1.6581428050994873, "learning_rate": 4.5076994390888294e-06, "loss": 0.1058, "step": 1205 }, { "epoch": 0.2305927342256214, "grad_norm": 1.87697434425354, "learning_rate": 4.508226525961212e-06, "loss": 0.1238, "step": 1206 }, { "epoch": 0.23078393881453155, "grad_norm": 2.034977912902832, "learning_rate": 4.508753175960878e-06, "loss": 0.183, "step": 1207 }, { "epoch": 0.2309751434034417, "grad_norm": 2.1185624599456787, "learning_rate": 4.509279389811426e-06, "loss": 0.2888, "step": 1208 }, { "epoch": 0.2311663479923518, "grad_norm": 2.6012215614318848, "learning_rate": 4.509805168234659e-06, "loss": 0.3785, "step": 1209 }, { "epoch": 0.23135755258126195, "grad_norm": 1.3720933198928833, "learning_rate": 4.510330511950588e-06, "loss": 0.1433, "step": 1210 }, { "epoch": 0.2315487571701721, "grad_norm": 1.6616960763931274, "learning_rate": 4.510855421677442e-06, "loss": 0.1277, "step": 1211 }, { "epoch": 0.2317399617590822, "grad_norm": 1.7231650352478027, "learning_rate": 4.511379898131671e-06, "loss": 0.1076, "step": 1212 }, { "epoch": 0.23193116634799235, "grad_norm": 2.2084357738494873, "learning_rate": 4.511903942027949e-06, "loss": 0.2608, "step": 1213 }, { "epoch": 0.2321223709369025, "grad_norm": 2.0652458667755127, "learning_rate": 4.512427554079188e-06, "loss": 0.2228, "step": 1214 }, { "epoch": 0.2323135755258126, "grad_norm": 1.695842981338501, "learning_rate": 4.512950734996536e-06, "loss": 0.1546, "step": 1215 }, { "epoch": 0.23250478011472275, "grad_norm": 3.044466972351074, "learning_rate": 4.513473485489387e-06, "loss": 0.3004, "step": 1216 }, { "epoch": 0.2326959847036329, "grad_norm": 2.2261133193969727, "learning_rate": 4.513995806265384e-06, "loss": 0.2103, "step": 1217 }, { "epoch": 0.232887189292543, "grad_norm": 2.5913712978363037, "learning_rate": 4.51451769803043e-06, "loss": 0.3076, "step": 1218 }, { "epoch": 0.23307839388145316, "grad_norm": 2.4403324127197266, "learning_rate": 4.515039161488684e-06, "loss": 0.1843, "step": 1219 }, { "epoch": 0.2332695984703633, "grad_norm": 3.694639205932617, "learning_rate": 4.51556019734258e-06, "loss": 0.3922, "step": 1220 }, { "epoch": 0.23346080305927341, "grad_norm": 3.163925886154175, "learning_rate": 4.516080806292819e-06, "loss": 0.3171, "step": 1221 }, { "epoch": 0.23365200764818356, "grad_norm": 2.774327039718628, "learning_rate": 4.516600989038385e-06, "loss": 0.3307, "step": 1222 }, { "epoch": 0.2338432122370937, "grad_norm": 1.897996425628662, "learning_rate": 4.517120746276545e-06, "loss": 0.1318, "step": 1223 }, { "epoch": 0.23403441682600382, "grad_norm": 1.8080636262893677, "learning_rate": 4.517640078702858e-06, "loss": 0.089, "step": 1224 }, { "epoch": 0.23422562141491396, "grad_norm": 1.8752037286758423, "learning_rate": 4.5181589870111755e-06, "loss": 0.1162, "step": 1225 }, { "epoch": 0.2344168260038241, "grad_norm": 1.7227205038070679, "learning_rate": 4.518677471893654e-06, "loss": 0.316, "step": 1226 }, { "epoch": 0.23460803059273422, "grad_norm": 2.1087758541107178, "learning_rate": 4.519195534040756e-06, "loss": 0.1585, "step": 1227 }, { "epoch": 0.23479923518164436, "grad_norm": 2.5245354175567627, "learning_rate": 4.519713174141255e-06, "loss": 0.2401, "step": 1228 }, { "epoch": 0.2349904397705545, "grad_norm": 2.966695547103882, "learning_rate": 4.520230392882245e-06, "loss": 0.0818, "step": 1229 }, { "epoch": 0.23518164435946462, "grad_norm": 1.3945024013519287, "learning_rate": 4.520747190949141e-06, "loss": 0.1178, "step": 1230 }, { "epoch": 0.23537284894837476, "grad_norm": 2.4835667610168457, "learning_rate": 4.5212635690256885e-06, "loss": 0.0745, "step": 1231 }, { "epoch": 0.2355640535372849, "grad_norm": 3.491046667098999, "learning_rate": 4.521779527793967e-06, "loss": 0.2478, "step": 1232 }, { "epoch": 0.23575525812619502, "grad_norm": 1.6101934909820557, "learning_rate": 4.522295067934395e-06, "loss": 0.1821, "step": 1233 }, { "epoch": 0.23594646271510517, "grad_norm": 1.2053178548812866, "learning_rate": 4.522810190125739e-06, "loss": 0.1249, "step": 1234 }, { "epoch": 0.2361376673040153, "grad_norm": 2.9630486965179443, "learning_rate": 4.523324895045111e-06, "loss": 0.2589, "step": 1235 }, { "epoch": 0.23632887189292542, "grad_norm": 1.9277501106262207, "learning_rate": 4.523839183367984e-06, "loss": 0.2001, "step": 1236 }, { "epoch": 0.23652007648183557, "grad_norm": 3.066760540008545, "learning_rate": 4.5243530557681885e-06, "loss": 0.4899, "step": 1237 }, { "epoch": 0.2367112810707457, "grad_norm": 1.5749143362045288, "learning_rate": 4.524866512917924e-06, "loss": 0.0899, "step": 1238 }, { "epoch": 0.23690248565965583, "grad_norm": 2.5981080532073975, "learning_rate": 4.525379555487759e-06, "loss": 0.4254, "step": 1239 }, { "epoch": 0.23709369024856597, "grad_norm": 2.3466320037841797, "learning_rate": 4.525892184146643e-06, "loss": 0.3603, "step": 1240 }, { "epoch": 0.2372848948374761, "grad_norm": 1.4082063436508179, "learning_rate": 4.526404399561903e-06, "loss": 0.1948, "step": 1241 }, { "epoch": 0.23747609942638623, "grad_norm": 2.5637047290802, "learning_rate": 4.5269162023992565e-06, "loss": 0.2129, "step": 1242 }, { "epoch": 0.23766730401529637, "grad_norm": 6.297000885009766, "learning_rate": 4.527427593322813e-06, "loss": 0.1063, "step": 1243 }, { "epoch": 0.23785850860420651, "grad_norm": 1.5783936977386475, "learning_rate": 4.527938572995081e-06, "loss": 0.0618, "step": 1244 }, { "epoch": 0.23804971319311663, "grad_norm": 2.314439535140991, "learning_rate": 4.5284491420769685e-06, "loss": 0.2352, "step": 1245 }, { "epoch": 0.23824091778202677, "grad_norm": 2.707080841064453, "learning_rate": 4.528959301227795e-06, "loss": 0.5317, "step": 1246 }, { "epoch": 0.2384321223709369, "grad_norm": 3.3706955909729004, "learning_rate": 4.529469051105292e-06, "loss": 0.2381, "step": 1247 }, { "epoch": 0.23862332695984703, "grad_norm": 2.027712345123291, "learning_rate": 4.5299783923656084e-06, "loss": 0.2689, "step": 1248 }, { "epoch": 0.23881453154875718, "grad_norm": 2.501119375228882, "learning_rate": 4.530487325663318e-06, "loss": 0.1832, "step": 1249 }, { "epoch": 0.2390057361376673, "grad_norm": 1.212647795677185, "learning_rate": 4.530995851651422e-06, "loss": 0.0478, "step": 1250 }, { "epoch": 0.23919694072657743, "grad_norm": 3.2780611515045166, "learning_rate": 4.531503970981357e-06, "loss": 0.5961, "step": 1251 }, { "epoch": 0.23938814531548758, "grad_norm": 2.3319296836853027, "learning_rate": 4.532011684302993e-06, "loss": 0.1756, "step": 1252 }, { "epoch": 0.2395793499043977, "grad_norm": 1.8689370155334473, "learning_rate": 4.532518992264648e-06, "loss": 0.0948, "step": 1253 }, { "epoch": 0.23977055449330784, "grad_norm": 2.4608871936798096, "learning_rate": 4.5330258955130894e-06, "loss": 0.1658, "step": 1254 }, { "epoch": 0.23996175908221798, "grad_norm": 3.9478836059570312, "learning_rate": 4.533532394693532e-06, "loss": 0.3201, "step": 1255 }, { "epoch": 0.2401529636711281, "grad_norm": 1.9749811887741089, "learning_rate": 4.534038490449656e-06, "loss": 0.0873, "step": 1256 }, { "epoch": 0.24034416826003824, "grad_norm": 1.7174595594406128, "learning_rate": 4.534544183423599e-06, "loss": 0.1389, "step": 1257 }, { "epoch": 0.24053537284894838, "grad_norm": 1.479738712310791, "learning_rate": 4.5350494742559694e-06, "loss": 0.2478, "step": 1258 }, { "epoch": 0.2407265774378585, "grad_norm": 1.9513295888900757, "learning_rate": 4.535554363585849e-06, "loss": 0.2366, "step": 1259 }, { "epoch": 0.24091778202676864, "grad_norm": 1.9584410190582275, "learning_rate": 4.536058852050796e-06, "loss": 0.1392, "step": 1260 }, { "epoch": 0.24110898661567878, "grad_norm": 1.891797423362732, "learning_rate": 4.53656294028685e-06, "loss": 0.0977, "step": 1261 }, { "epoch": 0.2413001912045889, "grad_norm": 1.9594978094100952, "learning_rate": 4.537066628928541e-06, "loss": 0.1411, "step": 1262 }, { "epoch": 0.24149139579349904, "grad_norm": 2.9643545150756836, "learning_rate": 4.537569918608891e-06, "loss": 0.2389, "step": 1263 }, { "epoch": 0.24168260038240919, "grad_norm": 1.6669223308563232, "learning_rate": 4.538072809959417e-06, "loss": 0.1562, "step": 1264 }, { "epoch": 0.2418738049713193, "grad_norm": 2.446153402328491, "learning_rate": 4.538575303610137e-06, "loss": 0.1492, "step": 1265 }, { "epoch": 0.24206500956022944, "grad_norm": 1.535170555114746, "learning_rate": 4.539077400189579e-06, "loss": 0.0918, "step": 1266 }, { "epoch": 0.2422562141491396, "grad_norm": 1.8623381853103638, "learning_rate": 4.53957910032478e-06, "loss": 0.107, "step": 1267 }, { "epoch": 0.2424474187380497, "grad_norm": 2.7198643684387207, "learning_rate": 4.5400804046412925e-06, "loss": 0.256, "step": 1268 }, { "epoch": 0.24263862332695985, "grad_norm": 2.15179443359375, "learning_rate": 4.540581313763191e-06, "loss": 0.1535, "step": 1269 }, { "epoch": 0.24282982791587, "grad_norm": 2.4457833766937256, "learning_rate": 4.541081828313074e-06, "loss": 0.4226, "step": 1270 }, { "epoch": 0.2430210325047801, "grad_norm": 2.3401358127593994, "learning_rate": 4.54158194891207e-06, "loss": 0.1621, "step": 1271 }, { "epoch": 0.24321223709369025, "grad_norm": 2.9228503704071045, "learning_rate": 4.542081676179842e-06, "loss": 0.1591, "step": 1272 }, { "epoch": 0.2434034416826004, "grad_norm": 1.8721615076065063, "learning_rate": 4.542581010734594e-06, "loss": 0.2154, "step": 1273 }, { "epoch": 0.2435946462715105, "grad_norm": 1.5572518110275269, "learning_rate": 4.54307995319307e-06, "loss": 0.084, "step": 1274 }, { "epoch": 0.24378585086042065, "grad_norm": 2.8869805335998535, "learning_rate": 4.543578504170567e-06, "loss": 0.1584, "step": 1275 }, { "epoch": 0.2439770554493308, "grad_norm": 2.540313243865967, "learning_rate": 4.544076664280929e-06, "loss": 0.2337, "step": 1276 }, { "epoch": 0.2441682600382409, "grad_norm": 2.9700610637664795, "learning_rate": 4.544574434136564e-06, "loss": 0.2049, "step": 1277 }, { "epoch": 0.24435946462715105, "grad_norm": 2.179842710494995, "learning_rate": 4.545071814348435e-06, "loss": 0.3845, "step": 1278 }, { "epoch": 0.2445506692160612, "grad_norm": 2.47133731842041, "learning_rate": 4.5455688055260765e-06, "loss": 0.2824, "step": 1279 }, { "epoch": 0.2447418738049713, "grad_norm": 1.5787922143936157, "learning_rate": 4.5460654082775925e-06, "loss": 0.079, "step": 1280 }, { "epoch": 0.24493307839388145, "grad_norm": 2.6020150184631348, "learning_rate": 4.546561623209661e-06, "loss": 0.1166, "step": 1281 }, { "epoch": 0.2451242829827916, "grad_norm": 5.912609100341797, "learning_rate": 4.547057450927541e-06, "loss": 0.6601, "step": 1282 }, { "epoch": 0.2453154875717017, "grad_norm": 3.048389434814453, "learning_rate": 4.547552892035077e-06, "loss": 0.5557, "step": 1283 }, { "epoch": 0.24550669216061186, "grad_norm": 2.1050589084625244, "learning_rate": 4.548047947134698e-06, "loss": 0.1083, "step": 1284 }, { "epoch": 0.245697896749522, "grad_norm": 2.0243983268737793, "learning_rate": 4.5485426168274285e-06, "loss": 0.1569, "step": 1285 }, { "epoch": 0.24588910133843211, "grad_norm": 1.9739599227905273, "learning_rate": 4.549036901712892e-06, "loss": 0.1624, "step": 1286 }, { "epoch": 0.24608030592734226, "grad_norm": 1.8565188646316528, "learning_rate": 4.549530802389311e-06, "loss": 0.0998, "step": 1287 }, { "epoch": 0.2462715105162524, "grad_norm": 2.806781053543091, "learning_rate": 4.550024319453516e-06, "loss": 0.3977, "step": 1288 }, { "epoch": 0.24646271510516252, "grad_norm": 2.98370623588562, "learning_rate": 4.550517453500946e-06, "loss": 0.4309, "step": 1289 }, { "epoch": 0.24665391969407266, "grad_norm": 2.2158684730529785, "learning_rate": 4.551010205125657e-06, "loss": 0.3051, "step": 1290 }, { "epoch": 0.2468451242829828, "grad_norm": 2.2170090675354004, "learning_rate": 4.551502574920322e-06, "loss": 0.2447, "step": 1291 }, { "epoch": 0.24703632887189292, "grad_norm": 2.093726634979248, "learning_rate": 4.551994563476239e-06, "loss": 0.1615, "step": 1292 }, { "epoch": 0.24722753346080306, "grad_norm": 0.953401505947113, "learning_rate": 4.5524861713833315e-06, "loss": 0.0517, "step": 1293 }, { "epoch": 0.2474187380497132, "grad_norm": 2.464068651199341, "learning_rate": 4.552977399230156e-06, "loss": 0.1319, "step": 1294 }, { "epoch": 0.24760994263862332, "grad_norm": 1.8242707252502441, "learning_rate": 4.553468247603907e-06, "loss": 0.1472, "step": 1295 }, { "epoch": 0.24780114722753346, "grad_norm": 1.8389755487442017, "learning_rate": 4.553958717090414e-06, "loss": 0.2597, "step": 1296 }, { "epoch": 0.2479923518164436, "grad_norm": 2.3550262451171875, "learning_rate": 4.554448808274157e-06, "loss": 0.1745, "step": 1297 }, { "epoch": 0.24818355640535372, "grad_norm": 1.627742886543274, "learning_rate": 4.554938521738259e-06, "loss": 0.0905, "step": 1298 }, { "epoch": 0.24837476099426387, "grad_norm": 3.0560617446899414, "learning_rate": 4.555427858064501e-06, "loss": 0.5314, "step": 1299 }, { "epoch": 0.248565965583174, "grad_norm": 2.0782976150512695, "learning_rate": 4.555916817833317e-06, "loss": 0.0902, "step": 1300 }, { "epoch": 0.24875717017208412, "grad_norm": 2.113381862640381, "learning_rate": 4.556405401623804e-06, "loss": 0.4115, "step": 1301 }, { "epoch": 0.24894837476099427, "grad_norm": 1.6574503183364868, "learning_rate": 4.5568936100137235e-06, "loss": 0.1394, "step": 1302 }, { "epoch": 0.2491395793499044, "grad_norm": 2.0017340183258057, "learning_rate": 4.557381443579506e-06, "loss": 0.1937, "step": 1303 }, { "epoch": 0.24933078393881453, "grad_norm": 2.2529678344726562, "learning_rate": 4.5578689028962575e-06, "loss": 0.2572, "step": 1304 }, { "epoch": 0.24952198852772467, "grad_norm": 1.7279530763626099, "learning_rate": 4.558355988537758e-06, "loss": 0.0845, "step": 1305 }, { "epoch": 0.2497131931166348, "grad_norm": 3.168016195297241, "learning_rate": 4.558842701076469e-06, "loss": 0.174, "step": 1306 }, { "epoch": 0.24990439770554493, "grad_norm": 2.5194199085235596, "learning_rate": 4.559329041083543e-06, "loss": 0.3312, "step": 1307 }, { "epoch": 0.25009560229445504, "grad_norm": 3.844399929046631, "learning_rate": 4.5598150091288164e-06, "loss": 0.3948, "step": 1308 }, { "epoch": 0.2502868068833652, "grad_norm": 1.3260141611099243, "learning_rate": 4.560300605780819e-06, "loss": 0.0967, "step": 1309 }, { "epoch": 0.25047801147227533, "grad_norm": 1.3077999353408813, "learning_rate": 4.5607858316067835e-06, "loss": 0.1281, "step": 1310 }, { "epoch": 0.2506692160611855, "grad_norm": 2.026874542236328, "learning_rate": 4.561270687172638e-06, "loss": 0.2637, "step": 1311 }, { "epoch": 0.2508604206500956, "grad_norm": 1.997843861579895, "learning_rate": 4.561755173043019e-06, "loss": 0.1711, "step": 1312 }, { "epoch": 0.25105162523900576, "grad_norm": 1.434065341949463, "learning_rate": 4.562239289781273e-06, "loss": 0.1526, "step": 1313 }, { "epoch": 0.25124282982791585, "grad_norm": 2.5661134719848633, "learning_rate": 4.5627230379494595e-06, "loss": 0.4677, "step": 1314 }, { "epoch": 0.251434034416826, "grad_norm": 2.9416589736938477, "learning_rate": 4.5632064181083524e-06, "loss": 0.337, "step": 1315 }, { "epoch": 0.25162523900573613, "grad_norm": 2.0680315494537354, "learning_rate": 4.56368943081745e-06, "loss": 0.2415, "step": 1316 }, { "epoch": 0.2518164435946463, "grad_norm": 1.4894253015518188, "learning_rate": 4.564172076634976e-06, "loss": 0.2191, "step": 1317 }, { "epoch": 0.2520076481835564, "grad_norm": 2.523341655731201, "learning_rate": 4.56465435611788e-06, "loss": 0.1231, "step": 1318 }, { "epoch": 0.25219885277246656, "grad_norm": 2.880688190460205, "learning_rate": 4.5651362698218455e-06, "loss": 0.2017, "step": 1319 }, { "epoch": 0.25239005736137665, "grad_norm": 2.6411757469177246, "learning_rate": 4.565617818301295e-06, "loss": 0.5086, "step": 1320 }, { "epoch": 0.2525812619502868, "grad_norm": 2.0425150394439697, "learning_rate": 4.566099002109388e-06, "loss": 0.2197, "step": 1321 }, { "epoch": 0.25277246653919694, "grad_norm": 2.203280448913574, "learning_rate": 4.56657982179803e-06, "loss": 0.2555, "step": 1322 }, { "epoch": 0.2529636711281071, "grad_norm": 1.5817688703536987, "learning_rate": 4.567060277917876e-06, "loss": 0.107, "step": 1323 }, { "epoch": 0.2531548757170172, "grad_norm": 2.743864059448242, "learning_rate": 4.567540371018329e-06, "loss": 0.1049, "step": 1324 }, { "epoch": 0.25334608030592737, "grad_norm": 1.1663135290145874, "learning_rate": 4.568020101647551e-06, "loss": 0.0449, "step": 1325 }, { "epoch": 0.25353728489483746, "grad_norm": 2.348593235015869, "learning_rate": 4.568499470352461e-06, "loss": 0.4336, "step": 1326 }, { "epoch": 0.2537284894837476, "grad_norm": 2.174994468688965, "learning_rate": 4.568978477678743e-06, "loss": 0.4255, "step": 1327 }, { "epoch": 0.25391969407265774, "grad_norm": 2.499854564666748, "learning_rate": 4.5694571241708465e-06, "loss": 0.1284, "step": 1328 }, { "epoch": 0.2541108986615679, "grad_norm": 1.8215583562850952, "learning_rate": 4.5699354103719936e-06, "loss": 0.1327, "step": 1329 }, { "epoch": 0.25430210325047803, "grad_norm": 2.302138328552246, "learning_rate": 4.570413336824176e-06, "loss": 0.0985, "step": 1330 }, { "epoch": 0.25449330783938817, "grad_norm": 2.055473804473877, "learning_rate": 4.570890904068169e-06, "loss": 0.0946, "step": 1331 }, { "epoch": 0.25468451242829826, "grad_norm": 1.9759926795959473, "learning_rate": 4.571368112643526e-06, "loss": 0.1447, "step": 1332 }, { "epoch": 0.2548757170172084, "grad_norm": 2.6056742668151855, "learning_rate": 4.571844963088587e-06, "loss": 0.4277, "step": 1333 }, { "epoch": 0.25506692160611855, "grad_norm": 1.6580945253372192, "learning_rate": 4.572321455940478e-06, "loss": 0.3823, "step": 1334 }, { "epoch": 0.2552581261950287, "grad_norm": 3.8294076919555664, "learning_rate": 4.572797591735123e-06, "loss": 0.0972, "step": 1335 }, { "epoch": 0.25544933078393883, "grad_norm": 1.2979289293289185, "learning_rate": 4.573273371007238e-06, "loss": 0.0785, "step": 1336 }, { "epoch": 0.255640535372849, "grad_norm": 1.569770097732544, "learning_rate": 4.573748794290339e-06, "loss": 0.1083, "step": 1337 }, { "epoch": 0.25583173996175906, "grad_norm": 1.5974154472351074, "learning_rate": 4.574223862116746e-06, "loss": 0.1379, "step": 1338 }, { "epoch": 0.2560229445506692, "grad_norm": 2.4823997020721436, "learning_rate": 4.574698575017587e-06, "loss": 0.3444, "step": 1339 }, { "epoch": 0.25621414913957935, "grad_norm": 1.7080856561660767, "learning_rate": 4.5751729335227995e-06, "loss": 0.1214, "step": 1340 }, { "epoch": 0.2564053537284895, "grad_norm": 2.0777196884155273, "learning_rate": 4.575646938161135e-06, "loss": 0.1628, "step": 1341 }, { "epoch": 0.25659655831739964, "grad_norm": 1.514445424079895, "learning_rate": 4.576120589460161e-06, "loss": 0.1017, "step": 1342 }, { "epoch": 0.2567877629063097, "grad_norm": 3.241065502166748, "learning_rate": 4.576593887946269e-06, "loss": 0.1865, "step": 1343 }, { "epoch": 0.25697896749521987, "grad_norm": 1.9930219650268555, "learning_rate": 4.577066834144674e-06, "loss": 0.2197, "step": 1344 }, { "epoch": 0.25717017208413, "grad_norm": 2.6061015129089355, "learning_rate": 4.577539428579417e-06, "loss": 0.4761, "step": 1345 }, { "epoch": 0.25736137667304015, "grad_norm": 1.4190788269042969, "learning_rate": 4.578011671773374e-06, "loss": 0.1455, "step": 1346 }, { "epoch": 0.2575525812619503, "grad_norm": 1.4043774604797363, "learning_rate": 4.578483564248254e-06, "loss": 0.0912, "step": 1347 }, { "epoch": 0.25774378585086044, "grad_norm": 2.920255184173584, "learning_rate": 4.578955106524605e-06, "loss": 0.1808, "step": 1348 }, { "epoch": 0.25793499043977053, "grad_norm": 2.587714672088623, "learning_rate": 4.5794262991218156e-06, "loss": 0.1429, "step": 1349 }, { "epoch": 0.25812619502868067, "grad_norm": 2.5600852966308594, "learning_rate": 4.5798971425581235e-06, "loss": 0.2295, "step": 1350 }, { "epoch": 0.2583173996175908, "grad_norm": 1.5082076787948608, "learning_rate": 4.580367637350609e-06, "loss": 0.174, "step": 1351 }, { "epoch": 0.25850860420650096, "grad_norm": 1.4872022867202759, "learning_rate": 4.580837784015212e-06, "loss": 0.1193, "step": 1352 }, { "epoch": 0.2586998087954111, "grad_norm": 3.185533285140991, "learning_rate": 4.581307583066722e-06, "loss": 0.1592, "step": 1353 }, { "epoch": 0.25889101338432124, "grad_norm": 2.905579090118408, "learning_rate": 4.58177703501879e-06, "loss": 0.1793, "step": 1354 }, { "epoch": 0.25908221797323133, "grad_norm": 1.6768579483032227, "learning_rate": 4.58224614038393e-06, "loss": 0.0814, "step": 1355 }, { "epoch": 0.2592734225621415, "grad_norm": 4.425497055053711, "learning_rate": 4.58271489967352e-06, "loss": 0.2171, "step": 1356 }, { "epoch": 0.2594646271510516, "grad_norm": 2.09071946144104, "learning_rate": 4.5831833133978085e-06, "loss": 0.1503, "step": 1357 }, { "epoch": 0.25965583173996176, "grad_norm": 2.006558895111084, "learning_rate": 4.583651382065915e-06, "loss": 0.4281, "step": 1358 }, { "epoch": 0.2598470363288719, "grad_norm": 1.6992747783660889, "learning_rate": 4.584119106185835e-06, "loss": 0.1838, "step": 1359 }, { "epoch": 0.26003824091778205, "grad_norm": 2.895442485809326, "learning_rate": 4.584586486264445e-06, "loss": 0.5134, "step": 1360 }, { "epoch": 0.26022944550669214, "grad_norm": 3.2242867946624756, "learning_rate": 4.585053522807501e-06, "loss": 0.1576, "step": 1361 }, { "epoch": 0.2604206500956023, "grad_norm": 3.0295143127441406, "learning_rate": 4.5855202163196466e-06, "loss": 0.3062, "step": 1362 }, { "epoch": 0.2606118546845124, "grad_norm": 1.9487686157226562, "learning_rate": 4.585986567304413e-06, "loss": 0.2205, "step": 1363 }, { "epoch": 0.26080305927342257, "grad_norm": 2.4985392093658447, "learning_rate": 4.586452576264223e-06, "loss": 0.3626, "step": 1364 }, { "epoch": 0.2609942638623327, "grad_norm": 3.514467716217041, "learning_rate": 4.586918243700398e-06, "loss": 0.365, "step": 1365 }, { "epoch": 0.26118546845124285, "grad_norm": 2.703725576400757, "learning_rate": 4.587383570113155e-06, "loss": 0.4229, "step": 1366 }, { "epoch": 0.26137667304015294, "grad_norm": 2.9062862396240234, "learning_rate": 4.587848556001613e-06, "loss": 0.2676, "step": 1367 }, { "epoch": 0.2615678776290631, "grad_norm": 1.557067632675171, "learning_rate": 4.588313201863795e-06, "loss": 0.092, "step": 1368 }, { "epoch": 0.2617590822179732, "grad_norm": 2.868450880050659, "learning_rate": 4.588777508196637e-06, "loss": 0.2177, "step": 1369 }, { "epoch": 0.26195028680688337, "grad_norm": 2.503908157348633, "learning_rate": 4.589241475495983e-06, "loss": 0.3252, "step": 1370 }, { "epoch": 0.2621414913957935, "grad_norm": 3.2766878604888916, "learning_rate": 4.58970510425659e-06, "loss": 0.5061, "step": 1371 }, { "epoch": 0.26233269598470366, "grad_norm": 1.436415672302246, "learning_rate": 4.5901683949721355e-06, "loss": 0.113, "step": 1372 }, { "epoch": 0.26252390057361374, "grad_norm": 1.4718507528305054, "learning_rate": 4.590631348135217e-06, "loss": 0.2098, "step": 1373 }, { "epoch": 0.2627151051625239, "grad_norm": 1.3220497369766235, "learning_rate": 4.591093964237357e-06, "loss": 0.0534, "step": 1374 }, { "epoch": 0.26290630975143403, "grad_norm": 2.3449716567993164, "learning_rate": 4.591556243769003e-06, "loss": 0.1064, "step": 1375 }, { "epoch": 0.2630975143403442, "grad_norm": 2.357490301132202, "learning_rate": 4.592018187219536e-06, "loss": 0.5121, "step": 1376 }, { "epoch": 0.2632887189292543, "grad_norm": 2.1978540420532227, "learning_rate": 4.5924797950772665e-06, "loss": 0.2471, "step": 1377 }, { "epoch": 0.26347992351816446, "grad_norm": 2.617004871368408, "learning_rate": 4.592941067829446e-06, "loss": 0.3932, "step": 1378 }, { "epoch": 0.26367112810707455, "grad_norm": 2.3759875297546387, "learning_rate": 4.593402005962261e-06, "loss": 0.3458, "step": 1379 }, { "epoch": 0.2638623326959847, "grad_norm": 1.166843056678772, "learning_rate": 4.593862609960843e-06, "loss": 0.0701, "step": 1380 }, { "epoch": 0.26405353728489483, "grad_norm": 2.4391300678253174, "learning_rate": 4.594322880309272e-06, "loss": 0.1061, "step": 1381 }, { "epoch": 0.264244741873805, "grad_norm": 1.7300337553024292, "learning_rate": 4.594782817490571e-06, "loss": 0.1513, "step": 1382 }, { "epoch": 0.2644359464627151, "grad_norm": 2.3044793605804443, "learning_rate": 4.595242421986719e-06, "loss": 0.2708, "step": 1383 }, { "epoch": 0.26462715105162526, "grad_norm": 1.8326455354690552, "learning_rate": 4.595701694278649e-06, "loss": 0.253, "step": 1384 }, { "epoch": 0.26481835564053535, "grad_norm": 2.248283863067627, "learning_rate": 4.5961606348462506e-06, "loss": 0.2861, "step": 1385 }, { "epoch": 0.2650095602294455, "grad_norm": 3.2349724769592285, "learning_rate": 4.596619244168376e-06, "loss": 0.2644, "step": 1386 }, { "epoch": 0.26520076481835564, "grad_norm": 3.4575836658477783, "learning_rate": 4.59707752272284e-06, "loss": 0.1486, "step": 1387 }, { "epoch": 0.2653919694072658, "grad_norm": 1.597733497619629, "learning_rate": 4.597535470986426e-06, "loss": 0.1201, "step": 1388 }, { "epoch": 0.2655831739961759, "grad_norm": 2.6269469261169434, "learning_rate": 4.597993089434886e-06, "loss": 0.18, "step": 1389 }, { "epoch": 0.26577437858508607, "grad_norm": 2.98217511177063, "learning_rate": 4.598450378542943e-06, "loss": 0.3596, "step": 1390 }, { "epoch": 0.26596558317399616, "grad_norm": 1.9663608074188232, "learning_rate": 4.5989073387843e-06, "loss": 0.2026, "step": 1391 }, { "epoch": 0.2661567877629063, "grad_norm": 1.8245049715042114, "learning_rate": 4.599363970631637e-06, "loss": 0.1057, "step": 1392 }, { "epoch": 0.26634799235181644, "grad_norm": 2.081806182861328, "learning_rate": 4.599820274556611e-06, "loss": 0.1292, "step": 1393 }, { "epoch": 0.2665391969407266, "grad_norm": 2.341907262802124, "learning_rate": 4.6002762510298725e-06, "loss": 0.159, "step": 1394 }, { "epoch": 0.26673040152963673, "grad_norm": 4.16287899017334, "learning_rate": 4.600731900521051e-06, "loss": 0.5912, "step": 1395 }, { "epoch": 0.2669216061185469, "grad_norm": 2.3653371334075928, "learning_rate": 4.601187223498774e-06, "loss": 0.1569, "step": 1396 }, { "epoch": 0.26711281070745696, "grad_norm": 1.7293826341629028, "learning_rate": 4.601642220430655e-06, "loss": 0.1551, "step": 1397 }, { "epoch": 0.2673040152963671, "grad_norm": 2.207956314086914, "learning_rate": 4.602096891783308e-06, "loss": 0.1525, "step": 1398 }, { "epoch": 0.26749521988527725, "grad_norm": 2.079894542694092, "learning_rate": 4.6025512380223466e-06, "loss": 0.2216, "step": 1399 }, { "epoch": 0.2676864244741874, "grad_norm": 1.731223702430725, "learning_rate": 4.603005259612382e-06, "loss": 0.1097, "step": 1400 }, { "epoch": 0.26787762906309753, "grad_norm": 2.4742074012756348, "learning_rate": 4.603458957017036e-06, "loss": 0.3192, "step": 1401 }, { "epoch": 0.2680688336520076, "grad_norm": 2.1221306324005127, "learning_rate": 4.603912330698932e-06, "loss": 0.2922, "step": 1402 }, { "epoch": 0.26826003824091776, "grad_norm": 3.109562873840332, "learning_rate": 4.60436538111971e-06, "loss": 0.3177, "step": 1403 }, { "epoch": 0.2684512428298279, "grad_norm": 2.970853567123413, "learning_rate": 4.6048181087400175e-06, "loss": 0.3191, "step": 1404 }, { "epoch": 0.26864244741873805, "grad_norm": 1.673386812210083, "learning_rate": 4.605270514019522e-06, "loss": 0.0833, "step": 1405 }, { "epoch": 0.2688336520076482, "grad_norm": 2.224393606185913, "learning_rate": 4.605722597416907e-06, "loss": 0.2605, "step": 1406 }, { "epoch": 0.26902485659655834, "grad_norm": 1.8234128952026367, "learning_rate": 4.6061743593898815e-06, "loss": 0.1346, "step": 1407 }, { "epoch": 0.2692160611854684, "grad_norm": 1.949435830116272, "learning_rate": 4.6066258003951735e-06, "loss": 0.157, "step": 1408 }, { "epoch": 0.26940726577437857, "grad_norm": 1.833545446395874, "learning_rate": 4.607076920888543e-06, "loss": 0.1708, "step": 1409 }, { "epoch": 0.2695984703632887, "grad_norm": 2.0920236110687256, "learning_rate": 4.607527721324779e-06, "loss": 0.32, "step": 1410 }, { "epoch": 0.26978967495219885, "grad_norm": 1.772287368774414, "learning_rate": 4.6079782021577e-06, "loss": 0.1181, "step": 1411 }, { "epoch": 0.269980879541109, "grad_norm": 3.1658973693847656, "learning_rate": 4.608428363840164e-06, "loss": 0.1821, "step": 1412 }, { "epoch": 0.27017208413001914, "grad_norm": 1.7124018669128418, "learning_rate": 4.608878206824065e-06, "loss": 0.0898, "step": 1413 }, { "epoch": 0.27036328871892923, "grad_norm": 4.624269008636475, "learning_rate": 4.6093277315603385e-06, "loss": 0.461, "step": 1414 }, { "epoch": 0.27055449330783937, "grad_norm": 3.7334680557250977, "learning_rate": 4.609776938498964e-06, "loss": 0.2932, "step": 1415 }, { "epoch": 0.2707456978967495, "grad_norm": 2.483091354370117, "learning_rate": 4.610225828088966e-06, "loss": 0.2618, "step": 1416 }, { "epoch": 0.27093690248565966, "grad_norm": 2.0719807147979736, "learning_rate": 4.610674400778419e-06, "loss": 0.256, "step": 1417 }, { "epoch": 0.2711281070745698, "grad_norm": 2.7514090538024902, "learning_rate": 4.6111226570144505e-06, "loss": 0.2337, "step": 1418 }, { "epoch": 0.27131931166347995, "grad_norm": 4.372493267059326, "learning_rate": 4.611570597243238e-06, "loss": 0.348, "step": 1419 }, { "epoch": 0.27151051625239003, "grad_norm": 3.5149691104888916, "learning_rate": 4.6120182219100225e-06, "loss": 0.6993, "step": 1420 }, { "epoch": 0.2717017208413002, "grad_norm": 2.984778881072998, "learning_rate": 4.612465531459098e-06, "loss": 0.2772, "step": 1421 }, { "epoch": 0.2718929254302103, "grad_norm": 2.6740102767944336, "learning_rate": 4.612912526333825e-06, "loss": 0.3069, "step": 1422 }, { "epoch": 0.27208413001912046, "grad_norm": 1.7484631538391113, "learning_rate": 4.613359206976629e-06, "loss": 0.0702, "step": 1423 }, { "epoch": 0.2722753346080306, "grad_norm": 2.2864737510681152, "learning_rate": 4.613805573829002e-06, "loss": 0.1678, "step": 1424 }, { "epoch": 0.27246653919694075, "grad_norm": 1.9719682931900024, "learning_rate": 4.614251627331505e-06, "loss": 0.1079, "step": 1425 }, { "epoch": 0.27265774378585084, "grad_norm": 3.1089677810668945, "learning_rate": 4.614697367923773e-06, "loss": 0.4868, "step": 1426 }, { "epoch": 0.272848948374761, "grad_norm": 3.177788734436035, "learning_rate": 4.615142796044517e-06, "loss": 0.1517, "step": 1427 }, { "epoch": 0.2730401529636711, "grad_norm": 1.7205730676651, "learning_rate": 4.615587912131526e-06, "loss": 0.1392, "step": 1428 }, { "epoch": 0.27323135755258127, "grad_norm": 2.4936370849609375, "learning_rate": 4.61603271662167e-06, "loss": 0.2818, "step": 1429 }, { "epoch": 0.2734225621414914, "grad_norm": 2.6876871585845947, "learning_rate": 4.616477209950898e-06, "loss": 0.2261, "step": 1430 }, { "epoch": 0.27361376673040155, "grad_norm": 2.5818753242492676, "learning_rate": 4.616921392554251e-06, "loss": 0.1231, "step": 1431 }, { "epoch": 0.27380497131931164, "grad_norm": 2.133584499359131, "learning_rate": 4.617365264865855e-06, "loss": 0.2894, "step": 1432 }, { "epoch": 0.2739961759082218, "grad_norm": 3.4310872554779053, "learning_rate": 4.6178088273189265e-06, "loss": 0.7312, "step": 1433 }, { "epoch": 0.2741873804971319, "grad_norm": 1.8406034708023071, "learning_rate": 4.618252080345775e-06, "loss": 0.1771, "step": 1434 }, { "epoch": 0.27437858508604207, "grad_norm": 1.9556293487548828, "learning_rate": 4.61869502437781e-06, "loss": 0.1997, "step": 1435 }, { "epoch": 0.2745697896749522, "grad_norm": 2.400635242462158, "learning_rate": 4.619137659845533e-06, "loss": 0.1104, "step": 1436 }, { "epoch": 0.27476099426386236, "grad_norm": 3.1298184394836426, "learning_rate": 4.619579987178551e-06, "loss": 0.2818, "step": 1437 }, { "epoch": 0.27495219885277244, "grad_norm": 3.0977323055267334, "learning_rate": 4.620022006805574e-06, "loss": 0.5284, "step": 1438 }, { "epoch": 0.2751434034416826, "grad_norm": 1.5940463542938232, "learning_rate": 4.620463719154416e-06, "loss": 0.2201, "step": 1439 }, { "epoch": 0.27533460803059273, "grad_norm": 2.438342809677124, "learning_rate": 4.620905124652002e-06, "loss": 0.2274, "step": 1440 }, { "epoch": 0.2755258126195029, "grad_norm": 1.9232128858566284, "learning_rate": 4.6213462237243646e-06, "loss": 0.1143, "step": 1441 }, { "epoch": 0.275717017208413, "grad_norm": 2.3613779544830322, "learning_rate": 4.621787016796653e-06, "loss": 0.1331, "step": 1442 }, { "epoch": 0.27590822179732316, "grad_norm": 1.824241280555725, "learning_rate": 4.6222275042931295e-06, "loss": 0.086, "step": 1443 }, { "epoch": 0.27609942638623325, "grad_norm": 2.2183074951171875, "learning_rate": 4.622667686637177e-06, "loss": 0.1208, "step": 1444 }, { "epoch": 0.2762906309751434, "grad_norm": 2.0111348628997803, "learning_rate": 4.623107564251298e-06, "loss": 0.1653, "step": 1445 }, { "epoch": 0.27648183556405354, "grad_norm": 2.0708916187286377, "learning_rate": 4.623547137557118e-06, "loss": 0.1958, "step": 1446 }, { "epoch": 0.2766730401529637, "grad_norm": 3.2641139030456543, "learning_rate": 4.623986406975387e-06, "loss": 0.2679, "step": 1447 }, { "epoch": 0.2768642447418738, "grad_norm": 1.6562097072601318, "learning_rate": 4.624425372925986e-06, "loss": 0.1593, "step": 1448 }, { "epoch": 0.27705544933078396, "grad_norm": 3.0824685096740723, "learning_rate": 4.624864035827925e-06, "loss": 0.1947, "step": 1449 }, { "epoch": 0.27724665391969405, "grad_norm": 2.1108744144439697, "learning_rate": 4.6253023960993445e-06, "loss": 0.0826, "step": 1450 }, { "epoch": 0.2774378585086042, "grad_norm": 1.9569447040557861, "learning_rate": 4.625740454157524e-06, "loss": 0.2948, "step": 1451 }, { "epoch": 0.27762906309751434, "grad_norm": 2.475095272064209, "learning_rate": 4.626178210418876e-06, "loss": 0.2801, "step": 1452 }, { "epoch": 0.2778202676864245, "grad_norm": 1.949391484260559, "learning_rate": 4.626615665298957e-06, "loss": 0.2627, "step": 1453 }, { "epoch": 0.2780114722753346, "grad_norm": 3.6278679370880127, "learning_rate": 4.627052819212466e-06, "loss": 0.3923, "step": 1454 }, { "epoch": 0.27820267686424477, "grad_norm": 1.5531980991363525, "learning_rate": 4.627489672573243e-06, "loss": 0.0759, "step": 1455 }, { "epoch": 0.27839388145315486, "grad_norm": 2.221863269805908, "learning_rate": 4.627926225794277e-06, "loss": 0.1375, "step": 1456 }, { "epoch": 0.278585086042065, "grad_norm": 3.4804158210754395, "learning_rate": 4.628362479287708e-06, "loss": 0.31, "step": 1457 }, { "epoch": 0.27877629063097514, "grad_norm": 2.319563627243042, "learning_rate": 4.628798433464823e-06, "loss": 0.3307, "step": 1458 }, { "epoch": 0.2789674952198853, "grad_norm": 2.1878955364227295, "learning_rate": 4.62923408873607e-06, "loss": 0.1792, "step": 1459 }, { "epoch": 0.27915869980879543, "grad_norm": 1.8384076356887817, "learning_rate": 4.629669445511046e-06, "loss": 0.1425, "step": 1460 }, { "epoch": 0.2793499043977055, "grad_norm": 3.533768892288208, "learning_rate": 4.630104504198513e-06, "loss": 0.2667, "step": 1461 }, { "epoch": 0.27954110898661566, "grad_norm": 2.315150499343872, "learning_rate": 4.6305392652063885e-06, "loss": 0.1398, "step": 1462 }, { "epoch": 0.2797323135755258, "grad_norm": 3.2896182537078857, "learning_rate": 4.630973728941758e-06, "loss": 0.3448, "step": 1463 }, { "epoch": 0.27992351816443595, "grad_norm": 2.074253797531128, "learning_rate": 4.631407895810868e-06, "loss": 0.2655, "step": 1464 }, { "epoch": 0.2801147227533461, "grad_norm": 2.220587730407715, "learning_rate": 4.631841766219136e-06, "loss": 0.4133, "step": 1465 }, { "epoch": 0.28030592734225623, "grad_norm": 2.7555809020996094, "learning_rate": 4.63227534057115e-06, "loss": 0.3825, "step": 1466 }, { "epoch": 0.2804971319311663, "grad_norm": 1.95787513256073, "learning_rate": 4.6327086192706666e-06, "loss": 0.1425, "step": 1467 }, { "epoch": 0.28068833652007646, "grad_norm": 2.1310410499572754, "learning_rate": 4.633141602720621e-06, "loss": 0.1743, "step": 1468 }, { "epoch": 0.2808795411089866, "grad_norm": 1.3947631120681763, "learning_rate": 4.633574291323124e-06, "loss": 0.0875, "step": 1469 }, { "epoch": 0.28107074569789675, "grad_norm": 3.2543869018554688, "learning_rate": 4.6340066854794634e-06, "loss": 0.5823, "step": 1470 }, { "epoch": 0.2812619502868069, "grad_norm": 2.7726821899414062, "learning_rate": 4.634438785590112e-06, "loss": 0.4261, "step": 1471 }, { "epoch": 0.28145315487571704, "grad_norm": 1.8597933053970337, "learning_rate": 4.634870592054722e-06, "loss": 0.3469, "step": 1472 }, { "epoch": 0.2816443594646271, "grad_norm": 1.49411141872406, "learning_rate": 4.635302105272136e-06, "loss": 0.0695, "step": 1473 }, { "epoch": 0.28183556405353727, "grad_norm": 2.444284677505493, "learning_rate": 4.635733325640381e-06, "loss": 0.2761, "step": 1474 }, { "epoch": 0.2820267686424474, "grad_norm": 2.646402359008789, "learning_rate": 4.636164253556675e-06, "loss": 0.1316, "step": 1475 }, { "epoch": 0.28221797323135756, "grad_norm": 2.195561408996582, "learning_rate": 4.636594889417429e-06, "loss": 0.4064, "step": 1476 }, { "epoch": 0.2824091778202677, "grad_norm": 1.763237476348877, "learning_rate": 4.637025233618248e-06, "loss": 0.1922, "step": 1477 }, { "epoch": 0.28260038240917784, "grad_norm": 2.47198748588562, "learning_rate": 4.637455286553934e-06, "loss": 0.4326, "step": 1478 }, { "epoch": 0.28279158699808793, "grad_norm": 1.6601879596710205, "learning_rate": 4.637885048618489e-06, "loss": 0.124, "step": 1479 }, { "epoch": 0.2829827915869981, "grad_norm": 1.9923487901687622, "learning_rate": 4.6383145202051135e-06, "loss": 0.1602, "step": 1480 }, { "epoch": 0.2831739961759082, "grad_norm": 1.6631501913070679, "learning_rate": 4.638743701706214e-06, "loss": 0.1, "step": 1481 }, { "epoch": 0.28336520076481836, "grad_norm": 2.8882148265838623, "learning_rate": 4.639172593513399e-06, "loss": 0.3787, "step": 1482 }, { "epoch": 0.2835564053537285, "grad_norm": 2.454380750656128, "learning_rate": 4.639601196017489e-06, "loss": 0.3458, "step": 1483 }, { "epoch": 0.28374760994263865, "grad_norm": 2.356696367263794, "learning_rate": 4.640029509608511e-06, "loss": 0.2473, "step": 1484 }, { "epoch": 0.28393881453154873, "grad_norm": 2.13679838180542, "learning_rate": 4.640457534675704e-06, "loss": 0.318, "step": 1485 }, { "epoch": 0.2841300191204589, "grad_norm": 1.2381728887557983, "learning_rate": 4.640885271607523e-06, "loss": 0.0621, "step": 1486 }, { "epoch": 0.284321223709369, "grad_norm": 2.054654121398926, "learning_rate": 4.641312720791636e-06, "loss": 0.33, "step": 1487 }, { "epoch": 0.28451242829827916, "grad_norm": 3.917362928390503, "learning_rate": 4.64173988261493e-06, "loss": 0.1747, "step": 1488 }, { "epoch": 0.2847036328871893, "grad_norm": 2.648787260055542, "learning_rate": 4.642166757463516e-06, "loss": 0.5647, "step": 1489 }, { "epoch": 0.28489483747609945, "grad_norm": 2.680501937866211, "learning_rate": 4.6425933457227225e-06, "loss": 0.3256, "step": 1490 }, { "epoch": 0.28508604206500954, "grad_norm": 2.612128734588623, "learning_rate": 4.643019647777103e-06, "loss": 0.2337, "step": 1491 }, { "epoch": 0.2852772466539197, "grad_norm": 2.540062189102173, "learning_rate": 4.6434456640104405e-06, "loss": 0.1389, "step": 1492 }, { "epoch": 0.2854684512428298, "grad_norm": 3.286160469055176, "learning_rate": 4.643871394805745e-06, "loss": 0.1752, "step": 1493 }, { "epoch": 0.28565965583173997, "grad_norm": 1.7292871475219727, "learning_rate": 4.644296840545256e-06, "loss": 0.0987, "step": 1494 }, { "epoch": 0.2858508604206501, "grad_norm": 2.197441816329956, "learning_rate": 4.644722001610448e-06, "loss": 0.1405, "step": 1495 }, { "epoch": 0.28604206500956025, "grad_norm": 1.9219545125961304, "learning_rate": 4.645146878382026e-06, "loss": 0.2266, "step": 1496 }, { "epoch": 0.28623326959847034, "grad_norm": 2.036558151245117, "learning_rate": 4.645571471239938e-06, "loss": 0.1706, "step": 1497 }, { "epoch": 0.2864244741873805, "grad_norm": 1.5000076293945312, "learning_rate": 4.6459957805633654e-06, "loss": 0.1039, "step": 1498 }, { "epoch": 0.2866156787762906, "grad_norm": 2.3707828521728516, "learning_rate": 4.646419806730734e-06, "loss": 0.1623, "step": 1499 }, { "epoch": 0.28680688336520077, "grad_norm": 1.9618526697158813, "learning_rate": 4.64684355011971e-06, "loss": 0.1254, "step": 1500 }, { "epoch": 0.28680688336520077, "eval_runtime": 761.9792, "eval_samples_per_second": 2.013, "eval_steps_per_second": 0.252, "step": 1500 }, { "epoch": 0.2869980879541109, "grad_norm": 2.2166504859924316, "learning_rate": 4.6472670111072075e-06, "loss": 0.37, "step": 1501 }, { "epoch": 0.28718929254302106, "grad_norm": 2.407583236694336, "learning_rate": 4.647690190069383e-06, "loss": 0.3667, "step": 1502 }, { "epoch": 0.28738049713193115, "grad_norm": 2.0655288696289062, "learning_rate": 4.648113087381647e-06, "loss": 0.1062, "step": 1503 }, { "epoch": 0.2875717017208413, "grad_norm": 1.6839008331298828, "learning_rate": 4.648535703418657e-06, "loss": 0.122, "step": 1504 }, { "epoch": 0.28776290630975143, "grad_norm": 2.2787880897521973, "learning_rate": 4.648958038554326e-06, "loss": 0.3146, "step": 1505 }, { "epoch": 0.2879541108986616, "grad_norm": 1.7219648361206055, "learning_rate": 4.64938009316182e-06, "loss": 0.0914, "step": 1506 }, { "epoch": 0.2881453154875717, "grad_norm": 2.2042160034179688, "learning_rate": 4.6498018676135644e-06, "loss": 0.291, "step": 1507 }, { "epoch": 0.28833652007648186, "grad_norm": 2.321742057800293, "learning_rate": 4.65022336228124e-06, "loss": 0.2794, "step": 1508 }, { "epoch": 0.28852772466539195, "grad_norm": 2.547917127609253, "learning_rate": 4.650644577535791e-06, "loss": 0.5054, "step": 1509 }, { "epoch": 0.2887189292543021, "grad_norm": 1.275026798248291, "learning_rate": 4.651065513747423e-06, "loss": 0.0787, "step": 1510 }, { "epoch": 0.28891013384321224, "grad_norm": 3.1266400814056396, "learning_rate": 4.651486171285608e-06, "loss": 0.3595, "step": 1511 }, { "epoch": 0.2891013384321224, "grad_norm": 1.9492202997207642, "learning_rate": 4.651906550519083e-06, "loss": 0.123, "step": 1512 }, { "epoch": 0.2892925430210325, "grad_norm": 1.362431526184082, "learning_rate": 4.652326651815854e-06, "loss": 0.1088, "step": 1513 }, { "epoch": 0.28948374760994267, "grad_norm": 1.5959398746490479, "learning_rate": 4.652746475543199e-06, "loss": 0.1612, "step": 1514 }, { "epoch": 0.28967495219885275, "grad_norm": 1.9055774211883545, "learning_rate": 4.6531660220676665e-06, "loss": 0.2062, "step": 1515 }, { "epoch": 0.2898661567877629, "grad_norm": 1.3554742336273193, "learning_rate": 4.653585291755081e-06, "loss": 0.1628, "step": 1516 }, { "epoch": 0.29005736137667304, "grad_norm": 1.883872628211975, "learning_rate": 4.65400428497054e-06, "loss": 0.1861, "step": 1517 }, { "epoch": 0.2902485659655832, "grad_norm": 1.0759433507919312, "learning_rate": 4.654423002078425e-06, "loss": 0.0531, "step": 1518 }, { "epoch": 0.2904397705544933, "grad_norm": 2.0732216835021973, "learning_rate": 4.654841443442393e-06, "loss": 0.1208, "step": 1519 }, { "epoch": 0.29063097514340347, "grad_norm": 2.8344123363494873, "learning_rate": 4.655259609425383e-06, "loss": 0.7317, "step": 1520 }, { "epoch": 0.29082217973231356, "grad_norm": 1.3993836641311646, "learning_rate": 4.655677500389621e-06, "loss": 0.1063, "step": 1521 }, { "epoch": 0.2910133843212237, "grad_norm": 1.489634394645691, "learning_rate": 4.6560951166966175e-06, "loss": 0.1145, "step": 1522 }, { "epoch": 0.29120458891013384, "grad_norm": 1.8000231981277466, "learning_rate": 4.656512458707168e-06, "loss": 0.1213, "step": 1523 }, { "epoch": 0.291395793499044, "grad_norm": 2.340092658996582, "learning_rate": 4.656929526781362e-06, "loss": 0.2737, "step": 1524 }, { "epoch": 0.29158699808795413, "grad_norm": 2.1177775859832764, "learning_rate": 4.6573463212785765e-06, "loss": 0.0867, "step": 1525 }, { "epoch": 0.2917782026768642, "grad_norm": 2.30710768699646, "learning_rate": 4.657762842557484e-06, "loss": 0.3876, "step": 1526 }, { "epoch": 0.29196940726577436, "grad_norm": 2.207430362701416, "learning_rate": 4.658179090976053e-06, "loss": 0.2844, "step": 1527 }, { "epoch": 0.2921606118546845, "grad_norm": 2.2533726692199707, "learning_rate": 4.658595066891546e-06, "loss": 0.1704, "step": 1528 }, { "epoch": 0.29235181644359465, "grad_norm": 1.5354182720184326, "learning_rate": 4.6590107706605244e-06, "loss": 0.1379, "step": 1529 }, { "epoch": 0.2925430210325048, "grad_norm": 1.293443202972412, "learning_rate": 4.659426202638854e-06, "loss": 0.0735, "step": 1530 }, { "epoch": 0.29273422562141493, "grad_norm": 2.447760820388794, "learning_rate": 4.6598413631817005e-06, "loss": 0.0711, "step": 1531 }, { "epoch": 0.292925430210325, "grad_norm": 3.0195038318634033, "learning_rate": 4.660256252643533e-06, "loss": 0.3391, "step": 1532 }, { "epoch": 0.29311663479923517, "grad_norm": 2.5726795196533203, "learning_rate": 4.660670871378128e-06, "loss": 0.456, "step": 1533 }, { "epoch": 0.2933078393881453, "grad_norm": 2.395726442337036, "learning_rate": 4.6610852197385695e-06, "loss": 0.4479, "step": 1534 }, { "epoch": 0.29349904397705545, "grad_norm": 4.810997486114502, "learning_rate": 4.661499298077252e-06, "loss": 0.2757, "step": 1535 }, { "epoch": 0.2936902485659656, "grad_norm": 1.4835280179977417, "learning_rate": 4.66191310674588e-06, "loss": 0.1364, "step": 1536 }, { "epoch": 0.29388145315487574, "grad_norm": 1.9309308528900146, "learning_rate": 4.662326646095474e-06, "loss": 0.081, "step": 1537 }, { "epoch": 0.2940726577437858, "grad_norm": 3.1075916290283203, "learning_rate": 4.662739916476364e-06, "loss": 0.3218, "step": 1538 }, { "epoch": 0.29426386233269597, "grad_norm": 3.229728937149048, "learning_rate": 4.663152918238205e-06, "loss": 0.3299, "step": 1539 }, { "epoch": 0.2944550669216061, "grad_norm": 2.6888089179992676, "learning_rate": 4.663565651729963e-06, "loss": 0.2274, "step": 1540 }, { "epoch": 0.29464627151051626, "grad_norm": 1.4698622226715088, "learning_rate": 4.66397811729993e-06, "loss": 0.0906, "step": 1541 }, { "epoch": 0.2948374760994264, "grad_norm": 2.7869653701782227, "learning_rate": 4.664390315295716e-06, "loss": 0.17, "step": 1542 }, { "epoch": 0.29502868068833654, "grad_norm": 3.202296257019043, "learning_rate": 4.664802246064258e-06, "loss": 0.1493, "step": 1543 }, { "epoch": 0.29521988527724663, "grad_norm": 1.7716546058654785, "learning_rate": 4.665213909951816e-06, "loss": 0.0703, "step": 1544 }, { "epoch": 0.2954110898661568, "grad_norm": 4.290736675262451, "learning_rate": 4.66562530730398e-06, "loss": 0.3951, "step": 1545 }, { "epoch": 0.2956022944550669, "grad_norm": 1.5324119329452515, "learning_rate": 4.666036438465668e-06, "loss": 0.167, "step": 1546 }, { "epoch": 0.29579349904397706, "grad_norm": 1.529327154159546, "learning_rate": 4.66644730378113e-06, "loss": 0.158, "step": 1547 }, { "epoch": 0.2959847036328872, "grad_norm": 1.6390931606292725, "learning_rate": 4.666857903593945e-06, "loss": 0.136, "step": 1548 }, { "epoch": 0.29617590822179735, "grad_norm": 1.1221572160720825, "learning_rate": 4.667268238247031e-06, "loss": 0.0375, "step": 1549 }, { "epoch": 0.29636711281070743, "grad_norm": 3.3055922985076904, "learning_rate": 4.667678308082639e-06, "loss": 0.1424, "step": 1550 }, { "epoch": 0.2965583173996176, "grad_norm": 1.8237640857696533, "learning_rate": 4.668088113442359e-06, "loss": 0.2064, "step": 1551 }, { "epoch": 0.2967495219885277, "grad_norm": 2.7643344402313232, "learning_rate": 4.668497654667122e-06, "loss": 0.415, "step": 1552 }, { "epoch": 0.29694072657743786, "grad_norm": 3.103980302810669, "learning_rate": 4.668906932097195e-06, "loss": 0.5029, "step": 1553 }, { "epoch": 0.297131931166348, "grad_norm": 2.982830286026001, "learning_rate": 4.669315946072195e-06, "loss": 0.4939, "step": 1554 }, { "epoch": 0.29732313575525815, "grad_norm": 2.102238416671753, "learning_rate": 4.669724696931077e-06, "loss": 0.2851, "step": 1555 }, { "epoch": 0.29751434034416824, "grad_norm": 2.107391357421875, "learning_rate": 4.670133185012147e-06, "loss": 0.1171, "step": 1556 }, { "epoch": 0.2977055449330784, "grad_norm": 7.794981002807617, "learning_rate": 4.670541410653059e-06, "loss": 0.3182, "step": 1557 }, { "epoch": 0.2978967495219885, "grad_norm": 3.63592529296875, "learning_rate": 4.6709493741908105e-06, "loss": 0.5255, "step": 1558 }, { "epoch": 0.29808795411089867, "grad_norm": 2.3166344165802, "learning_rate": 4.671357075961757e-06, "loss": 0.2296, "step": 1559 }, { "epoch": 0.2982791586998088, "grad_norm": 1.7584208250045776, "learning_rate": 4.671764516301605e-06, "loss": 0.2356, "step": 1560 }, { "epoch": 0.29847036328871895, "grad_norm": 1.8798422813415527, "learning_rate": 4.672171695545415e-06, "loss": 0.1173, "step": 1561 }, { "epoch": 0.29866156787762904, "grad_norm": 2.359449863433838, "learning_rate": 4.672578614027603e-06, "loss": 0.1093, "step": 1562 }, { "epoch": 0.2988527724665392, "grad_norm": 2.927279233932495, "learning_rate": 4.672985272081945e-06, "loss": 0.4062, "step": 1563 }, { "epoch": 0.29904397705544933, "grad_norm": 3.0216593742370605, "learning_rate": 4.673391670041575e-06, "loss": 0.5828, "step": 1564 }, { "epoch": 0.29923518164435947, "grad_norm": 2.489259719848633, "learning_rate": 4.673797808238989e-06, "loss": 0.3301, "step": 1565 }, { "epoch": 0.2994263862332696, "grad_norm": 2.547424793243408, "learning_rate": 4.674203687006046e-06, "loss": 0.2759, "step": 1566 }, { "epoch": 0.29961759082217976, "grad_norm": 1.899907112121582, "learning_rate": 4.674609306673967e-06, "loss": 0.1262, "step": 1567 }, { "epoch": 0.29980879541108985, "grad_norm": 1.9912757873535156, "learning_rate": 4.675014667573342e-06, "loss": 0.1042, "step": 1568 }, { "epoch": 0.3, "grad_norm": 1.6315016746520996, "learning_rate": 4.675419770034128e-06, "loss": 0.0955, "step": 1569 }, { "epoch": 0.30019120458891013, "grad_norm": 2.126676321029663, "learning_rate": 4.675824614385652e-06, "loss": 0.4287, "step": 1570 }, { "epoch": 0.3003824091778203, "grad_norm": 1.8122913837432861, "learning_rate": 4.6762292009566105e-06, "loss": 0.1287, "step": 1571 }, { "epoch": 0.3005736137667304, "grad_norm": 16.377099990844727, "learning_rate": 4.676633530075071e-06, "loss": 0.2717, "step": 1572 }, { "epoch": 0.30076481835564056, "grad_norm": 2.3182194232940674, "learning_rate": 4.677037602068479e-06, "loss": 0.2243, "step": 1573 }, { "epoch": 0.30095602294455065, "grad_norm": 1.5239766836166382, "learning_rate": 4.677441417263654e-06, "loss": 0.124, "step": 1574 }, { "epoch": 0.3011472275334608, "grad_norm": 2.429558753967285, "learning_rate": 4.677844975986791e-06, "loss": 0.156, "step": 1575 }, { "epoch": 0.30133843212237094, "grad_norm": 2.8561208248138428, "learning_rate": 4.678248278563468e-06, "loss": 0.316, "step": 1576 }, { "epoch": 0.3015296367112811, "grad_norm": 2.1259777545928955, "learning_rate": 4.678651325318638e-06, "loss": 0.1972, "step": 1577 }, { "epoch": 0.3017208413001912, "grad_norm": 1.0598630905151367, "learning_rate": 4.67905411657664e-06, "loss": 0.0997, "step": 1578 }, { "epoch": 0.30191204588910137, "grad_norm": 2.3135368824005127, "learning_rate": 4.679456652661196e-06, "loss": 0.1955, "step": 1579 }, { "epoch": 0.30210325047801145, "grad_norm": 1.8581372499465942, "learning_rate": 4.679858933895413e-06, "loss": 0.1274, "step": 1580 }, { "epoch": 0.3022944550669216, "grad_norm": 1.0900535583496094, "learning_rate": 4.680260960601784e-06, "loss": 0.0515, "step": 1581 }, { "epoch": 0.30248565965583174, "grad_norm": 2.1805429458618164, "learning_rate": 4.680662733102189e-06, "loss": 0.4408, "step": 1582 }, { "epoch": 0.3026768642447419, "grad_norm": 3.509096145629883, "learning_rate": 4.681064251717901e-06, "loss": 0.4217, "step": 1583 }, { "epoch": 0.302868068833652, "grad_norm": 2.643540620803833, "learning_rate": 4.681465516769583e-06, "loss": 0.1418, "step": 1584 }, { "epoch": 0.3030592734225621, "grad_norm": 1.8429009914398193, "learning_rate": 4.681866528577289e-06, "loss": 0.1108, "step": 1585 }, { "epoch": 0.30325047801147226, "grad_norm": 1.879594087600708, "learning_rate": 4.6822672874604705e-06, "loss": 0.2356, "step": 1586 }, { "epoch": 0.3034416826003824, "grad_norm": 2.853865623474121, "learning_rate": 4.6826677937379745e-06, "loss": 0.1654, "step": 1587 }, { "epoch": 0.30363288718929254, "grad_norm": 1.8738490343093872, "learning_rate": 4.683068047728041e-06, "loss": 0.2092, "step": 1588 }, { "epoch": 0.3038240917782027, "grad_norm": 3.2390451431274414, "learning_rate": 4.683468049748315e-06, "loss": 0.528, "step": 1589 }, { "epoch": 0.30401529636711283, "grad_norm": 2.1751222610473633, "learning_rate": 4.683867800115839e-06, "loss": 0.3197, "step": 1590 }, { "epoch": 0.3042065009560229, "grad_norm": 3.2450201511383057, "learning_rate": 4.684267299147057e-06, "loss": 0.4242, "step": 1591 }, { "epoch": 0.30439770554493306, "grad_norm": 1.056480050086975, "learning_rate": 4.684666547157818e-06, "loss": 0.1298, "step": 1592 }, { "epoch": 0.3045889101338432, "grad_norm": 1.7769147157669067, "learning_rate": 4.685065544463375e-06, "loss": 0.1012, "step": 1593 }, { "epoch": 0.30478011472275335, "grad_norm": 1.4946969747543335, "learning_rate": 4.685464291378389e-06, "loss": 0.0821, "step": 1594 }, { "epoch": 0.3049713193116635, "grad_norm": 2.63144850730896, "learning_rate": 4.6858627882169256e-06, "loss": 0.3787, "step": 1595 }, { "epoch": 0.30516252390057363, "grad_norm": 2.6494715213775635, "learning_rate": 4.686261035292464e-06, "loss": 0.3208, "step": 1596 }, { "epoch": 0.3053537284894837, "grad_norm": 4.6903791427612305, "learning_rate": 4.6866590329178915e-06, "loss": 0.363, "step": 1597 }, { "epoch": 0.30554493307839387, "grad_norm": 2.102142572402954, "learning_rate": 4.68705678140551e-06, "loss": 0.1513, "step": 1598 }, { "epoch": 0.305736137667304, "grad_norm": 1.5582478046417236, "learning_rate": 4.687454281067032e-06, "loss": 0.0899, "step": 1599 }, { "epoch": 0.30592734225621415, "grad_norm": 1.9125760793685913, "learning_rate": 4.687851532213589e-06, "loss": 0.0984, "step": 1600 }, { "epoch": 0.3061185468451243, "grad_norm": 2.396864414215088, "learning_rate": 4.688248535155727e-06, "loss": 0.3593, "step": 1601 }, { "epoch": 0.30630975143403444, "grad_norm": 1.5265965461730957, "learning_rate": 4.688645290203411e-06, "loss": 0.0922, "step": 1602 }, { "epoch": 0.3065009560229445, "grad_norm": 2.4949896335601807, "learning_rate": 4.689041797666025e-06, "loss": 0.1977, "step": 1603 }, { "epoch": 0.30669216061185467, "grad_norm": 2.181450605392456, "learning_rate": 4.6894380578523775e-06, "loss": 0.1048, "step": 1604 }, { "epoch": 0.3068833652007648, "grad_norm": 1.2355436086654663, "learning_rate": 4.689834071070693e-06, "loss": 0.0392, "step": 1605 }, { "epoch": 0.30707456978967496, "grad_norm": 3.254880666732788, "learning_rate": 4.690229837628627e-06, "loss": 0.1794, "step": 1606 }, { "epoch": 0.3072657743785851, "grad_norm": 2.0266425609588623, "learning_rate": 4.690625357833257e-06, "loss": 0.1934, "step": 1607 }, { "epoch": 0.30745697896749524, "grad_norm": 1.5074000358581543, "learning_rate": 4.6910206319910875e-06, "loss": 0.206, "step": 1608 }, { "epoch": 0.30764818355640533, "grad_norm": 1.8750319480895996, "learning_rate": 4.6914156604080515e-06, "loss": 0.1769, "step": 1609 }, { "epoch": 0.3078393881453155, "grad_norm": 1.9686181545257568, "learning_rate": 4.691810443389513e-06, "loss": 0.1552, "step": 1610 }, { "epoch": 0.3080305927342256, "grad_norm": 2.02163028717041, "learning_rate": 4.692204981240264e-06, "loss": 0.1021, "step": 1611 }, { "epoch": 0.30822179732313576, "grad_norm": 2.6845357418060303, "learning_rate": 4.692599274264534e-06, "loss": 0.1429, "step": 1612 }, { "epoch": 0.3084130019120459, "grad_norm": 2.7308313846588135, "learning_rate": 4.692993322765983e-06, "loss": 0.3626, "step": 1613 }, { "epoch": 0.30860420650095605, "grad_norm": 1.9989246129989624, "learning_rate": 4.693387127047705e-06, "loss": 0.3389, "step": 1614 }, { "epoch": 0.30879541108986613, "grad_norm": 2.3005568981170654, "learning_rate": 4.693780687412236e-06, "loss": 0.3273, "step": 1615 }, { "epoch": 0.3089866156787763, "grad_norm": 2.354796886444092, "learning_rate": 4.694174004161545e-06, "loss": 0.1299, "step": 1616 }, { "epoch": 0.3091778202676864, "grad_norm": 1.5658437013626099, "learning_rate": 4.6945670775970445e-06, "loss": 0.0881, "step": 1617 }, { "epoch": 0.30936902485659656, "grad_norm": 2.1076741218566895, "learning_rate": 4.694959908019585e-06, "loss": 0.2679, "step": 1618 }, { "epoch": 0.3095602294455067, "grad_norm": 2.51861572265625, "learning_rate": 4.6953524957294615e-06, "loss": 0.1314, "step": 1619 }, { "epoch": 0.30975143403441685, "grad_norm": 2.3451156616210938, "learning_rate": 4.695744841026411e-06, "loss": 0.2426, "step": 1620 }, { "epoch": 0.30994263862332694, "grad_norm": 2.2577333450317383, "learning_rate": 4.696136944209617e-06, "loss": 0.3822, "step": 1621 }, { "epoch": 0.3101338432122371, "grad_norm": 2.172588348388672, "learning_rate": 4.696528805577708e-06, "loss": 0.1405, "step": 1622 }, { "epoch": 0.3103250478011472, "grad_norm": 3.17950177192688, "learning_rate": 4.696920425428762e-06, "loss": 0.3541, "step": 1623 }, { "epoch": 0.31051625239005737, "grad_norm": 4.15213680267334, "learning_rate": 4.6973118040603045e-06, "loss": 0.1327, "step": 1624 }, { "epoch": 0.3107074569789675, "grad_norm": 1.5607051849365234, "learning_rate": 4.697702941769314e-06, "loss": 0.0799, "step": 1625 }, { "epoch": 0.31089866156787765, "grad_norm": 1.972217082977295, "learning_rate": 4.698093838852218e-06, "loss": 0.3492, "step": 1626 }, { "epoch": 0.31108986615678774, "grad_norm": 2.502187728881836, "learning_rate": 4.6984844956048994e-06, "loss": 0.3884, "step": 1627 }, { "epoch": 0.3112810707456979, "grad_norm": 3.3736958503723145, "learning_rate": 4.698874912322695e-06, "loss": 0.2595, "step": 1628 }, { "epoch": 0.31147227533460803, "grad_norm": 1.729385495185852, "learning_rate": 4.699265089300396e-06, "loss": 0.0982, "step": 1629 }, { "epoch": 0.31166347992351817, "grad_norm": 9.559845924377441, "learning_rate": 4.699655026832254e-06, "loss": 0.5083, "step": 1630 }, { "epoch": 0.3118546845124283, "grad_norm": 2.132229804992676, "learning_rate": 4.700044725211977e-06, "loss": 0.1439, "step": 1631 }, { "epoch": 0.31204588910133846, "grad_norm": 2.046917676925659, "learning_rate": 4.700434184732733e-06, "loss": 0.345, "step": 1632 }, { "epoch": 0.31223709369024855, "grad_norm": 2.525454044342041, "learning_rate": 4.700823405687152e-06, "loss": 0.4536, "step": 1633 }, { "epoch": 0.3124282982791587, "grad_norm": 2.227076768875122, "learning_rate": 4.7012123883673265e-06, "loss": 0.2261, "step": 1634 }, { "epoch": 0.31261950286806883, "grad_norm": 1.19818115234375, "learning_rate": 4.701601133064812e-06, "loss": 0.0679, "step": 1635 }, { "epoch": 0.312810707456979, "grad_norm": 2.9948644638061523, "learning_rate": 4.701989640070631e-06, "loss": 0.1852, "step": 1636 }, { "epoch": 0.3130019120458891, "grad_norm": 2.5695416927337646, "learning_rate": 4.70237790967527e-06, "loss": 0.3184, "step": 1637 }, { "epoch": 0.31319311663479926, "grad_norm": 2.033766508102417, "learning_rate": 4.702765942168687e-06, "loss": 0.1388, "step": 1638 }, { "epoch": 0.31338432122370935, "grad_norm": 2.1387429237365723, "learning_rate": 4.703153737840303e-06, "loss": 0.2016, "step": 1639 }, { "epoch": 0.3135755258126195, "grad_norm": 1.2150760889053345, "learning_rate": 4.703541296979016e-06, "loss": 0.1351, "step": 1640 }, { "epoch": 0.31376673040152964, "grad_norm": 1.4617748260498047, "learning_rate": 4.703928619873192e-06, "loss": 0.0998, "step": 1641 }, { "epoch": 0.3139579349904398, "grad_norm": 2.583080291748047, "learning_rate": 4.704315706810671e-06, "loss": 0.1376, "step": 1642 }, { "epoch": 0.3141491395793499, "grad_norm": 1.4851268529891968, "learning_rate": 4.7047025580787675e-06, "loss": 0.0733, "step": 1643 }, { "epoch": 0.31434034416826, "grad_norm": 2.7585723400115967, "learning_rate": 4.7050891739642704e-06, "loss": 0.1577, "step": 1644 }, { "epoch": 0.31453154875717015, "grad_norm": 2.7079718112945557, "learning_rate": 4.705475554753447e-06, "loss": 0.2468, "step": 1645 }, { "epoch": 0.3147227533460803, "grad_norm": 2.084747791290283, "learning_rate": 4.705861700732041e-06, "loss": 0.1464, "step": 1646 }, { "epoch": 0.31491395793499044, "grad_norm": 3.7070984840393066, "learning_rate": 4.706247612185277e-06, "loss": 0.1427, "step": 1647 }, { "epoch": 0.3151051625239006, "grad_norm": 1.74429190158844, "learning_rate": 4.70663328939786e-06, "loss": 0.1169, "step": 1648 }, { "epoch": 0.3152963671128107, "grad_norm": 2.154435157775879, "learning_rate": 4.707018732653974e-06, "loss": 0.2084, "step": 1649 }, { "epoch": 0.3154875717017208, "grad_norm": 2.1565818786621094, "learning_rate": 4.707403942237291e-06, "loss": 0.1462, "step": 1650 }, { "epoch": 0.31567877629063096, "grad_norm": 3.50270676612854, "learning_rate": 4.707788918430965e-06, "loss": 0.6104, "step": 1651 }, { "epoch": 0.3158699808795411, "grad_norm": 1.7462185621261597, "learning_rate": 4.708173661517635e-06, "loss": 0.1656, "step": 1652 }, { "epoch": 0.31606118546845124, "grad_norm": 1.6207178831100464, "learning_rate": 4.708558171779426e-06, "loss": 0.1279, "step": 1653 }, { "epoch": 0.3162523900573614, "grad_norm": 2.2919764518737793, "learning_rate": 4.7089424494979555e-06, "loss": 0.2565, "step": 1654 }, { "epoch": 0.31644359464627153, "grad_norm": 2.5331718921661377, "learning_rate": 4.709326494954326e-06, "loss": 0.1646, "step": 1655 }, { "epoch": 0.3166347992351816, "grad_norm": 2.5295658111572266, "learning_rate": 4.709710308429132e-06, "loss": 0.1328, "step": 1656 }, { "epoch": 0.31682600382409176, "grad_norm": 2.147109031677246, "learning_rate": 4.710093890202459e-06, "loss": 0.2947, "step": 1657 }, { "epoch": 0.3170172084130019, "grad_norm": 2.6603047847747803, "learning_rate": 4.71047724055389e-06, "loss": 0.4228, "step": 1658 }, { "epoch": 0.31720841300191205, "grad_norm": 3.075491189956665, "learning_rate": 4.710860359762494e-06, "loss": 0.4414, "step": 1659 }, { "epoch": 0.3173996175908222, "grad_norm": 2.007510185241699, "learning_rate": 4.711243248106844e-06, "loss": 0.1255, "step": 1660 }, { "epoch": 0.31759082217973233, "grad_norm": 1.8735504150390625, "learning_rate": 4.711625905865004e-06, "loss": 0.1407, "step": 1661 }, { "epoch": 0.3177820267686424, "grad_norm": 1.5158590078353882, "learning_rate": 4.7120083333145385e-06, "loss": 0.0653, "step": 1662 }, { "epoch": 0.31797323135755257, "grad_norm": 2.0995800495147705, "learning_rate": 4.712390530732511e-06, "loss": 0.3412, "step": 1663 }, { "epoch": 0.3181644359464627, "grad_norm": 2.8367748260498047, "learning_rate": 4.712772498395484e-06, "loss": 0.3504, "step": 1664 }, { "epoch": 0.31835564053537285, "grad_norm": 2.228076696395874, "learning_rate": 4.713154236579523e-06, "loss": 0.2741, "step": 1665 }, { "epoch": 0.318546845124283, "grad_norm": 2.8032071590423584, "learning_rate": 4.713535745560195e-06, "loss": 0.4121, "step": 1666 }, { "epoch": 0.31873804971319314, "grad_norm": 2.5112833976745605, "learning_rate": 4.713917025612572e-06, "loss": 0.4035, "step": 1667 }, { "epoch": 0.3189292543021032, "grad_norm": 4.7532758712768555, "learning_rate": 4.714298077011231e-06, "loss": 0.1276, "step": 1668 }, { "epoch": 0.31912045889101337, "grad_norm": 2.9184582233428955, "learning_rate": 4.714678900030255e-06, "loss": 0.1993, "step": 1669 }, { "epoch": 0.3193116634799235, "grad_norm": 3.2624402046203613, "learning_rate": 4.715059494943234e-06, "loss": 0.5957, "step": 1670 }, { "epoch": 0.31950286806883366, "grad_norm": 2.0953452587127686, "learning_rate": 4.715439862023267e-06, "loss": 0.2429, "step": 1671 }, { "epoch": 0.3196940726577438, "grad_norm": 2.3828635215759277, "learning_rate": 4.715820001542965e-06, "loss": 0.2674, "step": 1672 }, { "epoch": 0.31988527724665394, "grad_norm": 1.713576078414917, "learning_rate": 4.716199913774444e-06, "loss": 0.1027, "step": 1673 }, { "epoch": 0.32007648183556403, "grad_norm": 2.204185724258423, "learning_rate": 4.71657959898934e-06, "loss": 0.0693, "step": 1674 }, { "epoch": 0.3202676864244742, "grad_norm": 2.4515421390533447, "learning_rate": 4.716959057458796e-06, "loss": 0.2329, "step": 1675 }, { "epoch": 0.3204588910133843, "grad_norm": 2.277780055999756, "learning_rate": 4.717338289453474e-06, "loss": 0.2594, "step": 1676 }, { "epoch": 0.32065009560229446, "grad_norm": 1.873704433441162, "learning_rate": 4.717717295243549e-06, "loss": 0.3301, "step": 1677 }, { "epoch": 0.3208413001912046, "grad_norm": 1.6470894813537598, "learning_rate": 4.718096075098712e-06, "loss": 0.1803, "step": 1678 }, { "epoch": 0.32103250478011475, "grad_norm": 2.8109920024871826, "learning_rate": 4.718474629288177e-06, "loss": 0.1192, "step": 1679 }, { "epoch": 0.32122370936902483, "grad_norm": 2.7735977172851562, "learning_rate": 4.71885295808067e-06, "loss": 0.3198, "step": 1680 }, { "epoch": 0.321414913957935, "grad_norm": 3.0144803524017334, "learning_rate": 4.719231061744443e-06, "loss": 0.1269, "step": 1681 }, { "epoch": 0.3216061185468451, "grad_norm": 2.3814454078674316, "learning_rate": 4.7196089405472675e-06, "loss": 0.1253, "step": 1682 }, { "epoch": 0.32179732313575526, "grad_norm": 2.3879141807556152, "learning_rate": 4.719986594756435e-06, "loss": 0.4084, "step": 1683 }, { "epoch": 0.3219885277246654, "grad_norm": 1.6740145683288574, "learning_rate": 4.720364024638766e-06, "loss": 0.0784, "step": 1684 }, { "epoch": 0.32217973231357555, "grad_norm": 2.241508722305298, "learning_rate": 4.7207412304606015e-06, "loss": 0.298, "step": 1685 }, { "epoch": 0.32237093690248564, "grad_norm": 1.0748635530471802, "learning_rate": 4.72111821248781e-06, "loss": 0.048, "step": 1686 }, { "epoch": 0.3225621414913958, "grad_norm": 2.42177677154541, "learning_rate": 4.721494970985786e-06, "loss": 0.1027, "step": 1687 }, { "epoch": 0.3227533460803059, "grad_norm": 2.5916969776153564, "learning_rate": 4.721871506219455e-06, "loss": 0.1818, "step": 1688 }, { "epoch": 0.32294455066921607, "grad_norm": 1.406157374382019, "learning_rate": 4.7222478184532676e-06, "loss": 0.1544, "step": 1689 }, { "epoch": 0.3231357552581262, "grad_norm": 1.2543416023254395, "learning_rate": 4.722623907951209e-06, "loss": 0.1498, "step": 1690 }, { "epoch": 0.32332695984703635, "grad_norm": 1.768796443939209, "learning_rate": 4.722999774976792e-06, "loss": 0.2372, "step": 1691 }, { "epoch": 0.32351816443594644, "grad_norm": 2.0685126781463623, "learning_rate": 4.723375419793066e-06, "loss": 0.1395, "step": 1692 }, { "epoch": 0.3237093690248566, "grad_norm": 3.7179152965545654, "learning_rate": 4.723750842662612e-06, "loss": 0.1303, "step": 1693 }, { "epoch": 0.32390057361376673, "grad_norm": 2.238060712814331, "learning_rate": 4.7241260438475445e-06, "loss": 0.2068, "step": 1694 }, { "epoch": 0.3240917782026769, "grad_norm": 2.760972023010254, "learning_rate": 4.724501023609517e-06, "loss": 0.4147, "step": 1695 }, { "epoch": 0.324282982791587, "grad_norm": 3.159468412399292, "learning_rate": 4.724875782209718e-06, "loss": 0.6906, "step": 1696 }, { "epoch": 0.32447418738049716, "grad_norm": 2.3904199600219727, "learning_rate": 4.725250319908874e-06, "loss": 0.4181, "step": 1697 }, { "epoch": 0.32466539196940725, "grad_norm": 2.083958625793457, "learning_rate": 4.725624636967252e-06, "loss": 0.1321, "step": 1698 }, { "epoch": 0.3248565965583174, "grad_norm": 2.2649729251861572, "learning_rate": 4.725998733644659e-06, "loss": 0.1378, "step": 1699 }, { "epoch": 0.32504780114722753, "grad_norm": 2.6033074855804443, "learning_rate": 4.726372610200442e-06, "loss": 0.1434, "step": 1700 }, { "epoch": 0.3252390057361377, "grad_norm": 2.7515151500701904, "learning_rate": 4.726746266893492e-06, "loss": 0.4188, "step": 1701 }, { "epoch": 0.3254302103250478, "grad_norm": 2.8080029487609863, "learning_rate": 4.727119703982244e-06, "loss": 0.3093, "step": 1702 }, { "epoch": 0.3256214149139579, "grad_norm": 3.603713274002075, "learning_rate": 4.727492921724675e-06, "loss": 0.4832, "step": 1703 }, { "epoch": 0.32581261950286805, "grad_norm": 2.3233444690704346, "learning_rate": 4.72786592037831e-06, "loss": 0.3555, "step": 1704 }, { "epoch": 0.3260038240917782, "grad_norm": 3.610562801361084, "learning_rate": 4.728238700200221e-06, "loss": 0.2113, "step": 1705 }, { "epoch": 0.32619502868068834, "grad_norm": 1.6778076887130737, "learning_rate": 4.728611261447025e-06, "loss": 0.0959, "step": 1706 }, { "epoch": 0.3263862332695985, "grad_norm": 2.616176128387451, "learning_rate": 4.728983604374891e-06, "loss": 0.3212, "step": 1707 }, { "epoch": 0.3265774378585086, "grad_norm": 2.2144649028778076, "learning_rate": 4.7293557292395365e-06, "loss": 0.3198, "step": 1708 }, { "epoch": 0.3267686424474187, "grad_norm": 1.6588743925094604, "learning_rate": 4.72972763629623e-06, "loss": 0.1418, "step": 1709 }, { "epoch": 0.32695984703632885, "grad_norm": 3.0176148414611816, "learning_rate": 4.730099325799792e-06, "loss": 0.2423, "step": 1710 }, { "epoch": 0.327151051625239, "grad_norm": 1.894723653793335, "learning_rate": 4.730470798004597e-06, "loss": 0.1414, "step": 1711 }, { "epoch": 0.32734225621414914, "grad_norm": 2.1323254108428955, "learning_rate": 4.730842053164572e-06, "loss": 0.0995, "step": 1712 }, { "epoch": 0.3275334608030593, "grad_norm": 3.2055537700653076, "learning_rate": 4.7312130915332e-06, "loss": 0.3597, "step": 1713 }, { "epoch": 0.3277246653919694, "grad_norm": 2.552307605743408, "learning_rate": 4.731583913363522e-06, "loss": 0.1933, "step": 1714 }, { "epoch": 0.3279158699808795, "grad_norm": 1.2854914665222168, "learning_rate": 4.731954518908132e-06, "loss": 0.2917, "step": 1715 }, { "epoch": 0.32810707456978966, "grad_norm": 2.1804897785186768, "learning_rate": 4.732324908419186e-06, "loss": 0.2287, "step": 1716 }, { "epoch": 0.3282982791586998, "grad_norm": 1.7139678001403809, "learning_rate": 4.732695082148399e-06, "loss": 0.1392, "step": 1717 }, { "epoch": 0.32848948374760994, "grad_norm": 2.053713321685791, "learning_rate": 4.733065040347042e-06, "loss": 0.1129, "step": 1718 }, { "epoch": 0.3286806883365201, "grad_norm": 1.8551483154296875, "learning_rate": 4.733434783265955e-06, "loss": 0.096, "step": 1719 }, { "epoch": 0.32887189292543023, "grad_norm": 2.319152593612671, "learning_rate": 4.733804311155533e-06, "loss": 0.5004, "step": 1720 }, { "epoch": 0.3290630975143403, "grad_norm": 2.038278102874756, "learning_rate": 4.734173624265738e-06, "loss": 0.1583, "step": 1721 }, { "epoch": 0.32925430210325046, "grad_norm": 1.7227160930633545, "learning_rate": 4.734542722846097e-06, "loss": 0.2504, "step": 1722 }, { "epoch": 0.3294455066921606, "grad_norm": 2.386646270751953, "learning_rate": 4.734911607145701e-06, "loss": 0.3351, "step": 1723 }, { "epoch": 0.32963671128107075, "grad_norm": 2.143439769744873, "learning_rate": 4.735280277413207e-06, "loss": 0.1811, "step": 1724 }, { "epoch": 0.3298279158699809, "grad_norm": 2.0049850940704346, "learning_rate": 4.735648733896841e-06, "loss": 0.1041, "step": 1725 }, { "epoch": 0.33001912045889104, "grad_norm": 2.2885913848876953, "learning_rate": 4.736016976844395e-06, "loss": 0.2313, "step": 1726 }, { "epoch": 0.3302103250478011, "grad_norm": 3.8230414390563965, "learning_rate": 4.736385006503233e-06, "loss": 0.6751, "step": 1727 }, { "epoch": 0.33040152963671127, "grad_norm": 2.540400266647339, "learning_rate": 4.7367528231202895e-06, "loss": 0.2008, "step": 1728 }, { "epoch": 0.3305927342256214, "grad_norm": 2.929203987121582, "learning_rate": 4.737120426942068e-06, "loss": 0.3909, "step": 1729 }, { "epoch": 0.33078393881453155, "grad_norm": 1.263774037361145, "learning_rate": 4.737487818214645e-06, "loss": 0.1669, "step": 1730 }, { "epoch": 0.3309751434034417, "grad_norm": 1.2126981019973755, "learning_rate": 4.737854997183673e-06, "loss": 0.1578, "step": 1731 }, { "epoch": 0.33116634799235184, "grad_norm": 3.105952739715576, "learning_rate": 4.738221964094376e-06, "loss": 0.4275, "step": 1732 }, { "epoch": 0.3313575525812619, "grad_norm": 1.326631784439087, "learning_rate": 4.738588719191555e-06, "loss": 0.1018, "step": 1733 }, { "epoch": 0.33154875717017207, "grad_norm": 1.790029764175415, "learning_rate": 4.738955262719585e-06, "loss": 0.1735, "step": 1734 }, { "epoch": 0.3317399617590822, "grad_norm": 2.4458529949188232, "learning_rate": 4.739321594922423e-06, "loss": 0.1493, "step": 1735 }, { "epoch": 0.33193116634799236, "grad_norm": 1.3313581943511963, "learning_rate": 4.7396877160436e-06, "loss": 0.048, "step": 1736 }, { "epoch": 0.3321223709369025, "grad_norm": 1.1828702688217163, "learning_rate": 4.740053626326225e-06, "loss": 0.0575, "step": 1737 }, { "epoch": 0.33231357552581264, "grad_norm": 3.091031551361084, "learning_rate": 4.740419326012995e-06, "loss": 0.4912, "step": 1738 }, { "epoch": 0.33250478011472273, "grad_norm": 2.2998857498168945, "learning_rate": 4.740784815346178e-06, "loss": 0.2701, "step": 1739 }, { "epoch": 0.3326959847036329, "grad_norm": 1.3561292886734009, "learning_rate": 4.741150094567632e-06, "loss": 0.2125, "step": 1740 }, { "epoch": 0.332887189292543, "grad_norm": 0.8512531518936157, "learning_rate": 4.7415151639187964e-06, "loss": 0.077, "step": 1741 }, { "epoch": 0.33307839388145316, "grad_norm": 1.6298433542251587, "learning_rate": 4.741880023640691e-06, "loss": 0.1782, "step": 1742 }, { "epoch": 0.3332695984703633, "grad_norm": 3.3909101486206055, "learning_rate": 4.742244673973925e-06, "loss": 0.2529, "step": 1743 }, { "epoch": 0.33346080305927345, "grad_norm": 2.4966087341308594, "learning_rate": 4.742609115158691e-06, "loss": 0.2019, "step": 1744 }, { "epoch": 0.33365200764818354, "grad_norm": 4.023033142089844, "learning_rate": 4.74297334743477e-06, "loss": 0.4731, "step": 1745 }, { "epoch": 0.3338432122370937, "grad_norm": 3.0190768241882324, "learning_rate": 4.74333737104153e-06, "loss": 0.4793, "step": 1746 }, { "epoch": 0.3340344168260038, "grad_norm": 3.289560556411743, "learning_rate": 4.743701186217929e-06, "loss": 0.1652, "step": 1747 }, { "epoch": 0.33422562141491396, "grad_norm": 2.139427423477173, "learning_rate": 4.744064793202513e-06, "loss": 0.137, "step": 1748 }, { "epoch": 0.3344168260038241, "grad_norm": 3.575605869293213, "learning_rate": 4.74442819223342e-06, "loss": 0.3851, "step": 1749 }, { "epoch": 0.33460803059273425, "grad_norm": 3.1366186141967773, "learning_rate": 4.744791383548379e-06, "loss": 0.1949, "step": 1750 }, { "epoch": 0.33479923518164434, "grad_norm": 1.9750595092773438, "learning_rate": 4.745154367384712e-06, "loss": 0.1648, "step": 1751 }, { "epoch": 0.3349904397705545, "grad_norm": 1.4047598838806152, "learning_rate": 4.745517143979335e-06, "loss": 0.1517, "step": 1752 }, { "epoch": 0.3351816443594646, "grad_norm": 1.289894700050354, "learning_rate": 4.7458797135687565e-06, "loss": 0.1399, "step": 1753 }, { "epoch": 0.33537284894837477, "grad_norm": 0.9425712823867798, "learning_rate": 4.746242076389082e-06, "loss": 0.1055, "step": 1754 }, { "epoch": 0.3355640535372849, "grad_norm": 2.4297313690185547, "learning_rate": 4.746604232676014e-06, "loss": 0.1552, "step": 1755 }, { "epoch": 0.33575525812619506, "grad_norm": 2.6048977375030518, "learning_rate": 4.746966182664851e-06, "loss": 0.1051, "step": 1756 }, { "epoch": 0.33594646271510514, "grad_norm": 3.050530433654785, "learning_rate": 4.747327926590489e-06, "loss": 0.3244, "step": 1757 }, { "epoch": 0.3361376673040153, "grad_norm": 2.425839900970459, "learning_rate": 4.747689464687424e-06, "loss": 0.4203, "step": 1758 }, { "epoch": 0.33632887189292543, "grad_norm": 1.1890416145324707, "learning_rate": 4.748050797189752e-06, "loss": 0.1145, "step": 1759 }, { "epoch": 0.3365200764818356, "grad_norm": 2.5795624256134033, "learning_rate": 4.74841192433117e-06, "loss": 0.2012, "step": 1760 }, { "epoch": 0.3367112810707457, "grad_norm": 1.8465324640274048, "learning_rate": 4.7487728463449755e-06, "loss": 0.1086, "step": 1761 }, { "epoch": 0.33690248565965586, "grad_norm": 1.6357309818267822, "learning_rate": 4.749133563464071e-06, "loss": 0.1407, "step": 1762 }, { "epoch": 0.33709369024856595, "grad_norm": 1.5176138877868652, "learning_rate": 4.749494075920959e-06, "loss": 0.0951, "step": 1763 }, { "epoch": 0.3372848948374761, "grad_norm": 3.06494140625, "learning_rate": 4.749854383947751e-06, "loss": 0.453, "step": 1764 }, { "epoch": 0.33747609942638623, "grad_norm": 3.943192481994629, "learning_rate": 4.7502144877761604e-06, "loss": 0.5386, "step": 1765 }, { "epoch": 0.3376673040152964, "grad_norm": 1.6208007335662842, "learning_rate": 4.750574387637508e-06, "loss": 0.1078, "step": 1766 }, { "epoch": 0.3378585086042065, "grad_norm": 1.6230381727218628, "learning_rate": 4.750934083762721e-06, "loss": 0.0768, "step": 1767 }, { "epoch": 0.3380497131931166, "grad_norm": 1.910230040550232, "learning_rate": 4.751293576382336e-06, "loss": 0.1267, "step": 1768 }, { "epoch": 0.33824091778202675, "grad_norm": 2.327993869781494, "learning_rate": 4.751652865726499e-06, "loss": 0.1338, "step": 1769 }, { "epoch": 0.3384321223709369, "grad_norm": 2.1939034461975098, "learning_rate": 4.752011952024963e-06, "loss": 0.2042, "step": 1770 }, { "epoch": 0.33862332695984704, "grad_norm": 1.9725449085235596, "learning_rate": 4.752370835507094e-06, "loss": 0.2497, "step": 1771 }, { "epoch": 0.3388145315487572, "grad_norm": 3.0306098461151123, "learning_rate": 4.752729516401868e-06, "loss": 0.3951, "step": 1772 }, { "epoch": 0.3390057361376673, "grad_norm": 2.3561394214630127, "learning_rate": 4.753087994937877e-06, "loss": 0.2254, "step": 1773 }, { "epoch": 0.3391969407265774, "grad_norm": 2.5845770835876465, "learning_rate": 4.753446271343321e-06, "loss": 0.2025, "step": 1774 }, { "epoch": 0.33938814531548755, "grad_norm": 2.2755463123321533, "learning_rate": 4.753804345846018e-06, "loss": 0.1169, "step": 1775 }, { "epoch": 0.3395793499043977, "grad_norm": 2.320425271987915, "learning_rate": 4.754162218673401e-06, "loss": 0.3507, "step": 1776 }, { "epoch": 0.33977055449330784, "grad_norm": 1.7574282884597778, "learning_rate": 4.754519890052516e-06, "loss": 0.3012, "step": 1777 }, { "epoch": 0.339961759082218, "grad_norm": 1.9989606142044067, "learning_rate": 4.75487736021003e-06, "loss": 0.2198, "step": 1778 }, { "epoch": 0.34015296367112813, "grad_norm": 1.8190861940383911, "learning_rate": 4.7552346293722235e-06, "loss": 0.1237, "step": 1779 }, { "epoch": 0.3403441682600382, "grad_norm": 1.9230531454086304, "learning_rate": 4.755591697764998e-06, "loss": 0.1621, "step": 1780 }, { "epoch": 0.34053537284894836, "grad_norm": 2.197153091430664, "learning_rate": 4.755948565613874e-06, "loss": 0.1253, "step": 1781 }, { "epoch": 0.3407265774378585, "grad_norm": 2.13446044921875, "learning_rate": 4.756305233143992e-06, "loss": 0.2251, "step": 1782 }, { "epoch": 0.34091778202676865, "grad_norm": 1.634322166442871, "learning_rate": 4.756661700580113e-06, "loss": 0.1433, "step": 1783 }, { "epoch": 0.3411089866156788, "grad_norm": 1.0584380626678467, "learning_rate": 4.757017968146622e-06, "loss": 0.0601, "step": 1784 }, { "epoch": 0.34130019120458893, "grad_norm": 2.613311529159546, "learning_rate": 4.757374036067523e-06, "loss": 0.3052, "step": 1785 }, { "epoch": 0.341491395793499, "grad_norm": 2.105159044265747, "learning_rate": 4.757729904566448e-06, "loss": 0.0758, "step": 1786 }, { "epoch": 0.34168260038240916, "grad_norm": 2.2200515270233154, "learning_rate": 4.75808557386665e-06, "loss": 0.2331, "step": 1787 }, { "epoch": 0.3418738049713193, "grad_norm": 1.8687654733657837, "learning_rate": 4.75844104419101e-06, "loss": 0.1472, "step": 1788 }, { "epoch": 0.34206500956022945, "grad_norm": 3.0916874408721924, "learning_rate": 4.758796315762033e-06, "loss": 0.3935, "step": 1789 }, { "epoch": 0.3422562141491396, "grad_norm": 2.0571913719177246, "learning_rate": 4.759151388801852e-06, "loss": 0.1408, "step": 1790 }, { "epoch": 0.34244741873804974, "grad_norm": 2.078958511352539, "learning_rate": 4.759506263532227e-06, "loss": 0.2162, "step": 1791 }, { "epoch": 0.3426386233269598, "grad_norm": 1.2904046773910522, "learning_rate": 4.759860940174549e-06, "loss": 0.1126, "step": 1792 }, { "epoch": 0.34282982791586997, "grad_norm": 1.762441635131836, "learning_rate": 4.760215418949835e-06, "loss": 0.1004, "step": 1793 }, { "epoch": 0.3430210325047801, "grad_norm": 1.8758217096328735, "learning_rate": 4.760569700078735e-06, "loss": 0.0913, "step": 1794 }, { "epoch": 0.34321223709369025, "grad_norm": 2.6033754348754883, "learning_rate": 4.760923783781529e-06, "loss": 0.2553, "step": 1795 }, { "epoch": 0.3434034416826004, "grad_norm": 3.3962483406066895, "learning_rate": 4.76127767027813e-06, "loss": 0.5496, "step": 1796 }, { "epoch": 0.34359464627151054, "grad_norm": 1.7760767936706543, "learning_rate": 4.7616313597880805e-06, "loss": 0.223, "step": 1797 }, { "epoch": 0.3437858508604206, "grad_norm": 2.5091898441314697, "learning_rate": 4.761984852530561e-06, "loss": 0.0785, "step": 1798 }, { "epoch": 0.34397705544933077, "grad_norm": 2.1788854598999023, "learning_rate": 4.762338148724385e-06, "loss": 0.2495, "step": 1799 }, { "epoch": 0.3441682600382409, "grad_norm": 2.8695642948150635, "learning_rate": 4.762691248587998e-06, "loss": 0.1116, "step": 1800 }, { "epoch": 0.34435946462715106, "grad_norm": 2.591621160507202, "learning_rate": 4.763044152339487e-06, "loss": 0.5715, "step": 1801 }, { "epoch": 0.3445506692160612, "grad_norm": 2.8537497520446777, "learning_rate": 4.76339686019657e-06, "loss": 0.4617, "step": 1802 }, { "epoch": 0.34474187380497134, "grad_norm": 2.406893014907837, "learning_rate": 4.763749372376608e-06, "loss": 0.0832, "step": 1803 }, { "epoch": 0.34493307839388143, "grad_norm": 1.8675955533981323, "learning_rate": 4.764101689096597e-06, "loss": 0.1626, "step": 1804 }, { "epoch": 0.3451242829827916, "grad_norm": 3.17629337310791, "learning_rate": 4.7644538105731735e-06, "loss": 0.3307, "step": 1805 }, { "epoch": 0.3453154875717017, "grad_norm": 2.8467295169830322, "learning_rate": 4.764805737022614e-06, "loss": 0.2995, "step": 1806 }, { "epoch": 0.34550669216061186, "grad_norm": 2.333732843399048, "learning_rate": 4.765157468660835e-06, "loss": 0.3135, "step": 1807 }, { "epoch": 0.345697896749522, "grad_norm": 3.3674793243408203, "learning_rate": 4.7655090057033955e-06, "loss": 0.651, "step": 1808 }, { "epoch": 0.34588910133843215, "grad_norm": 2.2502448558807373, "learning_rate": 4.7658603483654965e-06, "loss": 0.2983, "step": 1809 }, { "epoch": 0.34608030592734224, "grad_norm": 2.531153917312622, "learning_rate": 4.7662114968619835e-06, "loss": 0.3743, "step": 1810 }, { "epoch": 0.3462715105162524, "grad_norm": 2.929173231124878, "learning_rate": 4.766562451407343e-06, "loss": 0.2767, "step": 1811 }, { "epoch": 0.3464627151051625, "grad_norm": 2.2841217517852783, "learning_rate": 4.766913212215711e-06, "loss": 0.1143, "step": 1812 }, { "epoch": 0.34665391969407267, "grad_norm": 1.7222204208374023, "learning_rate": 4.767263779500863e-06, "loss": 0.118, "step": 1813 }, { "epoch": 0.3468451242829828, "grad_norm": 2.1262247562408447, "learning_rate": 4.767614153476226e-06, "loss": 0.1722, "step": 1814 }, { "epoch": 0.34703632887189295, "grad_norm": 1.8248090744018555, "learning_rate": 4.7679643343548724e-06, "loss": 0.1271, "step": 1815 }, { "epoch": 0.34722753346080304, "grad_norm": 1.4642949104309082, "learning_rate": 4.768314322349521e-06, "loss": 0.0987, "step": 1816 }, { "epoch": 0.3474187380497132, "grad_norm": 2.0707128047943115, "learning_rate": 4.768664117672543e-06, "loss": 0.0991, "step": 1817 }, { "epoch": 0.3476099426386233, "grad_norm": 1.105255126953125, "learning_rate": 4.769013720535954e-06, "loss": 0.0739, "step": 1818 }, { "epoch": 0.34780114722753347, "grad_norm": 1.9014976024627686, "learning_rate": 4.769363131151425e-06, "loss": 0.1378, "step": 1819 }, { "epoch": 0.3479923518164436, "grad_norm": 2.5240933895111084, "learning_rate": 4.769712349730274e-06, "loss": 0.4471, "step": 1820 }, { "epoch": 0.34818355640535376, "grad_norm": 1.6601219177246094, "learning_rate": 4.770061376483473e-06, "loss": 0.1254, "step": 1821 }, { "epoch": 0.34837476099426384, "grad_norm": 2.0070979595184326, "learning_rate": 4.770410211621644e-06, "loss": 0.1297, "step": 1822 }, { "epoch": 0.348565965583174, "grad_norm": 2.647048234939575, "learning_rate": 4.7707588553550665e-06, "loss": 0.2616, "step": 1823 }, { "epoch": 0.34875717017208413, "grad_norm": 1.199033260345459, "learning_rate": 4.77110730789367e-06, "loss": 0.0857, "step": 1824 }, { "epoch": 0.3489483747609943, "grad_norm": 2.3734302520751953, "learning_rate": 4.771455569447043e-06, "loss": 0.0676, "step": 1825 }, { "epoch": 0.3491395793499044, "grad_norm": 2.570103883743286, "learning_rate": 4.771803640224424e-06, "loss": 0.3698, "step": 1826 }, { "epoch": 0.3493307839388145, "grad_norm": 1.8896373510360718, "learning_rate": 4.7721515204347135e-06, "loss": 0.1165, "step": 1827 }, { "epoch": 0.34952198852772465, "grad_norm": 2.3569037914276123, "learning_rate": 4.772499210286465e-06, "loss": 0.2643, "step": 1828 }, { "epoch": 0.3497131931166348, "grad_norm": 3.0266520977020264, "learning_rate": 4.772846709987891e-06, "loss": 0.3738, "step": 1829 }, { "epoch": 0.34990439770554493, "grad_norm": 3.1515417098999023, "learning_rate": 4.773194019746864e-06, "loss": 0.1745, "step": 1830 }, { "epoch": 0.3500956022944551, "grad_norm": 1.1789801120758057, "learning_rate": 4.773541139770914e-06, "loss": 0.0523, "step": 1831 }, { "epoch": 0.3502868068833652, "grad_norm": 2.357591152191162, "learning_rate": 4.7738880702672316e-06, "loss": 0.3621, "step": 1832 }, { "epoch": 0.3504780114722753, "grad_norm": 1.8474746942520142, "learning_rate": 4.77423481144267e-06, "loss": 0.3042, "step": 1833 }, { "epoch": 0.35066921606118545, "grad_norm": 1.6401458978652954, "learning_rate": 4.77458136350374e-06, "loss": 0.1098, "step": 1834 }, { "epoch": 0.3508604206500956, "grad_norm": 2.561168670654297, "learning_rate": 4.774927726656617e-06, "loss": 0.3783, "step": 1835 }, { "epoch": 0.35105162523900574, "grad_norm": 2.586186170578003, "learning_rate": 4.775273901107143e-06, "loss": 0.1671, "step": 1836 }, { "epoch": 0.3512428298279159, "grad_norm": 0.993121325969696, "learning_rate": 4.775619887060815e-06, "loss": 0.0358, "step": 1837 }, { "epoch": 0.351434034416826, "grad_norm": 3.0413339138031006, "learning_rate": 4.775965684722804e-06, "loss": 0.1799, "step": 1838 }, { "epoch": 0.3516252390057361, "grad_norm": 2.575594902038574, "learning_rate": 4.776311294297939e-06, "loss": 0.3987, "step": 1839 }, { "epoch": 0.35181644359464626, "grad_norm": 2.484976053237915, "learning_rate": 4.776656715990719e-06, "loss": 0.201, "step": 1840 }, { "epoch": 0.3520076481835564, "grad_norm": 2.7293996810913086, "learning_rate": 4.777001950005309e-06, "loss": 0.5425, "step": 1841 }, { "epoch": 0.35219885277246654, "grad_norm": 1.7221540212631226, "learning_rate": 4.77734699654554e-06, "loss": 0.1215, "step": 1842 }, { "epoch": 0.3523900573613767, "grad_norm": 2.226285696029663, "learning_rate": 4.777691855814912e-06, "loss": 0.2002, "step": 1843 }, { "epoch": 0.35258126195028683, "grad_norm": 3.086306571960449, "learning_rate": 4.778036528016594e-06, "loss": 0.2252, "step": 1844 }, { "epoch": 0.3527724665391969, "grad_norm": 2.252471685409546, "learning_rate": 4.778381013353426e-06, "loss": 0.2922, "step": 1845 }, { "epoch": 0.35296367112810706, "grad_norm": 3.0337913036346436, "learning_rate": 4.778725312027913e-06, "loss": 0.3342, "step": 1846 }, { "epoch": 0.3531548757170172, "grad_norm": 2.6601107120513916, "learning_rate": 4.7790694242422385e-06, "loss": 0.2105, "step": 1847 }, { "epoch": 0.35334608030592735, "grad_norm": 1.9365030527114868, "learning_rate": 4.779413350198251e-06, "loss": 0.192, "step": 1848 }, { "epoch": 0.3535372848948375, "grad_norm": 2.894613027572632, "learning_rate": 4.779757090097476e-06, "loss": 0.175, "step": 1849 }, { "epoch": 0.35372848948374763, "grad_norm": 2.1319048404693604, "learning_rate": 4.780100644141109e-06, "loss": 0.1175, "step": 1850 }, { "epoch": 0.3539196940726577, "grad_norm": 2.9696707725524902, "learning_rate": 4.780444012530022e-06, "loss": 0.2393, "step": 1851 }, { "epoch": 0.35411089866156786, "grad_norm": 2.417073965072632, "learning_rate": 4.780787195464761e-06, "loss": 0.335, "step": 1852 }, { "epoch": 0.354302103250478, "grad_norm": 2.466055154800415, "learning_rate": 4.7811301931455436e-06, "loss": 0.3433, "step": 1853 }, { "epoch": 0.35449330783938815, "grad_norm": 2.6776459217071533, "learning_rate": 4.781473005772269e-06, "loss": 0.3252, "step": 1854 }, { "epoch": 0.3546845124282983, "grad_norm": 1.4896794557571411, "learning_rate": 4.7818156335445075e-06, "loss": 0.1096, "step": 1855 }, { "epoch": 0.35487571701720844, "grad_norm": 2.7048983573913574, "learning_rate": 4.782158076661511e-06, "loss": 0.2342, "step": 1856 }, { "epoch": 0.3550669216061185, "grad_norm": 2.1952528953552246, "learning_rate": 4.782500335322208e-06, "loss": 0.2551, "step": 1857 }, { "epoch": 0.35525812619502867, "grad_norm": 2.7355797290802, "learning_rate": 4.782842409725205e-06, "loss": 0.4026, "step": 1858 }, { "epoch": 0.3554493307839388, "grad_norm": 1.9009243249893188, "learning_rate": 4.78318430006879e-06, "loss": 0.169, "step": 1859 }, { "epoch": 0.35564053537284895, "grad_norm": 1.3028790950775146, "learning_rate": 4.783526006550927e-06, "loss": 0.1357, "step": 1860 }, { "epoch": 0.3558317399617591, "grad_norm": 2.2286794185638428, "learning_rate": 4.783867529369265e-06, "loss": 0.1787, "step": 1861 }, { "epoch": 0.35602294455066924, "grad_norm": 1.97344970703125, "learning_rate": 4.784208868721133e-06, "loss": 0.0833, "step": 1862 }, { "epoch": 0.35621414913957933, "grad_norm": 2.151817560195923, "learning_rate": 4.784550024803541e-06, "loss": 0.154, "step": 1863 }, { "epoch": 0.35640535372848947, "grad_norm": 1.631117582321167, "learning_rate": 4.784890997813184e-06, "loss": 0.1474, "step": 1864 }, { "epoch": 0.3565965583173996, "grad_norm": 2.0637216567993164, "learning_rate": 4.785231787946437e-06, "loss": 0.1154, "step": 1865 }, { "epoch": 0.35678776290630976, "grad_norm": 1.8624078035354614, "learning_rate": 4.785572395399365e-06, "loss": 0.1261, "step": 1866 }, { "epoch": 0.3569789674952199, "grad_norm": 2.4838550090789795, "learning_rate": 4.785912820367712e-06, "loss": 0.2282, "step": 1867 }, { "epoch": 0.35717017208413004, "grad_norm": 2.778296709060669, "learning_rate": 4.786253063046911e-06, "loss": 0.3457, "step": 1868 }, { "epoch": 0.35736137667304013, "grad_norm": 3.7155518531799316, "learning_rate": 4.7865931236320795e-06, "loss": 0.4199, "step": 1869 }, { "epoch": 0.3575525812619503, "grad_norm": 2.4353044033050537, "learning_rate": 4.7869330023180235e-06, "loss": 0.1885, "step": 1870 }, { "epoch": 0.3577437858508604, "grad_norm": 2.149007797241211, "learning_rate": 4.787272699299234e-06, "loss": 0.1746, "step": 1871 }, { "epoch": 0.35793499043977056, "grad_norm": 3.386817216873169, "learning_rate": 4.787612214769893e-06, "loss": 0.2888, "step": 1872 }, { "epoch": 0.3581261950286807, "grad_norm": 1.4428246021270752, "learning_rate": 4.78795154892387e-06, "loss": 0.1703, "step": 1873 }, { "epoch": 0.35831739961759085, "grad_norm": 1.719489574432373, "learning_rate": 4.788290701954725e-06, "loss": 0.113, "step": 1874 }, { "epoch": 0.35850860420650094, "grad_norm": 2.9463346004486084, "learning_rate": 4.788629674055707e-06, "loss": 0.1265, "step": 1875 }, { "epoch": 0.3586998087954111, "grad_norm": 1.8955507278442383, "learning_rate": 4.788968465419756e-06, "loss": 0.1493, "step": 1876 }, { "epoch": 0.3588910133843212, "grad_norm": 1.885257601737976, "learning_rate": 4.789307076239504e-06, "loss": 0.3264, "step": 1877 }, { "epoch": 0.35908221797323137, "grad_norm": 2.1236987113952637, "learning_rate": 4.789645506707277e-06, "loss": 0.1504, "step": 1878 }, { "epoch": 0.3592734225621415, "grad_norm": 1.592720627784729, "learning_rate": 4.789983757015089e-06, "loss": 0.2391, "step": 1879 }, { "epoch": 0.35946462715105165, "grad_norm": 1.6620198488235474, "learning_rate": 4.790321827354654e-06, "loss": 0.2443, "step": 1880 }, { "epoch": 0.35965583173996174, "grad_norm": 1.3143545389175415, "learning_rate": 4.790659717917373e-06, "loss": 0.1046, "step": 1881 }, { "epoch": 0.3598470363288719, "grad_norm": 1.7060904502868652, "learning_rate": 4.790997428894348e-06, "loss": 0.2334, "step": 1882 }, { "epoch": 0.360038240917782, "grad_norm": 2.8742563724517822, "learning_rate": 4.791334960476374e-06, "loss": 0.4188, "step": 1883 }, { "epoch": 0.36022944550669217, "grad_norm": 3.0253193378448486, "learning_rate": 4.79167231285394e-06, "loss": 0.267, "step": 1884 }, { "epoch": 0.3604206500956023, "grad_norm": 1.5449588298797607, "learning_rate": 4.792009486217236e-06, "loss": 0.1025, "step": 1885 }, { "epoch": 0.3606118546845124, "grad_norm": 2.831562042236328, "learning_rate": 4.792346480756146e-06, "loss": 0.2385, "step": 1886 }, { "epoch": 0.36080305927342254, "grad_norm": 2.17584490776062, "learning_rate": 4.792683296660254e-06, "loss": 0.1348, "step": 1887 }, { "epoch": 0.3609942638623327, "grad_norm": 2.886467456817627, "learning_rate": 4.793019934118841e-06, "loss": 0.3341, "step": 1888 }, { "epoch": 0.36118546845124283, "grad_norm": 2.703188896179199, "learning_rate": 4.793356393320889e-06, "loss": 0.5411, "step": 1889 }, { "epoch": 0.361376673040153, "grad_norm": 2.883558988571167, "learning_rate": 4.79369267445508e-06, "loss": 0.4724, "step": 1890 }, { "epoch": 0.3615678776290631, "grad_norm": 2.0322422981262207, "learning_rate": 4.794028777709793e-06, "loss": 0.1397, "step": 1891 }, { "epoch": 0.3617590822179732, "grad_norm": 2.538921594619751, "learning_rate": 4.794364703273114e-06, "loss": 0.1836, "step": 1892 }, { "epoch": 0.36195028680688335, "grad_norm": 1.7409203052520752, "learning_rate": 4.7947004513328256e-06, "loss": 0.1028, "step": 1893 }, { "epoch": 0.3621414913957935, "grad_norm": 2.6174840927124023, "learning_rate": 4.795036022076417e-06, "loss": 0.228, "step": 1894 }, { "epoch": 0.36233269598470363, "grad_norm": 1.768669605255127, "learning_rate": 4.795371415691077e-06, "loss": 0.2068, "step": 1895 }, { "epoch": 0.3625239005736138, "grad_norm": 1.426334261894226, "learning_rate": 4.795706632363701e-06, "loss": 0.1385, "step": 1896 }, { "epoch": 0.3627151051625239, "grad_norm": 1.599705457687378, "learning_rate": 4.796041672280887e-06, "loss": 0.1014, "step": 1897 }, { "epoch": 0.362906309751434, "grad_norm": 3.4072515964508057, "learning_rate": 4.7963765356289385e-06, "loss": 0.1648, "step": 1898 }, { "epoch": 0.36309751434034415, "grad_norm": 2.493661642074585, "learning_rate": 4.796711222593864e-06, "loss": 0.106, "step": 1899 }, { "epoch": 0.3632887189292543, "grad_norm": 1.6522670984268188, "learning_rate": 4.79704573336138e-06, "loss": 0.0677, "step": 1900 }, { "epoch": 0.36347992351816444, "grad_norm": 2.6305341720581055, "learning_rate": 4.797380068116908e-06, "loss": 0.3414, "step": 1901 }, { "epoch": 0.3636711281070746, "grad_norm": 2.2357025146484375, "learning_rate": 4.797714227045577e-06, "loss": 0.3488, "step": 1902 }, { "epoch": 0.3638623326959847, "grad_norm": 2.4729673862457275, "learning_rate": 4.7980482103322265e-06, "loss": 0.3622, "step": 1903 }, { "epoch": 0.3640535372848948, "grad_norm": 2.1345574855804443, "learning_rate": 4.7983820181614016e-06, "loss": 0.2182, "step": 1904 }, { "epoch": 0.36424474187380496, "grad_norm": 3.113025188446045, "learning_rate": 4.798715650717358e-06, "loss": 0.1702, "step": 1905 }, { "epoch": 0.3644359464627151, "grad_norm": 1.9702924489974976, "learning_rate": 4.799049108184062e-06, "loss": 0.1392, "step": 1906 }, { "epoch": 0.36462715105162524, "grad_norm": 3.3608129024505615, "learning_rate": 4.799382390745188e-06, "loss": 0.6729, "step": 1907 }, { "epoch": 0.3648183556405354, "grad_norm": 1.8882901668548584, "learning_rate": 4.799715498584127e-06, "loss": 0.1397, "step": 1908 }, { "epoch": 0.36500956022944553, "grad_norm": 2.139448642730713, "learning_rate": 4.800048431883974e-06, "loss": 0.2139, "step": 1909 }, { "epoch": 0.3652007648183556, "grad_norm": 1.5788694620132446, "learning_rate": 4.800381190827542e-06, "loss": 0.1232, "step": 1910 }, { "epoch": 0.36539196940726576, "grad_norm": 1.1630561351776123, "learning_rate": 4.800713775597355e-06, "loss": 0.0644, "step": 1911 }, { "epoch": 0.3655831739961759, "grad_norm": 1.5363394021987915, "learning_rate": 4.8010461863756505e-06, "loss": 0.0722, "step": 1912 }, { "epoch": 0.36577437858508605, "grad_norm": 2.2994883060455322, "learning_rate": 4.801378423344381e-06, "loss": 0.234, "step": 1913 }, { "epoch": 0.3659655831739962, "grad_norm": 2.314574956893921, "learning_rate": 4.8017104866852135e-06, "loss": 0.3868, "step": 1914 }, { "epoch": 0.36615678776290633, "grad_norm": 2.861659288406372, "learning_rate": 4.802042376579529e-06, "loss": 0.3618, "step": 1915 }, { "epoch": 0.3663479923518164, "grad_norm": 2.1803858280181885, "learning_rate": 4.802374093208426e-06, "loss": 0.2072, "step": 1916 }, { "epoch": 0.36653919694072656, "grad_norm": 2.282540798187256, "learning_rate": 4.8027056367527195e-06, "loss": 0.1491, "step": 1917 }, { "epoch": 0.3667304015296367, "grad_norm": 2.8533167839050293, "learning_rate": 4.803037007392939e-06, "loss": 0.2092, "step": 1918 }, { "epoch": 0.36692160611854685, "grad_norm": 1.306018590927124, "learning_rate": 4.803368205309336e-06, "loss": 0.0812, "step": 1919 }, { "epoch": 0.367112810707457, "grad_norm": 2.2597286701202393, "learning_rate": 4.803699230681877e-06, "loss": 0.3965, "step": 1920 }, { "epoch": 0.36730401529636714, "grad_norm": 1.759872555732727, "learning_rate": 4.804030083690248e-06, "loss": 0.1801, "step": 1921 }, { "epoch": 0.3674952198852772, "grad_norm": 2.418565273284912, "learning_rate": 4.804360764513856e-06, "loss": 0.3234, "step": 1922 }, { "epoch": 0.36768642447418737, "grad_norm": 2.85204815864563, "learning_rate": 4.804691273331826e-06, "loss": 0.1746, "step": 1923 }, { "epoch": 0.3678776290630975, "grad_norm": 3.226734161376953, "learning_rate": 4.805021610323005e-06, "loss": 0.115, "step": 1924 }, { "epoch": 0.36806883365200765, "grad_norm": 2.01751446723938, "learning_rate": 4.805351775665959e-06, "loss": 0.1831, "step": 1925 }, { "epoch": 0.3682600382409178, "grad_norm": 6.024718761444092, "learning_rate": 4.805681769538982e-06, "loss": 0.3726, "step": 1926 }, { "epoch": 0.36845124282982794, "grad_norm": 1.2591511011123657, "learning_rate": 4.80601159212008e-06, "loss": 0.1463, "step": 1927 }, { "epoch": 0.36864244741873803, "grad_norm": 4.118191719055176, "learning_rate": 4.806341243586993e-06, "loss": 0.4118, "step": 1928 }, { "epoch": 0.36883365200764817, "grad_norm": 3.087899923324585, "learning_rate": 4.806670724117176e-06, "loss": 0.4401, "step": 1929 }, { "epoch": 0.3690248565965583, "grad_norm": 1.9501676559448242, "learning_rate": 4.807000033887813e-06, "loss": 0.1647, "step": 1930 }, { "epoch": 0.36921606118546846, "grad_norm": 1.5643742084503174, "learning_rate": 4.807329173075811e-06, "loss": 0.1077, "step": 1931 }, { "epoch": 0.3694072657743786, "grad_norm": 1.5807640552520752, "learning_rate": 4.8076581418578e-06, "loss": 0.2282, "step": 1932 }, { "epoch": 0.36959847036328874, "grad_norm": 2.416539430618286, "learning_rate": 4.807986940410142e-06, "loss": 0.2228, "step": 1933 }, { "epoch": 0.36978967495219883, "grad_norm": 1.8993134498596191, "learning_rate": 4.808315568908919e-06, "loss": 0.1577, "step": 1934 }, { "epoch": 0.369980879541109, "grad_norm": 2.963465690612793, "learning_rate": 4.808644027529942e-06, "loss": 0.3729, "step": 1935 }, { "epoch": 0.3701720841300191, "grad_norm": 2.0274300575256348, "learning_rate": 4.808972316448751e-06, "loss": 0.1123, "step": 1936 }, { "epoch": 0.37036328871892926, "grad_norm": 1.7501300573349, "learning_rate": 4.809300435840613e-06, "loss": 0.0785, "step": 1937 }, { "epoch": 0.3705544933078394, "grad_norm": 1.8195841312408447, "learning_rate": 4.809628385880523e-06, "loss": 0.138, "step": 1938 }, { "epoch": 0.37074569789674955, "grad_norm": 1.7135430574417114, "learning_rate": 4.809956166743207e-06, "loss": 0.1738, "step": 1939 }, { "epoch": 0.37093690248565964, "grad_norm": 2.0836780071258545, "learning_rate": 4.8102837786031185e-06, "loss": 0.1386, "step": 1940 }, { "epoch": 0.3711281070745698, "grad_norm": 1.6961652040481567, "learning_rate": 4.8106112216344405e-06, "loss": 0.1152, "step": 1941 }, { "epoch": 0.3713193116634799, "grad_norm": 1.7368990182876587, "learning_rate": 4.810938496011093e-06, "loss": 0.1961, "step": 1942 }, { "epoch": 0.37151051625239007, "grad_norm": 1.4816842079162598, "learning_rate": 4.811265601906719e-06, "loss": 0.0777, "step": 1943 }, { "epoch": 0.3717017208413002, "grad_norm": 2.1479954719543457, "learning_rate": 4.8115925394946985e-06, "loss": 0.2723, "step": 1944 }, { "epoch": 0.37189292543021035, "grad_norm": 2.2252044677734375, "learning_rate": 4.811919308948144e-06, "loss": 0.2639, "step": 1945 }, { "epoch": 0.37208413001912044, "grad_norm": 1.9705275297164917, "learning_rate": 4.8122459104399e-06, "loss": 0.1681, "step": 1946 }, { "epoch": 0.3722753346080306, "grad_norm": 3.6255764961242676, "learning_rate": 4.812572344142544e-06, "loss": 0.1158, "step": 1947 }, { "epoch": 0.3724665391969407, "grad_norm": 1.2052736282348633, "learning_rate": 4.812898610228388e-06, "loss": 0.1344, "step": 1948 }, { "epoch": 0.37265774378585087, "grad_norm": 2.316469669342041, "learning_rate": 4.81322470886948e-06, "loss": 0.2374, "step": 1949 }, { "epoch": 0.372848948374761, "grad_norm": 2.0329482555389404, "learning_rate": 4.813550640237602e-06, "loss": 0.0955, "step": 1950 }, { "epoch": 0.3730401529636711, "grad_norm": 2.659797191619873, "learning_rate": 4.813876404504271e-06, "loss": 0.4339, "step": 1951 }, { "epoch": 0.37323135755258124, "grad_norm": 1.6562273502349854, "learning_rate": 4.814202001840742e-06, "loss": 0.1375, "step": 1952 }, { "epoch": 0.3734225621414914, "grad_norm": 3.6155762672424316, "learning_rate": 4.814527432418008e-06, "loss": 0.1702, "step": 1953 }, { "epoch": 0.37361376673040153, "grad_norm": 1.579047679901123, "learning_rate": 4.814852696406796e-06, "loss": 0.1116, "step": 1954 }, { "epoch": 0.3738049713193117, "grad_norm": 2.239649772644043, "learning_rate": 4.815177793977572e-06, "loss": 0.1462, "step": 1955 }, { "epoch": 0.3739961759082218, "grad_norm": 0.9502508640289307, "learning_rate": 4.815502725300541e-06, "loss": 0.0592, "step": 1956 }, { "epoch": 0.3741873804971319, "grad_norm": 3.136199712753296, "learning_rate": 4.815827490545649e-06, "loss": 0.5801, "step": 1957 }, { "epoch": 0.37437858508604205, "grad_norm": 4.284992694854736, "learning_rate": 4.8161520898825794e-06, "loss": 0.1291, "step": 1958 }, { "epoch": 0.3745697896749522, "grad_norm": 2.919841766357422, "learning_rate": 4.816476523480754e-06, "loss": 0.4008, "step": 1959 }, { "epoch": 0.37476099426386233, "grad_norm": 2.5471062660217285, "learning_rate": 4.816800791509338e-06, "loss": 0.0981, "step": 1960 }, { "epoch": 0.3749521988527725, "grad_norm": 2.21247935295105, "learning_rate": 4.817124894137239e-06, "loss": 0.153, "step": 1961 }, { "epoch": 0.3751434034416826, "grad_norm": 2.344128131866455, "learning_rate": 4.8174488315331e-06, "loss": 0.1729, "step": 1962 }, { "epoch": 0.3753346080305927, "grad_norm": 1.6539994478225708, "learning_rate": 4.817772603865314e-06, "loss": 0.1111, "step": 1963 }, { "epoch": 0.37552581261950285, "grad_norm": 2.1132073402404785, "learning_rate": 4.818096211302012e-06, "loss": 0.3434, "step": 1964 }, { "epoch": 0.375717017208413, "grad_norm": 2.918445110321045, "learning_rate": 4.818419654011068e-06, "loss": 0.4585, "step": 1965 }, { "epoch": 0.37590822179732314, "grad_norm": 2.2636373043060303, "learning_rate": 4.818742932160102e-06, "loss": 0.3373, "step": 1966 }, { "epoch": 0.3760994263862333, "grad_norm": 6.420876502990723, "learning_rate": 4.8190660459164775e-06, "loss": 0.1555, "step": 1967 }, { "epoch": 0.3762906309751434, "grad_norm": 3.735738754272461, "learning_rate": 4.819388995447304e-06, "loss": 0.2304, "step": 1968 }, { "epoch": 0.3764818355640535, "grad_norm": 0.7488231062889099, "learning_rate": 4.819711780919433e-06, "loss": 0.0496, "step": 1969 }, { "epoch": 0.37667304015296366, "grad_norm": 3.412890672683716, "learning_rate": 4.8200344024994645e-06, "loss": 0.6286, "step": 1970 }, { "epoch": 0.3768642447418738, "grad_norm": 1.569946050643921, "learning_rate": 4.820356860353744e-06, "loss": 0.1469, "step": 1971 }, { "epoch": 0.37705544933078394, "grad_norm": 1.486785650253296, "learning_rate": 4.820679154648364e-06, "loss": 0.1043, "step": 1972 }, { "epoch": 0.3772466539196941, "grad_norm": 1.4714351892471313, "learning_rate": 4.821001285549165e-06, "loss": 0.0955, "step": 1973 }, { "epoch": 0.37743785850860423, "grad_norm": 3.8283934593200684, "learning_rate": 4.821323253221735e-06, "loss": 0.24, "step": 1974 }, { "epoch": 0.3776290630975143, "grad_norm": 2.3622210025787354, "learning_rate": 4.821645057831409e-06, "loss": 0.2372, "step": 1975 }, { "epoch": 0.37782026768642446, "grad_norm": 2.4711544513702393, "learning_rate": 4.821966699543274e-06, "loss": 0.2733, "step": 1976 }, { "epoch": 0.3780114722753346, "grad_norm": 3.0288548469543457, "learning_rate": 4.822288178522164e-06, "loss": 0.3468, "step": 1977 }, { "epoch": 0.37820267686424475, "grad_norm": 2.11275577545166, "learning_rate": 4.822609494932662e-06, "loss": 0.4129, "step": 1978 }, { "epoch": 0.3783938814531549, "grad_norm": 2.2081563472747803, "learning_rate": 4.822930648939106e-06, "loss": 0.2547, "step": 1979 }, { "epoch": 0.37858508604206503, "grad_norm": 1.3358185291290283, "learning_rate": 4.823251640705579e-06, "loss": 0.0734, "step": 1980 }, { "epoch": 0.3787762906309751, "grad_norm": 1.5681957006454468, "learning_rate": 4.82357247039592e-06, "loss": 0.0682, "step": 1981 }, { "epoch": 0.37896749521988526, "grad_norm": 3.349141836166382, "learning_rate": 4.823893138173719e-06, "loss": 0.2218, "step": 1982 }, { "epoch": 0.3791586998087954, "grad_norm": 1.5513498783111572, "learning_rate": 4.824213644202315e-06, "loss": 0.1234, "step": 1983 }, { "epoch": 0.37934990439770555, "grad_norm": 1.504607081413269, "learning_rate": 4.824533988644806e-06, "loss": 0.114, "step": 1984 }, { "epoch": 0.3795411089866157, "grad_norm": 1.5265016555786133, "learning_rate": 4.8248541716640375e-06, "loss": 0.1458, "step": 1985 }, { "epoch": 0.37973231357552584, "grad_norm": 1.8571374416351318, "learning_rate": 4.825174193422613e-06, "loss": 0.0707, "step": 1986 }, { "epoch": 0.3799235181644359, "grad_norm": 1.7218410968780518, "learning_rate": 4.82549405408289e-06, "loss": 0.0912, "step": 1987 }, { "epoch": 0.38011472275334607, "grad_norm": 2.492093324661255, "learning_rate": 4.825813753806979e-06, "loss": 0.2172, "step": 1988 }, { "epoch": 0.3803059273422562, "grad_norm": 1.584241271018982, "learning_rate": 4.8261332927567455e-06, "loss": 0.1455, "step": 1989 }, { "epoch": 0.38049713193116635, "grad_norm": 1.470940351486206, "learning_rate": 4.826452671093815e-06, "loss": 0.1538, "step": 1990 }, { "epoch": 0.3806883365200765, "grad_norm": 0.8330782055854797, "learning_rate": 4.826771888979564e-06, "loss": 0.0504, "step": 1991 }, { "epoch": 0.38087954110898664, "grad_norm": 2.128312110900879, "learning_rate": 4.827090946575131e-06, "loss": 0.1854, "step": 1992 }, { "epoch": 0.38107074569789673, "grad_norm": 2.1762027740478516, "learning_rate": 4.827409844041409e-06, "loss": 0.1338, "step": 1993 }, { "epoch": 0.38126195028680687, "grad_norm": 6.44410514831543, "learning_rate": 4.827728581539049e-06, "loss": 0.4493, "step": 1994 }, { "epoch": 0.381453154875717, "grad_norm": 2.3031179904937744, "learning_rate": 4.828047159228461e-06, "loss": 0.1516, "step": 1995 }, { "epoch": 0.38164435946462716, "grad_norm": 1.6921497583389282, "learning_rate": 4.828365577269813e-06, "loss": 0.2124, "step": 1996 }, { "epoch": 0.3818355640535373, "grad_norm": 2.609638214111328, "learning_rate": 4.828683835823034e-06, "loss": 0.3682, "step": 1997 }, { "epoch": 0.38202676864244745, "grad_norm": 1.6939135789871216, "learning_rate": 4.82900193504781e-06, "loss": 0.1457, "step": 1998 }, { "epoch": 0.38221797323135753, "grad_norm": 1.4097681045532227, "learning_rate": 4.829319875103591e-06, "loss": 0.0805, "step": 1999 }, { "epoch": 0.3824091778202677, "grad_norm": 1.39816153049469, "learning_rate": 4.829637656149586e-06, "loss": 0.1119, "step": 2000 }, { "epoch": 0.3824091778202677, "eval_runtime": 849.7838, "eval_samples_per_second": 1.805, "eval_steps_per_second": 0.226, "step": 2000 }, { "epoch": 0.3826003824091778, "grad_norm": 2.89485502243042, "learning_rate": 4.829955278344763e-06, "loss": 0.2111, "step": 2001 }, { "epoch": 0.38279158699808796, "grad_norm": 2.1980834007263184, "learning_rate": 4.830272741847855e-06, "loss": 0.2678, "step": 2002 }, { "epoch": 0.3829827915869981, "grad_norm": 6.06950569152832, "learning_rate": 4.830590046817356e-06, "loss": 0.5378, "step": 2003 }, { "epoch": 0.38317399617590825, "grad_norm": 1.7232050895690918, "learning_rate": 4.830907193411522e-06, "loss": 0.1375, "step": 2004 }, { "epoch": 0.38336520076481834, "grad_norm": 2.8828940391540527, "learning_rate": 4.831224181788373e-06, "loss": 0.2377, "step": 2005 }, { "epoch": 0.3835564053537285, "grad_norm": 2.6020548343658447, "learning_rate": 4.831541012105694e-06, "loss": 0.1445, "step": 2006 }, { "epoch": 0.3837476099426386, "grad_norm": 4.2495341300964355, "learning_rate": 4.83185768452103e-06, "loss": 0.3976, "step": 2007 }, { "epoch": 0.38393881453154877, "grad_norm": 1.559543251991272, "learning_rate": 4.832174199191696e-06, "loss": 0.2264, "step": 2008 }, { "epoch": 0.3841300191204589, "grad_norm": 1.8970059156417847, "learning_rate": 4.8324905562747656e-06, "loss": 0.1736, "step": 2009 }, { "epoch": 0.384321223709369, "grad_norm": 1.7815033197402954, "learning_rate": 4.832806755927084e-06, "loss": 0.1395, "step": 2010 }, { "epoch": 0.38451242829827914, "grad_norm": 1.9595319032669067, "learning_rate": 4.833122798305259e-06, "loss": 0.0992, "step": 2011 }, { "epoch": 0.3847036328871893, "grad_norm": 1.122615098953247, "learning_rate": 4.8334386835656655e-06, "loss": 0.0475, "step": 2012 }, { "epoch": 0.3848948374760994, "grad_norm": 1.7938095331192017, "learning_rate": 4.8337544118644455e-06, "loss": 0.0843, "step": 2013 }, { "epoch": 0.38508604206500957, "grad_norm": 2.116589307785034, "learning_rate": 4.834069983357508e-06, "loss": 0.2812, "step": 2014 }, { "epoch": 0.3852772466539197, "grad_norm": 1.9789377450942993, "learning_rate": 4.83438539820053e-06, "loss": 0.1502, "step": 2015 }, { "epoch": 0.3854684512428298, "grad_norm": 1.4772371053695679, "learning_rate": 4.834700656548958e-06, "loss": 0.119, "step": 2016 }, { "epoch": 0.38565965583173994, "grad_norm": 1.6071560382843018, "learning_rate": 4.835015758558004e-06, "loss": 0.1266, "step": 2017 }, { "epoch": 0.3858508604206501, "grad_norm": 1.8250123262405396, "learning_rate": 4.835330704382654e-06, "loss": 0.1408, "step": 2018 }, { "epoch": 0.38604206500956023, "grad_norm": 1.7217341661453247, "learning_rate": 4.835645494177658e-06, "loss": 0.0784, "step": 2019 }, { "epoch": 0.3862332695984704, "grad_norm": 2.3684465885162354, "learning_rate": 4.835960128097542e-06, "loss": 0.336, "step": 2020 }, { "epoch": 0.3864244741873805, "grad_norm": 2.564565420150757, "learning_rate": 4.836274606296597e-06, "loss": 0.4045, "step": 2021 }, { "epoch": 0.3866156787762906, "grad_norm": 1.4181162118911743, "learning_rate": 4.8365889289288894e-06, "loss": 0.0994, "step": 2022 }, { "epoch": 0.38680688336520075, "grad_norm": 2.774085760116577, "learning_rate": 4.836903096148254e-06, "loss": 0.2666, "step": 2023 }, { "epoch": 0.3869980879541109, "grad_norm": 2.653083086013794, "learning_rate": 4.837217108108301e-06, "loss": 0.3805, "step": 2024 }, { "epoch": 0.38718929254302104, "grad_norm": 2.3239336013793945, "learning_rate": 4.837530964962407e-06, "loss": 0.086, "step": 2025 }, { "epoch": 0.3873804971319312, "grad_norm": 1.9220002889633179, "learning_rate": 4.837844666863729e-06, "loss": 0.2595, "step": 2026 }, { "epoch": 0.3875717017208413, "grad_norm": 2.506913900375366, "learning_rate": 4.838158213965192e-06, "loss": 0.3344, "step": 2027 }, { "epoch": 0.3877629063097514, "grad_norm": 2.293449640274048, "learning_rate": 4.838471606419496e-06, "loss": 0.1295, "step": 2028 }, { "epoch": 0.38795411089866155, "grad_norm": 2.00219464302063, "learning_rate": 4.8387848443791165e-06, "loss": 0.1283, "step": 2029 }, { "epoch": 0.3881453154875717, "grad_norm": 2.1243393421173096, "learning_rate": 4.839097927996301e-06, "loss": 0.1196, "step": 2030 }, { "epoch": 0.38833652007648184, "grad_norm": 1.4236724376678467, "learning_rate": 4.839410857423075e-06, "loss": 0.1003, "step": 2031 }, { "epoch": 0.388527724665392, "grad_norm": 1.9056960344314575, "learning_rate": 4.839723632811237e-06, "loss": 0.1268, "step": 2032 }, { "epoch": 0.3887189292543021, "grad_norm": 2.2061145305633545, "learning_rate": 4.840036254312363e-06, "loss": 0.3048, "step": 2033 }, { "epoch": 0.3889101338432122, "grad_norm": 1.490218997001648, "learning_rate": 4.840348722077805e-06, "loss": 0.1018, "step": 2034 }, { "epoch": 0.38910133843212236, "grad_norm": 1.3906128406524658, "learning_rate": 4.840661036258691e-06, "loss": 0.079, "step": 2035 }, { "epoch": 0.3892925430210325, "grad_norm": 2.2867212295532227, "learning_rate": 4.840973197005928e-06, "loss": 0.2561, "step": 2036 }, { "epoch": 0.38948374760994264, "grad_norm": 2.6874730587005615, "learning_rate": 4.841285204470199e-06, "loss": 0.1251, "step": 2037 }, { "epoch": 0.3896749521988528, "grad_norm": 2.0944433212280273, "learning_rate": 4.841597058801967e-06, "loss": 0.2646, "step": 2038 }, { "epoch": 0.38986615678776293, "grad_norm": 1.4850279092788696, "learning_rate": 4.84190876015147e-06, "loss": 0.1377, "step": 2039 }, { "epoch": 0.390057361376673, "grad_norm": 1.729572057723999, "learning_rate": 4.8422203086687295e-06, "loss": 0.1603, "step": 2040 }, { "epoch": 0.39024856596558316, "grad_norm": 2.221207618713379, "learning_rate": 4.842531704503544e-06, "loss": 0.1859, "step": 2041 }, { "epoch": 0.3904397705544933, "grad_norm": 1.757230281829834, "learning_rate": 4.842842947805491e-06, "loss": 0.1199, "step": 2042 }, { "epoch": 0.39063097514340345, "grad_norm": 1.1581106185913086, "learning_rate": 4.843154038723931e-06, "loss": 0.0563, "step": 2043 }, { "epoch": 0.3908221797323136, "grad_norm": 1.65236234664917, "learning_rate": 4.843464977408003e-06, "loss": 0.1155, "step": 2044 }, { "epoch": 0.39101338432122373, "grad_norm": 3.325693130493164, "learning_rate": 4.843775764006627e-06, "loss": 0.4827, "step": 2045 }, { "epoch": 0.3912045889101338, "grad_norm": 2.740530014038086, "learning_rate": 4.844086398668508e-06, "loss": 0.3053, "step": 2046 }, { "epoch": 0.39139579349904396, "grad_norm": 4.023008346557617, "learning_rate": 4.844396881542128e-06, "loss": 0.3783, "step": 2047 }, { "epoch": 0.3915869980879541, "grad_norm": 1.8242052793502808, "learning_rate": 4.844707212775756e-06, "loss": 0.1804, "step": 2048 }, { "epoch": 0.39177820267686425, "grad_norm": 1.4388545751571655, "learning_rate": 4.8450173925174395e-06, "loss": 0.0858, "step": 2049 }, { "epoch": 0.3919694072657744, "grad_norm": 2.0084853172302246, "learning_rate": 4.845327420915012e-06, "loss": 0.1291, "step": 2050 }, { "epoch": 0.39216061185468454, "grad_norm": 2.2301721572875977, "learning_rate": 4.845637298116093e-06, "loss": 0.3989, "step": 2051 }, { "epoch": 0.3923518164435946, "grad_norm": 1.3058888912200928, "learning_rate": 4.84594702426808e-06, "loss": 0.1435, "step": 2052 }, { "epoch": 0.39254302103250477, "grad_norm": 2.7791600227355957, "learning_rate": 4.84625659951816e-06, "loss": 0.4347, "step": 2053 }, { "epoch": 0.3927342256214149, "grad_norm": 2.30441951751709, "learning_rate": 4.846566024013304e-06, "loss": 0.1264, "step": 2054 }, { "epoch": 0.39292543021032506, "grad_norm": 1.0920432806015015, "learning_rate": 4.846875297900267e-06, "loss": 0.0552, "step": 2055 }, { "epoch": 0.3931166347992352, "grad_norm": 1.924726128578186, "learning_rate": 4.847184421325591e-06, "loss": 0.1238, "step": 2056 }, { "epoch": 0.39330783938814534, "grad_norm": 2.9873266220092773, "learning_rate": 4.847493394435604e-06, "loss": 0.3692, "step": 2057 }, { "epoch": 0.39349904397705543, "grad_norm": 3.197446584701538, "learning_rate": 4.84780221737642e-06, "loss": 0.3972, "step": 2058 }, { "epoch": 0.3936902485659656, "grad_norm": 3.1228456497192383, "learning_rate": 4.8481108902939405e-06, "loss": 0.4105, "step": 2059 }, { "epoch": 0.3938814531548757, "grad_norm": 5.650298595428467, "learning_rate": 4.8484194133338555e-06, "loss": 0.3185, "step": 2060 }, { "epoch": 0.39407265774378586, "grad_norm": 1.6081198453903198, "learning_rate": 4.8487277866416415e-06, "loss": 0.1289, "step": 2061 }, { "epoch": 0.394263862332696, "grad_norm": 2.226999521255493, "learning_rate": 4.849036010362564e-06, "loss": 0.0994, "step": 2062 }, { "epoch": 0.39445506692160615, "grad_norm": 2.3722176551818848, "learning_rate": 4.8493440846416755e-06, "loss": 0.1002, "step": 2063 }, { "epoch": 0.39464627151051623, "grad_norm": 3.996410369873047, "learning_rate": 4.8496520096238205e-06, "loss": 0.5538, "step": 2064 }, { "epoch": 0.3948374760994264, "grad_norm": 2.3079299926757812, "learning_rate": 4.8499597854536305e-06, "loss": 0.2703, "step": 2065 }, { "epoch": 0.3950286806883365, "grad_norm": 2.9113502502441406, "learning_rate": 4.85026741227553e-06, "loss": 0.3456, "step": 2066 }, { "epoch": 0.39521988527724666, "grad_norm": 2.183955192565918, "learning_rate": 4.850574890233729e-06, "loss": 0.11, "step": 2067 }, { "epoch": 0.3954110898661568, "grad_norm": 2.493313789367676, "learning_rate": 4.850882219472235e-06, "loss": 0.2962, "step": 2068 }, { "epoch": 0.3956022944550669, "grad_norm": 1.643670678138733, "learning_rate": 4.851189400134838e-06, "loss": 0.1328, "step": 2069 }, { "epoch": 0.39579349904397704, "grad_norm": 2.830077886581421, "learning_rate": 4.851496432365128e-06, "loss": 0.3916, "step": 2070 }, { "epoch": 0.3959847036328872, "grad_norm": 1.6414769887924194, "learning_rate": 4.851803316306482e-06, "loss": 0.1513, "step": 2071 }, { "epoch": 0.3961759082217973, "grad_norm": 1.641139268875122, "learning_rate": 4.85211005210207e-06, "loss": 0.1359, "step": 2072 }, { "epoch": 0.39636711281070747, "grad_norm": 2.166752576828003, "learning_rate": 4.852416639894855e-06, "loss": 0.1225, "step": 2073 }, { "epoch": 0.3965583173996176, "grad_norm": 2.580700397491455, "learning_rate": 4.852723079827596e-06, "loss": 0.2727, "step": 2074 }, { "epoch": 0.3967495219885277, "grad_norm": 1.7222399711608887, "learning_rate": 4.85302937204284e-06, "loss": 0.1269, "step": 2075 }, { "epoch": 0.39694072657743784, "grad_norm": 1.9561702013015747, "learning_rate": 4.853335516682933e-06, "loss": 0.2164, "step": 2076 }, { "epoch": 0.397131931166348, "grad_norm": 2.9243381023406982, "learning_rate": 4.853641513890013e-06, "loss": 0.3344, "step": 2077 }, { "epoch": 0.39732313575525813, "grad_norm": 2.7178955078125, "learning_rate": 4.853947363806012e-06, "loss": 0.2405, "step": 2078 }, { "epoch": 0.39751434034416827, "grad_norm": 1.9798709154129028, "learning_rate": 4.85425306657266e-06, "loss": 0.2382, "step": 2079 }, { "epoch": 0.3977055449330784, "grad_norm": 3.2675580978393555, "learning_rate": 4.8545586223314805e-06, "loss": 0.1743, "step": 2080 }, { "epoch": 0.3978967495219885, "grad_norm": 2.0814285278320312, "learning_rate": 4.8548640312237916e-06, "loss": 0.0812, "step": 2081 }, { "epoch": 0.39808795411089865, "grad_norm": 3.074506998062134, "learning_rate": 4.855169293390711e-06, "loss": 0.4034, "step": 2082 }, { "epoch": 0.3982791586998088, "grad_norm": 2.637707233428955, "learning_rate": 4.85547440897315e-06, "loss": 0.4595, "step": 2083 }, { "epoch": 0.39847036328871893, "grad_norm": 1.7692375183105469, "learning_rate": 4.855779378111821e-06, "loss": 0.1297, "step": 2084 }, { "epoch": 0.3986615678776291, "grad_norm": 1.5920054912567139, "learning_rate": 4.856084200947228e-06, "loss": 0.1393, "step": 2085 }, { "epoch": 0.3988527724665392, "grad_norm": 2.117548942565918, "learning_rate": 4.856388877619678e-06, "loss": 0.1037, "step": 2086 }, { "epoch": 0.3990439770554493, "grad_norm": 1.231958270072937, "learning_rate": 4.856693408269275e-06, "loss": 0.0662, "step": 2087 }, { "epoch": 0.39923518164435945, "grad_norm": 1.5708861351013184, "learning_rate": 4.85699779303592e-06, "loss": 0.1032, "step": 2088 }, { "epoch": 0.3994263862332696, "grad_norm": 2.8182146549224854, "learning_rate": 4.857302032059316e-06, "loss": 0.5743, "step": 2089 }, { "epoch": 0.39961759082217974, "grad_norm": 2.6597273349761963, "learning_rate": 4.857606125478961e-06, "loss": 0.3972, "step": 2090 }, { "epoch": 0.3998087954110899, "grad_norm": 2.8121769428253174, "learning_rate": 4.857910073434157e-06, "loss": 0.3327, "step": 2091 }, { "epoch": 0.4, "grad_norm": 2.188518762588501, "learning_rate": 4.858213876064004e-06, "loss": 0.2586, "step": 2092 }, { "epoch": 0.4001912045889101, "grad_norm": 1.4912960529327393, "learning_rate": 4.858517533507403e-06, "loss": 0.0882, "step": 2093 }, { "epoch": 0.40038240917782025, "grad_norm": 3.883857011795044, "learning_rate": 4.858821045903058e-06, "loss": 0.1254, "step": 2094 }, { "epoch": 0.4005736137667304, "grad_norm": 3.3286659717559814, "learning_rate": 4.85912441338947e-06, "loss": 0.741, "step": 2095 }, { "epoch": 0.40076481835564054, "grad_norm": 2.54927134513855, "learning_rate": 4.859427636104946e-06, "loss": 0.1917, "step": 2096 }, { "epoch": 0.4009560229445507, "grad_norm": 3.058338165283203, "learning_rate": 4.859730714187593e-06, "loss": 0.2758, "step": 2097 }, { "epoch": 0.4011472275334608, "grad_norm": 2.1191134452819824, "learning_rate": 4.860033647775319e-06, "loss": 0.0992, "step": 2098 }, { "epoch": 0.4013384321223709, "grad_norm": 1.2811925411224365, "learning_rate": 4.860336437005838e-06, "loss": 0.066, "step": 2099 }, { "epoch": 0.40152963671128106, "grad_norm": 1.372006893157959, "learning_rate": 4.860639082016667e-06, "loss": 0.0831, "step": 2100 }, { "epoch": 0.4017208413001912, "grad_norm": 2.105086088180542, "learning_rate": 4.8609415829451225e-06, "loss": 0.1654, "step": 2101 }, { "epoch": 0.40191204588910134, "grad_norm": 2.2075912952423096, "learning_rate": 4.861243939928331e-06, "loss": 0.2912, "step": 2102 }, { "epoch": 0.4021032504780115, "grad_norm": 1.9700030088424683, "learning_rate": 4.861546153103217e-06, "loss": 0.2135, "step": 2103 }, { "epoch": 0.40229445506692163, "grad_norm": 1.7116718292236328, "learning_rate": 4.861848222606516e-06, "loss": 0.1778, "step": 2104 }, { "epoch": 0.4024856596558317, "grad_norm": 3.5548574924468994, "learning_rate": 4.8621501485747625e-06, "loss": 0.078, "step": 2105 }, { "epoch": 0.40267686424474186, "grad_norm": 1.4873651266098022, "learning_rate": 4.862451931144302e-06, "loss": 0.0588, "step": 2106 }, { "epoch": 0.402868068833652, "grad_norm": 2.477473497390747, "learning_rate": 4.862753570451282e-06, "loss": 0.2443, "step": 2107 }, { "epoch": 0.40305927342256215, "grad_norm": 2.1252503395080566, "learning_rate": 4.863055066631658e-06, "loss": 0.4126, "step": 2108 }, { "epoch": 0.4032504780114723, "grad_norm": 1.0193214416503906, "learning_rate": 4.863356419821191e-06, "loss": 0.0968, "step": 2109 }, { "epoch": 0.40344168260038243, "grad_norm": 3.261441946029663, "learning_rate": 4.863657630155451e-06, "loss": 0.5482, "step": 2110 }, { "epoch": 0.4036328871892925, "grad_norm": 1.2448210716247559, "learning_rate": 4.863958697769811e-06, "loss": 0.1252, "step": 2111 }, { "epoch": 0.40382409177820267, "grad_norm": 2.254920482635498, "learning_rate": 4.864259622799458e-06, "loss": 0.0864, "step": 2112 }, { "epoch": 0.4040152963671128, "grad_norm": 2.846625804901123, "learning_rate": 4.8645604053793795e-06, "loss": 0.2891, "step": 2113 }, { "epoch": 0.40420650095602295, "grad_norm": 2.426459789276123, "learning_rate": 4.864861045644379e-06, "loss": 0.2404, "step": 2114 }, { "epoch": 0.4043977055449331, "grad_norm": 2.1458327770233154, "learning_rate": 4.865161543729063e-06, "loss": 0.1713, "step": 2115 }, { "epoch": 0.40458891013384324, "grad_norm": 3.3918213844299316, "learning_rate": 4.865461899767849e-06, "loss": 0.2598, "step": 2116 }, { "epoch": 0.4047801147227533, "grad_norm": 2.337883710861206, "learning_rate": 4.865762113894966e-06, "loss": 0.2028, "step": 2117 }, { "epoch": 0.40497131931166347, "grad_norm": 1.5774587392807007, "learning_rate": 4.866062186244448e-06, "loss": 0.1215, "step": 2118 }, { "epoch": 0.4051625239005736, "grad_norm": 1.7613880634307861, "learning_rate": 4.8663621169501456e-06, "loss": 0.1357, "step": 2119 }, { "epoch": 0.40535372848948376, "grad_norm": 3.2717082500457764, "learning_rate": 4.866661906145713e-06, "loss": 0.433, "step": 2120 }, { "epoch": 0.4055449330783939, "grad_norm": 2.3337671756744385, "learning_rate": 4.866961553964623e-06, "loss": 0.2526, "step": 2121 }, { "epoch": 0.40573613766730404, "grad_norm": 1.9872267246246338, "learning_rate": 4.867261060540153e-06, "loss": 0.1941, "step": 2122 }, { "epoch": 0.40592734225621413, "grad_norm": 1.2843959331512451, "learning_rate": 4.867560426005394e-06, "loss": 0.0682, "step": 2123 }, { "epoch": 0.4061185468451243, "grad_norm": 1.543263554573059, "learning_rate": 4.8678596504932505e-06, "loss": 0.1491, "step": 2124 }, { "epoch": 0.4063097514340344, "grad_norm": 2.1265337467193604, "learning_rate": 4.868158734136438e-06, "loss": 0.0946, "step": 2125 }, { "epoch": 0.40650095602294456, "grad_norm": 1.6050384044647217, "learning_rate": 4.868457677067485e-06, "loss": 0.2419, "step": 2126 }, { "epoch": 0.4066921606118547, "grad_norm": 2.0895438194274902, "learning_rate": 4.868756479418735e-06, "loss": 0.1516, "step": 2127 }, { "epoch": 0.40688336520076485, "grad_norm": 1.8586974143981934, "learning_rate": 4.8690551413223396e-06, "loss": 0.2194, "step": 2128 }, { "epoch": 0.40707456978967493, "grad_norm": 2.5971615314483643, "learning_rate": 4.869353662910269e-06, "loss": 0.3191, "step": 2129 }, { "epoch": 0.4072657743785851, "grad_norm": 1.6561384201049805, "learning_rate": 4.869652044314306e-06, "loss": 0.1768, "step": 2130 }, { "epoch": 0.4074569789674952, "grad_norm": 2.3525571823120117, "learning_rate": 4.869950285666048e-06, "loss": 0.154, "step": 2131 }, { "epoch": 0.40764818355640536, "grad_norm": 2.6129164695739746, "learning_rate": 4.870248387096907e-06, "loss": 0.1612, "step": 2132 }, { "epoch": 0.4078393881453155, "grad_norm": 1.5268852710723877, "learning_rate": 4.87054634873811e-06, "loss": 0.1151, "step": 2133 }, { "epoch": 0.4080305927342256, "grad_norm": 2.9393773078918457, "learning_rate": 4.8708441707207e-06, "loss": 0.1944, "step": 2134 }, { "epoch": 0.40822179732313574, "grad_norm": 0.9411885738372803, "learning_rate": 4.871141853175533e-06, "loss": 0.0954, "step": 2135 }, { "epoch": 0.4084130019120459, "grad_norm": 1.751671552658081, "learning_rate": 4.871439396233286e-06, "loss": 0.2188, "step": 2136 }, { "epoch": 0.408604206500956, "grad_norm": 2.3655753135681152, "learning_rate": 4.8717368000244496e-06, "loss": 0.3265, "step": 2137 }, { "epoch": 0.40879541108986617, "grad_norm": 1.446745753288269, "learning_rate": 4.8720340646793314e-06, "loss": 0.1341, "step": 2138 }, { "epoch": 0.4089866156787763, "grad_norm": 1.964414358139038, "learning_rate": 4.872331190328057e-06, "loss": 0.3604, "step": 2139 }, { "epoch": 0.4091778202676864, "grad_norm": 1.6994702816009521, "learning_rate": 4.872628177100569e-06, "loss": 0.1487, "step": 2140 }, { "epoch": 0.40936902485659654, "grad_norm": 2.1562836170196533, "learning_rate": 4.872925025126627e-06, "loss": 0.1985, "step": 2141 }, { "epoch": 0.4095602294455067, "grad_norm": 2.705679416656494, "learning_rate": 4.873221734535811e-06, "loss": 0.3949, "step": 2142 }, { "epoch": 0.40975143403441683, "grad_norm": 2.84946870803833, "learning_rate": 4.873518305457518e-06, "loss": 0.1461, "step": 2143 }, { "epoch": 0.40994263862332697, "grad_norm": 5.145754814147949, "learning_rate": 4.873814738020963e-06, "loss": 0.1118, "step": 2144 }, { "epoch": 0.4101338432122371, "grad_norm": 2.6604089736938477, "learning_rate": 4.874111032355183e-06, "loss": 0.3319, "step": 2145 }, { "epoch": 0.4103250478011472, "grad_norm": 2.2231945991516113, "learning_rate": 4.874407188589032e-06, "loss": 0.2436, "step": 2146 }, { "epoch": 0.41051625239005735, "grad_norm": 1.9829926490783691, "learning_rate": 4.874703206851186e-06, "loss": 0.1137, "step": 2147 }, { "epoch": 0.4107074569789675, "grad_norm": 1.793606162071228, "learning_rate": 4.87499908727014e-06, "loss": 0.1267, "step": 2148 }, { "epoch": 0.41089866156787763, "grad_norm": 2.4400620460510254, "learning_rate": 4.8752948299742085e-06, "loss": 0.087, "step": 2149 }, { "epoch": 0.4110898661567878, "grad_norm": 1.6873563528060913, "learning_rate": 4.875590435091529e-06, "loss": 0.0621, "step": 2150 }, { "epoch": 0.4112810707456979, "grad_norm": 1.6609550714492798, "learning_rate": 4.87588590275006e-06, "loss": 0.1239, "step": 2151 }, { "epoch": 0.411472275334608, "grad_norm": 1.9967857599258423, "learning_rate": 4.876181233077581e-06, "loss": 0.3056, "step": 2152 }, { "epoch": 0.41166347992351815, "grad_norm": 2.5311481952667236, "learning_rate": 4.876476426201691e-06, "loss": 0.3258, "step": 2153 }, { "epoch": 0.4118546845124283, "grad_norm": 2.0180413722991943, "learning_rate": 4.876771482249817e-06, "loss": 0.2722, "step": 2154 }, { "epoch": 0.41204588910133844, "grad_norm": 1.2949248552322388, "learning_rate": 4.877066401349204e-06, "loss": 0.091, "step": 2155 }, { "epoch": 0.4122370936902486, "grad_norm": 1.2472654581069946, "learning_rate": 4.87736118362692e-06, "loss": 0.0818, "step": 2156 }, { "epoch": 0.4124282982791587, "grad_norm": 1.8975611925125122, "learning_rate": 4.877655829209858e-06, "loss": 0.2136, "step": 2157 }, { "epoch": 0.4126195028680688, "grad_norm": 2.191755771636963, "learning_rate": 4.877950338224735e-06, "loss": 0.3722, "step": 2158 }, { "epoch": 0.41281070745697895, "grad_norm": 2.048837661743164, "learning_rate": 4.87824471079809e-06, "loss": 0.2638, "step": 2159 }, { "epoch": 0.4130019120458891, "grad_norm": 2.1126198768615723, "learning_rate": 4.878538947056285e-06, "loss": 0.2743, "step": 2160 }, { "epoch": 0.41319311663479924, "grad_norm": 2.0426793098449707, "learning_rate": 4.878833047125512e-06, "loss": 0.3363, "step": 2161 }, { "epoch": 0.4133843212237094, "grad_norm": 1.8592612743377686, "learning_rate": 4.879127011131783e-06, "loss": 0.1042, "step": 2162 }, { "epoch": 0.4135755258126195, "grad_norm": 7.511478424072266, "learning_rate": 4.879420839200937e-06, "loss": 0.2228, "step": 2163 }, { "epoch": 0.4137667304015296, "grad_norm": 1.5293527841567993, "learning_rate": 4.8797145314586365e-06, "loss": 0.1024, "step": 2164 }, { "epoch": 0.41395793499043976, "grad_norm": 4.5838141441345215, "learning_rate": 4.880008088030373e-06, "loss": 0.5163, "step": 2165 }, { "epoch": 0.4141491395793499, "grad_norm": 2.471611738204956, "learning_rate": 4.880301509041461e-06, "loss": 0.2742, "step": 2166 }, { "epoch": 0.41434034416826004, "grad_norm": 2.4510347843170166, "learning_rate": 4.880594794617045e-06, "loss": 0.1341, "step": 2167 }, { "epoch": 0.4145315487571702, "grad_norm": 2.3072543144226074, "learning_rate": 4.880887944882094e-06, "loss": 0.1352, "step": 2168 }, { "epoch": 0.41472275334608033, "grad_norm": 0.9170379638671875, "learning_rate": 4.881180959961403e-06, "loss": 0.0542, "step": 2169 }, { "epoch": 0.4149139579349904, "grad_norm": 2.4316747188568115, "learning_rate": 4.8814738399795956e-06, "loss": 0.4565, "step": 2170 }, { "epoch": 0.41510516252390056, "grad_norm": 2.150325298309326, "learning_rate": 4.881766585061125e-06, "loss": 0.2609, "step": 2171 }, { "epoch": 0.4152963671128107, "grad_norm": 2.830181360244751, "learning_rate": 4.8820591953302706e-06, "loss": 0.3472, "step": 2172 }, { "epoch": 0.41548757170172085, "grad_norm": 1.9228092432022095, "learning_rate": 4.882351670911141e-06, "loss": 0.1312, "step": 2173 }, { "epoch": 0.415678776290631, "grad_norm": 2.210085868835449, "learning_rate": 4.8826440119276715e-06, "loss": 0.1784, "step": 2174 }, { "epoch": 0.41586998087954113, "grad_norm": 5.4337849617004395, "learning_rate": 4.882936218503629e-06, "loss": 0.1746, "step": 2175 }, { "epoch": 0.4160611854684512, "grad_norm": 1.8646759986877441, "learning_rate": 4.883228290762608e-06, "loss": 0.2276, "step": 2176 }, { "epoch": 0.41625239005736137, "grad_norm": 2.343479871749878, "learning_rate": 4.883520228828034e-06, "loss": 0.2489, "step": 2177 }, { "epoch": 0.4164435946462715, "grad_norm": 2.6077628135681152, "learning_rate": 4.88381203282316e-06, "loss": 0.4214, "step": 2178 }, { "epoch": 0.41663479923518165, "grad_norm": 2.1604247093200684, "learning_rate": 4.884103702871074e-06, "loss": 0.1684, "step": 2179 }, { "epoch": 0.4168260038240918, "grad_norm": 2.03054141998291, "learning_rate": 4.884395239094688e-06, "loss": 0.2568, "step": 2180 }, { "epoch": 0.41701720841300194, "grad_norm": 1.8029322624206543, "learning_rate": 4.88468664161675e-06, "loss": 0.3231, "step": 2181 }, { "epoch": 0.417208413001912, "grad_norm": 2.567142963409424, "learning_rate": 4.884977910559839e-06, "loss": 0.4366, "step": 2182 }, { "epoch": 0.41739961759082217, "grad_norm": 1.7099403142929077, "learning_rate": 4.885269046046362e-06, "loss": 0.1528, "step": 2183 }, { "epoch": 0.4175908221797323, "grad_norm": 2.3315675258636475, "learning_rate": 4.885560048198562e-06, "loss": 0.5276, "step": 2184 }, { "epoch": 0.41778202676864246, "grad_norm": 1.4180094003677368, "learning_rate": 4.885850917138509e-06, "loss": 0.0885, "step": 2185 }, { "epoch": 0.4179732313575526, "grad_norm": 1.8287487030029297, "learning_rate": 4.886141652988113e-06, "loss": 0.1236, "step": 2186 }, { "epoch": 0.41816443594646274, "grad_norm": 2.6701676845550537, "learning_rate": 4.886432255869108e-06, "loss": 0.169, "step": 2187 }, { "epoch": 0.41835564053537283, "grad_norm": 2.2629594802856445, "learning_rate": 4.886722725903068e-06, "loss": 0.2824, "step": 2188 }, { "epoch": 0.418546845124283, "grad_norm": 2.599104881286621, "learning_rate": 4.887013063211395e-06, "loss": 0.2329, "step": 2189 }, { "epoch": 0.4187380497131931, "grad_norm": 2.1738619804382324, "learning_rate": 4.887303267915331e-06, "loss": 0.307, "step": 2190 }, { "epoch": 0.41892925430210326, "grad_norm": 2.778266668319702, "learning_rate": 4.887593340135946e-06, "loss": 0.1392, "step": 2191 }, { "epoch": 0.4191204588910134, "grad_norm": 2.5189621448516846, "learning_rate": 4.887883279994146e-06, "loss": 0.3267, "step": 2192 }, { "epoch": 0.4193116634799235, "grad_norm": 2.6800177097320557, "learning_rate": 4.888173087610673e-06, "loss": 0.4112, "step": 2193 }, { "epoch": 0.41950286806883363, "grad_norm": 4.370938777923584, "learning_rate": 4.888462763106103e-06, "loss": 0.2855, "step": 2194 }, { "epoch": 0.4196940726577438, "grad_norm": 2.941232442855835, "learning_rate": 4.888752306600847e-06, "loss": 0.4048, "step": 2195 }, { "epoch": 0.4198852772466539, "grad_norm": 2.167144775390625, "learning_rate": 4.889041718215152e-06, "loss": 0.3469, "step": 2196 }, { "epoch": 0.42007648183556406, "grad_norm": 3.1527018547058105, "learning_rate": 4.889330998069099e-06, "loss": 0.4365, "step": 2197 }, { "epoch": 0.4202676864244742, "grad_norm": 2.6182942390441895, "learning_rate": 4.889620146282609e-06, "loss": 0.2454, "step": 2198 }, { "epoch": 0.4204588910133843, "grad_norm": 1.247412919998169, "learning_rate": 4.889909162975434e-06, "loss": 0.0454, "step": 2199 }, { "epoch": 0.42065009560229444, "grad_norm": 2.049236536026001, "learning_rate": 4.890198048267166e-06, "loss": 0.0835, "step": 2200 }, { "epoch": 0.4208413001912046, "grad_norm": 2.416959285736084, "learning_rate": 4.890486802277235e-06, "loss": 0.2328, "step": 2201 }, { "epoch": 0.4210325047801147, "grad_norm": 2.8074474334716797, "learning_rate": 4.890775425124906e-06, "loss": 0.3223, "step": 2202 }, { "epoch": 0.42122370936902487, "grad_norm": 2.717810869216919, "learning_rate": 4.89106391692928e-06, "loss": 0.3238, "step": 2203 }, { "epoch": 0.421414913957935, "grad_norm": 1.5587702989578247, "learning_rate": 4.891352277809302e-06, "loss": 0.1786, "step": 2204 }, { "epoch": 0.4216061185468451, "grad_norm": 1.952847957611084, "learning_rate": 4.891640507883748e-06, "loss": 0.174, "step": 2205 }, { "epoch": 0.42179732313575524, "grad_norm": 3.144420623779297, "learning_rate": 4.891928607271237e-06, "loss": 0.1949, "step": 2206 }, { "epoch": 0.4219885277246654, "grad_norm": 2.8562870025634766, "learning_rate": 4.892216576090225e-06, "loss": 0.3854, "step": 2207 }, { "epoch": 0.42217973231357553, "grad_norm": 2.9668025970458984, "learning_rate": 4.892504414459007e-06, "loss": 0.4332, "step": 2208 }, { "epoch": 0.42237093690248567, "grad_norm": 2.1173808574676514, "learning_rate": 4.892792122495718e-06, "loss": 0.1733, "step": 2209 }, { "epoch": 0.4225621414913958, "grad_norm": 2.9007081985473633, "learning_rate": 4.893079700318333e-06, "loss": 0.2412, "step": 2210 }, { "epoch": 0.4227533460803059, "grad_norm": 2.2025012969970703, "learning_rate": 4.893367148044665e-06, "loss": 0.2325, "step": 2211 }, { "epoch": 0.42294455066921605, "grad_norm": 2.527050733566284, "learning_rate": 4.893654465792369e-06, "loss": 0.128, "step": 2212 }, { "epoch": 0.4231357552581262, "grad_norm": 1.7862459421157837, "learning_rate": 4.89394165367894e-06, "loss": 0.2455, "step": 2213 }, { "epoch": 0.42332695984703633, "grad_norm": 1.6860567331314087, "learning_rate": 4.894228711821714e-06, "loss": 0.1739, "step": 2214 }, { "epoch": 0.4235181644359465, "grad_norm": 1.391953706741333, "learning_rate": 4.894515640337865e-06, "loss": 0.1023, "step": 2215 }, { "epoch": 0.4237093690248566, "grad_norm": 2.542024850845337, "learning_rate": 4.894802439344414e-06, "loss": 0.352, "step": 2216 }, { "epoch": 0.4239005736137667, "grad_norm": 1.902836799621582, "learning_rate": 4.8950891089582185e-06, "loss": 0.1247, "step": 2217 }, { "epoch": 0.42409177820267685, "grad_norm": 1.8815006017684937, "learning_rate": 4.895375649295982e-06, "loss": 0.1437, "step": 2218 }, { "epoch": 0.424282982791587, "grad_norm": 2.594043731689453, "learning_rate": 4.895662060474246e-06, "loss": 0.1224, "step": 2219 }, { "epoch": 0.42447418738049714, "grad_norm": 2.592609405517578, "learning_rate": 4.895948342609398e-06, "loss": 0.5407, "step": 2220 }, { "epoch": 0.4246653919694073, "grad_norm": 1.1108200550079346, "learning_rate": 4.8962344958176664e-06, "loss": 0.1188, "step": 2221 }, { "epoch": 0.4248565965583174, "grad_norm": 1.9550951719284058, "learning_rate": 4.896520520215123e-06, "loss": 0.1843, "step": 2222 }, { "epoch": 0.4250478011472275, "grad_norm": 2.217928886413574, "learning_rate": 4.8968064159176835e-06, "loss": 0.1519, "step": 2223 }, { "epoch": 0.42523900573613765, "grad_norm": 2.6040468215942383, "learning_rate": 4.897092183041107e-06, "loss": 0.2007, "step": 2224 }, { "epoch": 0.4254302103250478, "grad_norm": 1.7855995893478394, "learning_rate": 4.897377821700995e-06, "loss": 0.1069, "step": 2225 }, { "epoch": 0.42562141491395794, "grad_norm": 2.076179027557373, "learning_rate": 4.897663332012795e-06, "loss": 0.4742, "step": 2226 }, { "epoch": 0.4258126195028681, "grad_norm": 2.4149672985076904, "learning_rate": 4.897948714091799e-06, "loss": 0.2418, "step": 2227 }, { "epoch": 0.4260038240917782, "grad_norm": 2.047053575515747, "learning_rate": 4.898233968053142e-06, "loss": 0.2112, "step": 2228 }, { "epoch": 0.4261950286806883, "grad_norm": 1.8152729272842407, "learning_rate": 4.8985190940118074e-06, "loss": 0.0942, "step": 2229 }, { "epoch": 0.42638623326959846, "grad_norm": 1.7877713441848755, "learning_rate": 4.898804092082618e-06, "loss": 0.1056, "step": 2230 }, { "epoch": 0.4265774378585086, "grad_norm": 1.7800778150558472, "learning_rate": 4.899088962380248e-06, "loss": 0.0929, "step": 2231 }, { "epoch": 0.42676864244741874, "grad_norm": 3.8843071460723877, "learning_rate": 4.899373705019215e-06, "loss": 0.628, "step": 2232 }, { "epoch": 0.4269598470363289, "grad_norm": 3.350219964981079, "learning_rate": 4.899658320113882e-06, "loss": 0.591, "step": 2233 }, { "epoch": 0.42715105162523903, "grad_norm": 1.9053051471710205, "learning_rate": 4.899942807778461e-06, "loss": 0.1806, "step": 2234 }, { "epoch": 0.4273422562141491, "grad_norm": 1.257057547569275, "learning_rate": 4.900227168127006e-06, "loss": 0.0753, "step": 2235 }, { "epoch": 0.42753346080305926, "grad_norm": 1.812961220741272, "learning_rate": 4.900511401273424e-06, "loss": 0.1633, "step": 2236 }, { "epoch": 0.4277246653919694, "grad_norm": 2.06535267829895, "learning_rate": 4.900795507331465e-06, "loss": 0.1393, "step": 2237 }, { "epoch": 0.42791586998087955, "grad_norm": 1.0033485889434814, "learning_rate": 4.901079486414725e-06, "loss": 0.0774, "step": 2238 }, { "epoch": 0.4281070745697897, "grad_norm": 2.0232021808624268, "learning_rate": 4.9013633386366545e-06, "loss": 0.2666, "step": 2239 }, { "epoch": 0.42829827915869984, "grad_norm": 2.4945590496063232, "learning_rate": 4.901647064110545e-06, "loss": 0.4225, "step": 2240 }, { "epoch": 0.4284894837476099, "grad_norm": 2.9069554805755615, "learning_rate": 4.901930662949541e-06, "loss": 0.4286, "step": 2241 }, { "epoch": 0.42868068833652007, "grad_norm": 1.4231261014938354, "learning_rate": 4.902214135266632e-06, "loss": 0.072, "step": 2242 }, { "epoch": 0.4288718929254302, "grad_norm": 1.2749229669570923, "learning_rate": 4.902497481174659e-06, "loss": 0.1584, "step": 2243 }, { "epoch": 0.42906309751434035, "grad_norm": 3.1102404594421387, "learning_rate": 4.902780700786311e-06, "loss": 0.163, "step": 2244 }, { "epoch": 0.4292543021032505, "grad_norm": 2.0278851985931396, "learning_rate": 4.903063794214126e-06, "loss": 0.4301, "step": 2245 }, { "epoch": 0.42944550669216064, "grad_norm": 2.2186572551727295, "learning_rate": 4.903346761570493e-06, "loss": 0.0785, "step": 2246 }, { "epoch": 0.4296367112810707, "grad_norm": 2.2448575496673584, "learning_rate": 4.90362960296765e-06, "loss": 0.2338, "step": 2247 }, { "epoch": 0.42982791586998087, "grad_norm": 2.3453407287597656, "learning_rate": 4.903912318517684e-06, "loss": 0.269, "step": 2248 }, { "epoch": 0.430019120458891, "grad_norm": 1.2500512599945068, "learning_rate": 4.904194908332537e-06, "loss": 0.0725, "step": 2249 }, { "epoch": 0.43021032504780116, "grad_norm": 2.4721336364746094, "learning_rate": 4.904477372523995e-06, "loss": 0.07, "step": 2250 }, { "epoch": 0.4304015296367113, "grad_norm": 2.766767740249634, "learning_rate": 4.9047597112037e-06, "loss": 0.5043, "step": 2251 }, { "epoch": 0.4305927342256214, "grad_norm": 1.717307686805725, "learning_rate": 4.905041924483143e-06, "loss": 0.1655, "step": 2252 }, { "epoch": 0.43078393881453153, "grad_norm": 3.6042754650115967, "learning_rate": 4.905324012473667e-06, "loss": 0.4827, "step": 2253 }, { "epoch": 0.4309751434034417, "grad_norm": 1.890688419342041, "learning_rate": 4.905605975286469e-06, "loss": 0.1853, "step": 2254 }, { "epoch": 0.4311663479923518, "grad_norm": 1.5629879236221313, "learning_rate": 4.9058878130325935e-06, "loss": 0.1271, "step": 2255 }, { "epoch": 0.43135755258126196, "grad_norm": 2.144808053970337, "learning_rate": 4.906169525822942e-06, "loss": 0.1081, "step": 2256 }, { "epoch": 0.4315487571701721, "grad_norm": 1.9382816553115845, "learning_rate": 4.9064511137682635e-06, "loss": 0.2521, "step": 2257 }, { "epoch": 0.4317399617590822, "grad_norm": 1.2302849292755127, "learning_rate": 4.906732576979165e-06, "loss": 0.1157, "step": 2258 }, { "epoch": 0.43193116634799233, "grad_norm": 1.9480595588684082, "learning_rate": 4.907013915566105e-06, "loss": 0.22, "step": 2259 }, { "epoch": 0.4321223709369025, "grad_norm": 1.205723762512207, "learning_rate": 4.907295129639391e-06, "loss": 0.0779, "step": 2260 }, { "epoch": 0.4323135755258126, "grad_norm": 2.0031495094299316, "learning_rate": 4.907576219309192e-06, "loss": 0.2669, "step": 2261 }, { "epoch": 0.43250478011472276, "grad_norm": 1.9002723693847656, "learning_rate": 4.907857184685524e-06, "loss": 0.0908, "step": 2262 }, { "epoch": 0.4326959847036329, "grad_norm": 2.721759796142578, "learning_rate": 4.9081380258782595e-06, "loss": 0.2764, "step": 2263 }, { "epoch": 0.432887189292543, "grad_norm": 3.628314733505249, "learning_rate": 4.908418742997127e-06, "loss": 0.768, "step": 2264 }, { "epoch": 0.43307839388145314, "grad_norm": 1.3902168273925781, "learning_rate": 4.908699336151707e-06, "loss": 0.1548, "step": 2265 }, { "epoch": 0.4332695984703633, "grad_norm": 2.48300838470459, "learning_rate": 4.908979805451437e-06, "loss": 0.4893, "step": 2266 }, { "epoch": 0.4334608030592734, "grad_norm": 2.2376482486724854, "learning_rate": 4.909260151005608e-06, "loss": 0.3067, "step": 2267 }, { "epoch": 0.43365200764818357, "grad_norm": 1.2431788444519043, "learning_rate": 4.909540372923367e-06, "loss": 0.0815, "step": 2268 }, { "epoch": 0.4338432122370937, "grad_norm": 2.4078571796417236, "learning_rate": 4.9098204713137175e-06, "loss": 0.2497, "step": 2269 }, { "epoch": 0.4340344168260038, "grad_norm": 2.028752088546753, "learning_rate": 4.910100446285518e-06, "loss": 0.3107, "step": 2270 }, { "epoch": 0.43422562141491394, "grad_norm": 2.2157468795776367, "learning_rate": 4.910380297947484e-06, "loss": 0.2083, "step": 2271 }, { "epoch": 0.4344168260038241, "grad_norm": 2.5187478065490723, "learning_rate": 4.910660026408185e-06, "loss": 0.2862, "step": 2272 }, { "epoch": 0.43460803059273423, "grad_norm": 2.122492790222168, "learning_rate": 4.910939631776051e-06, "loss": 0.1934, "step": 2273 }, { "epoch": 0.4347992351816444, "grad_norm": 1.5224970579147339, "learning_rate": 4.911219114159365e-06, "loss": 0.0709, "step": 2274 }, { "epoch": 0.4349904397705545, "grad_norm": 1.999548077583313, "learning_rate": 4.91149847366627e-06, "loss": 0.1255, "step": 2275 }, { "epoch": 0.4351816443594646, "grad_norm": 1.97611665725708, "learning_rate": 4.911777710404766e-06, "loss": 0.1653, "step": 2276 }, { "epoch": 0.43537284894837475, "grad_norm": 1.5954374074935913, "learning_rate": 4.91205682448271e-06, "loss": 0.365, "step": 2277 }, { "epoch": 0.4355640535372849, "grad_norm": 1.9923690557479858, "learning_rate": 4.9123358160078154e-06, "loss": 0.4458, "step": 2278 }, { "epoch": 0.43575525812619503, "grad_norm": 2.6444168090820312, "learning_rate": 4.912614685087658e-06, "loss": 0.3102, "step": 2279 }, { "epoch": 0.4359464627151052, "grad_norm": 3.0847392082214355, "learning_rate": 4.9128934318296675e-06, "loss": 0.0966, "step": 2280 }, { "epoch": 0.4361376673040153, "grad_norm": 1.3775651454925537, "learning_rate": 4.913172056341135e-06, "loss": 0.0513, "step": 2281 }, { "epoch": 0.4363288718929254, "grad_norm": 2.156324625015259, "learning_rate": 4.91345055872921e-06, "loss": 0.3312, "step": 2282 }, { "epoch": 0.43652007648183555, "grad_norm": 2.418484926223755, "learning_rate": 4.913728939100901e-06, "loss": 0.1921, "step": 2283 }, { "epoch": 0.4367112810707457, "grad_norm": 0.8661771416664124, "learning_rate": 4.914007197563076e-06, "loss": 0.0958, "step": 2284 }, { "epoch": 0.43690248565965584, "grad_norm": 2.313511371612549, "learning_rate": 4.914285334222461e-06, "loss": 0.3075, "step": 2285 }, { "epoch": 0.437093690248566, "grad_norm": 0.9290204048156738, "learning_rate": 4.914563349185646e-06, "loss": 0.091, "step": 2286 }, { "epoch": 0.4372848948374761, "grad_norm": 1.7797731161117554, "learning_rate": 4.914841242559077e-06, "loss": 0.117, "step": 2287 }, { "epoch": 0.4374760994263862, "grad_norm": 3.0950605869293213, "learning_rate": 4.915119014449062e-06, "loss": 0.4731, "step": 2288 }, { "epoch": 0.43766730401529635, "grad_norm": 2.342931032180786, "learning_rate": 4.915396664961769e-06, "loss": 0.1476, "step": 2289 }, { "epoch": 0.4378585086042065, "grad_norm": 2.6388490200042725, "learning_rate": 4.915674194203229e-06, "loss": 0.5004, "step": 2290 }, { "epoch": 0.43804971319311664, "grad_norm": 1.8596402406692505, "learning_rate": 4.9159516022793316e-06, "loss": 0.1407, "step": 2291 }, { "epoch": 0.4382409177820268, "grad_norm": 2.9163615703582764, "learning_rate": 4.916228889295829e-06, "loss": 0.272, "step": 2292 }, { "epoch": 0.4384321223709369, "grad_norm": 4.297892093658447, "learning_rate": 4.916506055358336e-06, "loss": 0.1782, "step": 2293 }, { "epoch": 0.438623326959847, "grad_norm": 1.6136482954025269, "learning_rate": 4.916783100572327e-06, "loss": 0.1081, "step": 2294 }, { "epoch": 0.43881453154875716, "grad_norm": 2.648573398590088, "learning_rate": 4.917060025043139e-06, "loss": 0.369, "step": 2295 }, { "epoch": 0.4390057361376673, "grad_norm": 2.309370756149292, "learning_rate": 4.917336828875972e-06, "loss": 0.3044, "step": 2296 }, { "epoch": 0.43919694072657744, "grad_norm": 2.0419301986694336, "learning_rate": 4.91761351217589e-06, "loss": 0.1293, "step": 2297 }, { "epoch": 0.4393881453154876, "grad_norm": 1.3703843355178833, "learning_rate": 4.917890075047817e-06, "loss": 0.0763, "step": 2298 }, { "epoch": 0.43957934990439773, "grad_norm": 2.651688814163208, "learning_rate": 4.918166517596542e-06, "loss": 0.0794, "step": 2299 }, { "epoch": 0.4397705544933078, "grad_norm": 2.5015828609466553, "learning_rate": 4.918442839926716e-06, "loss": 0.1203, "step": 2300 }, { "epoch": 0.43996175908221796, "grad_norm": 3.4868128299713135, "learning_rate": 4.918719042142854e-06, "loss": 0.3761, "step": 2301 }, { "epoch": 0.4401529636711281, "grad_norm": 3.072807550430298, "learning_rate": 4.918995124349335e-06, "loss": 0.4009, "step": 2302 }, { "epoch": 0.44034416826003825, "grad_norm": 1.4773715734481812, "learning_rate": 4.9192710866504036e-06, "loss": 0.1333, "step": 2303 }, { "epoch": 0.4405353728489484, "grad_norm": 1.7132066488265991, "learning_rate": 4.919546929150165e-06, "loss": 0.2852, "step": 2304 }, { "epoch": 0.44072657743785854, "grad_norm": 1.9559893608093262, "learning_rate": 4.919822651952591e-06, "loss": 0.0896, "step": 2305 }, { "epoch": 0.4409177820267686, "grad_norm": 3.859649658203125, "learning_rate": 4.920098255161518e-06, "loss": 0.1275, "step": 2306 }, { "epoch": 0.44110898661567877, "grad_norm": 2.1365041732788086, "learning_rate": 4.920373738880649e-06, "loss": 0.2009, "step": 2307 }, { "epoch": 0.4413001912045889, "grad_norm": 2.118523120880127, "learning_rate": 4.920649103213549e-06, "loss": 0.2082, "step": 2308 }, { "epoch": 0.44149139579349905, "grad_norm": 3.3567910194396973, "learning_rate": 4.92092434826365e-06, "loss": 0.2772, "step": 2309 }, { "epoch": 0.4416826003824092, "grad_norm": 2.266333818435669, "learning_rate": 4.921199474134248e-06, "loss": 0.2863, "step": 2310 }, { "epoch": 0.4418738049713193, "grad_norm": 2.4705424308776855, "learning_rate": 4.921474480928509e-06, "loss": 0.1557, "step": 2311 }, { "epoch": 0.4420650095602294, "grad_norm": 0.8634752631187439, "learning_rate": 4.921749368749461e-06, "loss": 0.0341, "step": 2312 }, { "epoch": 0.44225621414913957, "grad_norm": 3.7431929111480713, "learning_rate": 4.9220241377e-06, "loss": 0.337, "step": 2313 }, { "epoch": 0.4424474187380497, "grad_norm": 3.0681843757629395, "learning_rate": 4.92229878788289e-06, "loss": 0.3988, "step": 2314 }, { "epoch": 0.44263862332695986, "grad_norm": 1.9662657976150513, "learning_rate": 4.922573319400757e-06, "loss": 0.2526, "step": 2315 }, { "epoch": 0.44282982791587, "grad_norm": 2.1128218173980713, "learning_rate": 4.9228477323561e-06, "loss": 0.1578, "step": 2316 }, { "epoch": 0.4430210325047801, "grad_norm": 2.3029232025146484, "learning_rate": 4.923122026851282e-06, "loss": 0.188, "step": 2317 }, { "epoch": 0.44321223709369023, "grad_norm": 1.4844509363174438, "learning_rate": 4.923396202988534e-06, "loss": 0.0395, "step": 2318 }, { "epoch": 0.4434034416826004, "grad_norm": 1.5008915662765503, "learning_rate": 4.923670260869953e-06, "loss": 0.0757, "step": 2319 }, { "epoch": 0.4435946462715105, "grad_norm": 3.172825813293457, "learning_rate": 4.923944200597508e-06, "loss": 0.5372, "step": 2320 }, { "epoch": 0.44378585086042066, "grad_norm": 3.6113059520721436, "learning_rate": 4.924218022273032e-06, "loss": 0.4811, "step": 2321 }, { "epoch": 0.4439770554493308, "grad_norm": 1.0926949977874756, "learning_rate": 4.9244917259982296e-06, "loss": 0.1378, "step": 2322 }, { "epoch": 0.4441682600382409, "grad_norm": 2.9779396057128906, "learning_rate": 4.9247653118746715e-06, "loss": 0.1651, "step": 2323 }, { "epoch": 0.44435946462715104, "grad_norm": 1.3684827089309692, "learning_rate": 4.9250387800038e-06, "loss": 0.1091, "step": 2324 }, { "epoch": 0.4445506692160612, "grad_norm": 1.5437180995941162, "learning_rate": 4.9253121304869235e-06, "loss": 0.0794, "step": 2325 }, { "epoch": 0.4447418738049713, "grad_norm": 1.8073052167892456, "learning_rate": 4.925585363425222e-06, "loss": 0.1806, "step": 2326 }, { "epoch": 0.44493307839388146, "grad_norm": 2.79695463180542, "learning_rate": 4.925858478919743e-06, "loss": 0.4065, "step": 2327 }, { "epoch": 0.4451242829827916, "grad_norm": 2.834144115447998, "learning_rate": 4.926131477071406e-06, "loss": 0.4724, "step": 2328 }, { "epoch": 0.4453154875717017, "grad_norm": 1.8955655097961426, "learning_rate": 4.926404357980999e-06, "loss": 0.1503, "step": 2329 }, { "epoch": 0.44550669216061184, "grad_norm": 2.6821250915527344, "learning_rate": 4.9266771217491796e-06, "loss": 0.172, "step": 2330 }, { "epoch": 0.445697896749522, "grad_norm": 1.803351640701294, "learning_rate": 4.926949768476479e-06, "loss": 0.1085, "step": 2331 }, { "epoch": 0.4458891013384321, "grad_norm": 1.929173469543457, "learning_rate": 4.927222298263295e-06, "loss": 0.1473, "step": 2332 }, { "epoch": 0.44608030592734227, "grad_norm": 1.8458374738693237, "learning_rate": 4.927494711209899e-06, "loss": 0.2217, "step": 2333 }, { "epoch": 0.4462715105162524, "grad_norm": 2.2545127868652344, "learning_rate": 4.927767007416432e-06, "loss": 0.2251, "step": 2334 }, { "epoch": 0.4464627151051625, "grad_norm": 2.347503185272217, "learning_rate": 4.928039186982908e-06, "loss": 0.2747, "step": 2335 }, { "epoch": 0.44665391969407264, "grad_norm": 2.8446438312530518, "learning_rate": 4.92831125000921e-06, "loss": 0.2503, "step": 2336 }, { "epoch": 0.4468451242829828, "grad_norm": 1.7588080167770386, "learning_rate": 4.928583196595095e-06, "loss": 0.0731, "step": 2337 }, { "epoch": 0.44703632887189293, "grad_norm": 2.6320197582244873, "learning_rate": 4.92885502684019e-06, "loss": 0.1976, "step": 2338 }, { "epoch": 0.4472275334608031, "grad_norm": 2.0502002239227295, "learning_rate": 4.929126740843998e-06, "loss": 0.2047, "step": 2339 }, { "epoch": 0.4474187380497132, "grad_norm": 1.7559682130813599, "learning_rate": 4.929398338705889e-06, "loss": 0.1507, "step": 2340 }, { "epoch": 0.4476099426386233, "grad_norm": 1.752091407775879, "learning_rate": 4.92966982052511e-06, "loss": 0.1281, "step": 2341 }, { "epoch": 0.44780114722753345, "grad_norm": 3.454530954360962, "learning_rate": 4.92994118640078e-06, "loss": 0.2777, "step": 2342 }, { "epoch": 0.4479923518164436, "grad_norm": 3.0871379375457764, "learning_rate": 4.930212436431887e-06, "loss": 0.2577, "step": 2343 }, { "epoch": 0.44818355640535373, "grad_norm": 2.506565809249878, "learning_rate": 4.9304835707173e-06, "loss": 0.1185, "step": 2344 }, { "epoch": 0.4483747609942639, "grad_norm": 2.834341287612915, "learning_rate": 4.930754589355753e-06, "loss": 0.4537, "step": 2345 }, { "epoch": 0.448565965583174, "grad_norm": 3.0927183628082275, "learning_rate": 4.931025492445859e-06, "loss": 0.6336, "step": 2346 }, { "epoch": 0.4487571701720841, "grad_norm": 2.212230920791626, "learning_rate": 4.931296280086106e-06, "loss": 0.2144, "step": 2347 }, { "epoch": 0.44894837476099425, "grad_norm": 1.8910106420516968, "learning_rate": 4.931566952374851e-06, "loss": 0.1857, "step": 2348 }, { "epoch": 0.4491395793499044, "grad_norm": 1.7347790002822876, "learning_rate": 4.93183750941033e-06, "loss": 0.094, "step": 2349 }, { "epoch": 0.44933078393881454, "grad_norm": 2.411543846130371, "learning_rate": 4.93210795129065e-06, "loss": 0.1578, "step": 2350 }, { "epoch": 0.4495219885277247, "grad_norm": 1.223103642463684, "learning_rate": 4.932378278113796e-06, "loss": 0.1855, "step": 2351 }, { "epoch": 0.4497131931166348, "grad_norm": 2.896897554397583, "learning_rate": 4.932648489977627e-06, "loss": 0.562, "step": 2352 }, { "epoch": 0.4499043977055449, "grad_norm": 2.2476577758789062, "learning_rate": 4.932918586979875e-06, "loss": 0.2595, "step": 2353 }, { "epoch": 0.45009560229445505, "grad_norm": 1.3098033666610718, "learning_rate": 4.93318856921815e-06, "loss": 0.0972, "step": 2354 }, { "epoch": 0.4502868068833652, "grad_norm": 1.3734967708587646, "learning_rate": 4.933458436789937e-06, "loss": 0.1431, "step": 2355 }, { "epoch": 0.45047801147227534, "grad_norm": 1.1524994373321533, "learning_rate": 4.933728189792596e-06, "loss": 0.0973, "step": 2356 }, { "epoch": 0.4506692160611855, "grad_norm": 3.060199022293091, "learning_rate": 4.933997828323365e-06, "loss": 0.4424, "step": 2357 }, { "epoch": 0.45086042065009563, "grad_norm": 3.3533732891082764, "learning_rate": 4.934267352479356e-06, "loss": 0.3997, "step": 2358 }, { "epoch": 0.4510516252390057, "grad_norm": 2.261230945587158, "learning_rate": 4.934536762357558e-06, "loss": 0.2667, "step": 2359 }, { "epoch": 0.45124282982791586, "grad_norm": 1.9926918745040894, "learning_rate": 4.934806058054837e-06, "loss": 0.1408, "step": 2360 }, { "epoch": 0.451434034416826, "grad_norm": 1.3215872049331665, "learning_rate": 4.935075239667939e-06, "loss": 0.1143, "step": 2361 }, { "epoch": 0.45162523900573615, "grad_norm": 2.423924684524536, "learning_rate": 4.93534430729348e-06, "loss": 0.1111, "step": 2362 }, { "epoch": 0.4518164435946463, "grad_norm": 2.364931583404541, "learning_rate": 4.935613261027959e-06, "loss": 0.3041, "step": 2363 }, { "epoch": 0.45200764818355643, "grad_norm": 3.6321725845336914, "learning_rate": 4.9358821009677516e-06, "loss": 0.7906, "step": 2364 }, { "epoch": 0.4521988527724665, "grad_norm": 1.2970473766326904, "learning_rate": 4.93615082720911e-06, "loss": 0.1492, "step": 2365 }, { "epoch": 0.45239005736137666, "grad_norm": 1.3972055912017822, "learning_rate": 4.936419439848165e-06, "loss": 0.2284, "step": 2366 }, { "epoch": 0.4525812619502868, "grad_norm": 2.145207166671753, "learning_rate": 4.936687938980925e-06, "loss": 0.1145, "step": 2367 }, { "epoch": 0.45277246653919695, "grad_norm": 1.5136973857879639, "learning_rate": 4.936956324703276e-06, "loss": 0.1319, "step": 2368 }, { "epoch": 0.4529636711281071, "grad_norm": 2.3273026943206787, "learning_rate": 4.937224597110986e-06, "loss": 0.2864, "step": 2369 }, { "epoch": 0.45315487571701724, "grad_norm": 2.093433141708374, "learning_rate": 4.937492756299697e-06, "loss": 0.1958, "step": 2370 }, { "epoch": 0.4533460803059273, "grad_norm": 1.7759649753570557, "learning_rate": 4.937760802364934e-06, "loss": 0.3099, "step": 2371 }, { "epoch": 0.45353728489483747, "grad_norm": 2.3750076293945312, "learning_rate": 4.938028735402098e-06, "loss": 0.2607, "step": 2372 }, { "epoch": 0.4537284894837476, "grad_norm": 1.7167375087738037, "learning_rate": 4.938296555506473e-06, "loss": 0.1179, "step": 2373 }, { "epoch": 0.45391969407265775, "grad_norm": 2.261927604675293, "learning_rate": 4.938564262773219e-06, "loss": 0.1355, "step": 2374 }, { "epoch": 0.4541108986615679, "grad_norm": 1.3687342405319214, "learning_rate": 4.938831857297376e-06, "loss": 0.0843, "step": 2375 }, { "epoch": 0.454302103250478, "grad_norm": 2.428593158721924, "learning_rate": 4.939099339173867e-06, "loss": 0.1995, "step": 2376 }, { "epoch": 0.4544933078393881, "grad_norm": 1.6332013607025146, "learning_rate": 4.9393667084974925e-06, "loss": 0.1263, "step": 2377 }, { "epoch": 0.45468451242829827, "grad_norm": 1.8396623134613037, "learning_rate": 4.9396339653629345e-06, "loss": 0.153, "step": 2378 }, { "epoch": 0.4548757170172084, "grad_norm": 1.5843632221221924, "learning_rate": 4.939901109864755e-06, "loss": 0.1409, "step": 2379 }, { "epoch": 0.45506692160611856, "grad_norm": 2.6990528106689453, "learning_rate": 4.940168142097398e-06, "loss": 0.2067, "step": 2380 }, { "epoch": 0.4552581261950287, "grad_norm": 1.8671232461929321, "learning_rate": 4.940435062155186e-06, "loss": 0.105, "step": 2381 }, { "epoch": 0.4554493307839388, "grad_norm": 2.4851431846618652, "learning_rate": 4.940701870132325e-06, "loss": 0.277, "step": 2382 }, { "epoch": 0.45564053537284893, "grad_norm": 3.0492827892303467, "learning_rate": 4.940968566122902e-06, "loss": 0.4704, "step": 2383 }, { "epoch": 0.4558317399617591, "grad_norm": 3.329960823059082, "learning_rate": 4.941235150220885e-06, "loss": 0.1627, "step": 2384 }, { "epoch": 0.4560229445506692, "grad_norm": 2.0267832279205322, "learning_rate": 4.941501622520123e-06, "loss": 0.1374, "step": 2385 }, { "epoch": 0.45621414913957936, "grad_norm": 1.8187408447265625, "learning_rate": 4.941767983114349e-06, "loss": 0.2506, "step": 2386 }, { "epoch": 0.4564053537284895, "grad_norm": 1.1165252923965454, "learning_rate": 4.942034232097177e-06, "loss": 0.1032, "step": 2387 }, { "epoch": 0.4565965583173996, "grad_norm": 2.010648250579834, "learning_rate": 4.942300369562102e-06, "loss": 0.1881, "step": 2388 }, { "epoch": 0.45678776290630974, "grad_norm": 4.388535499572754, "learning_rate": 4.942566395602506e-06, "loss": 0.338, "step": 2389 }, { "epoch": 0.4569789674952199, "grad_norm": 2.1628904342651367, "learning_rate": 4.942832310311647e-06, "loss": 0.2078, "step": 2390 }, { "epoch": 0.45717017208413, "grad_norm": 1.473975658416748, "learning_rate": 4.943098113782672e-06, "loss": 0.1039, "step": 2391 }, { "epoch": 0.45736137667304017, "grad_norm": 2.243346691131592, "learning_rate": 4.94336380610861e-06, "loss": 0.0887, "step": 2392 }, { "epoch": 0.4575525812619503, "grad_norm": 1.2243520021438599, "learning_rate": 4.9436293873823705e-06, "loss": 0.1352, "step": 2393 }, { "epoch": 0.4577437858508604, "grad_norm": 1.9870951175689697, "learning_rate": 4.943894857696749e-06, "loss": 0.1092, "step": 2394 }, { "epoch": 0.45793499043977054, "grad_norm": 1.85684335231781, "learning_rate": 4.9441602171444225e-06, "loss": 0.1827, "step": 2395 }, { "epoch": 0.4581261950286807, "grad_norm": 2.156700849533081, "learning_rate": 4.944425465817956e-06, "loss": 0.1789, "step": 2396 }, { "epoch": 0.4583173996175908, "grad_norm": 1.204390287399292, "learning_rate": 4.944690603809794e-06, "loss": 0.0855, "step": 2397 }, { "epoch": 0.45850860420650097, "grad_norm": 1.6291753053665161, "learning_rate": 4.944955631212269e-06, "loss": 0.1035, "step": 2398 }, { "epoch": 0.4586998087954111, "grad_norm": 2.399693012237549, "learning_rate": 4.945220548117595e-06, "loss": 0.2533, "step": 2399 }, { "epoch": 0.4588910133843212, "grad_norm": 1.6935648918151855, "learning_rate": 4.945485354617874e-06, "loss": 0.089, "step": 2400 }, { "epoch": 0.45908221797323134, "grad_norm": 1.8837876319885254, "learning_rate": 4.945750050805088e-06, "loss": 0.2955, "step": 2401 }, { "epoch": 0.4592734225621415, "grad_norm": 1.520376205444336, "learning_rate": 4.946014636771111e-06, "loss": 0.0884, "step": 2402 }, { "epoch": 0.45946462715105163, "grad_norm": 6.492906093597412, "learning_rate": 4.946279112607695e-06, "loss": 0.8193, "step": 2403 }, { "epoch": 0.4596558317399618, "grad_norm": 2.74912691116333, "learning_rate": 4.946543478406484e-06, "loss": 0.2926, "step": 2404 }, { "epoch": 0.4598470363288719, "grad_norm": 1.9996367692947388, "learning_rate": 4.946807734259001e-06, "loss": 0.1018, "step": 2405 }, { "epoch": 0.460038240917782, "grad_norm": 1.903185486793518, "learning_rate": 4.947071880256661e-06, "loss": 0.243, "step": 2406 }, { "epoch": 0.46022944550669215, "grad_norm": 2.6955156326293945, "learning_rate": 4.947335916490763e-06, "loss": 0.3418, "step": 2407 }, { "epoch": 0.4604206500956023, "grad_norm": 4.031216144561768, "learning_rate": 4.947599843052489e-06, "loss": 0.4078, "step": 2408 }, { "epoch": 0.46061185468451243, "grad_norm": 2.110466241836548, "learning_rate": 4.947863660032912e-06, "loss": 0.2372, "step": 2409 }, { "epoch": 0.4608030592734226, "grad_norm": 1.633364200592041, "learning_rate": 4.9481273675229894e-06, "loss": 0.1019, "step": 2410 }, { "epoch": 0.4609942638623327, "grad_norm": 1.452785611152649, "learning_rate": 4.948390965613565e-06, "loss": 0.0943, "step": 2411 }, { "epoch": 0.4611854684512428, "grad_norm": 2.504720449447632, "learning_rate": 4.948654454395372e-06, "loss": 0.1355, "step": 2412 }, { "epoch": 0.46137667304015295, "grad_norm": 2.703850269317627, "learning_rate": 4.948917833959027e-06, "loss": 0.3414, "step": 2413 }, { "epoch": 0.4615678776290631, "grad_norm": 2.818969964981079, "learning_rate": 4.949181104395038e-06, "loss": 0.528, "step": 2414 }, { "epoch": 0.46175908221797324, "grad_norm": 2.11025071144104, "learning_rate": 4.949444265793797e-06, "loss": 0.1207, "step": 2415 }, { "epoch": 0.4619502868068834, "grad_norm": 2.019381523132324, "learning_rate": 4.949707318245586e-06, "loss": 0.1208, "step": 2416 }, { "epoch": 0.4621414913957935, "grad_norm": 2.8823390007019043, "learning_rate": 4.949970261840574e-06, "loss": 0.3685, "step": 2417 }, { "epoch": 0.4623326959847036, "grad_norm": 1.2599728107452393, "learning_rate": 4.950233096668818e-06, "loss": 0.0653, "step": 2418 }, { "epoch": 0.46252390057361376, "grad_norm": 2.618422508239746, "learning_rate": 4.950495822820266e-06, "loss": 0.1693, "step": 2419 }, { "epoch": 0.4627151051625239, "grad_norm": 2.234100341796875, "learning_rate": 4.950758440384748e-06, "loss": 0.1476, "step": 2420 }, { "epoch": 0.46290630975143404, "grad_norm": 1.7235227823257446, "learning_rate": 4.95102094945199e-06, "loss": 0.1403, "step": 2421 }, { "epoch": 0.4630975143403442, "grad_norm": 3.084684133529663, "learning_rate": 4.951283350111601e-06, "loss": 0.4766, "step": 2422 }, { "epoch": 0.46328871892925433, "grad_norm": 1.7967596054077148, "learning_rate": 4.951545642453086e-06, "loss": 0.1491, "step": 2423 }, { "epoch": 0.4634799235181644, "grad_norm": 2.608936071395874, "learning_rate": 4.95180782656583e-06, "loss": 0.2224, "step": 2424 }, { "epoch": 0.46367112810707456, "grad_norm": 1.7240525484085083, "learning_rate": 4.952069902539114e-06, "loss": 0.1001, "step": 2425 }, { "epoch": 0.4638623326959847, "grad_norm": 2.553189754486084, "learning_rate": 4.952331870462108e-06, "loss": 0.3537, "step": 2426 }, { "epoch": 0.46405353728489485, "grad_norm": 1.4191901683807373, "learning_rate": 4.952593730423869e-06, "loss": 0.0765, "step": 2427 }, { "epoch": 0.464244741873805, "grad_norm": 2.872032880783081, "learning_rate": 4.952855482513347e-06, "loss": 0.5042, "step": 2428 }, { "epoch": 0.46443594646271513, "grad_norm": 1.699519157409668, "learning_rate": 4.95311712681938e-06, "loss": 0.2303, "step": 2429 }, { "epoch": 0.4646271510516252, "grad_norm": 3.224975109100342, "learning_rate": 4.953378663430695e-06, "loss": 0.1607, "step": 2430 }, { "epoch": 0.46481835564053536, "grad_norm": 2.4731459617614746, "learning_rate": 4.953640092435914e-06, "loss": 0.1178, "step": 2431 }, { "epoch": 0.4650095602294455, "grad_norm": 2.1776509284973145, "learning_rate": 4.953901413923546e-06, "loss": 0.2036, "step": 2432 }, { "epoch": 0.46520076481835565, "grad_norm": 2.7886409759521484, "learning_rate": 4.9541626279819915e-06, "loss": 0.334, "step": 2433 }, { "epoch": 0.4653919694072658, "grad_norm": 1.9042888879776, "learning_rate": 4.954423734699544e-06, "loss": 0.2839, "step": 2434 }, { "epoch": 0.4655831739961759, "grad_norm": 3.304387092590332, "learning_rate": 4.954684734164385e-06, "loss": 0.5778, "step": 2435 }, { "epoch": 0.465774378585086, "grad_norm": 2.7068235874176025, "learning_rate": 4.954945626464589e-06, "loss": 0.2185, "step": 2436 }, { "epoch": 0.46596558317399617, "grad_norm": 2.0537827014923096, "learning_rate": 4.955206411688123e-06, "loss": 0.1307, "step": 2437 }, { "epoch": 0.4661567877629063, "grad_norm": 1.799789309501648, "learning_rate": 4.955467089922844e-06, "loss": 0.1601, "step": 2438 }, { "epoch": 0.46634799235181645, "grad_norm": 2.1717841625213623, "learning_rate": 4.955727661256503e-06, "loss": 0.2017, "step": 2439 }, { "epoch": 0.4665391969407266, "grad_norm": 2.0859739780426025, "learning_rate": 4.95598812577674e-06, "loss": 0.1489, "step": 2440 }, { "epoch": 0.4667304015296367, "grad_norm": 1.309865117073059, "learning_rate": 4.95624848357109e-06, "loss": 0.0694, "step": 2441 }, { "epoch": 0.46692160611854683, "grad_norm": 1.9916973114013672, "learning_rate": 4.956508734726978e-06, "loss": 0.1463, "step": 2442 }, { "epoch": 0.46711281070745697, "grad_norm": 2.035287618637085, "learning_rate": 4.956768879331726e-06, "loss": 0.1042, "step": 2443 }, { "epoch": 0.4673040152963671, "grad_norm": 1.9258490800857544, "learning_rate": 4.957028917472544e-06, "loss": 0.1315, "step": 2444 }, { "epoch": 0.46749521988527726, "grad_norm": 2.847909688949585, "learning_rate": 4.957288849236539e-06, "loss": 0.4624, "step": 2445 }, { "epoch": 0.4676864244741874, "grad_norm": 3.1480817794799805, "learning_rate": 4.957548674710705e-06, "loss": 0.3211, "step": 2446 }, { "epoch": 0.4678776290630975, "grad_norm": 2.3119571208953857, "learning_rate": 4.957808393981937e-06, "loss": 0.2032, "step": 2447 }, { "epoch": 0.46806883365200763, "grad_norm": 2.356323480606079, "learning_rate": 4.9580680071370174e-06, "loss": 0.2908, "step": 2448 }, { "epoch": 0.4682600382409178, "grad_norm": 0.683710515499115, "learning_rate": 4.958327514262626e-06, "loss": 0.0256, "step": 2449 }, { "epoch": 0.4684512428298279, "grad_norm": 2.147235155105591, "learning_rate": 4.9585869154453355e-06, "loss": 0.186, "step": 2450 }, { "epoch": 0.46864244741873806, "grad_norm": 2.6111466884613037, "learning_rate": 4.958846210771611e-06, "loss": 0.3993, "step": 2451 }, { "epoch": 0.4688336520076482, "grad_norm": 1.6357756853103638, "learning_rate": 4.959105400327814e-06, "loss": 0.1385, "step": 2452 }, { "epoch": 0.4690248565965583, "grad_norm": 2.0699405670166016, "learning_rate": 4.9593644842001994e-06, "loss": 0.1961, "step": 2453 }, { "epoch": 0.46921606118546844, "grad_norm": 2.4611475467681885, "learning_rate": 4.959623462474916e-06, "loss": 0.194, "step": 2454 }, { "epoch": 0.4694072657743786, "grad_norm": 2.70579195022583, "learning_rate": 4.9598823352380075e-06, "loss": 0.3494, "step": 2455 }, { "epoch": 0.4695984703632887, "grad_norm": 1.362758994102478, "learning_rate": 4.9601411025754144e-06, "loss": 0.0551, "step": 2456 }, { "epoch": 0.46978967495219887, "grad_norm": 2.263684034347534, "learning_rate": 4.960399764572971e-06, "loss": 0.4414, "step": 2457 }, { "epoch": 0.469980879541109, "grad_norm": 2.3129522800445557, "learning_rate": 4.960658321316405e-06, "loss": 0.337, "step": 2458 }, { "epoch": 0.4701720841300191, "grad_norm": 3.598054885864258, "learning_rate": 4.960916772891341e-06, "loss": 0.6028, "step": 2459 }, { "epoch": 0.47036328871892924, "grad_norm": 2.352046489715576, "learning_rate": 4.9611751193833e-06, "loss": 0.1063, "step": 2460 }, { "epoch": 0.4705544933078394, "grad_norm": 2.858907461166382, "learning_rate": 4.9614333608776984e-06, "loss": 0.3046, "step": 2461 }, { "epoch": 0.4707456978967495, "grad_norm": 1.6364027261734009, "learning_rate": 4.9616914974598485e-06, "loss": 0.0841, "step": 2462 }, { "epoch": 0.47093690248565967, "grad_norm": 3.1622374057769775, "learning_rate": 4.961949529214955e-06, "loss": 0.3993, "step": 2463 }, { "epoch": 0.4711281070745698, "grad_norm": 2.4431493282318115, "learning_rate": 4.962207456228127e-06, "loss": 0.3394, "step": 2464 }, { "epoch": 0.4713193116634799, "grad_norm": 1.4499131441116333, "learning_rate": 4.96246527858436e-06, "loss": 0.1154, "step": 2465 }, { "epoch": 0.47151051625239004, "grad_norm": 1.915971279144287, "learning_rate": 4.962722996368555e-06, "loss": 0.3057, "step": 2466 }, { "epoch": 0.4717017208413002, "grad_norm": 2.2309677600860596, "learning_rate": 4.9629806096655045e-06, "loss": 0.3469, "step": 2467 }, { "epoch": 0.47189292543021033, "grad_norm": 1.4278203248977661, "learning_rate": 4.963238118559899e-06, "loss": 0.0782, "step": 2468 }, { "epoch": 0.4720841300191205, "grad_norm": 1.626359462738037, "learning_rate": 4.963495523136326e-06, "loss": 0.0824, "step": 2469 }, { "epoch": 0.4722753346080306, "grad_norm": 6.234444618225098, "learning_rate": 4.96375282347927e-06, "loss": 0.3704, "step": 2470 }, { "epoch": 0.4724665391969407, "grad_norm": 1.2472459077835083, "learning_rate": 4.964010019673117e-06, "loss": 0.1072, "step": 2471 }, { "epoch": 0.47265774378585085, "grad_norm": 0.9773194193840027, "learning_rate": 4.9642671118021435e-06, "loss": 0.1166, "step": 2472 }, { "epoch": 0.472848948374761, "grad_norm": 1.2697404623031616, "learning_rate": 4.9645240999505284e-06, "loss": 0.0847, "step": 2473 }, { "epoch": 0.47304015296367113, "grad_norm": 1.7373942136764526, "learning_rate": 4.9647809842023485e-06, "loss": 0.1414, "step": 2474 }, { "epoch": 0.4732313575525813, "grad_norm": 1.2510534524917603, "learning_rate": 4.965037764641576e-06, "loss": 0.0646, "step": 2475 }, { "epoch": 0.4734225621414914, "grad_norm": 3.523763418197632, "learning_rate": 4.965294441352084e-06, "loss": 0.6183, "step": 2476 }, { "epoch": 0.4736137667304015, "grad_norm": 2.536613941192627, "learning_rate": 4.965551014417641e-06, "loss": 0.307, "step": 2477 }, { "epoch": 0.47380497131931165, "grad_norm": 1.2308690547943115, "learning_rate": 4.965807483921919e-06, "loss": 0.121, "step": 2478 }, { "epoch": 0.4739961759082218, "grad_norm": 1.2739108800888062, "learning_rate": 4.966063849948484e-06, "loss": 0.0985, "step": 2479 }, { "epoch": 0.47418738049713194, "grad_norm": 4.273107528686523, "learning_rate": 4.966320112580802e-06, "loss": 0.1709, "step": 2480 }, { "epoch": 0.4743785850860421, "grad_norm": 2.6034741401672363, "learning_rate": 4.96657627190224e-06, "loss": 0.1497, "step": 2481 }, { "epoch": 0.4745697896749522, "grad_norm": 2.563988208770752, "learning_rate": 4.966832327996062e-06, "loss": 0.3513, "step": 2482 }, { "epoch": 0.4747609942638623, "grad_norm": 1.5361602306365967, "learning_rate": 4.967088280945433e-06, "loss": 0.1868, "step": 2483 }, { "epoch": 0.47495219885277246, "grad_norm": 1.8172998428344727, "learning_rate": 4.9673441308334165e-06, "loss": 0.2232, "step": 2484 }, { "epoch": 0.4751434034416826, "grad_norm": 1.6137768030166626, "learning_rate": 4.967599877742975e-06, "loss": 0.1634, "step": 2485 }, { "epoch": 0.47533460803059274, "grad_norm": 1.552061915397644, "learning_rate": 4.967855521756973e-06, "loss": 0.094, "step": 2486 }, { "epoch": 0.4755258126195029, "grad_norm": 2.0277292728424072, "learning_rate": 4.9681110629581734e-06, "loss": 0.1203, "step": 2487 }, { "epoch": 0.47571701720841303, "grad_norm": 2.038464307785034, "learning_rate": 4.96836650142924e-06, "loss": 0.164, "step": 2488 }, { "epoch": 0.4759082217973231, "grad_norm": 2.4230268001556396, "learning_rate": 4.968621837252737e-06, "loss": 0.3603, "step": 2489 }, { "epoch": 0.47609942638623326, "grad_norm": 1.4726464748382568, "learning_rate": 4.9688770705111285e-06, "loss": 0.1528, "step": 2490 }, { "epoch": 0.4762906309751434, "grad_norm": 1.6719284057617188, "learning_rate": 4.969132201286779e-06, "loss": 0.2506, "step": 2491 }, { "epoch": 0.47648183556405355, "grad_norm": 1.5102533102035522, "learning_rate": 4.969387229661954e-06, "loss": 0.0726, "step": 2492 }, { "epoch": 0.4766730401529637, "grad_norm": 2.305950880050659, "learning_rate": 4.969642155718823e-06, "loss": 0.2434, "step": 2493 }, { "epoch": 0.4768642447418738, "grad_norm": 2.0720887184143066, "learning_rate": 4.969896979539451e-06, "loss": 0.249, "step": 2494 }, { "epoch": 0.4770554493307839, "grad_norm": 2.4147567749023438, "learning_rate": 4.97015170120581e-06, "loss": 0.3391, "step": 2495 }, { "epoch": 0.47724665391969406, "grad_norm": 2.9447648525238037, "learning_rate": 4.9704063207997684e-06, "loss": 0.4681, "step": 2496 }, { "epoch": 0.4774378585086042, "grad_norm": 2.2584710121154785, "learning_rate": 4.9706608384031e-06, "loss": 0.1599, "step": 2497 }, { "epoch": 0.47762906309751435, "grad_norm": 1.4370557069778442, "learning_rate": 4.970915254097478e-06, "loss": 0.1377, "step": 2498 }, { "epoch": 0.4778202676864245, "grad_norm": 1.5512670278549194, "learning_rate": 4.971169567964479e-06, "loss": 0.111, "step": 2499 }, { "epoch": 0.4780114722753346, "grad_norm": 1.5349520444869995, "learning_rate": 4.9714237800855815e-06, "loss": 0.0762, "step": 2500 }, { "epoch": 0.4780114722753346, "eval_runtime": 838.1854, "eval_samples_per_second": 1.83, "eval_steps_per_second": 0.229, "step": 2500 }, { "epoch": 0.4782026768642447, "grad_norm": 1.6777958869934082, "learning_rate": 4.971677890542167e-06, "loss": 0.1465, "step": 2501 }, { "epoch": 0.47839388145315487, "grad_norm": 1.7863197326660156, "learning_rate": 4.971931899415515e-06, "loss": 0.1716, "step": 2502 }, { "epoch": 0.478585086042065, "grad_norm": 1.1726549863815308, "learning_rate": 4.972185806786815e-06, "loss": 0.1211, "step": 2503 }, { "epoch": 0.47877629063097515, "grad_norm": 3.095215320587158, "learning_rate": 4.972439612737152e-06, "loss": 0.3554, "step": 2504 }, { "epoch": 0.4789674952198853, "grad_norm": 1.327844262123108, "learning_rate": 4.972693317347518e-06, "loss": 0.0942, "step": 2505 }, { "epoch": 0.4791586998087954, "grad_norm": 1.6837940216064453, "learning_rate": 4.9729469206988075e-06, "loss": 0.0878, "step": 2506 }, { "epoch": 0.47934990439770553, "grad_norm": 2.4635088443756104, "learning_rate": 4.973200422871818e-06, "loss": 0.1642, "step": 2507 }, { "epoch": 0.47954110898661567, "grad_norm": 2.1021690368652344, "learning_rate": 4.973453823947249e-06, "loss": 0.3213, "step": 2508 }, { "epoch": 0.4797323135755258, "grad_norm": 1.8503338098526, "learning_rate": 4.973707124005704e-06, "loss": 0.1949, "step": 2509 }, { "epoch": 0.47992351816443596, "grad_norm": 1.9696707725524902, "learning_rate": 4.973960323127691e-06, "loss": 0.3777, "step": 2510 }, { "epoch": 0.4801147227533461, "grad_norm": 1.7098878622055054, "learning_rate": 4.974213421393625e-06, "loss": 0.1056, "step": 2511 }, { "epoch": 0.4803059273422562, "grad_norm": 2.5690712928771973, "learning_rate": 4.974466418883816e-06, "loss": 0.1538, "step": 2512 }, { "epoch": 0.48049713193116633, "grad_norm": 2.4078571796417236, "learning_rate": 4.974719315678486e-06, "loss": 0.4023, "step": 2513 }, { "epoch": 0.4806883365200765, "grad_norm": 1.6684014797210693, "learning_rate": 4.974972111857759e-06, "loss": 0.1205, "step": 2514 }, { "epoch": 0.4808795411089866, "grad_norm": 2.405083417892456, "learning_rate": 4.975224807501662e-06, "loss": 0.3595, "step": 2515 }, { "epoch": 0.48107074569789676, "grad_norm": 1.4291092157363892, "learning_rate": 4.975477402690129e-06, "loss": 0.0913, "step": 2516 }, { "epoch": 0.4812619502868069, "grad_norm": 6.145540237426758, "learning_rate": 4.975729897502997e-06, "loss": 0.1871, "step": 2517 }, { "epoch": 0.481453154875717, "grad_norm": 1.5036752223968506, "learning_rate": 4.975982292020009e-06, "loss": 0.1577, "step": 2518 }, { "epoch": 0.48164435946462714, "grad_norm": 2.249161720275879, "learning_rate": 4.97623458632081e-06, "loss": 0.1663, "step": 2519 }, { "epoch": 0.4818355640535373, "grad_norm": 1.631930947303772, "learning_rate": 4.976486780484955e-06, "loss": 0.1467, "step": 2520 }, { "epoch": 0.4820267686424474, "grad_norm": 1.7321642637252808, "learning_rate": 4.9767388745919e-06, "loss": 0.2776, "step": 2521 }, { "epoch": 0.48221797323135757, "grad_norm": 3.218299388885498, "learning_rate": 4.976990868721009e-06, "loss": 0.5657, "step": 2522 }, { "epoch": 0.4824091778202677, "grad_norm": 2.6193222999572754, "learning_rate": 4.977242762951551e-06, "loss": 0.2718, "step": 2523 }, { "epoch": 0.4826003824091778, "grad_norm": 2.339751720428467, "learning_rate": 4.977494557362701e-06, "loss": 0.1729, "step": 2524 }, { "epoch": 0.48279158699808794, "grad_norm": 1.910982370376587, "learning_rate": 4.9777462520335386e-06, "loss": 0.0808, "step": 2525 }, { "epoch": 0.4829827915869981, "grad_norm": 1.3816866874694824, "learning_rate": 4.97799784704305e-06, "loss": 0.0891, "step": 2526 }, { "epoch": 0.4831739961759082, "grad_norm": 1.6029402017593384, "learning_rate": 4.97824934247013e-06, "loss": 0.1892, "step": 2527 }, { "epoch": 0.48336520076481837, "grad_norm": 1.5500452518463135, "learning_rate": 4.978500738393576e-06, "loss": 0.143, "step": 2528 }, { "epoch": 0.4835564053537285, "grad_norm": 2.6044082641601562, "learning_rate": 4.978752034892094e-06, "loss": 0.132, "step": 2529 }, { "epoch": 0.4837476099426386, "grad_norm": 1.7775001525878906, "learning_rate": 4.979003232044297e-06, "loss": 0.1912, "step": 2530 }, { "epoch": 0.48393881453154874, "grad_norm": 1.5503368377685547, "learning_rate": 4.979254329928704e-06, "loss": 0.0685, "step": 2531 }, { "epoch": 0.4841300191204589, "grad_norm": 1.6482033729553223, "learning_rate": 4.979505328623739e-06, "loss": 0.2095, "step": 2532 }, { "epoch": 0.48432122370936903, "grad_norm": 3.1637353897094727, "learning_rate": 4.9797562282077376e-06, "loss": 0.4826, "step": 2533 }, { "epoch": 0.4845124282982792, "grad_norm": 1.1990470886230469, "learning_rate": 4.98000702875894e-06, "loss": 0.0619, "step": 2534 }, { "epoch": 0.4847036328871893, "grad_norm": 1.7547122240066528, "learning_rate": 4.980257730355493e-06, "loss": 0.1307, "step": 2535 }, { "epoch": 0.4848948374760994, "grad_norm": 1.7956041097640991, "learning_rate": 4.9805083330754525e-06, "loss": 0.1256, "step": 2536 }, { "epoch": 0.48508604206500955, "grad_norm": 1.7650172710418701, "learning_rate": 4.980758836996782e-06, "loss": 0.1144, "step": 2537 }, { "epoch": 0.4852772466539197, "grad_norm": 3.838001251220703, "learning_rate": 4.981009242197351e-06, "loss": 0.252, "step": 2538 }, { "epoch": 0.48546845124282983, "grad_norm": 2.8733246326446533, "learning_rate": 4.981259548754939e-06, "loss": 0.3944, "step": 2539 }, { "epoch": 0.48565965583174, "grad_norm": 1.7851463556289673, "learning_rate": 4.981509756747234e-06, "loss": 0.1834, "step": 2540 }, { "epoch": 0.4858508604206501, "grad_norm": 1.990270972251892, "learning_rate": 4.981759866251829e-06, "loss": 0.2053, "step": 2541 }, { "epoch": 0.4860420650095602, "grad_norm": 1.489159107208252, "learning_rate": 4.98200987734623e-06, "loss": 0.1285, "step": 2542 }, { "epoch": 0.48623326959847035, "grad_norm": 1.8262524604797363, "learning_rate": 4.9822597901078465e-06, "loss": 0.1314, "step": 2543 }, { "epoch": 0.4864244741873805, "grad_norm": 1.9886893033981323, "learning_rate": 4.982509604614002e-06, "loss": 0.1196, "step": 2544 }, { "epoch": 0.48661567877629064, "grad_norm": 1.8472392559051514, "learning_rate": 4.982759320941924e-06, "loss": 0.1214, "step": 2545 }, { "epoch": 0.4868068833652008, "grad_norm": 1.5206496715545654, "learning_rate": 4.9830089391687534e-06, "loss": 0.1371, "step": 2546 }, { "epoch": 0.4869980879541109, "grad_norm": 2.1726207733154297, "learning_rate": 4.983258459371536e-06, "loss": 0.1734, "step": 2547 }, { "epoch": 0.487189292543021, "grad_norm": 1.3305667638778687, "learning_rate": 4.9835078816272295e-06, "loss": 0.0642, "step": 2548 }, { "epoch": 0.48738049713193116, "grad_norm": 1.633368968963623, "learning_rate": 4.983757206012702e-06, "loss": 0.0843, "step": 2549 }, { "epoch": 0.4875717017208413, "grad_norm": 2.334226608276367, "learning_rate": 4.984006432604726e-06, "loss": 0.1321, "step": 2550 }, { "epoch": 0.48776290630975144, "grad_norm": 1.615264654159546, "learning_rate": 4.98425556147999e-06, "loss": 0.1884, "step": 2551 }, { "epoch": 0.4879541108986616, "grad_norm": 2.8392958641052246, "learning_rate": 4.984504592715089e-06, "loss": 0.5023, "step": 2552 }, { "epoch": 0.48814531548757173, "grad_norm": 2.3064663410186768, "learning_rate": 4.984753526386528e-06, "loss": 0.2044, "step": 2553 }, { "epoch": 0.4883365200764818, "grad_norm": 2.2431957721710205, "learning_rate": 4.985002362570723e-06, "loss": 0.2858, "step": 2554 }, { "epoch": 0.48852772466539196, "grad_norm": 2.3659920692443848, "learning_rate": 4.9852511013439995e-06, "loss": 0.1595, "step": 2555 }, { "epoch": 0.4887189292543021, "grad_norm": 2.477073907852173, "learning_rate": 4.985499742782594e-06, "loss": 0.0791, "step": 2556 }, { "epoch": 0.48891013384321225, "grad_norm": 3.7777740955352783, "learning_rate": 4.985748286962654e-06, "loss": 0.6394, "step": 2557 }, { "epoch": 0.4891013384321224, "grad_norm": 2.1839075088500977, "learning_rate": 4.9859967339602365e-06, "loss": 0.2495, "step": 2558 }, { "epoch": 0.4892925430210325, "grad_norm": 2.8904454708099365, "learning_rate": 4.986245083851309e-06, "loss": 0.419, "step": 2559 }, { "epoch": 0.4894837476099426, "grad_norm": 1.4429762363433838, "learning_rate": 4.986493336711752e-06, "loss": 0.2267, "step": 2560 }, { "epoch": 0.48967495219885276, "grad_norm": 5.334555149078369, "learning_rate": 4.986741492617356e-06, "loss": 0.2136, "step": 2561 }, { "epoch": 0.4898661567877629, "grad_norm": 1.6938073635101318, "learning_rate": 4.98698955164382e-06, "loss": 0.0614, "step": 2562 }, { "epoch": 0.49005736137667305, "grad_norm": 4.193806171417236, "learning_rate": 4.9872375138667615e-06, "loss": 0.2824, "step": 2563 }, { "epoch": 0.4902485659655832, "grad_norm": 2.4448256492614746, "learning_rate": 4.987485379361701e-06, "loss": 0.362, "step": 2564 }, { "epoch": 0.4904397705544933, "grad_norm": 2.175049066543579, "learning_rate": 4.987733148204077e-06, "loss": 0.159, "step": 2565 }, { "epoch": 0.4906309751434034, "grad_norm": 1.4821659326553345, "learning_rate": 4.987980820469236e-06, "loss": 0.1167, "step": 2566 }, { "epoch": 0.49082217973231357, "grad_norm": 1.6503522396087646, "learning_rate": 4.988228396232439e-06, "loss": 0.0725, "step": 2567 }, { "epoch": 0.4910133843212237, "grad_norm": 3.942607879638672, "learning_rate": 4.988475875568857e-06, "loss": 0.136, "step": 2568 }, { "epoch": 0.49120458891013385, "grad_norm": 1.6592808961868286, "learning_rate": 4.988723258553574e-06, "loss": 0.1298, "step": 2569 }, { "epoch": 0.491395793499044, "grad_norm": 1.752287745475769, "learning_rate": 4.988970545261588e-06, "loss": 0.311, "step": 2570 }, { "epoch": 0.4915869980879541, "grad_norm": 2.297788143157959, "learning_rate": 4.989217735767806e-06, "loss": 0.2016, "step": 2571 }, { "epoch": 0.49177820267686423, "grad_norm": 1.5098553895950317, "learning_rate": 4.989464830147051e-06, "loss": 0.1153, "step": 2572 }, { "epoch": 0.49196940726577437, "grad_norm": 1.6489616632461548, "learning_rate": 4.989711828474057e-06, "loss": 0.0871, "step": 2573 }, { "epoch": 0.4921606118546845, "grad_norm": 1.4597628116607666, "learning_rate": 4.989958730823471e-06, "loss": 0.1124, "step": 2574 }, { "epoch": 0.49235181644359466, "grad_norm": 1.9017386436462402, "learning_rate": 4.990205537269853e-06, "loss": 0.1342, "step": 2575 }, { "epoch": 0.4925430210325048, "grad_norm": 2.946324586868286, "learning_rate": 4.990452247887675e-06, "loss": 0.3603, "step": 2576 }, { "epoch": 0.4927342256214149, "grad_norm": 2.808943510055542, "learning_rate": 4.9906988627513265e-06, "loss": 0.3344, "step": 2577 }, { "epoch": 0.49292543021032503, "grad_norm": 2.313256025314331, "learning_rate": 4.990945381935106e-06, "loss": 0.336, "step": 2578 }, { "epoch": 0.4931166347992352, "grad_norm": 1.7406623363494873, "learning_rate": 4.991191805513227e-06, "loss": 0.1274, "step": 2579 }, { "epoch": 0.4933078393881453, "grad_norm": 2.546191692352295, "learning_rate": 4.991438133559817e-06, "loss": 0.3057, "step": 2580 }, { "epoch": 0.49349904397705546, "grad_norm": 3.7837564945220947, "learning_rate": 4.991684366148917e-06, "loss": 0.1454, "step": 2581 }, { "epoch": 0.4936902485659656, "grad_norm": 3.7757515907287598, "learning_rate": 4.991930503354482e-06, "loss": 0.3576, "step": 2582 }, { "epoch": 0.4938814531548757, "grad_norm": 2.6701161861419678, "learning_rate": 4.9921765452503814e-06, "loss": 0.4054, "step": 2583 }, { "epoch": 0.49407265774378584, "grad_norm": 3.300074577331543, "learning_rate": 4.992422491910399e-06, "loss": 0.3313, "step": 2584 }, { "epoch": 0.494263862332696, "grad_norm": 1.8099370002746582, "learning_rate": 4.9926683434082315e-06, "loss": 0.1173, "step": 2585 }, { "epoch": 0.4944550669216061, "grad_norm": 1.8686485290527344, "learning_rate": 4.9929140998174915e-06, "loss": 0.1983, "step": 2586 }, { "epoch": 0.49464627151051627, "grad_norm": 1.4938725233078003, "learning_rate": 4.9931597612117065e-06, "loss": 0.0842, "step": 2587 }, { "epoch": 0.4948374760994264, "grad_norm": 1.6325137615203857, "learning_rate": 4.993405327664316e-06, "loss": 0.093, "step": 2588 }, { "epoch": 0.4950286806883365, "grad_norm": 1.7486827373504639, "learning_rate": 4.99365079924868e-06, "loss": 0.2226, "step": 2589 }, { "epoch": 0.49521988527724664, "grad_norm": 1.7181081771850586, "learning_rate": 4.993896176038066e-06, "loss": 0.1983, "step": 2590 }, { "epoch": 0.4954110898661568, "grad_norm": 1.6922866106033325, "learning_rate": 4.9941414581056636e-06, "loss": 0.3837, "step": 2591 }, { "epoch": 0.4956022944550669, "grad_norm": 1.5740710496902466, "learning_rate": 4.994386645524574e-06, "loss": 0.1492, "step": 2592 }, { "epoch": 0.49579349904397707, "grad_norm": 1.319801926612854, "learning_rate": 4.994631738367814e-06, "loss": 0.1124, "step": 2593 }, { "epoch": 0.4959847036328872, "grad_norm": 1.6975462436676025, "learning_rate": 4.994876736708317e-06, "loss": 0.1612, "step": 2594 }, { "epoch": 0.4961759082217973, "grad_norm": 2.0203516483306885, "learning_rate": 4.99512164061893e-06, "loss": 0.2532, "step": 2595 }, { "epoch": 0.49636711281070744, "grad_norm": 1.9539330005645752, "learning_rate": 4.995366450172418e-06, "loss": 0.2315, "step": 2596 }, { "epoch": 0.4965583173996176, "grad_norm": 1.986435890197754, "learning_rate": 4.995611165441463e-06, "loss": 0.1917, "step": 2597 }, { "epoch": 0.49674952198852773, "grad_norm": 1.494219422340393, "learning_rate": 4.99585578649866e-06, "loss": 0.096, "step": 2598 }, { "epoch": 0.4969407265774379, "grad_norm": 1.1928995847702026, "learning_rate": 4.996100313416522e-06, "loss": 0.0925, "step": 2599 }, { "epoch": 0.497131931166348, "grad_norm": 2.613600492477417, "learning_rate": 4.996344746267477e-06, "loss": 0.1606, "step": 2600 }, { "epoch": 0.4973231357552581, "grad_norm": 1.8623335361480713, "learning_rate": 4.99658908512387e-06, "loss": 0.1377, "step": 2601 }, { "epoch": 0.49751434034416825, "grad_norm": 2.6297202110290527, "learning_rate": 4.996833330057964e-06, "loss": 0.3241, "step": 2602 }, { "epoch": 0.4977055449330784, "grad_norm": 1.7815700769424438, "learning_rate": 4.997077481141936e-06, "loss": 0.2196, "step": 2603 }, { "epoch": 0.49789674952198854, "grad_norm": 3.413189172744751, "learning_rate": 4.9973215384478835e-06, "loss": 0.1373, "step": 2604 }, { "epoch": 0.4980879541108987, "grad_norm": 4.574033737182617, "learning_rate": 4.997565502047817e-06, "loss": 0.2807, "step": 2605 }, { "epoch": 0.4982791586998088, "grad_norm": 3.2377426624298096, "learning_rate": 4.997809372013666e-06, "loss": 0.3188, "step": 2606 }, { "epoch": 0.4984703632887189, "grad_norm": 1.8162243366241455, "learning_rate": 4.998053148417279e-06, "loss": 0.2634, "step": 2607 }, { "epoch": 0.49866156787762905, "grad_norm": 3.267460823059082, "learning_rate": 4.998296831330417e-06, "loss": 0.2784, "step": 2608 }, { "epoch": 0.4988527724665392, "grad_norm": 1.5525761842727661, "learning_rate": 4.998540420824764e-06, "loss": 0.1681, "step": 2609 }, { "epoch": 0.49904397705544934, "grad_norm": 2.893181324005127, "learning_rate": 4.998783916971917e-06, "loss": 0.3174, "step": 2610 }, { "epoch": 0.4992351816443595, "grad_norm": 1.829892873764038, "learning_rate": 4.999027319843394e-06, "loss": 0.1275, "step": 2611 }, { "epoch": 0.4994263862332696, "grad_norm": 2.012143611907959, "learning_rate": 4.999270629510629e-06, "loss": 0.1867, "step": 2612 }, { "epoch": 0.4996175908221797, "grad_norm": 2.164092779159546, "learning_rate": 4.9995138460449755e-06, "loss": 0.2112, "step": 2613 }, { "epoch": 0.49980879541108986, "grad_norm": 1.7680532932281494, "learning_rate": 4.999756969517703e-06, "loss": 0.1469, "step": 2614 }, { "epoch": 0.5, "grad_norm": 4.590065002441406, "learning_rate": 5e-06, "loss": 0.2257, "step": 2615 }, { "epoch": 0.5001912045889101, "grad_norm": 1.274849772453308, "learning_rate": 5e-06, "loss": 0.0882, "step": 2616 }, { "epoch": 0.5003824091778203, "grad_norm": 2.5290040969848633, "learning_rate": 5e-06, "loss": 0.0899, "step": 2617 }, { "epoch": 0.5005736137667304, "grad_norm": 2.3508877754211426, "learning_rate": 5e-06, "loss": 0.165, "step": 2618 }, { "epoch": 0.5007648183556406, "grad_norm": 2.5142104625701904, "learning_rate": 5e-06, "loss": 0.1671, "step": 2619 }, { "epoch": 0.5009560229445507, "grad_norm": 2.028837203979492, "learning_rate": 5e-06, "loss": 0.265, "step": 2620 }, { "epoch": 0.5011472275334607, "grad_norm": 1.2833895683288574, "learning_rate": 5e-06, "loss": 0.1018, "step": 2621 }, { "epoch": 0.501338432122371, "grad_norm": 1.112088918685913, "learning_rate": 5e-06, "loss": 0.09, "step": 2622 }, { "epoch": 0.501529636711281, "grad_norm": 1.1832149028778076, "learning_rate": 5e-06, "loss": 0.096, "step": 2623 }, { "epoch": 0.5017208413001912, "grad_norm": 1.8202910423278809, "learning_rate": 5e-06, "loss": 0.1305, "step": 2624 }, { "epoch": 0.5019120458891013, "grad_norm": 2.20587420463562, "learning_rate": 5e-06, "loss": 0.0921, "step": 2625 }, { "epoch": 0.5021032504780115, "grad_norm": 2.4090912342071533, "learning_rate": 5e-06, "loss": 0.401, "step": 2626 }, { "epoch": 0.5022944550669216, "grad_norm": 2.0977654457092285, "learning_rate": 5e-06, "loss": 0.3296, "step": 2627 }, { "epoch": 0.5024856596558317, "grad_norm": 1.8550901412963867, "learning_rate": 5e-06, "loss": 0.1836, "step": 2628 }, { "epoch": 0.5026768642447419, "grad_norm": 0.8883583545684814, "learning_rate": 5e-06, "loss": 0.0655, "step": 2629 }, { "epoch": 0.502868068833652, "grad_norm": 1.4066579341888428, "learning_rate": 5e-06, "loss": 0.0553, "step": 2630 }, { "epoch": 0.5030592734225622, "grad_norm": 1.9968935251235962, "learning_rate": 5e-06, "loss": 0.0832, "step": 2631 }, { "epoch": 0.5032504780114723, "grad_norm": 1.5393012762069702, "learning_rate": 5e-06, "loss": 0.1059, "step": 2632 }, { "epoch": 0.5034416826003824, "grad_norm": 2.020643949508667, "learning_rate": 5e-06, "loss": 0.2575, "step": 2633 }, { "epoch": 0.5036328871892926, "grad_norm": 1.2440476417541504, "learning_rate": 5e-06, "loss": 0.0954, "step": 2634 }, { "epoch": 0.5038240917782026, "grad_norm": 1.243212103843689, "learning_rate": 5e-06, "loss": 0.1172, "step": 2635 }, { "epoch": 0.5040152963671128, "grad_norm": 2.4254207611083984, "learning_rate": 5e-06, "loss": 0.2976, "step": 2636 }, { "epoch": 0.5042065009560229, "grad_norm": 1.2954270839691162, "learning_rate": 5e-06, "loss": 0.0682, "step": 2637 }, { "epoch": 0.5043977055449331, "grad_norm": 1.661173701286316, "learning_rate": 5e-06, "loss": 0.2638, "step": 2638 }, { "epoch": 0.5045889101338432, "grad_norm": 2.2583632469177246, "learning_rate": 5e-06, "loss": 0.2787, "step": 2639 }, { "epoch": 0.5047801147227533, "grad_norm": 3.421473979949951, "learning_rate": 5e-06, "loss": 0.4827, "step": 2640 }, { "epoch": 0.5049713193116635, "grad_norm": 1.8076733350753784, "learning_rate": 5e-06, "loss": 0.154, "step": 2641 }, { "epoch": 0.5051625239005736, "grad_norm": 2.341878652572632, "learning_rate": 5e-06, "loss": 0.2703, "step": 2642 }, { "epoch": 0.5053537284894838, "grad_norm": 1.1928160190582275, "learning_rate": 5e-06, "loss": 0.1011, "step": 2643 }, { "epoch": 0.5055449330783939, "grad_norm": 1.0467259883880615, "learning_rate": 5e-06, "loss": 0.0831, "step": 2644 }, { "epoch": 0.505736137667304, "grad_norm": 1.8891057968139648, "learning_rate": 5e-06, "loss": 0.3587, "step": 2645 }, { "epoch": 0.5059273422562142, "grad_norm": 2.7861201763153076, "learning_rate": 5e-06, "loss": 0.4989, "step": 2646 }, { "epoch": 0.5061185468451243, "grad_norm": 1.5840612649917603, "learning_rate": 5e-06, "loss": 0.1864, "step": 2647 }, { "epoch": 0.5063097514340344, "grad_norm": 3.3611576557159424, "learning_rate": 5e-06, "loss": 0.3172, "step": 2648 }, { "epoch": 0.5065009560229445, "grad_norm": 1.371387004852295, "learning_rate": 5e-06, "loss": 0.0707, "step": 2649 }, { "epoch": 0.5066921606118547, "grad_norm": 1.2657071352005005, "learning_rate": 5e-06, "loss": 0.0526, "step": 2650 }, { "epoch": 0.5068833652007648, "grad_norm": 3.1202003955841064, "learning_rate": 5e-06, "loss": 0.3824, "step": 2651 }, { "epoch": 0.5070745697896749, "grad_norm": 1.1883548498153687, "learning_rate": 5e-06, "loss": 0.1146, "step": 2652 }, { "epoch": 0.5072657743785851, "grad_norm": 2.3430683612823486, "learning_rate": 5e-06, "loss": 0.3705, "step": 2653 }, { "epoch": 0.5074569789674952, "grad_norm": 1.8830441236495972, "learning_rate": 5e-06, "loss": 0.0895, "step": 2654 }, { "epoch": 0.5076481835564054, "grad_norm": 2.3886196613311768, "learning_rate": 5e-06, "loss": 0.257, "step": 2655 }, { "epoch": 0.5078393881453155, "grad_norm": 1.6566410064697266, "learning_rate": 5e-06, "loss": 0.1023, "step": 2656 }, { "epoch": 0.5080305927342256, "grad_norm": 1.7959635257720947, "learning_rate": 5e-06, "loss": 0.1833, "step": 2657 }, { "epoch": 0.5082217973231358, "grad_norm": 1.3540290594100952, "learning_rate": 5e-06, "loss": 0.1589, "step": 2658 }, { "epoch": 0.5084130019120459, "grad_norm": 1.84811532497406, "learning_rate": 5e-06, "loss": 0.1614, "step": 2659 }, { "epoch": 0.5086042065009561, "grad_norm": 1.4192479848861694, "learning_rate": 5e-06, "loss": 0.1005, "step": 2660 }, { "epoch": 0.5087954110898661, "grad_norm": 2.123854160308838, "learning_rate": 5e-06, "loss": 0.1387, "step": 2661 }, { "epoch": 0.5089866156787763, "grad_norm": 3.7756459712982178, "learning_rate": 5e-06, "loss": 0.215, "step": 2662 }, { "epoch": 0.5091778202676864, "grad_norm": 1.6984196901321411, "learning_rate": 5e-06, "loss": 0.0966, "step": 2663 }, { "epoch": 0.5093690248565965, "grad_norm": 2.247396469116211, "learning_rate": 5e-06, "loss": 0.1488, "step": 2664 }, { "epoch": 0.5095602294455067, "grad_norm": 2.6015279293060303, "learning_rate": 5e-06, "loss": 0.3206, "step": 2665 }, { "epoch": 0.5097514340344168, "grad_norm": 1.4520198106765747, "learning_rate": 5e-06, "loss": 0.1184, "step": 2666 }, { "epoch": 0.509942638623327, "grad_norm": 2.357475757598877, "learning_rate": 5e-06, "loss": 0.1052, "step": 2667 }, { "epoch": 0.5101338432122371, "grad_norm": 1.6518830060958862, "learning_rate": 5e-06, "loss": 0.0741, "step": 2668 }, { "epoch": 0.5103250478011472, "grad_norm": 3.9007205963134766, "learning_rate": 5e-06, "loss": 0.3746, "step": 2669 }, { "epoch": 0.5105162523900574, "grad_norm": 2.126702308654785, "learning_rate": 5e-06, "loss": 0.332, "step": 2670 }, { "epoch": 0.5107074569789675, "grad_norm": 1.7568902969360352, "learning_rate": 5e-06, "loss": 0.1567, "step": 2671 }, { "epoch": 0.5108986615678777, "grad_norm": 1.939828634262085, "learning_rate": 5e-06, "loss": 0.2383, "step": 2672 }, { "epoch": 0.5110898661567878, "grad_norm": 1.8241344690322876, "learning_rate": 5e-06, "loss": 0.1064, "step": 2673 }, { "epoch": 0.511281070745698, "grad_norm": 2.5553126335144043, "learning_rate": 5e-06, "loss": 0.2161, "step": 2674 }, { "epoch": 0.511472275334608, "grad_norm": 1.7424609661102295, "learning_rate": 5e-06, "loss": 0.091, "step": 2675 }, { "epoch": 0.5116634799235181, "grad_norm": 1.8869847059249878, "learning_rate": 5e-06, "loss": 0.2992, "step": 2676 }, { "epoch": 0.5118546845124283, "grad_norm": 1.1718688011169434, "learning_rate": 5e-06, "loss": 0.1112, "step": 2677 }, { "epoch": 0.5120458891013384, "grad_norm": 1.0170719623565674, "learning_rate": 5e-06, "loss": 0.1267, "step": 2678 }, { "epoch": 0.5122370936902486, "grad_norm": 1.5140657424926758, "learning_rate": 5e-06, "loss": 0.0969, "step": 2679 }, { "epoch": 0.5124282982791587, "grad_norm": 1.8344894647598267, "learning_rate": 5e-06, "loss": 0.14, "step": 2680 }, { "epoch": 0.5126195028680688, "grad_norm": 2.2053725719451904, "learning_rate": 5e-06, "loss": 0.2002, "step": 2681 }, { "epoch": 0.512810707456979, "grad_norm": 1.968572974205017, "learning_rate": 5e-06, "loss": 0.1644, "step": 2682 }, { "epoch": 0.5130019120458891, "grad_norm": 2.810896158218384, "learning_rate": 5e-06, "loss": 0.3554, "step": 2683 }, { "epoch": 0.5131931166347993, "grad_norm": 1.951485276222229, "learning_rate": 5e-06, "loss": 0.2156, "step": 2684 }, { "epoch": 0.5133843212237094, "grad_norm": 1.31484055519104, "learning_rate": 5e-06, "loss": 0.1424, "step": 2685 }, { "epoch": 0.5135755258126194, "grad_norm": 2.028074026107788, "learning_rate": 5e-06, "loss": 0.165, "step": 2686 }, { "epoch": 0.5137667304015296, "grad_norm": 1.3632593154907227, "learning_rate": 5e-06, "loss": 0.067, "step": 2687 }, { "epoch": 0.5139579349904397, "grad_norm": 1.8360464572906494, "learning_rate": 5e-06, "loss": 0.1073, "step": 2688 }, { "epoch": 0.5141491395793499, "grad_norm": 2.7221839427948, "learning_rate": 5e-06, "loss": 0.536, "step": 2689 }, { "epoch": 0.51434034416826, "grad_norm": 1.6568959951400757, "learning_rate": 5e-06, "loss": 0.133, "step": 2690 }, { "epoch": 0.5145315487571702, "grad_norm": 2.4238178730010986, "learning_rate": 5e-06, "loss": 0.2072, "step": 2691 }, { "epoch": 0.5147227533460803, "grad_norm": 1.4915567636489868, "learning_rate": 5e-06, "loss": 0.1145, "step": 2692 }, { "epoch": 0.5149139579349904, "grad_norm": 1.5793148279190063, "learning_rate": 5e-06, "loss": 0.1, "step": 2693 }, { "epoch": 0.5151051625239006, "grad_norm": 1.486043930053711, "learning_rate": 5e-06, "loss": 0.1559, "step": 2694 }, { "epoch": 0.5152963671128107, "grad_norm": 2.671621561050415, "learning_rate": 5e-06, "loss": 0.6114, "step": 2695 }, { "epoch": 0.5154875717017209, "grad_norm": 1.9827364683151245, "learning_rate": 5e-06, "loss": 0.148, "step": 2696 }, { "epoch": 0.515678776290631, "grad_norm": 1.608793020248413, "learning_rate": 5e-06, "loss": 0.1065, "step": 2697 }, { "epoch": 0.5158699808795411, "grad_norm": 1.5601345300674438, "learning_rate": 5e-06, "loss": 0.1278, "step": 2698 }, { "epoch": 0.5160611854684513, "grad_norm": 2.0068085193634033, "learning_rate": 5e-06, "loss": 0.1, "step": 2699 }, { "epoch": 0.5162523900573613, "grad_norm": 1.006656289100647, "learning_rate": 5e-06, "loss": 0.0425, "step": 2700 }, { "epoch": 0.5164435946462715, "grad_norm": 4.04417085647583, "learning_rate": 5e-06, "loss": 0.7525, "step": 2701 }, { "epoch": 0.5166347992351816, "grad_norm": 2.4919166564941406, "learning_rate": 5e-06, "loss": 0.3474, "step": 2702 }, { "epoch": 0.5168260038240918, "grad_norm": 1.7118905782699585, "learning_rate": 5e-06, "loss": 0.1593, "step": 2703 }, { "epoch": 0.5170172084130019, "grad_norm": 2.105720043182373, "learning_rate": 5e-06, "loss": 0.2862, "step": 2704 }, { "epoch": 0.517208413001912, "grad_norm": 2.6111350059509277, "learning_rate": 5e-06, "loss": 0.1986, "step": 2705 }, { "epoch": 0.5173996175908222, "grad_norm": 1.308851718902588, "learning_rate": 5e-06, "loss": 0.0404, "step": 2706 }, { "epoch": 0.5175908221797323, "grad_norm": 2.451007127761841, "learning_rate": 5e-06, "loss": 0.241, "step": 2707 }, { "epoch": 0.5177820267686425, "grad_norm": 1.6604572534561157, "learning_rate": 5e-06, "loss": 0.1542, "step": 2708 }, { "epoch": 0.5179732313575526, "grad_norm": 1.995052456855774, "learning_rate": 5e-06, "loss": 0.2343, "step": 2709 }, { "epoch": 0.5181644359464627, "grad_norm": 1.6679072380065918, "learning_rate": 5e-06, "loss": 0.1014, "step": 2710 }, { "epoch": 0.5183556405353729, "grad_norm": 1.284263253211975, "learning_rate": 5e-06, "loss": 0.0664, "step": 2711 }, { "epoch": 0.518546845124283, "grad_norm": 1.0988402366638184, "learning_rate": 5e-06, "loss": 0.0266, "step": 2712 }, { "epoch": 0.5187380497131932, "grad_norm": 1.9333552122116089, "learning_rate": 5e-06, "loss": 0.1244, "step": 2713 }, { "epoch": 0.5189292543021032, "grad_norm": 2.2870166301727295, "learning_rate": 5e-06, "loss": 0.4859, "step": 2714 }, { "epoch": 0.5191204588910134, "grad_norm": 2.012103319168091, "learning_rate": 5e-06, "loss": 0.1198, "step": 2715 }, { "epoch": 0.5193116634799235, "grad_norm": 1.6784409284591675, "learning_rate": 5e-06, "loss": 0.1635, "step": 2716 }, { "epoch": 0.5195028680688336, "grad_norm": 2.0358691215515137, "learning_rate": 5e-06, "loss": 0.15, "step": 2717 }, { "epoch": 0.5196940726577438, "grad_norm": 1.3582711219787598, "learning_rate": 5e-06, "loss": 0.0842, "step": 2718 }, { "epoch": 0.5198852772466539, "grad_norm": 2.2164931297302246, "learning_rate": 5e-06, "loss": 0.1049, "step": 2719 }, { "epoch": 0.5200764818355641, "grad_norm": 3.631624698638916, "learning_rate": 5e-06, "loss": 0.2526, "step": 2720 }, { "epoch": 0.5202676864244742, "grad_norm": 2.41479229927063, "learning_rate": 5e-06, "loss": 0.3641, "step": 2721 }, { "epoch": 0.5204588910133843, "grad_norm": 1.9997187852859497, "learning_rate": 5e-06, "loss": 0.1708, "step": 2722 }, { "epoch": 0.5206500956022945, "grad_norm": 3.5295379161834717, "learning_rate": 5e-06, "loss": 0.2561, "step": 2723 }, { "epoch": 0.5208413001912046, "grad_norm": 4.27695369720459, "learning_rate": 5e-06, "loss": 0.2081, "step": 2724 }, { "epoch": 0.5210325047801148, "grad_norm": 1.7342065572738647, "learning_rate": 5e-06, "loss": 0.0791, "step": 2725 }, { "epoch": 0.5212237093690248, "grad_norm": 1.875627040863037, "learning_rate": 5e-06, "loss": 0.1565, "step": 2726 }, { "epoch": 0.521414913957935, "grad_norm": 1.8105803728103638, "learning_rate": 5e-06, "loss": 0.2063, "step": 2727 }, { "epoch": 0.5216061185468451, "grad_norm": 1.5556526184082031, "learning_rate": 5e-06, "loss": 0.205, "step": 2728 }, { "epoch": 0.5217973231357552, "grad_norm": 1.5404901504516602, "learning_rate": 5e-06, "loss": 0.1059, "step": 2729 }, { "epoch": 0.5219885277246654, "grad_norm": 1.3888638019561768, "learning_rate": 5e-06, "loss": 0.1064, "step": 2730 }, { "epoch": 0.5221797323135755, "grad_norm": 1.3965755701065063, "learning_rate": 5e-06, "loss": 0.0549, "step": 2731 }, { "epoch": 0.5223709369024857, "grad_norm": 2.155271530151367, "learning_rate": 5e-06, "loss": 0.2185, "step": 2732 }, { "epoch": 0.5225621414913958, "grad_norm": 3.022123336791992, "learning_rate": 5e-06, "loss": 0.3292, "step": 2733 }, { "epoch": 0.5227533460803059, "grad_norm": 1.5752053260803223, "learning_rate": 5e-06, "loss": 0.1342, "step": 2734 }, { "epoch": 0.5229445506692161, "grad_norm": 1.3086416721343994, "learning_rate": 5e-06, "loss": 0.0672, "step": 2735 }, { "epoch": 0.5231357552581262, "grad_norm": 1.8340917825698853, "learning_rate": 5e-06, "loss": 0.0859, "step": 2736 }, { "epoch": 0.5233269598470364, "grad_norm": 4.185413360595703, "learning_rate": 5e-06, "loss": 0.1021, "step": 2737 }, { "epoch": 0.5235181644359465, "grad_norm": 3.0778019428253174, "learning_rate": 5e-06, "loss": 0.3103, "step": 2738 }, { "epoch": 0.5237093690248565, "grad_norm": 2.644026041030884, "learning_rate": 5e-06, "loss": 0.5049, "step": 2739 }, { "epoch": 0.5239005736137667, "grad_norm": 1.7256097793579102, "learning_rate": 5e-06, "loss": 0.2243, "step": 2740 }, { "epoch": 0.5240917782026768, "grad_norm": 2.3407437801361084, "learning_rate": 5e-06, "loss": 0.1834, "step": 2741 }, { "epoch": 0.524282982791587, "grad_norm": 1.885779619216919, "learning_rate": 5e-06, "loss": 0.1551, "step": 2742 }, { "epoch": 0.5244741873804971, "grad_norm": 2.010530710220337, "learning_rate": 5e-06, "loss": 0.1249, "step": 2743 }, { "epoch": 0.5246653919694073, "grad_norm": 2.1776020526885986, "learning_rate": 5e-06, "loss": 0.131, "step": 2744 }, { "epoch": 0.5248565965583174, "grad_norm": 2.62050199508667, "learning_rate": 5e-06, "loss": 0.3972, "step": 2745 }, { "epoch": 0.5250478011472275, "grad_norm": 2.356659173965454, "learning_rate": 5e-06, "loss": 0.1496, "step": 2746 }, { "epoch": 0.5252390057361377, "grad_norm": 1.9196159839630127, "learning_rate": 5e-06, "loss": 0.1202, "step": 2747 }, { "epoch": 0.5254302103250478, "grad_norm": 2.2066104412078857, "learning_rate": 5e-06, "loss": 0.1625, "step": 2748 }, { "epoch": 0.525621414913958, "grad_norm": 1.1974610090255737, "learning_rate": 5e-06, "loss": 0.0551, "step": 2749 }, { "epoch": 0.5258126195028681, "grad_norm": 2.4025866985321045, "learning_rate": 5e-06, "loss": 0.1537, "step": 2750 }, { "epoch": 0.5260038240917781, "grad_norm": 2.276185989379883, "learning_rate": 5e-06, "loss": 0.2141, "step": 2751 }, { "epoch": 0.5261950286806883, "grad_norm": 1.9449247121810913, "learning_rate": 5e-06, "loss": 0.2303, "step": 2752 }, { "epoch": 0.5263862332695984, "grad_norm": 1.2716035842895508, "learning_rate": 5e-06, "loss": 0.1516, "step": 2753 }, { "epoch": 0.5265774378585086, "grad_norm": 1.2771272659301758, "learning_rate": 5e-06, "loss": 0.088, "step": 2754 }, { "epoch": 0.5267686424474187, "grad_norm": 2.0387823581695557, "learning_rate": 5e-06, "loss": 0.1187, "step": 2755 }, { "epoch": 0.5269598470363289, "grad_norm": 2.576063394546509, "learning_rate": 5e-06, "loss": 0.1408, "step": 2756 }, { "epoch": 0.527151051625239, "grad_norm": 2.7884414196014404, "learning_rate": 5e-06, "loss": 0.3298, "step": 2757 }, { "epoch": 0.5273422562141491, "grad_norm": 3.2937912940979004, "learning_rate": 5e-06, "loss": 0.5074, "step": 2758 }, { "epoch": 0.5275334608030593, "grad_norm": 1.2143149375915527, "learning_rate": 5e-06, "loss": 0.0907, "step": 2759 }, { "epoch": 0.5277246653919694, "grad_norm": 1.9609935283660889, "learning_rate": 5e-06, "loss": 0.1422, "step": 2760 }, { "epoch": 0.5279158699808796, "grad_norm": 2.3065104484558105, "learning_rate": 5e-06, "loss": 0.2394, "step": 2761 }, { "epoch": 0.5281070745697897, "grad_norm": 1.3439974784851074, "learning_rate": 5e-06, "loss": 0.0567, "step": 2762 }, { "epoch": 0.5282982791586998, "grad_norm": 2.701871395111084, "learning_rate": 5e-06, "loss": 0.2844, "step": 2763 }, { "epoch": 0.52848948374761, "grad_norm": 2.580606460571289, "learning_rate": 5e-06, "loss": 0.3827, "step": 2764 }, { "epoch": 0.52868068833652, "grad_norm": 2.0490102767944336, "learning_rate": 5e-06, "loss": 0.164, "step": 2765 }, { "epoch": 0.5288718929254302, "grad_norm": 2.0676021575927734, "learning_rate": 5e-06, "loss": 0.3342, "step": 2766 }, { "epoch": 0.5290630975143403, "grad_norm": 1.146315097808838, "learning_rate": 5e-06, "loss": 0.0772, "step": 2767 }, { "epoch": 0.5292543021032505, "grad_norm": 1.5692076683044434, "learning_rate": 5e-06, "loss": 0.1324, "step": 2768 }, { "epoch": 0.5294455066921606, "grad_norm": 2.6447064876556396, "learning_rate": 5e-06, "loss": 0.2929, "step": 2769 }, { "epoch": 0.5296367112810707, "grad_norm": 3.0837621688842773, "learning_rate": 5e-06, "loss": 0.4175, "step": 2770 }, { "epoch": 0.5298279158699809, "grad_norm": 2.180725574493408, "learning_rate": 5e-06, "loss": 0.2276, "step": 2771 }, { "epoch": 0.530019120458891, "grad_norm": 1.3501646518707275, "learning_rate": 5e-06, "loss": 0.1305, "step": 2772 }, { "epoch": 0.5302103250478012, "grad_norm": 2.3673691749572754, "learning_rate": 5e-06, "loss": 0.2671, "step": 2773 }, { "epoch": 0.5304015296367113, "grad_norm": 2.891281843185425, "learning_rate": 5e-06, "loss": 0.2122, "step": 2774 }, { "epoch": 0.5305927342256214, "grad_norm": 2.67206072807312, "learning_rate": 5e-06, "loss": 0.112, "step": 2775 }, { "epoch": 0.5307839388145316, "grad_norm": 2.2440712451934814, "learning_rate": 5e-06, "loss": 0.3392, "step": 2776 }, { "epoch": 0.5309751434034417, "grad_norm": 1.790481686592102, "learning_rate": 5e-06, "loss": 0.2133, "step": 2777 }, { "epoch": 0.5311663479923519, "grad_norm": 1.2576000690460205, "learning_rate": 5e-06, "loss": 0.0937, "step": 2778 }, { "epoch": 0.5313575525812619, "grad_norm": 1.711289405822754, "learning_rate": 5e-06, "loss": 0.1502, "step": 2779 }, { "epoch": 0.5315487571701721, "grad_norm": 1.4342141151428223, "learning_rate": 5e-06, "loss": 0.1056, "step": 2780 }, { "epoch": 0.5317399617590822, "grad_norm": 2.1560006141662598, "learning_rate": 5e-06, "loss": 0.1531, "step": 2781 }, { "epoch": 0.5319311663479923, "grad_norm": 2.112128734588623, "learning_rate": 5e-06, "loss": 0.1361, "step": 2782 }, { "epoch": 0.5321223709369025, "grad_norm": 2.3116440773010254, "learning_rate": 5e-06, "loss": 0.2867, "step": 2783 }, { "epoch": 0.5323135755258126, "grad_norm": 2.4027740955352783, "learning_rate": 5e-06, "loss": 0.2486, "step": 2784 }, { "epoch": 0.5325047801147228, "grad_norm": 1.1170294284820557, "learning_rate": 5e-06, "loss": 0.0558, "step": 2785 }, { "epoch": 0.5326959847036329, "grad_norm": 2.2636983394622803, "learning_rate": 5e-06, "loss": 0.0858, "step": 2786 }, { "epoch": 0.532887189292543, "grad_norm": 1.4869250059127808, "learning_rate": 5e-06, "loss": 0.0778, "step": 2787 }, { "epoch": 0.5330783938814532, "grad_norm": 2.6340017318725586, "learning_rate": 5e-06, "loss": 0.1319, "step": 2788 }, { "epoch": 0.5332695984703633, "grad_norm": 2.0733349323272705, "learning_rate": 5e-06, "loss": 0.2641, "step": 2789 }, { "epoch": 0.5334608030592735, "grad_norm": 1.8458443880081177, "learning_rate": 5e-06, "loss": 0.1147, "step": 2790 }, { "epoch": 0.5336520076481835, "grad_norm": 0.9659456610679626, "learning_rate": 5e-06, "loss": 0.0781, "step": 2791 }, { "epoch": 0.5338432122370937, "grad_norm": 2.7698538303375244, "learning_rate": 5e-06, "loss": 0.2659, "step": 2792 }, { "epoch": 0.5340344168260038, "grad_norm": 0.9814829230308533, "learning_rate": 5e-06, "loss": 0.0433, "step": 2793 }, { "epoch": 0.5342256214149139, "grad_norm": 2.5287699699401855, "learning_rate": 5e-06, "loss": 0.1128, "step": 2794 }, { "epoch": 0.5344168260038241, "grad_norm": 2.2758593559265137, "learning_rate": 5e-06, "loss": 0.2237, "step": 2795 }, { "epoch": 0.5346080305927342, "grad_norm": 3.0711472034454346, "learning_rate": 5e-06, "loss": 0.1646, "step": 2796 }, { "epoch": 0.5347992351816444, "grad_norm": 2.538210153579712, "learning_rate": 5e-06, "loss": 0.3324, "step": 2797 }, { "epoch": 0.5349904397705545, "grad_norm": 1.7886542081832886, "learning_rate": 5e-06, "loss": 0.1307, "step": 2798 }, { "epoch": 0.5351816443594646, "grad_norm": 1.5378341674804688, "learning_rate": 5e-06, "loss": 0.1657, "step": 2799 }, { "epoch": 0.5353728489483748, "grad_norm": 1.748417615890503, "learning_rate": 5e-06, "loss": 0.0942, "step": 2800 }, { "epoch": 0.5355640535372849, "grad_norm": 1.9090436697006226, "learning_rate": 5e-06, "loss": 0.3028, "step": 2801 }, { "epoch": 0.5357552581261951, "grad_norm": 2.4026999473571777, "learning_rate": 5e-06, "loss": 0.2898, "step": 2802 }, { "epoch": 0.5359464627151052, "grad_norm": 1.4179044961929321, "learning_rate": 5e-06, "loss": 0.1178, "step": 2803 }, { "epoch": 0.5361376673040152, "grad_norm": 1.6482465267181396, "learning_rate": 5e-06, "loss": 0.0878, "step": 2804 }, { "epoch": 0.5363288718929254, "grad_norm": 2.1801981925964355, "learning_rate": 5e-06, "loss": 0.1139, "step": 2805 }, { "epoch": 0.5365200764818355, "grad_norm": 1.33127760887146, "learning_rate": 5e-06, "loss": 0.0881, "step": 2806 }, { "epoch": 0.5367112810707457, "grad_norm": 3.296818971633911, "learning_rate": 5e-06, "loss": 0.3499, "step": 2807 }, { "epoch": 0.5369024856596558, "grad_norm": 2.183624267578125, "learning_rate": 5e-06, "loss": 0.303, "step": 2808 }, { "epoch": 0.537093690248566, "grad_norm": 2.4392447471618652, "learning_rate": 5e-06, "loss": 0.3532, "step": 2809 }, { "epoch": 0.5372848948374761, "grad_norm": 1.9336990118026733, "learning_rate": 5e-06, "loss": 0.1537, "step": 2810 }, { "epoch": 0.5374760994263862, "grad_norm": 1.5660754442214966, "learning_rate": 5e-06, "loss": 0.1199, "step": 2811 }, { "epoch": 0.5376673040152964, "grad_norm": 2.9620461463928223, "learning_rate": 5e-06, "loss": 0.2485, "step": 2812 }, { "epoch": 0.5378585086042065, "grad_norm": 1.6001290082931519, "learning_rate": 5e-06, "loss": 0.1115, "step": 2813 }, { "epoch": 0.5380497131931167, "grad_norm": 1.5141234397888184, "learning_rate": 5e-06, "loss": 0.1407, "step": 2814 }, { "epoch": 0.5382409177820268, "grad_norm": 1.9983782768249512, "learning_rate": 5e-06, "loss": 0.24, "step": 2815 }, { "epoch": 0.5384321223709368, "grad_norm": 1.7045232057571411, "learning_rate": 5e-06, "loss": 0.1267, "step": 2816 }, { "epoch": 0.538623326959847, "grad_norm": 1.1079213619232178, "learning_rate": 5e-06, "loss": 0.1043, "step": 2817 }, { "epoch": 0.5388145315487571, "grad_norm": 2.0188820362091064, "learning_rate": 5e-06, "loss": 0.314, "step": 2818 }, { "epoch": 0.5390057361376673, "grad_norm": 1.7051680088043213, "learning_rate": 5e-06, "loss": 0.1723, "step": 2819 }, { "epoch": 0.5391969407265774, "grad_norm": 2.6241707801818848, "learning_rate": 5e-06, "loss": 0.3201, "step": 2820 }, { "epoch": 0.5393881453154876, "grad_norm": 1.4381252527236938, "learning_rate": 5e-06, "loss": 0.1995, "step": 2821 }, { "epoch": 0.5395793499043977, "grad_norm": 2.13104248046875, "learning_rate": 5e-06, "loss": 0.3106, "step": 2822 }, { "epoch": 0.5397705544933078, "grad_norm": 2.429593086242676, "learning_rate": 5e-06, "loss": 0.1502, "step": 2823 }, { "epoch": 0.539961759082218, "grad_norm": 1.9471462965011597, "learning_rate": 5e-06, "loss": 0.1125, "step": 2824 }, { "epoch": 0.5401529636711281, "grad_norm": 2.1036086082458496, "learning_rate": 5e-06, "loss": 0.1223, "step": 2825 }, { "epoch": 0.5403441682600383, "grad_norm": 3.6985058784484863, "learning_rate": 5e-06, "loss": 0.6219, "step": 2826 }, { "epoch": 0.5405353728489484, "grad_norm": 2.2251152992248535, "learning_rate": 5e-06, "loss": 0.2841, "step": 2827 }, { "epoch": 0.5407265774378585, "grad_norm": 1.7809268236160278, "learning_rate": 5e-06, "loss": 0.3284, "step": 2828 }, { "epoch": 0.5409177820267687, "grad_norm": 2.0658373832702637, "learning_rate": 5e-06, "loss": 0.1905, "step": 2829 }, { "epoch": 0.5411089866156787, "grad_norm": 2.7861831188201904, "learning_rate": 5e-06, "loss": 0.1682, "step": 2830 }, { "epoch": 0.5413001912045889, "grad_norm": 2.1867973804473877, "learning_rate": 5e-06, "loss": 0.1547, "step": 2831 }, { "epoch": 0.541491395793499, "grad_norm": 2.6366775035858154, "learning_rate": 5e-06, "loss": 0.4594, "step": 2832 }, { "epoch": 0.5416826003824092, "grad_norm": 2.0173962116241455, "learning_rate": 5e-06, "loss": 0.1957, "step": 2833 }, { "epoch": 0.5418738049713193, "grad_norm": 2.2647476196289062, "learning_rate": 5e-06, "loss": 0.1583, "step": 2834 }, { "epoch": 0.5420650095602294, "grad_norm": 1.8371925354003906, "learning_rate": 5e-06, "loss": 0.1247, "step": 2835 }, { "epoch": 0.5422562141491396, "grad_norm": 2.1972312927246094, "learning_rate": 5e-06, "loss": 0.22, "step": 2836 }, { "epoch": 0.5424474187380497, "grad_norm": 1.5029915571212769, "learning_rate": 5e-06, "loss": 0.0747, "step": 2837 }, { "epoch": 0.5426386233269599, "grad_norm": 1.6815667152404785, "learning_rate": 5e-06, "loss": 0.1455, "step": 2838 }, { "epoch": 0.54282982791587, "grad_norm": 2.7903823852539062, "learning_rate": 5e-06, "loss": 0.5205, "step": 2839 }, { "epoch": 0.5430210325047801, "grad_norm": 2.121938943862915, "learning_rate": 5e-06, "loss": 0.374, "step": 2840 }, { "epoch": 0.5432122370936903, "grad_norm": 2.8478827476501465, "learning_rate": 5e-06, "loss": 0.3739, "step": 2841 }, { "epoch": 0.5434034416826004, "grad_norm": 2.33632755279541, "learning_rate": 5e-06, "loss": 0.1146, "step": 2842 }, { "epoch": 0.5435946462715106, "grad_norm": 1.6369229555130005, "learning_rate": 5e-06, "loss": 0.0817, "step": 2843 }, { "epoch": 0.5437858508604206, "grad_norm": 2.857534885406494, "learning_rate": 5e-06, "loss": 0.4311, "step": 2844 }, { "epoch": 0.5439770554493308, "grad_norm": 2.2772583961486816, "learning_rate": 5e-06, "loss": 0.3075, "step": 2845 }, { "epoch": 0.5441682600382409, "grad_norm": 1.84805166721344, "learning_rate": 5e-06, "loss": 0.16, "step": 2846 }, { "epoch": 0.544359464627151, "grad_norm": 3.0901503562927246, "learning_rate": 5e-06, "loss": 0.2931, "step": 2847 }, { "epoch": 0.5445506692160612, "grad_norm": 1.3856008052825928, "learning_rate": 5e-06, "loss": 0.0772, "step": 2848 }, { "epoch": 0.5447418738049713, "grad_norm": 1.3322361707687378, "learning_rate": 5e-06, "loss": 0.0977, "step": 2849 }, { "epoch": 0.5449330783938815, "grad_norm": 2.8700478076934814, "learning_rate": 5e-06, "loss": 0.1916, "step": 2850 }, { "epoch": 0.5451242829827916, "grad_norm": 2.482931137084961, "learning_rate": 5e-06, "loss": 0.3922, "step": 2851 }, { "epoch": 0.5453154875717017, "grad_norm": 2.8786845207214355, "learning_rate": 5e-06, "loss": 0.3643, "step": 2852 }, { "epoch": 0.5455066921606119, "grad_norm": 1.5687427520751953, "learning_rate": 5e-06, "loss": 0.1403, "step": 2853 }, { "epoch": 0.545697896749522, "grad_norm": 1.8267560005187988, "learning_rate": 5e-06, "loss": 0.1301, "step": 2854 }, { "epoch": 0.5458891013384322, "grad_norm": 2.0420382022857666, "learning_rate": 5e-06, "loss": 0.1072, "step": 2855 }, { "epoch": 0.5460803059273422, "grad_norm": 2.005113124847412, "learning_rate": 5e-06, "loss": 0.106, "step": 2856 }, { "epoch": 0.5462715105162524, "grad_norm": 2.4532604217529297, "learning_rate": 5e-06, "loss": 0.4297, "step": 2857 }, { "epoch": 0.5464627151051625, "grad_norm": 1.8568321466445923, "learning_rate": 5e-06, "loss": 0.1724, "step": 2858 }, { "epoch": 0.5466539196940726, "grad_norm": 2.2571911811828613, "learning_rate": 5e-06, "loss": 0.2347, "step": 2859 }, { "epoch": 0.5468451242829828, "grad_norm": 1.288896918296814, "learning_rate": 5e-06, "loss": 0.0996, "step": 2860 }, { "epoch": 0.5470363288718929, "grad_norm": 1.4428579807281494, "learning_rate": 5e-06, "loss": 0.1142, "step": 2861 }, { "epoch": 0.5472275334608031, "grad_norm": 1.6780493259429932, "learning_rate": 5e-06, "loss": 0.1161, "step": 2862 }, { "epoch": 0.5474187380497132, "grad_norm": 2.8386378288269043, "learning_rate": 5e-06, "loss": 0.3605, "step": 2863 }, { "epoch": 0.5476099426386233, "grad_norm": 2.42722487449646, "learning_rate": 5e-06, "loss": 0.2646, "step": 2864 }, { "epoch": 0.5478011472275335, "grad_norm": 1.9471642971038818, "learning_rate": 5e-06, "loss": 0.4178, "step": 2865 }, { "epoch": 0.5479923518164436, "grad_norm": 1.1738109588623047, "learning_rate": 5e-06, "loss": 0.101, "step": 2866 }, { "epoch": 0.5481835564053538, "grad_norm": 1.1273823976516724, "learning_rate": 5e-06, "loss": 0.052, "step": 2867 }, { "epoch": 0.5483747609942639, "grad_norm": 0.9076278805732727, "learning_rate": 5e-06, "loss": 0.0353, "step": 2868 }, { "epoch": 0.5485659655831739, "grad_norm": 1.6331543922424316, "learning_rate": 5e-06, "loss": 0.1101, "step": 2869 }, { "epoch": 0.5487571701720841, "grad_norm": 4.053876876831055, "learning_rate": 5e-06, "loss": 0.6708, "step": 2870 }, { "epoch": 0.5489483747609942, "grad_norm": 2.119873523712158, "learning_rate": 5e-06, "loss": 0.1168, "step": 2871 }, { "epoch": 0.5491395793499044, "grad_norm": 1.7409799098968506, "learning_rate": 5e-06, "loss": 0.1408, "step": 2872 }, { "epoch": 0.5493307839388145, "grad_norm": 0.972379207611084, "learning_rate": 5e-06, "loss": 0.0649, "step": 2873 }, { "epoch": 0.5495219885277247, "grad_norm": 3.567614793777466, "learning_rate": 5e-06, "loss": 0.089, "step": 2874 }, { "epoch": 0.5497131931166348, "grad_norm": 1.5383042097091675, "learning_rate": 5e-06, "loss": 0.0641, "step": 2875 }, { "epoch": 0.5499043977055449, "grad_norm": 2.714123010635376, "learning_rate": 5e-06, "loss": 0.3569, "step": 2876 }, { "epoch": 0.5500956022944551, "grad_norm": 2.086832046508789, "learning_rate": 5e-06, "loss": 0.296, "step": 2877 }, { "epoch": 0.5502868068833652, "grad_norm": 2.510014295578003, "learning_rate": 5e-06, "loss": 0.2874, "step": 2878 }, { "epoch": 0.5504780114722754, "grad_norm": 1.494666576385498, "learning_rate": 5e-06, "loss": 0.1435, "step": 2879 }, { "epoch": 0.5506692160611855, "grad_norm": 2.3554635047912598, "learning_rate": 5e-06, "loss": 0.101, "step": 2880 }, { "epoch": 0.5508604206500956, "grad_norm": 1.758353352546692, "learning_rate": 5e-06, "loss": 0.1163, "step": 2881 }, { "epoch": 0.5510516252390057, "grad_norm": 1.8508045673370361, "learning_rate": 5e-06, "loss": 0.2355, "step": 2882 }, { "epoch": 0.5512428298279158, "grad_norm": 1.8111543655395508, "learning_rate": 5e-06, "loss": 0.2789, "step": 2883 }, { "epoch": 0.551434034416826, "grad_norm": 2.13771915435791, "learning_rate": 5e-06, "loss": 0.1331, "step": 2884 }, { "epoch": 0.5516252390057361, "grad_norm": 1.0804390907287598, "learning_rate": 5e-06, "loss": 0.1287, "step": 2885 }, { "epoch": 0.5518164435946463, "grad_norm": 1.9480468034744263, "learning_rate": 5e-06, "loss": 0.1915, "step": 2886 }, { "epoch": 0.5520076481835564, "grad_norm": 1.979048252105713, "learning_rate": 5e-06, "loss": 0.1247, "step": 2887 }, { "epoch": 0.5521988527724665, "grad_norm": 2.188206434249878, "learning_rate": 5e-06, "loss": 0.2204, "step": 2888 }, { "epoch": 0.5523900573613767, "grad_norm": 4.077398777008057, "learning_rate": 5e-06, "loss": 0.4834, "step": 2889 }, { "epoch": 0.5525812619502868, "grad_norm": 1.1730834245681763, "learning_rate": 5e-06, "loss": 0.0996, "step": 2890 }, { "epoch": 0.552772466539197, "grad_norm": 2.131211280822754, "learning_rate": 5e-06, "loss": 0.2714, "step": 2891 }, { "epoch": 0.5529636711281071, "grad_norm": 0.8780372738838196, "learning_rate": 5e-06, "loss": 0.0451, "step": 2892 }, { "epoch": 0.5531548757170172, "grad_norm": 1.3180248737335205, "learning_rate": 5e-06, "loss": 0.0978, "step": 2893 }, { "epoch": 0.5533460803059274, "grad_norm": 2.865992784500122, "learning_rate": 5e-06, "loss": 0.0841, "step": 2894 }, { "epoch": 0.5535372848948374, "grad_norm": 2.895068407058716, "learning_rate": 5e-06, "loss": 0.6579, "step": 2895 }, { "epoch": 0.5537284894837476, "grad_norm": 1.6795144081115723, "learning_rate": 5e-06, "loss": 0.3405, "step": 2896 }, { "epoch": 0.5539196940726577, "grad_norm": 2.2790634632110596, "learning_rate": 5e-06, "loss": 0.189, "step": 2897 }, { "epoch": 0.5541108986615679, "grad_norm": 2.1477177143096924, "learning_rate": 5e-06, "loss": 0.3104, "step": 2898 }, { "epoch": 0.554302103250478, "grad_norm": 1.7578496932983398, "learning_rate": 5e-06, "loss": 0.1977, "step": 2899 }, { "epoch": 0.5544933078393881, "grad_norm": 1.507967233657837, "learning_rate": 5e-06, "loss": 0.0621, "step": 2900 }, { "epoch": 0.5546845124282983, "grad_norm": 2.4397711753845215, "learning_rate": 5e-06, "loss": 0.3803, "step": 2901 }, { "epoch": 0.5548757170172084, "grad_norm": 1.9248405694961548, "learning_rate": 5e-06, "loss": 0.1198, "step": 2902 }, { "epoch": 0.5550669216061186, "grad_norm": 2.194777011871338, "learning_rate": 5e-06, "loss": 0.2595, "step": 2903 }, { "epoch": 0.5552581261950287, "grad_norm": 1.9272375106811523, "learning_rate": 5e-06, "loss": 0.1686, "step": 2904 }, { "epoch": 0.5554493307839388, "grad_norm": 1.9887967109680176, "learning_rate": 5e-06, "loss": 0.0554, "step": 2905 }, { "epoch": 0.555640535372849, "grad_norm": 2.538315773010254, "learning_rate": 5e-06, "loss": 0.1184, "step": 2906 }, { "epoch": 0.555831739961759, "grad_norm": 2.3389220237731934, "learning_rate": 5e-06, "loss": 0.2485, "step": 2907 }, { "epoch": 0.5560229445506693, "grad_norm": 4.87516450881958, "learning_rate": 5e-06, "loss": 0.3167, "step": 2908 }, { "epoch": 0.5562141491395793, "grad_norm": 2.524181365966797, "learning_rate": 5e-06, "loss": 0.3042, "step": 2909 }, { "epoch": 0.5564053537284895, "grad_norm": 1.860914707183838, "learning_rate": 5e-06, "loss": 0.1627, "step": 2910 }, { "epoch": 0.5565965583173996, "grad_norm": 1.0398834943771362, "learning_rate": 5e-06, "loss": 0.1021, "step": 2911 }, { "epoch": 0.5567877629063097, "grad_norm": 2.4790728092193604, "learning_rate": 5e-06, "loss": 0.1407, "step": 2912 }, { "epoch": 0.5569789674952199, "grad_norm": 1.9124921560287476, "learning_rate": 5e-06, "loss": 0.1495, "step": 2913 }, { "epoch": 0.55717017208413, "grad_norm": 3.33526349067688, "learning_rate": 5e-06, "loss": 0.7095, "step": 2914 }, { "epoch": 0.5573613766730402, "grad_norm": 1.0180929899215698, "learning_rate": 5e-06, "loss": 0.1051, "step": 2915 }, { "epoch": 0.5575525812619503, "grad_norm": 3.9493212699890137, "learning_rate": 5e-06, "loss": 0.335, "step": 2916 }, { "epoch": 0.5577437858508604, "grad_norm": 3.441434621810913, "learning_rate": 5e-06, "loss": 0.1332, "step": 2917 }, { "epoch": 0.5579349904397706, "grad_norm": 1.9761435985565186, "learning_rate": 5e-06, "loss": 0.1083, "step": 2918 }, { "epoch": 0.5581261950286807, "grad_norm": 1.1638832092285156, "learning_rate": 5e-06, "loss": 0.0597, "step": 2919 }, { "epoch": 0.5583173996175909, "grad_norm": 3.2977254390716553, "learning_rate": 5e-06, "loss": 0.7461, "step": 2920 }, { "epoch": 0.558508604206501, "grad_norm": 1.3047654628753662, "learning_rate": 5e-06, "loss": 0.0716, "step": 2921 }, { "epoch": 0.558699808795411, "grad_norm": 2.069331169128418, "learning_rate": 5e-06, "loss": 0.2381, "step": 2922 }, { "epoch": 0.5588910133843212, "grad_norm": 1.0748863220214844, "learning_rate": 5e-06, "loss": 0.0702, "step": 2923 }, { "epoch": 0.5590822179732313, "grad_norm": 1.0665748119354248, "learning_rate": 5e-06, "loss": 0.0602, "step": 2924 }, { "epoch": 0.5592734225621415, "grad_norm": 0.8832602500915527, "learning_rate": 5e-06, "loss": 0.0312, "step": 2925 }, { "epoch": 0.5594646271510516, "grad_norm": 2.666064739227295, "learning_rate": 5e-06, "loss": 0.4669, "step": 2926 }, { "epoch": 0.5596558317399618, "grad_norm": 1.9858882427215576, "learning_rate": 5e-06, "loss": 0.1749, "step": 2927 }, { "epoch": 0.5598470363288719, "grad_norm": 2.102036237716675, "learning_rate": 5e-06, "loss": 0.1507, "step": 2928 }, { "epoch": 0.560038240917782, "grad_norm": 1.3998578786849976, "learning_rate": 5e-06, "loss": 0.1824, "step": 2929 }, { "epoch": 0.5602294455066922, "grad_norm": 1.6715340614318848, "learning_rate": 5e-06, "loss": 0.0916, "step": 2930 }, { "epoch": 0.5604206500956023, "grad_norm": 1.6286790370941162, "learning_rate": 5e-06, "loss": 0.1024, "step": 2931 }, { "epoch": 0.5606118546845125, "grad_norm": 2.3067750930786133, "learning_rate": 5e-06, "loss": 0.3355, "step": 2932 }, { "epoch": 0.5608030592734226, "grad_norm": 2.1875245571136475, "learning_rate": 5e-06, "loss": 0.3912, "step": 2933 }, { "epoch": 0.5609942638623326, "grad_norm": 3.647351026535034, "learning_rate": 5e-06, "loss": 0.3519, "step": 2934 }, { "epoch": 0.5611854684512428, "grad_norm": 1.6276153326034546, "learning_rate": 5e-06, "loss": 0.1034, "step": 2935 }, { "epoch": 0.5613766730401529, "grad_norm": 1.9249351024627686, "learning_rate": 5e-06, "loss": 0.3196, "step": 2936 }, { "epoch": 0.5615678776290631, "grad_norm": 2.1490015983581543, "learning_rate": 5e-06, "loss": 0.1547, "step": 2937 }, { "epoch": 0.5617590822179732, "grad_norm": 1.8144192695617676, "learning_rate": 5e-06, "loss": 0.1045, "step": 2938 }, { "epoch": 0.5619502868068834, "grad_norm": 1.4243121147155762, "learning_rate": 5e-06, "loss": 0.1824, "step": 2939 }, { "epoch": 0.5621414913957935, "grad_norm": 2.2722017765045166, "learning_rate": 5e-06, "loss": 0.3895, "step": 2940 }, { "epoch": 0.5623326959847036, "grad_norm": 1.8516919612884521, "learning_rate": 5e-06, "loss": 0.0461, "step": 2941 }, { "epoch": 0.5625239005736138, "grad_norm": 2.4373631477355957, "learning_rate": 5e-06, "loss": 0.2125, "step": 2942 }, { "epoch": 0.5627151051625239, "grad_norm": 2.4550974369049072, "learning_rate": 5e-06, "loss": 0.13, "step": 2943 }, { "epoch": 0.5629063097514341, "grad_norm": 1.8102632761001587, "learning_rate": 5e-06, "loss": 0.0701, "step": 2944 }, { "epoch": 0.5630975143403442, "grad_norm": 2.8042306900024414, "learning_rate": 5e-06, "loss": 0.5347, "step": 2945 }, { "epoch": 0.5632887189292543, "grad_norm": 1.6445013284683228, "learning_rate": 5e-06, "loss": 0.1356, "step": 2946 }, { "epoch": 0.5634799235181644, "grad_norm": 1.7900830507278442, "learning_rate": 5e-06, "loss": 0.2208, "step": 2947 }, { "epoch": 0.5636711281070745, "grad_norm": 2.265395164489746, "learning_rate": 5e-06, "loss": 0.2767, "step": 2948 }, { "epoch": 0.5638623326959847, "grad_norm": 0.9388930201530457, "learning_rate": 5e-06, "loss": 0.0764, "step": 2949 }, { "epoch": 0.5640535372848948, "grad_norm": 2.3902626037597656, "learning_rate": 5e-06, "loss": 0.1168, "step": 2950 }, { "epoch": 0.564244741873805, "grad_norm": 2.793389320373535, "learning_rate": 5e-06, "loss": 0.5114, "step": 2951 }, { "epoch": 0.5644359464627151, "grad_norm": 1.428493618965149, "learning_rate": 5e-06, "loss": 0.1409, "step": 2952 }, { "epoch": 0.5646271510516252, "grad_norm": 2.532050848007202, "learning_rate": 5e-06, "loss": 0.2788, "step": 2953 }, { "epoch": 0.5648183556405354, "grad_norm": 2.010270357131958, "learning_rate": 5e-06, "loss": 0.0942, "step": 2954 }, { "epoch": 0.5650095602294455, "grad_norm": 2.6057164669036865, "learning_rate": 5e-06, "loss": 0.2012, "step": 2955 }, { "epoch": 0.5652007648183557, "grad_norm": 1.232944130897522, "learning_rate": 5e-06, "loss": 0.0479, "step": 2956 }, { "epoch": 0.5653919694072658, "grad_norm": 2.149996519088745, "learning_rate": 5e-06, "loss": 0.2825, "step": 2957 }, { "epoch": 0.5655831739961759, "grad_norm": 1.599258303642273, "learning_rate": 5e-06, "loss": 0.2824, "step": 2958 }, { "epoch": 0.5657743785850861, "grad_norm": 2.3001720905303955, "learning_rate": 5e-06, "loss": 0.4862, "step": 2959 }, { "epoch": 0.5659655831739961, "grad_norm": 3.394070863723755, "learning_rate": 5e-06, "loss": 0.3695, "step": 2960 }, { "epoch": 0.5661567877629063, "grad_norm": 2.2293701171875, "learning_rate": 5e-06, "loss": 0.0652, "step": 2961 }, { "epoch": 0.5663479923518164, "grad_norm": 2.403902769088745, "learning_rate": 5e-06, "loss": 0.17, "step": 2962 }, { "epoch": 0.5665391969407266, "grad_norm": 1.5070533752441406, "learning_rate": 5e-06, "loss": 0.0996, "step": 2963 }, { "epoch": 0.5667304015296367, "grad_norm": 3.1238303184509277, "learning_rate": 5e-06, "loss": 0.3401, "step": 2964 }, { "epoch": 0.5669216061185468, "grad_norm": 2.93849778175354, "learning_rate": 5e-06, "loss": 0.3098, "step": 2965 }, { "epoch": 0.567112810707457, "grad_norm": 1.2622096538543701, "learning_rate": 5e-06, "loss": 0.0951, "step": 2966 }, { "epoch": 0.5673040152963671, "grad_norm": 1.6056170463562012, "learning_rate": 5e-06, "loss": 0.1208, "step": 2967 }, { "epoch": 0.5674952198852773, "grad_norm": 1.6788007020950317, "learning_rate": 5e-06, "loss": 0.128, "step": 2968 }, { "epoch": 0.5676864244741874, "grad_norm": 2.0569911003112793, "learning_rate": 5e-06, "loss": 0.1404, "step": 2969 }, { "epoch": 0.5678776290630975, "grad_norm": 3.8002560138702393, "learning_rate": 5e-06, "loss": 1.0882, "step": 2970 }, { "epoch": 0.5680688336520077, "grad_norm": 1.7185357809066772, "learning_rate": 5e-06, "loss": 0.1324, "step": 2971 }, { "epoch": 0.5682600382409178, "grad_norm": 1.3213095664978027, "learning_rate": 5e-06, "loss": 0.104, "step": 2972 }, { "epoch": 0.568451242829828, "grad_norm": 1.4530354738235474, "learning_rate": 5e-06, "loss": 0.1042, "step": 2973 }, { "epoch": 0.568642447418738, "grad_norm": 3.0464630126953125, "learning_rate": 5e-06, "loss": 0.2976, "step": 2974 }, { "epoch": 0.5688336520076482, "grad_norm": 2.337543249130249, "learning_rate": 5e-06, "loss": 0.135, "step": 2975 }, { "epoch": 0.5690248565965583, "grad_norm": 2.6473655700683594, "learning_rate": 5e-06, "loss": 0.4181, "step": 2976 }, { "epoch": 0.5692160611854684, "grad_norm": 2.9804956912994385, "learning_rate": 5e-06, "loss": 0.5121, "step": 2977 }, { "epoch": 0.5694072657743786, "grad_norm": 1.9707882404327393, "learning_rate": 5e-06, "loss": 0.2911, "step": 2978 }, { "epoch": 0.5695984703632887, "grad_norm": 1.867041826248169, "learning_rate": 5e-06, "loss": 0.2345, "step": 2979 }, { "epoch": 0.5697896749521989, "grad_norm": 1.9662240743637085, "learning_rate": 5e-06, "loss": 0.1211, "step": 2980 }, { "epoch": 0.569980879541109, "grad_norm": 1.9298244714736938, "learning_rate": 5e-06, "loss": 0.0948, "step": 2981 }, { "epoch": 0.5701720841300191, "grad_norm": 2.791269063949585, "learning_rate": 5e-06, "loss": 0.3562, "step": 2982 }, { "epoch": 0.5703632887189293, "grad_norm": 1.7784298658370972, "learning_rate": 5e-06, "loss": 0.153, "step": 2983 }, { "epoch": 0.5705544933078394, "grad_norm": 2.0003790855407715, "learning_rate": 5e-06, "loss": 0.3397, "step": 2984 }, { "epoch": 0.5707456978967496, "grad_norm": 1.470927119255066, "learning_rate": 5e-06, "loss": 0.1265, "step": 2985 }, { "epoch": 0.5709369024856596, "grad_norm": 2.981893539428711, "learning_rate": 5e-06, "loss": 0.3084, "step": 2986 }, { "epoch": 0.5711281070745697, "grad_norm": 1.706616997718811, "learning_rate": 5e-06, "loss": 0.1126, "step": 2987 }, { "epoch": 0.5713193116634799, "grad_norm": 1.6704820394515991, "learning_rate": 5e-06, "loss": 0.1063, "step": 2988 }, { "epoch": 0.57151051625239, "grad_norm": 1.1496891975402832, "learning_rate": 5e-06, "loss": 0.1615, "step": 2989 }, { "epoch": 0.5717017208413002, "grad_norm": 2.2269532680511475, "learning_rate": 5e-06, "loss": 0.1694, "step": 2990 }, { "epoch": 0.5718929254302103, "grad_norm": 0.9735349416732788, "learning_rate": 5e-06, "loss": 0.0651, "step": 2991 }, { "epoch": 0.5720841300191205, "grad_norm": 1.315924048423767, "learning_rate": 5e-06, "loss": 0.0818, "step": 2992 }, { "epoch": 0.5722753346080306, "grad_norm": 2.6886305809020996, "learning_rate": 5e-06, "loss": 0.4085, "step": 2993 }, { "epoch": 0.5724665391969407, "grad_norm": 1.624668836593628, "learning_rate": 5e-06, "loss": 0.1512, "step": 2994 }, { "epoch": 0.5726577437858509, "grad_norm": 2.3353240489959717, "learning_rate": 5e-06, "loss": 0.2948, "step": 2995 }, { "epoch": 0.572848948374761, "grad_norm": 2.4698197841644287, "learning_rate": 5e-06, "loss": 0.2742, "step": 2996 }, { "epoch": 0.5730401529636712, "grad_norm": 2.138054370880127, "learning_rate": 5e-06, "loss": 0.3292, "step": 2997 }, { "epoch": 0.5732313575525813, "grad_norm": 2.4792983531951904, "learning_rate": 5e-06, "loss": 0.2956, "step": 2998 }, { "epoch": 0.5734225621414913, "grad_norm": 1.899868130683899, "learning_rate": 5e-06, "loss": 0.1315, "step": 2999 }, { "epoch": 0.5736137667304015, "grad_norm": 3.1172773838043213, "learning_rate": 5e-06, "loss": 0.2009, "step": 3000 }, { "epoch": 0.5736137667304015, "eval_runtime": 817.4268, "eval_samples_per_second": 1.877, "eval_steps_per_second": 0.235, "step": 3000 }, { "epoch": 0.5738049713193116, "grad_norm": 2.6903367042541504, "learning_rate": 5e-06, "loss": 0.1569, "step": 3001 }, { "epoch": 0.5739961759082218, "grad_norm": 1.9376270771026611, "learning_rate": 5e-06, "loss": 0.2842, "step": 3002 }, { "epoch": 0.5741873804971319, "grad_norm": 1.1127880811691284, "learning_rate": 5e-06, "loss": 0.0979, "step": 3003 }, { "epoch": 0.5743785850860421, "grad_norm": 1.381749153137207, "learning_rate": 5e-06, "loss": 0.1271, "step": 3004 }, { "epoch": 0.5745697896749522, "grad_norm": 3.1074182987213135, "learning_rate": 5e-06, "loss": 0.1865, "step": 3005 }, { "epoch": 0.5747609942638623, "grad_norm": 1.8517677783966064, "learning_rate": 5e-06, "loss": 0.0811, "step": 3006 }, { "epoch": 0.5749521988527725, "grad_norm": 2.8040478229522705, "learning_rate": 5e-06, "loss": 0.4882, "step": 3007 }, { "epoch": 0.5751434034416826, "grad_norm": 1.2875118255615234, "learning_rate": 5e-06, "loss": 0.1718, "step": 3008 }, { "epoch": 0.5753346080305928, "grad_norm": 1.20366370677948, "learning_rate": 5e-06, "loss": 0.0758, "step": 3009 }, { "epoch": 0.5755258126195029, "grad_norm": 1.6420012712478638, "learning_rate": 5e-06, "loss": 0.1576, "step": 3010 }, { "epoch": 0.575717017208413, "grad_norm": 1.7481510639190674, "learning_rate": 5e-06, "loss": 0.0927, "step": 3011 }, { "epoch": 0.5759082217973231, "grad_norm": 2.993232011795044, "learning_rate": 5e-06, "loss": 0.2496, "step": 3012 }, { "epoch": 0.5760994263862332, "grad_norm": 1.9660049676895142, "learning_rate": 5e-06, "loss": 0.2413, "step": 3013 }, { "epoch": 0.5762906309751434, "grad_norm": 1.9961297512054443, "learning_rate": 5e-06, "loss": 0.3713, "step": 3014 }, { "epoch": 0.5764818355640535, "grad_norm": 1.5734777450561523, "learning_rate": 5e-06, "loss": 0.1268, "step": 3015 }, { "epoch": 0.5766730401529637, "grad_norm": 1.7662030458450317, "learning_rate": 5e-06, "loss": 0.2618, "step": 3016 }, { "epoch": 0.5768642447418738, "grad_norm": 1.6102837324142456, "learning_rate": 5e-06, "loss": 0.1833, "step": 3017 }, { "epoch": 0.5770554493307839, "grad_norm": 2.5440475940704346, "learning_rate": 5e-06, "loss": 0.1946, "step": 3018 }, { "epoch": 0.5772466539196941, "grad_norm": 2.081432580947876, "learning_rate": 5e-06, "loss": 0.1695, "step": 3019 }, { "epoch": 0.5774378585086042, "grad_norm": 5.738420009613037, "learning_rate": 5e-06, "loss": 0.2006, "step": 3020 }, { "epoch": 0.5776290630975144, "grad_norm": 1.8196947574615479, "learning_rate": 5e-06, "loss": 0.117, "step": 3021 }, { "epoch": 0.5778202676864245, "grad_norm": 1.6107805967330933, "learning_rate": 5e-06, "loss": 0.2009, "step": 3022 }, { "epoch": 0.5780114722753346, "grad_norm": 1.1476129293441772, "learning_rate": 5e-06, "loss": 0.0376, "step": 3023 }, { "epoch": 0.5782026768642448, "grad_norm": 2.0225865840911865, "learning_rate": 5e-06, "loss": 0.1687, "step": 3024 }, { "epoch": 0.5783938814531548, "grad_norm": 2.241004228591919, "learning_rate": 5e-06, "loss": 0.2727, "step": 3025 }, { "epoch": 0.578585086042065, "grad_norm": 2.0061728954315186, "learning_rate": 5e-06, "loss": 0.1177, "step": 3026 }, { "epoch": 0.5787762906309751, "grad_norm": 2.0233190059661865, "learning_rate": 5e-06, "loss": 0.1115, "step": 3027 }, { "epoch": 0.5789674952198853, "grad_norm": 1.3441364765167236, "learning_rate": 5e-06, "loss": 0.0912, "step": 3028 }, { "epoch": 0.5791586998087954, "grad_norm": 1.7095820903778076, "learning_rate": 5e-06, "loss": 0.1289, "step": 3029 }, { "epoch": 0.5793499043977055, "grad_norm": 1.2957032918930054, "learning_rate": 5e-06, "loss": 0.0431, "step": 3030 }, { "epoch": 0.5795411089866157, "grad_norm": 2.276420831680298, "learning_rate": 5e-06, "loss": 0.2461, "step": 3031 }, { "epoch": 0.5797323135755258, "grad_norm": 2.8638715744018555, "learning_rate": 5e-06, "loss": 0.4221, "step": 3032 }, { "epoch": 0.579923518164436, "grad_norm": 1.7681334018707275, "learning_rate": 5e-06, "loss": 0.218, "step": 3033 }, { "epoch": 0.5801147227533461, "grad_norm": 2.0881717205047607, "learning_rate": 5e-06, "loss": 0.3088, "step": 3034 }, { "epoch": 0.5803059273422562, "grad_norm": 1.6630779504776, "learning_rate": 5e-06, "loss": 0.1146, "step": 3035 }, { "epoch": 0.5804971319311664, "grad_norm": 1.7102560997009277, "learning_rate": 5e-06, "loss": 0.123, "step": 3036 }, { "epoch": 0.5806883365200765, "grad_norm": 1.850621223449707, "learning_rate": 5e-06, "loss": 0.0667, "step": 3037 }, { "epoch": 0.5808795411089867, "grad_norm": 2.2591307163238525, "learning_rate": 5e-06, "loss": 0.1723, "step": 3038 }, { "epoch": 0.5810707456978967, "grad_norm": 3.2868549823760986, "learning_rate": 5e-06, "loss": 0.7069, "step": 3039 }, { "epoch": 0.5812619502868069, "grad_norm": 2.1534550189971924, "learning_rate": 5e-06, "loss": 0.1613, "step": 3040 }, { "epoch": 0.581453154875717, "grad_norm": 2.671644687652588, "learning_rate": 5e-06, "loss": 0.6401, "step": 3041 }, { "epoch": 0.5816443594646271, "grad_norm": 1.346547245979309, "learning_rate": 5e-06, "loss": 0.0957, "step": 3042 }, { "epoch": 0.5818355640535373, "grad_norm": 1.1658374071121216, "learning_rate": 5e-06, "loss": 0.1317, "step": 3043 }, { "epoch": 0.5820267686424474, "grad_norm": 1.5041284561157227, "learning_rate": 5e-06, "loss": 0.1007, "step": 3044 }, { "epoch": 0.5822179732313576, "grad_norm": 2.461108922958374, "learning_rate": 5e-06, "loss": 0.4222, "step": 3045 }, { "epoch": 0.5824091778202677, "grad_norm": 1.9935189485549927, "learning_rate": 5e-06, "loss": 0.2012, "step": 3046 }, { "epoch": 0.5826003824091778, "grad_norm": 3.317734718322754, "learning_rate": 5e-06, "loss": 0.2825, "step": 3047 }, { "epoch": 0.582791586998088, "grad_norm": 2.7666308879852295, "learning_rate": 5e-06, "loss": 0.1017, "step": 3048 }, { "epoch": 0.5829827915869981, "grad_norm": 1.4334791898727417, "learning_rate": 5e-06, "loss": 0.0966, "step": 3049 }, { "epoch": 0.5831739961759083, "grad_norm": 2.323887825012207, "learning_rate": 5e-06, "loss": 0.1215, "step": 3050 }, { "epoch": 0.5833652007648183, "grad_norm": 2.63179874420166, "learning_rate": 5e-06, "loss": 0.5225, "step": 3051 }, { "epoch": 0.5835564053537284, "grad_norm": 2.2976789474487305, "learning_rate": 5e-06, "loss": 0.3479, "step": 3052 }, { "epoch": 0.5837476099426386, "grad_norm": 1.7970479726791382, "learning_rate": 5e-06, "loss": 0.0878, "step": 3053 }, { "epoch": 0.5839388145315487, "grad_norm": 2.2620391845703125, "learning_rate": 5e-06, "loss": 0.2336, "step": 3054 }, { "epoch": 0.5841300191204589, "grad_norm": 1.0997439622879028, "learning_rate": 5e-06, "loss": 0.0781, "step": 3055 }, { "epoch": 0.584321223709369, "grad_norm": 1.8581743240356445, "learning_rate": 5e-06, "loss": 0.0843, "step": 3056 }, { "epoch": 0.5845124282982792, "grad_norm": 2.1834423542022705, "learning_rate": 5e-06, "loss": 0.1402, "step": 3057 }, { "epoch": 0.5847036328871893, "grad_norm": 1.7100716829299927, "learning_rate": 5e-06, "loss": 0.194, "step": 3058 }, { "epoch": 0.5848948374760994, "grad_norm": 3.5780153274536133, "learning_rate": 5e-06, "loss": 0.1706, "step": 3059 }, { "epoch": 0.5850860420650096, "grad_norm": 1.2528570890426636, "learning_rate": 5e-06, "loss": 0.0991, "step": 3060 }, { "epoch": 0.5852772466539197, "grad_norm": 1.7608826160430908, "learning_rate": 5e-06, "loss": 0.1739, "step": 3061 }, { "epoch": 0.5854684512428299, "grad_norm": 2.2554149627685547, "learning_rate": 5e-06, "loss": 0.1713, "step": 3062 }, { "epoch": 0.58565965583174, "grad_norm": 1.9988476037979126, "learning_rate": 5e-06, "loss": 0.3087, "step": 3063 }, { "epoch": 0.58585086042065, "grad_norm": 2.21211314201355, "learning_rate": 5e-06, "loss": 0.273, "step": 3064 }, { "epoch": 0.5860420650095602, "grad_norm": 1.947128415107727, "learning_rate": 5e-06, "loss": 0.2207, "step": 3065 }, { "epoch": 0.5862332695984703, "grad_norm": 3.2861714363098145, "learning_rate": 5e-06, "loss": 0.4373, "step": 3066 }, { "epoch": 0.5864244741873805, "grad_norm": 1.3944896459579468, "learning_rate": 5e-06, "loss": 0.0627, "step": 3067 }, { "epoch": 0.5866156787762906, "grad_norm": 2.653266429901123, "learning_rate": 5e-06, "loss": 0.1445, "step": 3068 }, { "epoch": 0.5868068833652008, "grad_norm": 9.50674819946289, "learning_rate": 5e-06, "loss": 0.2661, "step": 3069 }, { "epoch": 0.5869980879541109, "grad_norm": 2.211970329284668, "learning_rate": 5e-06, "loss": 0.2697, "step": 3070 }, { "epoch": 0.587189292543021, "grad_norm": 1.0039657354354858, "learning_rate": 5e-06, "loss": 0.0918, "step": 3071 }, { "epoch": 0.5873804971319312, "grad_norm": 2.002715587615967, "learning_rate": 5e-06, "loss": 0.2972, "step": 3072 }, { "epoch": 0.5875717017208413, "grad_norm": 1.1226190328598022, "learning_rate": 5e-06, "loss": 0.0923, "step": 3073 }, { "epoch": 0.5877629063097515, "grad_norm": 1.6911029815673828, "learning_rate": 5e-06, "loss": 0.0904, "step": 3074 }, { "epoch": 0.5879541108986616, "grad_norm": 2.079878568649292, "learning_rate": 5e-06, "loss": 0.0876, "step": 3075 }, { "epoch": 0.5881453154875717, "grad_norm": 2.1699066162109375, "learning_rate": 5e-06, "loss": 0.4323, "step": 3076 }, { "epoch": 0.5883365200764819, "grad_norm": 1.8481723070144653, "learning_rate": 5e-06, "loss": 0.208, "step": 3077 }, { "epoch": 0.5885277246653919, "grad_norm": 1.2353150844573975, "learning_rate": 5e-06, "loss": 0.0871, "step": 3078 }, { "epoch": 0.5887189292543021, "grad_norm": 3.055333375930786, "learning_rate": 5e-06, "loss": 0.1118, "step": 3079 }, { "epoch": 0.5889101338432122, "grad_norm": 1.535212755203247, "learning_rate": 5e-06, "loss": 0.2145, "step": 3080 }, { "epoch": 0.5891013384321224, "grad_norm": 3.490246295928955, "learning_rate": 5e-06, "loss": 0.0861, "step": 3081 }, { "epoch": 0.5892925430210325, "grad_norm": 1.9713093042373657, "learning_rate": 5e-06, "loss": 0.2104, "step": 3082 }, { "epoch": 0.5894837476099426, "grad_norm": 2.162919282913208, "learning_rate": 5e-06, "loss": 0.3863, "step": 3083 }, { "epoch": 0.5896749521988528, "grad_norm": 1.0454967021942139, "learning_rate": 5e-06, "loss": 0.0944, "step": 3084 }, { "epoch": 0.5898661567877629, "grad_norm": 1.190918207168579, "learning_rate": 5e-06, "loss": 0.1009, "step": 3085 }, { "epoch": 0.5900573613766731, "grad_norm": 1.4480558633804321, "learning_rate": 5e-06, "loss": 0.0612, "step": 3086 }, { "epoch": 0.5902485659655832, "grad_norm": 2.896592140197754, "learning_rate": 5e-06, "loss": 0.121, "step": 3087 }, { "epoch": 0.5904397705544933, "grad_norm": 1.869307279586792, "learning_rate": 5e-06, "loss": 0.1258, "step": 3088 }, { "epoch": 0.5906309751434035, "grad_norm": 1.9074268341064453, "learning_rate": 5e-06, "loss": 0.1778, "step": 3089 }, { "epoch": 0.5908221797323135, "grad_norm": 2.827545166015625, "learning_rate": 5e-06, "loss": 0.5372, "step": 3090 }, { "epoch": 0.5910133843212237, "grad_norm": 3.0285253524780273, "learning_rate": 5e-06, "loss": 0.1217, "step": 3091 }, { "epoch": 0.5912045889101338, "grad_norm": 1.5774853229522705, "learning_rate": 5e-06, "loss": 0.2967, "step": 3092 }, { "epoch": 0.591395793499044, "grad_norm": 1.55910062789917, "learning_rate": 5e-06, "loss": 0.0962, "step": 3093 }, { "epoch": 0.5915869980879541, "grad_norm": 2.466980457305908, "learning_rate": 5e-06, "loss": 0.1794, "step": 3094 }, { "epoch": 0.5917782026768642, "grad_norm": 1.3452435731887817, "learning_rate": 5e-06, "loss": 0.1445, "step": 3095 }, { "epoch": 0.5919694072657744, "grad_norm": 1.193184494972229, "learning_rate": 5e-06, "loss": 0.1735, "step": 3096 }, { "epoch": 0.5921606118546845, "grad_norm": 4.413208961486816, "learning_rate": 5e-06, "loss": 0.1679, "step": 3097 }, { "epoch": 0.5923518164435947, "grad_norm": 1.8981341123580933, "learning_rate": 5e-06, "loss": 0.1866, "step": 3098 }, { "epoch": 0.5925430210325048, "grad_norm": 1.924362301826477, "learning_rate": 5e-06, "loss": 0.1081, "step": 3099 }, { "epoch": 0.5927342256214149, "grad_norm": 3.043461561203003, "learning_rate": 5e-06, "loss": 0.1592, "step": 3100 }, { "epoch": 0.5929254302103251, "grad_norm": 2.672950506210327, "learning_rate": 5e-06, "loss": 0.5285, "step": 3101 }, { "epoch": 0.5931166347992352, "grad_norm": 2.3183720111846924, "learning_rate": 5e-06, "loss": 0.5674, "step": 3102 }, { "epoch": 0.5933078393881454, "grad_norm": 2.15521502494812, "learning_rate": 5e-06, "loss": 0.23, "step": 3103 }, { "epoch": 0.5934990439770554, "grad_norm": 2.041330575942993, "learning_rate": 5e-06, "loss": 0.1286, "step": 3104 }, { "epoch": 0.5936902485659655, "grad_norm": 2.0473520755767822, "learning_rate": 5e-06, "loss": 0.3554, "step": 3105 }, { "epoch": 0.5938814531548757, "grad_norm": 1.3054406642913818, "learning_rate": 5e-06, "loss": 0.1113, "step": 3106 }, { "epoch": 0.5940726577437858, "grad_norm": 2.71018385887146, "learning_rate": 5e-06, "loss": 0.3178, "step": 3107 }, { "epoch": 0.594263862332696, "grad_norm": 1.7151776552200317, "learning_rate": 5e-06, "loss": 0.1841, "step": 3108 }, { "epoch": 0.5944550669216061, "grad_norm": 2.065749168395996, "learning_rate": 5e-06, "loss": 0.157, "step": 3109 }, { "epoch": 0.5946462715105163, "grad_norm": 0.8944826126098633, "learning_rate": 5e-06, "loss": 0.0657, "step": 3110 }, { "epoch": 0.5948374760994264, "grad_norm": 1.4383631944656372, "learning_rate": 5e-06, "loss": 0.0874, "step": 3111 }, { "epoch": 0.5950286806883365, "grad_norm": 1.2038031816482544, "learning_rate": 5e-06, "loss": 0.0653, "step": 3112 }, { "epoch": 0.5952198852772467, "grad_norm": 1.0844436883926392, "learning_rate": 5e-06, "loss": 0.0806, "step": 3113 }, { "epoch": 0.5954110898661568, "grad_norm": 1.6793997287750244, "learning_rate": 5e-06, "loss": 0.1552, "step": 3114 }, { "epoch": 0.595602294455067, "grad_norm": 2.1270453929901123, "learning_rate": 5e-06, "loss": 0.3629, "step": 3115 }, { "epoch": 0.595793499043977, "grad_norm": 2.146496295928955, "learning_rate": 5e-06, "loss": 0.317, "step": 3116 }, { "epoch": 0.5959847036328871, "grad_norm": 1.409468173980713, "learning_rate": 5e-06, "loss": 0.0948, "step": 3117 }, { "epoch": 0.5961759082217973, "grad_norm": 1.9205509424209595, "learning_rate": 5e-06, "loss": 0.2199, "step": 3118 }, { "epoch": 0.5963671128107074, "grad_norm": 1.7777714729309082, "learning_rate": 5e-06, "loss": 0.1007, "step": 3119 }, { "epoch": 0.5965583173996176, "grad_norm": 3.087475061416626, "learning_rate": 5e-06, "loss": 0.3876, "step": 3120 }, { "epoch": 0.5967495219885277, "grad_norm": 2.7254316806793213, "learning_rate": 5e-06, "loss": 0.5461, "step": 3121 }, { "epoch": 0.5969407265774379, "grad_norm": 1.9217915534973145, "learning_rate": 5e-06, "loss": 0.2579, "step": 3122 }, { "epoch": 0.597131931166348, "grad_norm": 2.051192283630371, "learning_rate": 5e-06, "loss": 0.2243, "step": 3123 }, { "epoch": 0.5973231357552581, "grad_norm": 1.4439209699630737, "learning_rate": 5e-06, "loss": 0.0946, "step": 3124 }, { "epoch": 0.5975143403441683, "grad_norm": 0.8825449347496033, "learning_rate": 5e-06, "loss": 0.0568, "step": 3125 }, { "epoch": 0.5977055449330784, "grad_norm": 13.12146282196045, "learning_rate": 5e-06, "loss": 0.5537, "step": 3126 }, { "epoch": 0.5978967495219886, "grad_norm": 1.4262953996658325, "learning_rate": 5e-06, "loss": 0.1899, "step": 3127 }, { "epoch": 0.5980879541108987, "grad_norm": 2.342287063598633, "learning_rate": 5e-06, "loss": 0.312, "step": 3128 }, { "epoch": 0.5982791586998087, "grad_norm": 2.072007179260254, "learning_rate": 5e-06, "loss": 0.1151, "step": 3129 }, { "epoch": 0.5984703632887189, "grad_norm": 4.116969108581543, "learning_rate": 5e-06, "loss": 0.257, "step": 3130 }, { "epoch": 0.598661567877629, "grad_norm": 1.1240900754928589, "learning_rate": 5e-06, "loss": 0.0532, "step": 3131 }, { "epoch": 0.5988527724665392, "grad_norm": 1.6373003721237183, "learning_rate": 5e-06, "loss": 0.177, "step": 3132 }, { "epoch": 0.5990439770554493, "grad_norm": 2.227210521697998, "learning_rate": 5e-06, "loss": 0.1885, "step": 3133 }, { "epoch": 0.5992351816443595, "grad_norm": 1.2359378337860107, "learning_rate": 5e-06, "loss": 0.1457, "step": 3134 }, { "epoch": 0.5994263862332696, "grad_norm": 1.923424482345581, "learning_rate": 5e-06, "loss": 0.247, "step": 3135 }, { "epoch": 0.5996175908221797, "grad_norm": 1.0987998247146606, "learning_rate": 5e-06, "loss": 0.1003, "step": 3136 }, { "epoch": 0.5998087954110899, "grad_norm": 2.2175567150115967, "learning_rate": 5e-06, "loss": 0.1486, "step": 3137 }, { "epoch": 0.6, "grad_norm": 2.118872880935669, "learning_rate": 5e-06, "loss": 0.1863, "step": 3138 }, { "epoch": 0.6001912045889102, "grad_norm": 2.241590738296509, "learning_rate": 5e-06, "loss": 0.3768, "step": 3139 }, { "epoch": 0.6003824091778203, "grad_norm": 2.5006096363067627, "learning_rate": 5e-06, "loss": 0.3921, "step": 3140 }, { "epoch": 0.6005736137667304, "grad_norm": 1.5924909114837646, "learning_rate": 5e-06, "loss": 0.1555, "step": 3141 }, { "epoch": 0.6007648183556406, "grad_norm": 2.0221755504608154, "learning_rate": 5e-06, "loss": 0.1312, "step": 3142 }, { "epoch": 0.6009560229445506, "grad_norm": 2.870837450027466, "learning_rate": 5e-06, "loss": 0.0977, "step": 3143 }, { "epoch": 0.6011472275334608, "grad_norm": 2.1391944885253906, "learning_rate": 5e-06, "loss": 0.1124, "step": 3144 }, { "epoch": 0.6013384321223709, "grad_norm": 2.3611199855804443, "learning_rate": 5e-06, "loss": 0.5372, "step": 3145 }, { "epoch": 0.6015296367112811, "grad_norm": 2.637998104095459, "learning_rate": 5e-06, "loss": 0.4384, "step": 3146 }, { "epoch": 0.6017208413001912, "grad_norm": 3.041144609451294, "learning_rate": 5e-06, "loss": 0.2068, "step": 3147 }, { "epoch": 0.6019120458891013, "grad_norm": 2.0565388202667236, "learning_rate": 5e-06, "loss": 0.2483, "step": 3148 }, { "epoch": 0.6021032504780115, "grad_norm": 2.326119899749756, "learning_rate": 5e-06, "loss": 0.3495, "step": 3149 }, { "epoch": 0.6022944550669216, "grad_norm": 2.5809388160705566, "learning_rate": 5e-06, "loss": 0.1141, "step": 3150 }, { "epoch": 0.6024856596558318, "grad_norm": 2.951171398162842, "learning_rate": 5e-06, "loss": 0.4543, "step": 3151 }, { "epoch": 0.6026768642447419, "grad_norm": 1.1240613460540771, "learning_rate": 5e-06, "loss": 0.0896, "step": 3152 }, { "epoch": 0.602868068833652, "grad_norm": 1.6378625631332397, "learning_rate": 5e-06, "loss": 0.1978, "step": 3153 }, { "epoch": 0.6030592734225622, "grad_norm": 2.002222776412964, "learning_rate": 5e-06, "loss": 0.1737, "step": 3154 }, { "epoch": 0.6032504780114722, "grad_norm": 1.2147256135940552, "learning_rate": 5e-06, "loss": 0.0322, "step": 3155 }, { "epoch": 0.6034416826003824, "grad_norm": 1.5315823554992676, "learning_rate": 5e-06, "loss": 0.0978, "step": 3156 }, { "epoch": 0.6036328871892925, "grad_norm": 2.548656463623047, "learning_rate": 5e-06, "loss": 0.376, "step": 3157 }, { "epoch": 0.6038240917782027, "grad_norm": 2.330667495727539, "learning_rate": 5e-06, "loss": 0.3136, "step": 3158 }, { "epoch": 0.6040152963671128, "grad_norm": 3.249701738357544, "learning_rate": 5e-06, "loss": 0.7531, "step": 3159 }, { "epoch": 0.6042065009560229, "grad_norm": 1.688836932182312, "learning_rate": 5e-06, "loss": 0.1473, "step": 3160 }, { "epoch": 0.6043977055449331, "grad_norm": 2.461451530456543, "learning_rate": 5e-06, "loss": 0.246, "step": 3161 }, { "epoch": 0.6045889101338432, "grad_norm": 1.6401633024215698, "learning_rate": 5e-06, "loss": 0.094, "step": 3162 }, { "epoch": 0.6047801147227534, "grad_norm": 2.8850514888763428, "learning_rate": 5e-06, "loss": 0.466, "step": 3163 }, { "epoch": 0.6049713193116635, "grad_norm": 3.2717480659484863, "learning_rate": 5e-06, "loss": 0.5378, "step": 3164 }, { "epoch": 0.6051625239005736, "grad_norm": 3.4596874713897705, "learning_rate": 5e-06, "loss": 0.3068, "step": 3165 }, { "epoch": 0.6053537284894838, "grad_norm": 1.0679250955581665, "learning_rate": 5e-06, "loss": 0.0964, "step": 3166 }, { "epoch": 0.6055449330783939, "grad_norm": 1.2040866613388062, "learning_rate": 5e-06, "loss": 0.0847, "step": 3167 }, { "epoch": 0.605736137667304, "grad_norm": 3.1101861000061035, "learning_rate": 5e-06, "loss": 0.2559, "step": 3168 }, { "epoch": 0.6059273422562141, "grad_norm": 2.351142168045044, "learning_rate": 5e-06, "loss": 0.1671, "step": 3169 }, { "epoch": 0.6061185468451242, "grad_norm": 1.722732663154602, "learning_rate": 5e-06, "loss": 0.2063, "step": 3170 }, { "epoch": 0.6063097514340344, "grad_norm": 2.38865327835083, "learning_rate": 5e-06, "loss": 0.4377, "step": 3171 }, { "epoch": 0.6065009560229445, "grad_norm": 2.085334300994873, "learning_rate": 5e-06, "loss": 0.1241, "step": 3172 }, { "epoch": 0.6066921606118547, "grad_norm": 1.9132529497146606, "learning_rate": 5e-06, "loss": 0.1219, "step": 3173 }, { "epoch": 0.6068833652007648, "grad_norm": 2.437124013900757, "learning_rate": 5e-06, "loss": 0.1314, "step": 3174 }, { "epoch": 0.607074569789675, "grad_norm": 2.508739471435547, "learning_rate": 5e-06, "loss": 0.1578, "step": 3175 }, { "epoch": 0.6072657743785851, "grad_norm": 2.6360321044921875, "learning_rate": 5e-06, "loss": 0.2388, "step": 3176 }, { "epoch": 0.6074569789674952, "grad_norm": 1.8685966730117798, "learning_rate": 5e-06, "loss": 0.2663, "step": 3177 }, { "epoch": 0.6076481835564054, "grad_norm": 1.755370020866394, "learning_rate": 5e-06, "loss": 0.1159, "step": 3178 }, { "epoch": 0.6078393881453155, "grad_norm": 1.2698965072631836, "learning_rate": 5e-06, "loss": 0.1185, "step": 3179 }, { "epoch": 0.6080305927342257, "grad_norm": 3.567136764526367, "learning_rate": 5e-06, "loss": 0.2535, "step": 3180 }, { "epoch": 0.6082217973231357, "grad_norm": 1.5889770984649658, "learning_rate": 5e-06, "loss": 0.0809, "step": 3181 }, { "epoch": 0.6084130019120458, "grad_norm": 1.9720900058746338, "learning_rate": 5e-06, "loss": 0.1599, "step": 3182 }, { "epoch": 0.608604206500956, "grad_norm": 1.6620185375213623, "learning_rate": 5e-06, "loss": 0.2844, "step": 3183 }, { "epoch": 0.6087954110898661, "grad_norm": 2.424464464187622, "learning_rate": 5e-06, "loss": 0.2883, "step": 3184 }, { "epoch": 0.6089866156787763, "grad_norm": 3.4349234104156494, "learning_rate": 5e-06, "loss": 0.4843, "step": 3185 }, { "epoch": 0.6091778202676864, "grad_norm": 1.0751681327819824, "learning_rate": 5e-06, "loss": 0.0913, "step": 3186 }, { "epoch": 0.6093690248565966, "grad_norm": 1.671789526939392, "learning_rate": 5e-06, "loss": 0.1138, "step": 3187 }, { "epoch": 0.6095602294455067, "grad_norm": 2.0283384323120117, "learning_rate": 5e-06, "loss": 0.1114, "step": 3188 }, { "epoch": 0.6097514340344168, "grad_norm": 2.488652229309082, "learning_rate": 5e-06, "loss": 0.3425, "step": 3189 }, { "epoch": 0.609942638623327, "grad_norm": 1.1259547472000122, "learning_rate": 5e-06, "loss": 0.1119, "step": 3190 }, { "epoch": 0.6101338432122371, "grad_norm": 5.288424968719482, "learning_rate": 5e-06, "loss": 0.3013, "step": 3191 }, { "epoch": 0.6103250478011473, "grad_norm": 1.1626574993133545, "learning_rate": 5e-06, "loss": 0.1034, "step": 3192 }, { "epoch": 0.6105162523900574, "grad_norm": 2.4435818195343018, "learning_rate": 5e-06, "loss": 0.2786, "step": 3193 }, { "epoch": 0.6107074569789674, "grad_norm": 1.540985107421875, "learning_rate": 5e-06, "loss": 0.0953, "step": 3194 }, { "epoch": 0.6108986615678776, "grad_norm": 2.901230812072754, "learning_rate": 5e-06, "loss": 0.4028, "step": 3195 }, { "epoch": 0.6110898661567877, "grad_norm": 1.308781385421753, "learning_rate": 5e-06, "loss": 0.1013, "step": 3196 }, { "epoch": 0.6112810707456979, "grad_norm": 1.9014428853988647, "learning_rate": 5e-06, "loss": 0.2525, "step": 3197 }, { "epoch": 0.611472275334608, "grad_norm": 1.3016637563705444, "learning_rate": 5e-06, "loss": 0.1119, "step": 3198 }, { "epoch": 0.6116634799235182, "grad_norm": 1.2226108312606812, "learning_rate": 5e-06, "loss": 0.1074, "step": 3199 }, { "epoch": 0.6118546845124283, "grad_norm": 1.6033170223236084, "learning_rate": 5e-06, "loss": 0.0845, "step": 3200 }, { "epoch": 0.6120458891013384, "grad_norm": 1.5445128679275513, "learning_rate": 5e-06, "loss": 0.1334, "step": 3201 }, { "epoch": 0.6122370936902486, "grad_norm": 2.2498819828033447, "learning_rate": 5e-06, "loss": 0.3817, "step": 3202 }, { "epoch": 0.6124282982791587, "grad_norm": 1.170806646347046, "learning_rate": 5e-06, "loss": 0.0951, "step": 3203 }, { "epoch": 0.6126195028680689, "grad_norm": 1.942797303199768, "learning_rate": 5e-06, "loss": 0.179, "step": 3204 }, { "epoch": 0.612810707456979, "grad_norm": 1.344512701034546, "learning_rate": 5e-06, "loss": 0.081, "step": 3205 }, { "epoch": 0.613001912045889, "grad_norm": 1.1535552740097046, "learning_rate": 5e-06, "loss": 0.0581, "step": 3206 }, { "epoch": 0.6131931166347993, "grad_norm": 1.867598056793213, "learning_rate": 5e-06, "loss": 0.1903, "step": 3207 }, { "epoch": 0.6133843212237093, "grad_norm": 2.3806843757629395, "learning_rate": 5e-06, "loss": 0.3296, "step": 3208 }, { "epoch": 0.6135755258126195, "grad_norm": 1.6405065059661865, "learning_rate": 5e-06, "loss": 0.161, "step": 3209 }, { "epoch": 0.6137667304015296, "grad_norm": 2.7260189056396484, "learning_rate": 5e-06, "loss": 0.3575, "step": 3210 }, { "epoch": 0.6139579349904398, "grad_norm": 2.125488519668579, "learning_rate": 5e-06, "loss": 0.339, "step": 3211 }, { "epoch": 0.6141491395793499, "grad_norm": 2.283906936645508, "learning_rate": 5e-06, "loss": 0.1835, "step": 3212 }, { "epoch": 0.61434034416826, "grad_norm": 2.345285654067993, "learning_rate": 5e-06, "loss": 0.1712, "step": 3213 }, { "epoch": 0.6145315487571702, "grad_norm": 2.2793378829956055, "learning_rate": 5e-06, "loss": 0.3886, "step": 3214 }, { "epoch": 0.6147227533460803, "grad_norm": 2.4484992027282715, "learning_rate": 5e-06, "loss": 0.4042, "step": 3215 }, { "epoch": 0.6149139579349905, "grad_norm": 2.2618656158447266, "learning_rate": 5e-06, "loss": 0.2397, "step": 3216 }, { "epoch": 0.6151051625239006, "grad_norm": 2.3636066913604736, "learning_rate": 5e-06, "loss": 0.1085, "step": 3217 }, { "epoch": 0.6152963671128107, "grad_norm": 1.7464470863342285, "learning_rate": 5e-06, "loss": 0.0766, "step": 3218 }, { "epoch": 0.6154875717017209, "grad_norm": 3.1975414752960205, "learning_rate": 5e-06, "loss": 0.1565, "step": 3219 }, { "epoch": 0.615678776290631, "grad_norm": 2.530433416366577, "learning_rate": 5e-06, "loss": 0.3814, "step": 3220 }, { "epoch": 0.6158699808795411, "grad_norm": 1.9266382455825806, "learning_rate": 5e-06, "loss": 0.253, "step": 3221 }, { "epoch": 0.6160611854684512, "grad_norm": 2.1497132778167725, "learning_rate": 5e-06, "loss": 0.2549, "step": 3222 }, { "epoch": 0.6162523900573614, "grad_norm": 1.5742179155349731, "learning_rate": 5e-06, "loss": 0.1135, "step": 3223 }, { "epoch": 0.6164435946462715, "grad_norm": 1.3919093608856201, "learning_rate": 5e-06, "loss": 0.0582, "step": 3224 }, { "epoch": 0.6166347992351816, "grad_norm": 2.1460893154144287, "learning_rate": 5e-06, "loss": 0.1063, "step": 3225 }, { "epoch": 0.6168260038240918, "grad_norm": 3.082763671875, "learning_rate": 5e-06, "loss": 0.3345, "step": 3226 }, { "epoch": 0.6170172084130019, "grad_norm": 2.1403768062591553, "learning_rate": 5e-06, "loss": 0.1945, "step": 3227 }, { "epoch": 0.6172084130019121, "grad_norm": 2.1468288898468018, "learning_rate": 5e-06, "loss": 0.1464, "step": 3228 }, { "epoch": 0.6173996175908222, "grad_norm": 1.821694254875183, "learning_rate": 5e-06, "loss": 0.1214, "step": 3229 }, { "epoch": 0.6175908221797323, "grad_norm": 0.9342910051345825, "learning_rate": 5e-06, "loss": 0.0562, "step": 3230 }, { "epoch": 0.6177820267686425, "grad_norm": 1.4590644836425781, "learning_rate": 5e-06, "loss": 0.0798, "step": 3231 }, { "epoch": 0.6179732313575526, "grad_norm": 2.7206311225891113, "learning_rate": 5e-06, "loss": 0.3394, "step": 3232 }, { "epoch": 0.6181644359464628, "grad_norm": 2.2347638607025146, "learning_rate": 5e-06, "loss": 0.152, "step": 3233 }, { "epoch": 0.6183556405353728, "grad_norm": 2.2184712886810303, "learning_rate": 5e-06, "loss": 0.3338, "step": 3234 }, { "epoch": 0.6185468451242829, "grad_norm": 1.4767361879348755, "learning_rate": 5e-06, "loss": 0.1558, "step": 3235 }, { "epoch": 0.6187380497131931, "grad_norm": 1.7414273023605347, "learning_rate": 5e-06, "loss": 0.108, "step": 3236 }, { "epoch": 0.6189292543021032, "grad_norm": 2.4318485260009766, "learning_rate": 5e-06, "loss": 0.148, "step": 3237 }, { "epoch": 0.6191204588910134, "grad_norm": 2.138657808303833, "learning_rate": 5e-06, "loss": 0.2567, "step": 3238 }, { "epoch": 0.6193116634799235, "grad_norm": 1.6914198398590088, "learning_rate": 5e-06, "loss": 0.1906, "step": 3239 }, { "epoch": 0.6195028680688337, "grad_norm": 2.306057929992676, "learning_rate": 5e-06, "loss": 0.1104, "step": 3240 }, { "epoch": 0.6196940726577438, "grad_norm": 2.9438154697418213, "learning_rate": 5e-06, "loss": 0.2681, "step": 3241 }, { "epoch": 0.6198852772466539, "grad_norm": 0.8288871049880981, "learning_rate": 5e-06, "loss": 0.043, "step": 3242 }, { "epoch": 0.6200764818355641, "grad_norm": 2.2429580688476562, "learning_rate": 5e-06, "loss": 0.2368, "step": 3243 }, { "epoch": 0.6202676864244742, "grad_norm": 2.557755708694458, "learning_rate": 5e-06, "loss": 0.3005, "step": 3244 }, { "epoch": 0.6204588910133844, "grad_norm": 1.9078497886657715, "learning_rate": 5e-06, "loss": 0.1692, "step": 3245 }, { "epoch": 0.6206500956022944, "grad_norm": 2.475344657897949, "learning_rate": 5e-06, "loss": 0.2898, "step": 3246 }, { "epoch": 0.6208413001912045, "grad_norm": 2.7360498905181885, "learning_rate": 5e-06, "loss": 0.283, "step": 3247 }, { "epoch": 0.6210325047801147, "grad_norm": 1.3334991931915283, "learning_rate": 5e-06, "loss": 0.1089, "step": 3248 }, { "epoch": 0.6212237093690248, "grad_norm": 1.7204349040985107, "learning_rate": 5e-06, "loss": 0.1098, "step": 3249 }, { "epoch": 0.621414913957935, "grad_norm": 4.411588191986084, "learning_rate": 5e-06, "loss": 0.1599, "step": 3250 }, { "epoch": 0.6216061185468451, "grad_norm": 1.615220308303833, "learning_rate": 5e-06, "loss": 0.128, "step": 3251 }, { "epoch": 0.6217973231357553, "grad_norm": 2.4545962810516357, "learning_rate": 5e-06, "loss": 0.3416, "step": 3252 }, { "epoch": 0.6219885277246654, "grad_norm": 1.9672613143920898, "learning_rate": 5e-06, "loss": 0.1455, "step": 3253 }, { "epoch": 0.6221797323135755, "grad_norm": 2.6472153663635254, "learning_rate": 5e-06, "loss": 0.3045, "step": 3254 }, { "epoch": 0.6223709369024857, "grad_norm": 1.7558777332305908, "learning_rate": 5e-06, "loss": 0.1715, "step": 3255 }, { "epoch": 0.6225621414913958, "grad_norm": 1.1533193588256836, "learning_rate": 5e-06, "loss": 0.0723, "step": 3256 }, { "epoch": 0.622753346080306, "grad_norm": 2.2439403533935547, "learning_rate": 5e-06, "loss": 0.1156, "step": 3257 }, { "epoch": 0.6229445506692161, "grad_norm": 2.317619800567627, "learning_rate": 5e-06, "loss": 0.2825, "step": 3258 }, { "epoch": 0.6231357552581261, "grad_norm": 1.0267956256866455, "learning_rate": 5e-06, "loss": 0.0867, "step": 3259 }, { "epoch": 0.6233269598470363, "grad_norm": 1.7949508428573608, "learning_rate": 5e-06, "loss": 0.1758, "step": 3260 }, { "epoch": 0.6235181644359464, "grad_norm": 0.780097246170044, "learning_rate": 5e-06, "loss": 0.0627, "step": 3261 }, { "epoch": 0.6237093690248566, "grad_norm": 3.0155014991760254, "learning_rate": 5e-06, "loss": 0.125, "step": 3262 }, { "epoch": 0.6239005736137667, "grad_norm": 1.4573692083358765, "learning_rate": 5e-06, "loss": 0.1154, "step": 3263 }, { "epoch": 0.6240917782026769, "grad_norm": 2.20967173576355, "learning_rate": 5e-06, "loss": 0.265, "step": 3264 }, { "epoch": 0.624282982791587, "grad_norm": 2.5910797119140625, "learning_rate": 5e-06, "loss": 0.2157, "step": 3265 }, { "epoch": 0.6244741873804971, "grad_norm": 1.6436420679092407, "learning_rate": 5e-06, "loss": 0.1606, "step": 3266 }, { "epoch": 0.6246653919694073, "grad_norm": 1.099359154701233, "learning_rate": 5e-06, "loss": 0.0784, "step": 3267 }, { "epoch": 0.6248565965583174, "grad_norm": 2.09850811958313, "learning_rate": 5e-06, "loss": 0.1143, "step": 3268 }, { "epoch": 0.6250478011472276, "grad_norm": 2.030501365661621, "learning_rate": 5e-06, "loss": 0.1151, "step": 3269 }, { "epoch": 0.6252390057361377, "grad_norm": 1.77583646774292, "learning_rate": 5e-06, "loss": 0.2257, "step": 3270 }, { "epoch": 0.6254302103250478, "grad_norm": 2.196660280227661, "learning_rate": 5e-06, "loss": 0.2526, "step": 3271 }, { "epoch": 0.625621414913958, "grad_norm": 2.8073387145996094, "learning_rate": 5e-06, "loss": 0.1847, "step": 3272 }, { "epoch": 0.625812619502868, "grad_norm": 1.610040307044983, "learning_rate": 5e-06, "loss": 0.0844, "step": 3273 }, { "epoch": 0.6260038240917782, "grad_norm": 4.642181396484375, "learning_rate": 5e-06, "loss": 0.1936, "step": 3274 }, { "epoch": 0.6261950286806883, "grad_norm": 1.8837900161743164, "learning_rate": 5e-06, "loss": 0.1336, "step": 3275 }, { "epoch": 0.6263862332695985, "grad_norm": 2.7366063594818115, "learning_rate": 5e-06, "loss": 0.3408, "step": 3276 }, { "epoch": 0.6265774378585086, "grad_norm": 2.5217320919036865, "learning_rate": 5e-06, "loss": 0.3369, "step": 3277 }, { "epoch": 0.6267686424474187, "grad_norm": 2.0307939052581787, "learning_rate": 5e-06, "loss": 0.2868, "step": 3278 }, { "epoch": 0.6269598470363289, "grad_norm": 2.022230386734009, "learning_rate": 5e-06, "loss": 0.2864, "step": 3279 }, { "epoch": 0.627151051625239, "grad_norm": 1.771817684173584, "learning_rate": 5e-06, "loss": 0.086, "step": 3280 }, { "epoch": 0.6273422562141492, "grad_norm": 1.1006555557250977, "learning_rate": 5e-06, "loss": 0.036, "step": 3281 }, { "epoch": 0.6275334608030593, "grad_norm": 1.6326366662979126, "learning_rate": 5e-06, "loss": 0.1208, "step": 3282 }, { "epoch": 0.6277246653919694, "grad_norm": 2.166058301925659, "learning_rate": 5e-06, "loss": 0.3915, "step": 3283 }, { "epoch": 0.6279158699808796, "grad_norm": 1.752289056777954, "learning_rate": 5e-06, "loss": 0.1433, "step": 3284 }, { "epoch": 0.6281070745697896, "grad_norm": 1.6301438808441162, "learning_rate": 5e-06, "loss": 0.1176, "step": 3285 }, { "epoch": 0.6282982791586998, "grad_norm": 3.006061553955078, "learning_rate": 5e-06, "loss": 0.0893, "step": 3286 }, { "epoch": 0.6284894837476099, "grad_norm": 1.6843498945236206, "learning_rate": 5e-06, "loss": 0.0768, "step": 3287 }, { "epoch": 0.62868068833652, "grad_norm": 2.755248546600342, "learning_rate": 5e-06, "loss": 0.4565, "step": 3288 }, { "epoch": 0.6288718929254302, "grad_norm": 1.122692584991455, "learning_rate": 5e-06, "loss": 0.1002, "step": 3289 }, { "epoch": 0.6290630975143403, "grad_norm": 1.585771918296814, "learning_rate": 5e-06, "loss": 0.1586, "step": 3290 }, { "epoch": 0.6292543021032505, "grad_norm": 1.6943069696426392, "learning_rate": 5e-06, "loss": 0.0915, "step": 3291 }, { "epoch": 0.6294455066921606, "grad_norm": 1.3940718173980713, "learning_rate": 5e-06, "loss": 0.101, "step": 3292 }, { "epoch": 0.6296367112810708, "grad_norm": 1.1676661968231201, "learning_rate": 5e-06, "loss": 0.0367, "step": 3293 }, { "epoch": 0.6298279158699809, "grad_norm": 2.5931499004364014, "learning_rate": 5e-06, "loss": 0.2025, "step": 3294 }, { "epoch": 0.630019120458891, "grad_norm": 1.825156331062317, "learning_rate": 5e-06, "loss": 0.1334, "step": 3295 }, { "epoch": 0.6302103250478012, "grad_norm": 0.9374414086341858, "learning_rate": 5e-06, "loss": 0.0959, "step": 3296 }, { "epoch": 0.6304015296367113, "grad_norm": 1.9733580350875854, "learning_rate": 5e-06, "loss": 0.1064, "step": 3297 }, { "epoch": 0.6305927342256215, "grad_norm": 1.1101990938186646, "learning_rate": 5e-06, "loss": 0.0681, "step": 3298 }, { "epoch": 0.6307839388145315, "grad_norm": 1.4817825555801392, "learning_rate": 5e-06, "loss": 0.0942, "step": 3299 }, { "epoch": 0.6309751434034416, "grad_norm": 1.6432628631591797, "learning_rate": 5e-06, "loss": 0.0721, "step": 3300 }, { "epoch": 0.6311663479923518, "grad_norm": 3.0500617027282715, "learning_rate": 5e-06, "loss": 0.4934, "step": 3301 }, { "epoch": 0.6313575525812619, "grad_norm": 1.2397555112838745, "learning_rate": 5e-06, "loss": 0.1243, "step": 3302 }, { "epoch": 0.6315487571701721, "grad_norm": 1.4868794679641724, "learning_rate": 5e-06, "loss": 0.251, "step": 3303 }, { "epoch": 0.6317399617590822, "grad_norm": 1.124295711517334, "learning_rate": 5e-06, "loss": 0.1263, "step": 3304 }, { "epoch": 0.6319311663479924, "grad_norm": 1.9603227376937866, "learning_rate": 5e-06, "loss": 0.2923, "step": 3305 }, { "epoch": 0.6321223709369025, "grad_norm": 1.851251482963562, "learning_rate": 5e-06, "loss": 0.1242, "step": 3306 }, { "epoch": 0.6323135755258126, "grad_norm": 2.5106050968170166, "learning_rate": 5e-06, "loss": 0.2472, "step": 3307 }, { "epoch": 0.6325047801147228, "grad_norm": 1.8182765245437622, "learning_rate": 5e-06, "loss": 0.221, "step": 3308 }, { "epoch": 0.6326959847036329, "grad_norm": 1.9880139827728271, "learning_rate": 5e-06, "loss": 0.075, "step": 3309 }, { "epoch": 0.6328871892925431, "grad_norm": 1.8051013946533203, "learning_rate": 5e-06, "loss": 0.1106, "step": 3310 }, { "epoch": 0.6330783938814531, "grad_norm": 1.6434516906738281, "learning_rate": 5e-06, "loss": 0.1318, "step": 3311 }, { "epoch": 0.6332695984703632, "grad_norm": 1.805076241493225, "learning_rate": 5e-06, "loss": 0.0783, "step": 3312 }, { "epoch": 0.6334608030592734, "grad_norm": 2.02512788772583, "learning_rate": 5e-06, "loss": 0.1746, "step": 3313 }, { "epoch": 0.6336520076481835, "grad_norm": 1.037001132965088, "learning_rate": 5e-06, "loss": 0.0982, "step": 3314 }, { "epoch": 0.6338432122370937, "grad_norm": 1.426772117614746, "learning_rate": 5e-06, "loss": 0.1482, "step": 3315 }, { "epoch": 0.6340344168260038, "grad_norm": 1.1725845336914062, "learning_rate": 5e-06, "loss": 0.0841, "step": 3316 }, { "epoch": 0.634225621414914, "grad_norm": 1.404199242591858, "learning_rate": 5e-06, "loss": 0.0808, "step": 3317 }, { "epoch": 0.6344168260038241, "grad_norm": 1.5793739557266235, "learning_rate": 5e-06, "loss": 0.1145, "step": 3318 }, { "epoch": 0.6346080305927342, "grad_norm": 2.362203359603882, "learning_rate": 5e-06, "loss": 0.1639, "step": 3319 }, { "epoch": 0.6347992351816444, "grad_norm": 2.5650269985198975, "learning_rate": 5e-06, "loss": 0.3538, "step": 3320 }, { "epoch": 0.6349904397705545, "grad_norm": 1.7669709920883179, "learning_rate": 5e-06, "loss": 0.1892, "step": 3321 }, { "epoch": 0.6351816443594647, "grad_norm": 3.3054776191711426, "learning_rate": 5e-06, "loss": 0.5275, "step": 3322 }, { "epoch": 0.6353728489483748, "grad_norm": 1.7212293148040771, "learning_rate": 5e-06, "loss": 0.1369, "step": 3323 }, { "epoch": 0.6355640535372848, "grad_norm": 2.0429680347442627, "learning_rate": 5e-06, "loss": 0.0866, "step": 3324 }, { "epoch": 0.635755258126195, "grad_norm": 1.428225040435791, "learning_rate": 5e-06, "loss": 0.0705, "step": 3325 }, { "epoch": 0.6359464627151051, "grad_norm": 2.1418018341064453, "learning_rate": 5e-06, "loss": 0.3357, "step": 3326 }, { "epoch": 0.6361376673040153, "grad_norm": 1.53099524974823, "learning_rate": 5e-06, "loss": 0.1017, "step": 3327 }, { "epoch": 0.6363288718929254, "grad_norm": 1.5691248178482056, "learning_rate": 5e-06, "loss": 0.1724, "step": 3328 }, { "epoch": 0.6365200764818356, "grad_norm": 2.0633819103240967, "learning_rate": 5e-06, "loss": 0.2149, "step": 3329 }, { "epoch": 0.6367112810707457, "grad_norm": 2.932884693145752, "learning_rate": 5e-06, "loss": 0.3623, "step": 3330 }, { "epoch": 0.6369024856596558, "grad_norm": 1.8070765733718872, "learning_rate": 5e-06, "loss": 0.097, "step": 3331 }, { "epoch": 0.637093690248566, "grad_norm": 2.7510499954223633, "learning_rate": 5e-06, "loss": 0.3189, "step": 3332 }, { "epoch": 0.6372848948374761, "grad_norm": 2.5223939418792725, "learning_rate": 5e-06, "loss": 0.4771, "step": 3333 }, { "epoch": 0.6374760994263863, "grad_norm": 2.9238736629486084, "learning_rate": 5e-06, "loss": 0.2895, "step": 3334 }, { "epoch": 0.6376673040152964, "grad_norm": 0.8285096883773804, "learning_rate": 5e-06, "loss": 0.0522, "step": 3335 }, { "epoch": 0.6378585086042065, "grad_norm": 1.8238945007324219, "learning_rate": 5e-06, "loss": 0.1194, "step": 3336 }, { "epoch": 0.6380497131931167, "grad_norm": 2.4051716327667236, "learning_rate": 5e-06, "loss": 0.1254, "step": 3337 }, { "epoch": 0.6382409177820267, "grad_norm": 3.326117515563965, "learning_rate": 5e-06, "loss": 0.5696, "step": 3338 }, { "epoch": 0.6384321223709369, "grad_norm": 2.6902754306793213, "learning_rate": 5e-06, "loss": 0.211, "step": 3339 }, { "epoch": 0.638623326959847, "grad_norm": 2.737630605697632, "learning_rate": 5e-06, "loss": 0.3434, "step": 3340 }, { "epoch": 0.6388145315487572, "grad_norm": 1.2848268747329712, "learning_rate": 5e-06, "loss": 0.0847, "step": 3341 }, { "epoch": 0.6390057361376673, "grad_norm": 2.3309013843536377, "learning_rate": 5e-06, "loss": 0.1437, "step": 3342 }, { "epoch": 0.6391969407265774, "grad_norm": 2.1647744178771973, "learning_rate": 5e-06, "loss": 0.0887, "step": 3343 }, { "epoch": 0.6393881453154876, "grad_norm": 1.908402681350708, "learning_rate": 5e-06, "loss": 0.1112, "step": 3344 }, { "epoch": 0.6395793499043977, "grad_norm": 1.631221055984497, "learning_rate": 5e-06, "loss": 0.1566, "step": 3345 }, { "epoch": 0.6397705544933079, "grad_norm": 3.7973790168762207, "learning_rate": 5e-06, "loss": 0.5618, "step": 3346 }, { "epoch": 0.639961759082218, "grad_norm": 1.2875862121582031, "learning_rate": 5e-06, "loss": 0.0914, "step": 3347 }, { "epoch": 0.6401529636711281, "grad_norm": 1.294562578201294, "learning_rate": 5e-06, "loss": 0.0698, "step": 3348 }, { "epoch": 0.6403441682600383, "grad_norm": 2.602620840072632, "learning_rate": 5e-06, "loss": 0.1843, "step": 3349 }, { "epoch": 0.6405353728489483, "grad_norm": 2.0691466331481934, "learning_rate": 5e-06, "loss": 0.1112, "step": 3350 }, { "epoch": 0.6407265774378585, "grad_norm": 1.8351796865463257, "learning_rate": 5e-06, "loss": 0.2551, "step": 3351 }, { "epoch": 0.6409177820267686, "grad_norm": 2.922478199005127, "learning_rate": 5e-06, "loss": 0.3711, "step": 3352 }, { "epoch": 0.6411089866156787, "grad_norm": 1.2165088653564453, "learning_rate": 5e-06, "loss": 0.0966, "step": 3353 }, { "epoch": 0.6413001912045889, "grad_norm": 1.7305443286895752, "learning_rate": 5e-06, "loss": 0.0971, "step": 3354 }, { "epoch": 0.641491395793499, "grad_norm": 2.2913918495178223, "learning_rate": 5e-06, "loss": 0.0785, "step": 3355 }, { "epoch": 0.6416826003824092, "grad_norm": 2.2279272079467773, "learning_rate": 5e-06, "loss": 0.143, "step": 3356 }, { "epoch": 0.6418738049713193, "grad_norm": 2.924514055252075, "learning_rate": 5e-06, "loss": 0.4447, "step": 3357 }, { "epoch": 0.6420650095602295, "grad_norm": 2.028049945831299, "learning_rate": 5e-06, "loss": 0.2733, "step": 3358 }, { "epoch": 0.6422562141491396, "grad_norm": 1.2572370767593384, "learning_rate": 5e-06, "loss": 0.1236, "step": 3359 }, { "epoch": 0.6424474187380497, "grad_norm": 1.216548204421997, "learning_rate": 5e-06, "loss": 0.0975, "step": 3360 }, { "epoch": 0.6426386233269599, "grad_norm": 1.6940453052520752, "learning_rate": 5e-06, "loss": 0.0758, "step": 3361 }, { "epoch": 0.64282982791587, "grad_norm": 1.7871954441070557, "learning_rate": 5e-06, "loss": 0.0795, "step": 3362 }, { "epoch": 0.6430210325047802, "grad_norm": 1.6503570079803467, "learning_rate": 5e-06, "loss": 0.1087, "step": 3363 }, { "epoch": 0.6432122370936902, "grad_norm": 2.5519793033599854, "learning_rate": 5e-06, "loss": 0.4499, "step": 3364 }, { "epoch": 0.6434034416826003, "grad_norm": 1.5863988399505615, "learning_rate": 5e-06, "loss": 0.085, "step": 3365 }, { "epoch": 0.6435946462715105, "grad_norm": 0.7910031080245972, "learning_rate": 5e-06, "loss": 0.068, "step": 3366 }, { "epoch": 0.6437858508604206, "grad_norm": 1.732580304145813, "learning_rate": 5e-06, "loss": 0.1334, "step": 3367 }, { "epoch": 0.6439770554493308, "grad_norm": 2.4041748046875, "learning_rate": 5e-06, "loss": 0.3406, "step": 3368 }, { "epoch": 0.6441682600382409, "grad_norm": 1.4867603778839111, "learning_rate": 5e-06, "loss": 0.0677, "step": 3369 }, { "epoch": 0.6443594646271511, "grad_norm": 1.6989647150039673, "learning_rate": 5e-06, "loss": 0.1288, "step": 3370 }, { "epoch": 0.6445506692160612, "grad_norm": 1.4293967485427856, "learning_rate": 5e-06, "loss": 0.1531, "step": 3371 }, { "epoch": 0.6447418738049713, "grad_norm": 1.3375941514968872, "learning_rate": 5e-06, "loss": 0.1033, "step": 3372 }, { "epoch": 0.6449330783938815, "grad_norm": 0.9712766408920288, "learning_rate": 5e-06, "loss": 0.0382, "step": 3373 }, { "epoch": 0.6451242829827916, "grad_norm": 2.1650278568267822, "learning_rate": 5e-06, "loss": 0.2237, "step": 3374 }, { "epoch": 0.6453154875717018, "grad_norm": 1.0781400203704834, "learning_rate": 5e-06, "loss": 0.0441, "step": 3375 }, { "epoch": 0.6455066921606119, "grad_norm": 2.823831081390381, "learning_rate": 5e-06, "loss": 0.3515, "step": 3376 }, { "epoch": 0.6456978967495219, "grad_norm": 1.8595666885375977, "learning_rate": 5e-06, "loss": 0.3496, "step": 3377 }, { "epoch": 0.6458891013384321, "grad_norm": 2.762394905090332, "learning_rate": 5e-06, "loss": 0.4007, "step": 3378 }, { "epoch": 0.6460803059273422, "grad_norm": 1.8678456544876099, "learning_rate": 5e-06, "loss": 0.2212, "step": 3379 }, { "epoch": 0.6462715105162524, "grad_norm": 1.9255629777908325, "learning_rate": 5e-06, "loss": 0.2845, "step": 3380 }, { "epoch": 0.6464627151051625, "grad_norm": 2.22768497467041, "learning_rate": 5e-06, "loss": 0.0916, "step": 3381 }, { "epoch": 0.6466539196940727, "grad_norm": 2.3426296710968018, "learning_rate": 5e-06, "loss": 0.4442, "step": 3382 }, { "epoch": 0.6468451242829828, "grad_norm": 2.8555681705474854, "learning_rate": 5e-06, "loss": 0.1754, "step": 3383 }, { "epoch": 0.6470363288718929, "grad_norm": 2.99185848236084, "learning_rate": 5e-06, "loss": 0.2644, "step": 3384 }, { "epoch": 0.6472275334608031, "grad_norm": 2.5336363315582275, "learning_rate": 5e-06, "loss": 0.4541, "step": 3385 }, { "epoch": 0.6474187380497132, "grad_norm": 2.059492826461792, "learning_rate": 5e-06, "loss": 0.2929, "step": 3386 }, { "epoch": 0.6476099426386234, "grad_norm": 1.113668441772461, "learning_rate": 5e-06, "loss": 0.1024, "step": 3387 }, { "epoch": 0.6478011472275335, "grad_norm": 2.365506649017334, "learning_rate": 5e-06, "loss": 0.3054, "step": 3388 }, { "epoch": 0.6479923518164435, "grad_norm": 2.7506308555603027, "learning_rate": 5e-06, "loss": 0.4685, "step": 3389 }, { "epoch": 0.6481835564053537, "grad_norm": 1.8486632108688354, "learning_rate": 5e-06, "loss": 0.1623, "step": 3390 }, { "epoch": 0.6483747609942638, "grad_norm": 3.001056432723999, "learning_rate": 5e-06, "loss": 0.1733, "step": 3391 }, { "epoch": 0.648565965583174, "grad_norm": 1.8992429971694946, "learning_rate": 5e-06, "loss": 0.1681, "step": 3392 }, { "epoch": 0.6487571701720841, "grad_norm": 0.8879466652870178, "learning_rate": 5e-06, "loss": 0.0552, "step": 3393 }, { "epoch": 0.6489483747609943, "grad_norm": 1.9001550674438477, "learning_rate": 5e-06, "loss": 0.1817, "step": 3394 }, { "epoch": 0.6491395793499044, "grad_norm": 2.546579122543335, "learning_rate": 5e-06, "loss": 0.2765, "step": 3395 }, { "epoch": 0.6493307839388145, "grad_norm": 1.5954774618148804, "learning_rate": 5e-06, "loss": 0.3091, "step": 3396 }, { "epoch": 0.6495219885277247, "grad_norm": 2.02475905418396, "learning_rate": 5e-06, "loss": 0.2654, "step": 3397 }, { "epoch": 0.6497131931166348, "grad_norm": 2.6336565017700195, "learning_rate": 5e-06, "loss": 0.1651, "step": 3398 }, { "epoch": 0.649904397705545, "grad_norm": 2.454435348510742, "learning_rate": 5e-06, "loss": 0.0972, "step": 3399 }, { "epoch": 0.6500956022944551, "grad_norm": 1.3472257852554321, "learning_rate": 5e-06, "loss": 0.0623, "step": 3400 }, { "epoch": 0.6502868068833652, "grad_norm": 2.588073492050171, "learning_rate": 5e-06, "loss": 0.3797, "step": 3401 }, { "epoch": 0.6504780114722754, "grad_norm": 1.4676856994628906, "learning_rate": 5e-06, "loss": 0.1556, "step": 3402 }, { "epoch": 0.6506692160611854, "grad_norm": 1.6890233755111694, "learning_rate": 5e-06, "loss": 0.1048, "step": 3403 }, { "epoch": 0.6508604206500956, "grad_norm": 2.6827495098114014, "learning_rate": 5e-06, "loss": 0.223, "step": 3404 }, { "epoch": 0.6510516252390057, "grad_norm": 2.6940207481384277, "learning_rate": 5e-06, "loss": 0.1186, "step": 3405 }, { "epoch": 0.6512428298279158, "grad_norm": 2.2096269130706787, "learning_rate": 5e-06, "loss": 0.1084, "step": 3406 }, { "epoch": 0.651434034416826, "grad_norm": 1.9896823167800903, "learning_rate": 5e-06, "loss": 0.3026, "step": 3407 }, { "epoch": 0.6516252390057361, "grad_norm": 2.7920219898223877, "learning_rate": 5e-06, "loss": 0.3761, "step": 3408 }, { "epoch": 0.6518164435946463, "grad_norm": 1.8184442520141602, "learning_rate": 5e-06, "loss": 0.126, "step": 3409 }, { "epoch": 0.6520076481835564, "grad_norm": 1.1374013423919678, "learning_rate": 5e-06, "loss": 0.0662, "step": 3410 }, { "epoch": 0.6521988527724666, "grad_norm": 2.6526119709014893, "learning_rate": 5e-06, "loss": 0.3993, "step": 3411 }, { "epoch": 0.6523900573613767, "grad_norm": 2.8433001041412354, "learning_rate": 5e-06, "loss": 0.1707, "step": 3412 }, { "epoch": 0.6525812619502868, "grad_norm": 0.9985430836677551, "learning_rate": 5e-06, "loss": 0.08, "step": 3413 }, { "epoch": 0.652772466539197, "grad_norm": 2.4228732585906982, "learning_rate": 5e-06, "loss": 0.3744, "step": 3414 }, { "epoch": 0.652963671128107, "grad_norm": 1.5509470701217651, "learning_rate": 5e-06, "loss": 0.2037, "step": 3415 }, { "epoch": 0.6531548757170172, "grad_norm": 1.4869673252105713, "learning_rate": 5e-06, "loss": 0.093, "step": 3416 }, { "epoch": 0.6533460803059273, "grad_norm": 0.611136257648468, "learning_rate": 5e-06, "loss": 0.0593, "step": 3417 }, { "epoch": 0.6535372848948374, "grad_norm": 1.8549314737319946, "learning_rate": 5e-06, "loss": 0.0853, "step": 3418 }, { "epoch": 0.6537284894837476, "grad_norm": 1.9247181415557861, "learning_rate": 5e-06, "loss": 0.2044, "step": 3419 }, { "epoch": 0.6539196940726577, "grad_norm": 2.686331033706665, "learning_rate": 5e-06, "loss": 0.18, "step": 3420 }, { "epoch": 0.6541108986615679, "grad_norm": 1.9724444150924683, "learning_rate": 5e-06, "loss": 0.2007, "step": 3421 }, { "epoch": 0.654302103250478, "grad_norm": 1.914251685142517, "learning_rate": 5e-06, "loss": 0.2638, "step": 3422 }, { "epoch": 0.6544933078393882, "grad_norm": 1.8022854328155518, "learning_rate": 5e-06, "loss": 0.1179, "step": 3423 }, { "epoch": 0.6546845124282983, "grad_norm": 2.4049770832061768, "learning_rate": 5e-06, "loss": 0.1338, "step": 3424 }, { "epoch": 0.6548757170172084, "grad_norm": 1.664724349975586, "learning_rate": 5e-06, "loss": 0.0854, "step": 3425 }, { "epoch": 0.6550669216061186, "grad_norm": 2.4440340995788574, "learning_rate": 5e-06, "loss": 0.2504, "step": 3426 }, { "epoch": 0.6552581261950287, "grad_norm": 1.7133796215057373, "learning_rate": 5e-06, "loss": 0.148, "step": 3427 }, { "epoch": 0.6554493307839389, "grad_norm": 2.668142080307007, "learning_rate": 5e-06, "loss": 0.2902, "step": 3428 }, { "epoch": 0.6556405353728489, "grad_norm": 2.5641353130340576, "learning_rate": 5e-06, "loss": 0.3005, "step": 3429 }, { "epoch": 0.655831739961759, "grad_norm": 2.3122782707214355, "learning_rate": 5e-06, "loss": 0.1071, "step": 3430 }, { "epoch": 0.6560229445506692, "grad_norm": 1.7354764938354492, "learning_rate": 5e-06, "loss": 0.0937, "step": 3431 }, { "epoch": 0.6562141491395793, "grad_norm": 2.0829017162323, "learning_rate": 5e-06, "loss": 0.3719, "step": 3432 }, { "epoch": 0.6564053537284895, "grad_norm": 1.4630341529846191, "learning_rate": 5e-06, "loss": 0.1413, "step": 3433 }, { "epoch": 0.6565965583173996, "grad_norm": 1.0866786241531372, "learning_rate": 5e-06, "loss": 0.0894, "step": 3434 }, { "epoch": 0.6567877629063098, "grad_norm": 2.1426541805267334, "learning_rate": 5e-06, "loss": 0.4141, "step": 3435 }, { "epoch": 0.6569789674952199, "grad_norm": 3.4362006187438965, "learning_rate": 5e-06, "loss": 0.2099, "step": 3436 }, { "epoch": 0.65717017208413, "grad_norm": 2.1101267337799072, "learning_rate": 5e-06, "loss": 0.1208, "step": 3437 }, { "epoch": 0.6573613766730402, "grad_norm": 1.7151868343353271, "learning_rate": 5e-06, "loss": 0.1146, "step": 3438 }, { "epoch": 0.6575525812619503, "grad_norm": 1.1048760414123535, "learning_rate": 5e-06, "loss": 0.1355, "step": 3439 }, { "epoch": 0.6577437858508605, "grad_norm": 2.037752628326416, "learning_rate": 5e-06, "loss": 0.2758, "step": 3440 }, { "epoch": 0.6579349904397706, "grad_norm": 1.7046453952789307, "learning_rate": 5e-06, "loss": 0.1341, "step": 3441 }, { "epoch": 0.6581261950286806, "grad_norm": 1.204085350036621, "learning_rate": 5e-06, "loss": 0.0877, "step": 3442 }, { "epoch": 0.6583173996175908, "grad_norm": 1.8280205726623535, "learning_rate": 5e-06, "loss": 0.1489, "step": 3443 }, { "epoch": 0.6585086042065009, "grad_norm": 1.589545726776123, "learning_rate": 5e-06, "loss": 0.0657, "step": 3444 }, { "epoch": 0.6586998087954111, "grad_norm": 2.3866069316864014, "learning_rate": 5e-06, "loss": 0.2623, "step": 3445 }, { "epoch": 0.6588910133843212, "grad_norm": 1.6342370510101318, "learning_rate": 5e-06, "loss": 0.336, "step": 3446 }, { "epoch": 0.6590822179732314, "grad_norm": 1.841517448425293, "learning_rate": 5e-06, "loss": 0.1794, "step": 3447 }, { "epoch": 0.6592734225621415, "grad_norm": 1.5720770359039307, "learning_rate": 5e-06, "loss": 0.0911, "step": 3448 }, { "epoch": 0.6594646271510516, "grad_norm": 3.1381449699401855, "learning_rate": 5e-06, "loss": 0.1462, "step": 3449 }, { "epoch": 0.6596558317399618, "grad_norm": 2.0292165279388428, "learning_rate": 5e-06, "loss": 0.1095, "step": 3450 }, { "epoch": 0.6598470363288719, "grad_norm": 2.1982815265655518, "learning_rate": 5e-06, "loss": 0.2026, "step": 3451 }, { "epoch": 0.6600382409177821, "grad_norm": 3.106961727142334, "learning_rate": 5e-06, "loss": 0.0886, "step": 3452 }, { "epoch": 0.6602294455066922, "grad_norm": 2.6791203022003174, "learning_rate": 5e-06, "loss": 0.1841, "step": 3453 }, { "epoch": 0.6604206500956022, "grad_norm": 0.9955388903617859, "learning_rate": 5e-06, "loss": 0.0829, "step": 3454 }, { "epoch": 0.6606118546845124, "grad_norm": 1.959532618522644, "learning_rate": 5e-06, "loss": 0.1379, "step": 3455 }, { "epoch": 0.6608030592734225, "grad_norm": 1.4918891191482544, "learning_rate": 5e-06, "loss": 0.0437, "step": 3456 }, { "epoch": 0.6609942638623327, "grad_norm": 2.387288808822632, "learning_rate": 5e-06, "loss": 0.2389, "step": 3457 }, { "epoch": 0.6611854684512428, "grad_norm": 2.7315800189971924, "learning_rate": 5e-06, "loss": 0.4122, "step": 3458 }, { "epoch": 0.661376673040153, "grad_norm": 3.008780002593994, "learning_rate": 5e-06, "loss": 0.5461, "step": 3459 }, { "epoch": 0.6615678776290631, "grad_norm": 1.4275813102722168, "learning_rate": 5e-06, "loss": 0.0968, "step": 3460 }, { "epoch": 0.6617590822179732, "grad_norm": 2.498218297958374, "learning_rate": 5e-06, "loss": 0.176, "step": 3461 }, { "epoch": 0.6619502868068834, "grad_norm": 2.3589062690734863, "learning_rate": 5e-06, "loss": 0.1207, "step": 3462 }, { "epoch": 0.6621414913957935, "grad_norm": 1.8128048181533813, "learning_rate": 5e-06, "loss": 0.1377, "step": 3463 }, { "epoch": 0.6623326959847037, "grad_norm": 2.6147453784942627, "learning_rate": 5e-06, "loss": 0.4276, "step": 3464 }, { "epoch": 0.6625239005736138, "grad_norm": 1.5070366859436035, "learning_rate": 5e-06, "loss": 0.1426, "step": 3465 }, { "epoch": 0.6627151051625239, "grad_norm": 1.6204683780670166, "learning_rate": 5e-06, "loss": 0.0838, "step": 3466 }, { "epoch": 0.662906309751434, "grad_norm": 0.9713101983070374, "learning_rate": 5e-06, "loss": 0.1332, "step": 3467 }, { "epoch": 0.6630975143403441, "grad_norm": 2.181532621383667, "learning_rate": 5e-06, "loss": 0.1285, "step": 3468 }, { "epoch": 0.6632887189292543, "grad_norm": 2.44734787940979, "learning_rate": 5e-06, "loss": 0.1991, "step": 3469 }, { "epoch": 0.6634799235181644, "grad_norm": 3.1234562397003174, "learning_rate": 5e-06, "loss": 0.6639, "step": 3470 }, { "epoch": 0.6636711281070745, "grad_norm": 1.2073078155517578, "learning_rate": 5e-06, "loss": 0.123, "step": 3471 }, { "epoch": 0.6638623326959847, "grad_norm": 2.172306537628174, "learning_rate": 5e-06, "loss": 0.2327, "step": 3472 }, { "epoch": 0.6640535372848948, "grad_norm": 1.4012967348098755, "learning_rate": 5e-06, "loss": 0.1017, "step": 3473 }, { "epoch": 0.664244741873805, "grad_norm": 1.9511357545852661, "learning_rate": 5e-06, "loss": 0.1336, "step": 3474 }, { "epoch": 0.6644359464627151, "grad_norm": 4.60706090927124, "learning_rate": 5e-06, "loss": 0.1754, "step": 3475 }, { "epoch": 0.6646271510516253, "grad_norm": 2.3865597248077393, "learning_rate": 5e-06, "loss": 0.3496, "step": 3476 }, { "epoch": 0.6648183556405354, "grad_norm": 2.6536009311676025, "learning_rate": 5e-06, "loss": 0.2316, "step": 3477 }, { "epoch": 0.6650095602294455, "grad_norm": 1.3826388120651245, "learning_rate": 5e-06, "loss": 0.159, "step": 3478 }, { "epoch": 0.6652007648183557, "grad_norm": 1.5564355850219727, "learning_rate": 5e-06, "loss": 0.1318, "step": 3479 }, { "epoch": 0.6653919694072657, "grad_norm": 1.9521403312683105, "learning_rate": 5e-06, "loss": 0.1619, "step": 3480 }, { "epoch": 0.665583173996176, "grad_norm": 1.7389386892318726, "learning_rate": 5e-06, "loss": 0.0573, "step": 3481 }, { "epoch": 0.665774378585086, "grad_norm": 1.4144505262374878, "learning_rate": 5e-06, "loss": 0.245, "step": 3482 }, { "epoch": 0.6659655831739961, "grad_norm": 1.4570914506912231, "learning_rate": 5e-06, "loss": 0.1495, "step": 3483 }, { "epoch": 0.6661567877629063, "grad_norm": 1.8888745307922363, "learning_rate": 5e-06, "loss": 0.2384, "step": 3484 }, { "epoch": 0.6663479923518164, "grad_norm": 2.382516860961914, "learning_rate": 5e-06, "loss": 0.2814, "step": 3485 }, { "epoch": 0.6665391969407266, "grad_norm": 1.7876882553100586, "learning_rate": 5e-06, "loss": 0.1326, "step": 3486 }, { "epoch": 0.6667304015296367, "grad_norm": 1.878159999847412, "learning_rate": 5e-06, "loss": 0.1178, "step": 3487 }, { "epoch": 0.6669216061185469, "grad_norm": 2.022584915161133, "learning_rate": 5e-06, "loss": 0.1486, "step": 3488 }, { "epoch": 0.667112810707457, "grad_norm": 2.123523473739624, "learning_rate": 5e-06, "loss": 0.1952, "step": 3489 }, { "epoch": 0.6673040152963671, "grad_norm": 2.8303143978118896, "learning_rate": 5e-06, "loss": 0.3464, "step": 3490 }, { "epoch": 0.6674952198852773, "grad_norm": 1.546257495880127, "learning_rate": 5e-06, "loss": 0.1826, "step": 3491 }, { "epoch": 0.6676864244741874, "grad_norm": 1.338302731513977, "learning_rate": 5e-06, "loss": 0.109, "step": 3492 }, { "epoch": 0.6678776290630976, "grad_norm": 1.4451181888580322, "learning_rate": 5e-06, "loss": 0.1015, "step": 3493 }, { "epoch": 0.6680688336520076, "grad_norm": 1.242767095565796, "learning_rate": 5e-06, "loss": 0.0837, "step": 3494 }, { "epoch": 0.6682600382409177, "grad_norm": 2.2983875274658203, "learning_rate": 5e-06, "loss": 0.3422, "step": 3495 }, { "epoch": 0.6684512428298279, "grad_norm": 2.008904218673706, "learning_rate": 5e-06, "loss": 0.2969, "step": 3496 }, { "epoch": 0.668642447418738, "grad_norm": 2.3049376010894775, "learning_rate": 5e-06, "loss": 0.3266, "step": 3497 }, { "epoch": 0.6688336520076482, "grad_norm": 2.297036647796631, "learning_rate": 5e-06, "loss": 0.2258, "step": 3498 }, { "epoch": 0.6690248565965583, "grad_norm": 1.6770265102386475, "learning_rate": 5e-06, "loss": 0.0976, "step": 3499 }, { "epoch": 0.6692160611854685, "grad_norm": 0.9409367442131042, "learning_rate": 5e-06, "loss": 0.034, "step": 3500 }, { "epoch": 0.6692160611854685, "eval_runtime": 786.6312, "eval_samples_per_second": 1.95, "eval_steps_per_second": 0.244, "step": 3500 }, { "epoch": 0.6694072657743786, "grad_norm": 1.6677579879760742, "learning_rate": 5e-06, "loss": 0.1108, "step": 3501 }, { "epoch": 0.6695984703632887, "grad_norm": 1.5019170045852661, "learning_rate": 5e-06, "loss": 0.2266, "step": 3502 }, { "epoch": 0.6697896749521989, "grad_norm": 2.645740509033203, "learning_rate": 5e-06, "loss": 0.3297, "step": 3503 }, { "epoch": 0.669980879541109, "grad_norm": 2.4888534545898438, "learning_rate": 5e-06, "loss": 0.2745, "step": 3504 }, { "epoch": 0.6701720841300192, "grad_norm": 1.4102755784988403, "learning_rate": 5e-06, "loss": 0.0735, "step": 3505 }, { "epoch": 0.6703632887189293, "grad_norm": 11.278008460998535, "learning_rate": 5e-06, "loss": 0.1773, "step": 3506 }, { "epoch": 0.6705544933078393, "grad_norm": 2.274052381515503, "learning_rate": 5e-06, "loss": 0.1174, "step": 3507 }, { "epoch": 0.6707456978967495, "grad_norm": 1.99003005027771, "learning_rate": 5e-06, "loss": 0.268, "step": 3508 }, { "epoch": 0.6709369024856596, "grad_norm": 1.6028283834457397, "learning_rate": 5e-06, "loss": 0.1354, "step": 3509 }, { "epoch": 0.6711281070745698, "grad_norm": 2.1967668533325195, "learning_rate": 5e-06, "loss": 0.1541, "step": 3510 }, { "epoch": 0.6713193116634799, "grad_norm": 2.1785519123077393, "learning_rate": 5e-06, "loss": 0.1173, "step": 3511 }, { "epoch": 0.6715105162523901, "grad_norm": 1.8150266408920288, "learning_rate": 5e-06, "loss": 0.0938, "step": 3512 }, { "epoch": 0.6717017208413002, "grad_norm": 1.4618399143218994, "learning_rate": 5e-06, "loss": 0.1253, "step": 3513 }, { "epoch": 0.6718929254302103, "grad_norm": 3.069847822189331, "learning_rate": 5e-06, "loss": 0.5254, "step": 3514 }, { "epoch": 0.6720841300191205, "grad_norm": 1.6214332580566406, "learning_rate": 5e-06, "loss": 0.1171, "step": 3515 }, { "epoch": 0.6722753346080306, "grad_norm": 2.7934930324554443, "learning_rate": 5e-06, "loss": 0.2811, "step": 3516 }, { "epoch": 0.6724665391969408, "grad_norm": 1.2103495597839355, "learning_rate": 5e-06, "loss": 0.075, "step": 3517 }, { "epoch": 0.6726577437858509, "grad_norm": 1.6577422618865967, "learning_rate": 5e-06, "loss": 0.1344, "step": 3518 }, { "epoch": 0.672848948374761, "grad_norm": 1.5630927085876465, "learning_rate": 5e-06, "loss": 0.1212, "step": 3519 }, { "epoch": 0.6730401529636711, "grad_norm": 3.0157723426818848, "learning_rate": 5e-06, "loss": 0.5516, "step": 3520 }, { "epoch": 0.6732313575525812, "grad_norm": 2.152554988861084, "learning_rate": 5e-06, "loss": 0.3016, "step": 3521 }, { "epoch": 0.6734225621414914, "grad_norm": 2.146681547164917, "learning_rate": 5e-06, "loss": 0.4707, "step": 3522 }, { "epoch": 0.6736137667304015, "grad_norm": 2.0331432819366455, "learning_rate": 5e-06, "loss": 0.1046, "step": 3523 }, { "epoch": 0.6738049713193117, "grad_norm": 1.3789559602737427, "learning_rate": 5e-06, "loss": 0.076, "step": 3524 }, { "epoch": 0.6739961759082218, "grad_norm": 0.8906503319740295, "learning_rate": 5e-06, "loss": 0.0667, "step": 3525 }, { "epoch": 0.6741873804971319, "grad_norm": 2.7111520767211914, "learning_rate": 5e-06, "loss": 0.3719, "step": 3526 }, { "epoch": 0.6743785850860421, "grad_norm": 2.096019744873047, "learning_rate": 5e-06, "loss": 0.3089, "step": 3527 }, { "epoch": 0.6745697896749522, "grad_norm": 1.231835961341858, "learning_rate": 5e-06, "loss": 0.0677, "step": 3528 }, { "epoch": 0.6747609942638624, "grad_norm": 1.7469233274459839, "learning_rate": 5e-06, "loss": 0.0908, "step": 3529 }, { "epoch": 0.6749521988527725, "grad_norm": 0.5927736163139343, "learning_rate": 5e-06, "loss": 0.0342, "step": 3530 }, { "epoch": 0.6751434034416826, "grad_norm": 3.6534321308135986, "learning_rate": 5e-06, "loss": 0.1764, "step": 3531 }, { "epoch": 0.6753346080305928, "grad_norm": 1.5184030532836914, "learning_rate": 5e-06, "loss": 0.1878, "step": 3532 }, { "epoch": 0.6755258126195028, "grad_norm": 1.760200023651123, "learning_rate": 5e-06, "loss": 0.1343, "step": 3533 }, { "epoch": 0.675717017208413, "grad_norm": 2.823066234588623, "learning_rate": 5e-06, "loss": 0.3365, "step": 3534 }, { "epoch": 0.6759082217973231, "grad_norm": 1.697743535041809, "learning_rate": 5e-06, "loss": 0.1053, "step": 3535 }, { "epoch": 0.6760994263862332, "grad_norm": 1.5451792478561401, "learning_rate": 5e-06, "loss": 0.1168, "step": 3536 }, { "epoch": 0.6762906309751434, "grad_norm": 2.006969690322876, "learning_rate": 5e-06, "loss": 0.1252, "step": 3537 }, { "epoch": 0.6764818355640535, "grad_norm": 2.5118765830993652, "learning_rate": 5e-06, "loss": 0.3674, "step": 3538 }, { "epoch": 0.6766730401529637, "grad_norm": 2.0223960876464844, "learning_rate": 5e-06, "loss": 0.2248, "step": 3539 }, { "epoch": 0.6768642447418738, "grad_norm": 1.5017013549804688, "learning_rate": 5e-06, "loss": 0.1731, "step": 3540 }, { "epoch": 0.677055449330784, "grad_norm": 1.1571253538131714, "learning_rate": 5e-06, "loss": 0.1285, "step": 3541 }, { "epoch": 0.6772466539196941, "grad_norm": 0.9846636652946472, "learning_rate": 5e-06, "loss": 0.092, "step": 3542 }, { "epoch": 0.6774378585086042, "grad_norm": 1.6082611083984375, "learning_rate": 5e-06, "loss": 0.0561, "step": 3543 }, { "epoch": 0.6776290630975144, "grad_norm": 2.4058167934417725, "learning_rate": 5e-06, "loss": 0.1535, "step": 3544 }, { "epoch": 0.6778202676864244, "grad_norm": 1.8551628589630127, "learning_rate": 5e-06, "loss": 0.227, "step": 3545 }, { "epoch": 0.6780114722753346, "grad_norm": 2.5418436527252197, "learning_rate": 5e-06, "loss": 0.4509, "step": 3546 }, { "epoch": 0.6782026768642447, "grad_norm": 1.8027793169021606, "learning_rate": 5e-06, "loss": 0.1219, "step": 3547 }, { "epoch": 0.6783938814531548, "grad_norm": 1.5673744678497314, "learning_rate": 5e-06, "loss": 0.2043, "step": 3548 }, { "epoch": 0.678585086042065, "grad_norm": 1.2510675191879272, "learning_rate": 5e-06, "loss": 0.0827, "step": 3549 }, { "epoch": 0.6787762906309751, "grad_norm": 1.4993951320648193, "learning_rate": 5e-06, "loss": 0.0514, "step": 3550 }, { "epoch": 0.6789674952198853, "grad_norm": 2.15187931060791, "learning_rate": 5e-06, "loss": 0.2552, "step": 3551 }, { "epoch": 0.6791586998087954, "grad_norm": 2.6083855628967285, "learning_rate": 5e-06, "loss": 0.4038, "step": 3552 }, { "epoch": 0.6793499043977056, "grad_norm": 2.16745924949646, "learning_rate": 5e-06, "loss": 0.2588, "step": 3553 }, { "epoch": 0.6795411089866157, "grad_norm": 1.6964585781097412, "learning_rate": 5e-06, "loss": 0.1775, "step": 3554 }, { "epoch": 0.6797323135755258, "grad_norm": 2.437232732772827, "learning_rate": 5e-06, "loss": 0.1336, "step": 3555 }, { "epoch": 0.679923518164436, "grad_norm": 1.8153268098831177, "learning_rate": 5e-06, "loss": 0.095, "step": 3556 }, { "epoch": 0.6801147227533461, "grad_norm": 2.4039676189422607, "learning_rate": 5e-06, "loss": 0.2803, "step": 3557 }, { "epoch": 0.6803059273422563, "grad_norm": 3.8345534801483154, "learning_rate": 5e-06, "loss": 0.2945, "step": 3558 }, { "epoch": 0.6804971319311663, "grad_norm": 2.759597063064575, "learning_rate": 5e-06, "loss": 0.2355, "step": 3559 }, { "epoch": 0.6806883365200764, "grad_norm": 1.6006584167480469, "learning_rate": 5e-06, "loss": 0.1212, "step": 3560 }, { "epoch": 0.6808795411089866, "grad_norm": 1.7173149585723877, "learning_rate": 5e-06, "loss": 0.1152, "step": 3561 }, { "epoch": 0.6810707456978967, "grad_norm": 1.0151087045669556, "learning_rate": 5e-06, "loss": 0.0712, "step": 3562 }, { "epoch": 0.6812619502868069, "grad_norm": 1.4937494993209839, "learning_rate": 5e-06, "loss": 0.0642, "step": 3563 }, { "epoch": 0.681453154875717, "grad_norm": 1.3477442264556885, "learning_rate": 5e-06, "loss": 0.1069, "step": 3564 }, { "epoch": 0.6816443594646272, "grad_norm": 1.9342656135559082, "learning_rate": 5e-06, "loss": 0.4498, "step": 3565 }, { "epoch": 0.6818355640535373, "grad_norm": 2.319324254989624, "learning_rate": 5e-06, "loss": 0.3148, "step": 3566 }, { "epoch": 0.6820267686424474, "grad_norm": 2.8179759979248047, "learning_rate": 5e-06, "loss": 0.362, "step": 3567 }, { "epoch": 0.6822179732313576, "grad_norm": 2.6950948238372803, "learning_rate": 5e-06, "loss": 0.0819, "step": 3568 }, { "epoch": 0.6824091778202677, "grad_norm": 2.1264123916625977, "learning_rate": 5e-06, "loss": 0.1911, "step": 3569 }, { "epoch": 0.6826003824091779, "grad_norm": 2.7540876865386963, "learning_rate": 5e-06, "loss": 0.6599, "step": 3570 }, { "epoch": 0.682791586998088, "grad_norm": 2.1270487308502197, "learning_rate": 5e-06, "loss": 0.3011, "step": 3571 }, { "epoch": 0.682982791586998, "grad_norm": 2.540781021118164, "learning_rate": 5e-06, "loss": 0.4374, "step": 3572 }, { "epoch": 0.6831739961759082, "grad_norm": 1.4416691064834595, "learning_rate": 5e-06, "loss": 0.118, "step": 3573 }, { "epoch": 0.6833652007648183, "grad_norm": 1.5677932500839233, "learning_rate": 5e-06, "loss": 0.0783, "step": 3574 }, { "epoch": 0.6835564053537285, "grad_norm": 2.3644917011260986, "learning_rate": 5e-06, "loss": 0.0778, "step": 3575 }, { "epoch": 0.6837476099426386, "grad_norm": 2.1362829208374023, "learning_rate": 5e-06, "loss": 0.2909, "step": 3576 }, { "epoch": 0.6839388145315488, "grad_norm": 3.296583890914917, "learning_rate": 5e-06, "loss": 0.4958, "step": 3577 }, { "epoch": 0.6841300191204589, "grad_norm": 1.5384232997894287, "learning_rate": 5e-06, "loss": 0.1698, "step": 3578 }, { "epoch": 0.684321223709369, "grad_norm": 2.5473268032073975, "learning_rate": 5e-06, "loss": 0.2099, "step": 3579 }, { "epoch": 0.6845124282982792, "grad_norm": 1.863061547279358, "learning_rate": 5e-06, "loss": 0.1071, "step": 3580 }, { "epoch": 0.6847036328871893, "grad_norm": 2.5463814735412598, "learning_rate": 5e-06, "loss": 0.0986, "step": 3581 }, { "epoch": 0.6848948374760995, "grad_norm": 1.9547677040100098, "learning_rate": 5e-06, "loss": 0.2291, "step": 3582 }, { "epoch": 0.6850860420650096, "grad_norm": 1.2805721759796143, "learning_rate": 5e-06, "loss": 0.1025, "step": 3583 }, { "epoch": 0.6852772466539196, "grad_norm": 2.379103183746338, "learning_rate": 5e-06, "loss": 0.3174, "step": 3584 }, { "epoch": 0.6854684512428298, "grad_norm": 1.308780312538147, "learning_rate": 5e-06, "loss": 0.1225, "step": 3585 }, { "epoch": 0.6856596558317399, "grad_norm": 2.074080228805542, "learning_rate": 5e-06, "loss": 0.1967, "step": 3586 }, { "epoch": 0.6858508604206501, "grad_norm": 1.9603567123413086, "learning_rate": 5e-06, "loss": 0.1479, "step": 3587 }, { "epoch": 0.6860420650095602, "grad_norm": 1.7865123748779297, "learning_rate": 5e-06, "loss": 0.1054, "step": 3588 }, { "epoch": 0.6862332695984703, "grad_norm": 3.6250503063201904, "learning_rate": 5e-06, "loss": 0.4375, "step": 3589 }, { "epoch": 0.6864244741873805, "grad_norm": 2.0481324195861816, "learning_rate": 5e-06, "loss": 0.1674, "step": 3590 }, { "epoch": 0.6866156787762906, "grad_norm": 2.7558176517486572, "learning_rate": 5e-06, "loss": 0.1392, "step": 3591 }, { "epoch": 0.6868068833652008, "grad_norm": 1.8074321746826172, "learning_rate": 5e-06, "loss": 0.1178, "step": 3592 }, { "epoch": 0.6869980879541109, "grad_norm": 1.997239351272583, "learning_rate": 5e-06, "loss": 0.1253, "step": 3593 }, { "epoch": 0.6871892925430211, "grad_norm": 2.378185987472534, "learning_rate": 5e-06, "loss": 0.3194, "step": 3594 }, { "epoch": 0.6873804971319312, "grad_norm": 2.6446828842163086, "learning_rate": 5e-06, "loss": 0.4638, "step": 3595 }, { "epoch": 0.6875717017208413, "grad_norm": 2.4682164192199707, "learning_rate": 5e-06, "loss": 0.3686, "step": 3596 }, { "epoch": 0.6877629063097515, "grad_norm": 1.9417331218719482, "learning_rate": 5e-06, "loss": 0.2668, "step": 3597 }, { "epoch": 0.6879541108986615, "grad_norm": 1.9014066457748413, "learning_rate": 5e-06, "loss": 0.1559, "step": 3598 }, { "epoch": 0.6881453154875717, "grad_norm": 1.8162672519683838, "learning_rate": 5e-06, "loss": 0.1298, "step": 3599 }, { "epoch": 0.6883365200764818, "grad_norm": 1.8843580484390259, "learning_rate": 5e-06, "loss": 0.1089, "step": 3600 }, { "epoch": 0.6885277246653919, "grad_norm": 1.4576982259750366, "learning_rate": 5e-06, "loss": 0.1632, "step": 3601 }, { "epoch": 0.6887189292543021, "grad_norm": 3.355142831802368, "learning_rate": 5e-06, "loss": 0.2791, "step": 3602 }, { "epoch": 0.6889101338432122, "grad_norm": 1.7258144617080688, "learning_rate": 5e-06, "loss": 0.2368, "step": 3603 }, { "epoch": 0.6891013384321224, "grad_norm": 1.7668720483779907, "learning_rate": 5e-06, "loss": 0.132, "step": 3604 }, { "epoch": 0.6892925430210325, "grad_norm": 1.8233599662780762, "learning_rate": 5e-06, "loss": 0.0762, "step": 3605 }, { "epoch": 0.6894837476099427, "grad_norm": 2.0434274673461914, "learning_rate": 5e-06, "loss": 0.071, "step": 3606 }, { "epoch": 0.6896749521988528, "grad_norm": 2.575824022293091, "learning_rate": 5e-06, "loss": 0.4385, "step": 3607 }, { "epoch": 0.6898661567877629, "grad_norm": 3.179419755935669, "learning_rate": 5e-06, "loss": 0.4052, "step": 3608 }, { "epoch": 0.6900573613766731, "grad_norm": 2.368760347366333, "learning_rate": 5e-06, "loss": 0.4097, "step": 3609 }, { "epoch": 0.6902485659655831, "grad_norm": 1.6637201309204102, "learning_rate": 5e-06, "loss": 0.1209, "step": 3610 }, { "epoch": 0.6904397705544933, "grad_norm": 1.4172734022140503, "learning_rate": 5e-06, "loss": 0.1386, "step": 3611 }, { "epoch": 0.6906309751434034, "grad_norm": 2.328533411026001, "learning_rate": 5e-06, "loss": 0.2946, "step": 3612 }, { "epoch": 0.6908221797323135, "grad_norm": 1.887550711631775, "learning_rate": 5e-06, "loss": 0.1478, "step": 3613 }, { "epoch": 0.6910133843212237, "grad_norm": 1.8322516679763794, "learning_rate": 5e-06, "loss": 0.1951, "step": 3614 }, { "epoch": 0.6912045889101338, "grad_norm": 1.0440239906311035, "learning_rate": 5e-06, "loss": 0.1293, "step": 3615 }, { "epoch": 0.691395793499044, "grad_norm": 2.5914018154144287, "learning_rate": 5e-06, "loss": 0.3502, "step": 3616 }, { "epoch": 0.6915869980879541, "grad_norm": 2.4446136951446533, "learning_rate": 5e-06, "loss": 0.1806, "step": 3617 }, { "epoch": 0.6917782026768643, "grad_norm": 1.4871900081634521, "learning_rate": 5e-06, "loss": 0.0797, "step": 3618 }, { "epoch": 0.6919694072657744, "grad_norm": 2.3120055198669434, "learning_rate": 5e-06, "loss": 0.1075, "step": 3619 }, { "epoch": 0.6921606118546845, "grad_norm": 1.4232051372528076, "learning_rate": 5e-06, "loss": 0.1484, "step": 3620 }, { "epoch": 0.6923518164435947, "grad_norm": 3.1616592407226562, "learning_rate": 5e-06, "loss": 0.5661, "step": 3621 }, { "epoch": 0.6925430210325048, "grad_norm": 0.8112595081329346, "learning_rate": 5e-06, "loss": 0.0917, "step": 3622 }, { "epoch": 0.692734225621415, "grad_norm": 1.0516867637634277, "learning_rate": 5e-06, "loss": 0.0633, "step": 3623 }, { "epoch": 0.692925430210325, "grad_norm": 1.4513195753097534, "learning_rate": 5e-06, "loss": 0.0825, "step": 3624 }, { "epoch": 0.6931166347992351, "grad_norm": 2.015878200531006, "learning_rate": 5e-06, "loss": 0.1085, "step": 3625 }, { "epoch": 0.6933078393881453, "grad_norm": 2.6140496730804443, "learning_rate": 5e-06, "loss": 0.5673, "step": 3626 }, { "epoch": 0.6934990439770554, "grad_norm": 2.6710994243621826, "learning_rate": 5e-06, "loss": 0.1457, "step": 3627 }, { "epoch": 0.6936902485659656, "grad_norm": 1.289347529411316, "learning_rate": 5e-06, "loss": 0.0971, "step": 3628 }, { "epoch": 0.6938814531548757, "grad_norm": 1.6329401731491089, "learning_rate": 5e-06, "loss": 0.1046, "step": 3629 }, { "epoch": 0.6940726577437859, "grad_norm": 1.3376904726028442, "learning_rate": 5e-06, "loss": 0.0794, "step": 3630 }, { "epoch": 0.694263862332696, "grad_norm": 2.8650128841400146, "learning_rate": 5e-06, "loss": 0.1426, "step": 3631 }, { "epoch": 0.6944550669216061, "grad_norm": 3.3484082221984863, "learning_rate": 5e-06, "loss": 0.5369, "step": 3632 }, { "epoch": 0.6946462715105163, "grad_norm": 2.710831880569458, "learning_rate": 5e-06, "loss": 0.5368, "step": 3633 }, { "epoch": 0.6948374760994264, "grad_norm": 1.1810380220413208, "learning_rate": 5e-06, "loss": 0.1185, "step": 3634 }, { "epoch": 0.6950286806883366, "grad_norm": 0.5851054787635803, "learning_rate": 5e-06, "loss": 0.0376, "step": 3635 }, { "epoch": 0.6952198852772467, "grad_norm": 3.097205400466919, "learning_rate": 5e-06, "loss": 0.2126, "step": 3636 }, { "epoch": 0.6954110898661567, "grad_norm": 1.8246523141860962, "learning_rate": 5e-06, "loss": 0.1213, "step": 3637 }, { "epoch": 0.6956022944550669, "grad_norm": 1.682036280632019, "learning_rate": 5e-06, "loss": 0.0875, "step": 3638 }, { "epoch": 0.695793499043977, "grad_norm": 3.0946667194366455, "learning_rate": 5e-06, "loss": 0.6476, "step": 3639 }, { "epoch": 0.6959847036328872, "grad_norm": 1.0556777715682983, "learning_rate": 5e-06, "loss": 0.0874, "step": 3640 }, { "epoch": 0.6961759082217973, "grad_norm": 1.5672385692596436, "learning_rate": 5e-06, "loss": 0.1432, "step": 3641 }, { "epoch": 0.6963671128107075, "grad_norm": 1.1081948280334473, "learning_rate": 5e-06, "loss": 0.1039, "step": 3642 }, { "epoch": 0.6965583173996176, "grad_norm": 3.239100694656372, "learning_rate": 5e-06, "loss": 0.3624, "step": 3643 }, { "epoch": 0.6967495219885277, "grad_norm": 1.96116304397583, "learning_rate": 5e-06, "loss": 0.1417, "step": 3644 }, { "epoch": 0.6969407265774379, "grad_norm": 2.3989415168762207, "learning_rate": 5e-06, "loss": 0.3667, "step": 3645 }, { "epoch": 0.697131931166348, "grad_norm": 1.7304235696792603, "learning_rate": 5e-06, "loss": 0.3046, "step": 3646 }, { "epoch": 0.6973231357552582, "grad_norm": 1.469014286994934, "learning_rate": 5e-06, "loss": 0.1986, "step": 3647 }, { "epoch": 0.6975143403441683, "grad_norm": 2.6511895656585693, "learning_rate": 5e-06, "loss": 0.1332, "step": 3648 }, { "epoch": 0.6977055449330783, "grad_norm": 4.76706600189209, "learning_rate": 5e-06, "loss": 0.3297, "step": 3649 }, { "epoch": 0.6978967495219885, "grad_norm": 1.9821535348892212, "learning_rate": 5e-06, "loss": 0.0866, "step": 3650 }, { "epoch": 0.6980879541108986, "grad_norm": 1.3378146886825562, "learning_rate": 5e-06, "loss": 0.1156, "step": 3651 }, { "epoch": 0.6982791586998088, "grad_norm": 1.85330069065094, "learning_rate": 5e-06, "loss": 0.2699, "step": 3652 }, { "epoch": 0.6984703632887189, "grad_norm": 2.159931182861328, "learning_rate": 5e-06, "loss": 0.3349, "step": 3653 }, { "epoch": 0.698661567877629, "grad_norm": 2.206817150115967, "learning_rate": 5e-06, "loss": 0.1541, "step": 3654 }, { "epoch": 0.6988527724665392, "grad_norm": 2.111424446105957, "learning_rate": 5e-06, "loss": 0.2197, "step": 3655 }, { "epoch": 0.6990439770554493, "grad_norm": 2.3077478408813477, "learning_rate": 5e-06, "loss": 0.3232, "step": 3656 }, { "epoch": 0.6992351816443595, "grad_norm": 2.5184547901153564, "learning_rate": 5e-06, "loss": 0.1036, "step": 3657 }, { "epoch": 0.6994263862332696, "grad_norm": 2.8431849479675293, "learning_rate": 5e-06, "loss": 0.5781, "step": 3658 }, { "epoch": 0.6996175908221798, "grad_norm": 2.026881694793701, "learning_rate": 5e-06, "loss": 0.3044, "step": 3659 }, { "epoch": 0.6998087954110899, "grad_norm": 1.5575225353240967, "learning_rate": 5e-06, "loss": 0.0619, "step": 3660 }, { "epoch": 0.7, "grad_norm": 1.6749697923660278, "learning_rate": 5e-06, "loss": 0.1297, "step": 3661 }, { "epoch": 0.7001912045889102, "grad_norm": 1.1908769607543945, "learning_rate": 5e-06, "loss": 0.0485, "step": 3662 }, { "epoch": 0.7003824091778202, "grad_norm": 2.0859763622283936, "learning_rate": 5e-06, "loss": 0.2337, "step": 3663 }, { "epoch": 0.7005736137667304, "grad_norm": 1.7970099449157715, "learning_rate": 5e-06, "loss": 0.2513, "step": 3664 }, { "epoch": 0.7007648183556405, "grad_norm": 2.7064900398254395, "learning_rate": 5e-06, "loss": 0.3637, "step": 3665 }, { "epoch": 0.7009560229445506, "grad_norm": 3.3071577548980713, "learning_rate": 5e-06, "loss": 0.3557, "step": 3666 }, { "epoch": 0.7011472275334608, "grad_norm": 1.6861001253128052, "learning_rate": 5e-06, "loss": 0.1279, "step": 3667 }, { "epoch": 0.7013384321223709, "grad_norm": 1.3847708702087402, "learning_rate": 5e-06, "loss": 0.0656, "step": 3668 }, { "epoch": 0.7015296367112811, "grad_norm": 3.5664327144622803, "learning_rate": 5e-06, "loss": 0.1027, "step": 3669 }, { "epoch": 0.7017208413001912, "grad_norm": 3.75789213180542, "learning_rate": 5e-06, "loss": 0.5611, "step": 3670 }, { "epoch": 0.7019120458891014, "grad_norm": 2.001065254211426, "learning_rate": 5e-06, "loss": 0.2467, "step": 3671 }, { "epoch": 0.7021032504780115, "grad_norm": 2.242720603942871, "learning_rate": 5e-06, "loss": 0.1748, "step": 3672 }, { "epoch": 0.7022944550669216, "grad_norm": 1.567443609237671, "learning_rate": 5e-06, "loss": 0.1055, "step": 3673 }, { "epoch": 0.7024856596558318, "grad_norm": 1.640781044960022, "learning_rate": 5e-06, "loss": 0.0745, "step": 3674 }, { "epoch": 0.7026768642447419, "grad_norm": 1.6443709135055542, "learning_rate": 5e-06, "loss": 0.1, "step": 3675 }, { "epoch": 0.702868068833652, "grad_norm": 2.157893419265747, "learning_rate": 5e-06, "loss": 0.2708, "step": 3676 }, { "epoch": 0.7030592734225621, "grad_norm": 1.9968886375427246, "learning_rate": 5e-06, "loss": 0.1628, "step": 3677 }, { "epoch": 0.7032504780114722, "grad_norm": 1.2432177066802979, "learning_rate": 5e-06, "loss": 0.0915, "step": 3678 }, { "epoch": 0.7034416826003824, "grad_norm": 1.2381335496902466, "learning_rate": 5e-06, "loss": 0.1092, "step": 3679 }, { "epoch": 0.7036328871892925, "grad_norm": 1.1749067306518555, "learning_rate": 5e-06, "loss": 0.101, "step": 3680 }, { "epoch": 0.7038240917782027, "grad_norm": 3.458665370941162, "learning_rate": 5e-06, "loss": 0.1514, "step": 3681 }, { "epoch": 0.7040152963671128, "grad_norm": 2.4206576347351074, "learning_rate": 5e-06, "loss": 0.2704, "step": 3682 }, { "epoch": 0.704206500956023, "grad_norm": 1.8550755977630615, "learning_rate": 5e-06, "loss": 0.2404, "step": 3683 }, { "epoch": 0.7043977055449331, "grad_norm": 1.5968091487884521, "learning_rate": 5e-06, "loss": 0.0821, "step": 3684 }, { "epoch": 0.7045889101338432, "grad_norm": 2.3588109016418457, "learning_rate": 5e-06, "loss": 0.1878, "step": 3685 }, { "epoch": 0.7047801147227534, "grad_norm": 1.6067357063293457, "learning_rate": 5e-06, "loss": 0.0753, "step": 3686 }, { "epoch": 0.7049713193116635, "grad_norm": 1.9464937448501587, "learning_rate": 5e-06, "loss": 0.0893, "step": 3687 }, { "epoch": 0.7051625239005737, "grad_norm": 2.355674982070923, "learning_rate": 5e-06, "loss": 0.1453, "step": 3688 }, { "epoch": 0.7053537284894837, "grad_norm": 3.20164155960083, "learning_rate": 5e-06, "loss": 0.4407, "step": 3689 }, { "epoch": 0.7055449330783938, "grad_norm": 2.1521313190460205, "learning_rate": 5e-06, "loss": 0.1517, "step": 3690 }, { "epoch": 0.705736137667304, "grad_norm": 1.2731857299804688, "learning_rate": 5e-06, "loss": 0.1149, "step": 3691 }, { "epoch": 0.7059273422562141, "grad_norm": 1.5699528455734253, "learning_rate": 5e-06, "loss": 0.0717, "step": 3692 }, { "epoch": 0.7061185468451243, "grad_norm": 1.5166230201721191, "learning_rate": 5e-06, "loss": 0.0978, "step": 3693 }, { "epoch": 0.7063097514340344, "grad_norm": 1.5976234674453735, "learning_rate": 5e-06, "loss": 0.118, "step": 3694 }, { "epoch": 0.7065009560229446, "grad_norm": 2.420431137084961, "learning_rate": 5e-06, "loss": 0.3126, "step": 3695 }, { "epoch": 0.7066921606118547, "grad_norm": 2.7159039974212646, "learning_rate": 5e-06, "loss": 0.5264, "step": 3696 }, { "epoch": 0.7068833652007648, "grad_norm": 2.4235072135925293, "learning_rate": 5e-06, "loss": 0.4054, "step": 3697 }, { "epoch": 0.707074569789675, "grad_norm": 1.8437212705612183, "learning_rate": 5e-06, "loss": 0.2117, "step": 3698 }, { "epoch": 0.7072657743785851, "grad_norm": 1.3941651582717896, "learning_rate": 5e-06, "loss": 0.1185, "step": 3699 }, { "epoch": 0.7074569789674953, "grad_norm": 1.5819287300109863, "learning_rate": 5e-06, "loss": 0.1037, "step": 3700 }, { "epoch": 0.7076481835564054, "grad_norm": 3.036449670791626, "learning_rate": 5e-06, "loss": 0.4601, "step": 3701 }, { "epoch": 0.7078393881453154, "grad_norm": 1.64691960811615, "learning_rate": 5e-06, "loss": 0.2143, "step": 3702 }, { "epoch": 0.7080305927342256, "grad_norm": 1.3194284439086914, "learning_rate": 5e-06, "loss": 0.1746, "step": 3703 }, { "epoch": 0.7082217973231357, "grad_norm": 1.3173198699951172, "learning_rate": 5e-06, "loss": 0.1449, "step": 3704 }, { "epoch": 0.7084130019120459, "grad_norm": 2.3822851181030273, "learning_rate": 5e-06, "loss": 0.2857, "step": 3705 }, { "epoch": 0.708604206500956, "grad_norm": 0.9149222373962402, "learning_rate": 5e-06, "loss": 0.0497, "step": 3706 }, { "epoch": 0.7087954110898662, "grad_norm": 2.2636375427246094, "learning_rate": 5e-06, "loss": 0.2972, "step": 3707 }, { "epoch": 0.7089866156787763, "grad_norm": 1.518246054649353, "learning_rate": 5e-06, "loss": 0.1799, "step": 3708 }, { "epoch": 0.7091778202676864, "grad_norm": 1.2613885402679443, "learning_rate": 5e-06, "loss": 0.1001, "step": 3709 }, { "epoch": 0.7093690248565966, "grad_norm": 2.623847723007202, "learning_rate": 5e-06, "loss": 0.2827, "step": 3710 }, { "epoch": 0.7095602294455067, "grad_norm": 1.9074766635894775, "learning_rate": 5e-06, "loss": 0.0718, "step": 3711 }, { "epoch": 0.7097514340344169, "grad_norm": 2.9987568855285645, "learning_rate": 5e-06, "loss": 0.1303, "step": 3712 }, { "epoch": 0.709942638623327, "grad_norm": 1.0691415071487427, "learning_rate": 5e-06, "loss": 0.0659, "step": 3713 }, { "epoch": 0.710133843212237, "grad_norm": 3.7600460052490234, "learning_rate": 5e-06, "loss": 0.86, "step": 3714 }, { "epoch": 0.7103250478011472, "grad_norm": 1.6110256910324097, "learning_rate": 5e-06, "loss": 0.1318, "step": 3715 }, { "epoch": 0.7105162523900573, "grad_norm": 2.0216493606567383, "learning_rate": 5e-06, "loss": 0.2925, "step": 3716 }, { "epoch": 0.7107074569789675, "grad_norm": 1.7469826936721802, "learning_rate": 5e-06, "loss": 0.2517, "step": 3717 }, { "epoch": 0.7108986615678776, "grad_norm": 1.7995054721832275, "learning_rate": 5e-06, "loss": 0.1195, "step": 3718 }, { "epoch": 0.7110898661567877, "grad_norm": 2.31731915473938, "learning_rate": 5e-06, "loss": 0.1143, "step": 3719 }, { "epoch": 0.7112810707456979, "grad_norm": 3.077112913131714, "learning_rate": 5e-06, "loss": 0.7406, "step": 3720 }, { "epoch": 0.711472275334608, "grad_norm": 3.16402530670166, "learning_rate": 5e-06, "loss": 0.465, "step": 3721 }, { "epoch": 0.7116634799235182, "grad_norm": 2.755120038986206, "learning_rate": 5e-06, "loss": 0.2713, "step": 3722 }, { "epoch": 0.7118546845124283, "grad_norm": 1.240608811378479, "learning_rate": 5e-06, "loss": 0.0644, "step": 3723 }, { "epoch": 0.7120458891013385, "grad_norm": 1.4552818536758423, "learning_rate": 5e-06, "loss": 0.1006, "step": 3724 }, { "epoch": 0.7122370936902486, "grad_norm": 1.733351707458496, "learning_rate": 5e-06, "loss": 0.1208, "step": 3725 }, { "epoch": 0.7124282982791587, "grad_norm": 2.0551533699035645, "learning_rate": 5e-06, "loss": 0.3772, "step": 3726 }, { "epoch": 0.7126195028680689, "grad_norm": 3.2883100509643555, "learning_rate": 5e-06, "loss": 0.5013, "step": 3727 }, { "epoch": 0.7128107074569789, "grad_norm": 1.076749563217163, "learning_rate": 5e-06, "loss": 0.1152, "step": 3728 }, { "epoch": 0.7130019120458891, "grad_norm": 2.451246976852417, "learning_rate": 5e-06, "loss": 0.2865, "step": 3729 }, { "epoch": 0.7131931166347992, "grad_norm": 1.9702866077423096, "learning_rate": 5e-06, "loss": 0.2526, "step": 3730 }, { "epoch": 0.7133843212237093, "grad_norm": 2.362886667251587, "learning_rate": 5e-06, "loss": 0.1457, "step": 3731 }, { "epoch": 0.7135755258126195, "grad_norm": 2.858502149581909, "learning_rate": 5e-06, "loss": 0.4665, "step": 3732 }, { "epoch": 0.7137667304015296, "grad_norm": 1.5832735300064087, "learning_rate": 5e-06, "loss": 0.1224, "step": 3733 }, { "epoch": 0.7139579349904398, "grad_norm": 1.0978338718414307, "learning_rate": 5e-06, "loss": 0.1259, "step": 3734 }, { "epoch": 0.7141491395793499, "grad_norm": 2.182143211364746, "learning_rate": 5e-06, "loss": 0.2562, "step": 3735 }, { "epoch": 0.7143403441682601, "grad_norm": 2.5188608169555664, "learning_rate": 5e-06, "loss": 0.2833, "step": 3736 }, { "epoch": 0.7145315487571702, "grad_norm": 2.875762701034546, "learning_rate": 5e-06, "loss": 0.163, "step": 3737 }, { "epoch": 0.7147227533460803, "grad_norm": 1.844509243965149, "learning_rate": 5e-06, "loss": 0.123, "step": 3738 }, { "epoch": 0.7149139579349905, "grad_norm": 2.9208295345306396, "learning_rate": 5e-06, "loss": 0.3887, "step": 3739 }, { "epoch": 0.7151051625239006, "grad_norm": 2.217679977416992, "learning_rate": 5e-06, "loss": 0.2439, "step": 3740 }, { "epoch": 0.7152963671128107, "grad_norm": 2.5245535373687744, "learning_rate": 5e-06, "loss": 0.3163, "step": 3741 }, { "epoch": 0.7154875717017208, "grad_norm": 1.9459753036499023, "learning_rate": 5e-06, "loss": 0.1344, "step": 3742 }, { "epoch": 0.7156787762906309, "grad_norm": 2.5548386573791504, "learning_rate": 5e-06, "loss": 0.0823, "step": 3743 }, { "epoch": 0.7158699808795411, "grad_norm": 1.1367732286453247, "learning_rate": 5e-06, "loss": 0.0603, "step": 3744 }, { "epoch": 0.7160611854684512, "grad_norm": 1.7649719715118408, "learning_rate": 5e-06, "loss": 0.2555, "step": 3745 }, { "epoch": 0.7162523900573614, "grad_norm": 2.476152181625366, "learning_rate": 5e-06, "loss": 0.2607, "step": 3746 }, { "epoch": 0.7164435946462715, "grad_norm": 1.505000114440918, "learning_rate": 5e-06, "loss": 0.1498, "step": 3747 }, { "epoch": 0.7166347992351817, "grad_norm": 1.3874759674072266, "learning_rate": 5e-06, "loss": 0.0955, "step": 3748 }, { "epoch": 0.7168260038240918, "grad_norm": 1.742844581604004, "learning_rate": 5e-06, "loss": 0.1141, "step": 3749 }, { "epoch": 0.7170172084130019, "grad_norm": 2.4118449687957764, "learning_rate": 5e-06, "loss": 0.176, "step": 3750 }, { "epoch": 0.7172084130019121, "grad_norm": 1.4940874576568604, "learning_rate": 5e-06, "loss": 0.1553, "step": 3751 }, { "epoch": 0.7173996175908222, "grad_norm": 1.963732361793518, "learning_rate": 5e-06, "loss": 0.1711, "step": 3752 }, { "epoch": 0.7175908221797324, "grad_norm": 1.9962869882583618, "learning_rate": 5e-06, "loss": 0.2072, "step": 3753 }, { "epoch": 0.7177820267686424, "grad_norm": 1.2074509859085083, "learning_rate": 5e-06, "loss": 0.1099, "step": 3754 }, { "epoch": 0.7179732313575525, "grad_norm": 1.593335509300232, "learning_rate": 5e-06, "loss": 0.1583, "step": 3755 }, { "epoch": 0.7181644359464627, "grad_norm": 1.2521311044692993, "learning_rate": 5e-06, "loss": 0.0686, "step": 3756 }, { "epoch": 0.7183556405353728, "grad_norm": 2.19175124168396, "learning_rate": 5e-06, "loss": 0.3097, "step": 3757 }, { "epoch": 0.718546845124283, "grad_norm": 1.7197211980819702, "learning_rate": 5e-06, "loss": 0.1602, "step": 3758 }, { "epoch": 0.7187380497131931, "grad_norm": 2.827324867248535, "learning_rate": 5e-06, "loss": 0.5046, "step": 3759 }, { "epoch": 0.7189292543021033, "grad_norm": 2.1176669597625732, "learning_rate": 5e-06, "loss": 0.0945, "step": 3760 }, { "epoch": 0.7191204588910134, "grad_norm": 1.07952880859375, "learning_rate": 5e-06, "loss": 0.0495, "step": 3761 }, { "epoch": 0.7193116634799235, "grad_norm": 2.3675689697265625, "learning_rate": 5e-06, "loss": 0.2094, "step": 3762 }, { "epoch": 0.7195028680688337, "grad_norm": 2.8417303562164307, "learning_rate": 5e-06, "loss": 0.4245, "step": 3763 }, { "epoch": 0.7196940726577438, "grad_norm": 3.0691707134246826, "learning_rate": 5e-06, "loss": 0.5019, "step": 3764 }, { "epoch": 0.719885277246654, "grad_norm": 1.418975591659546, "learning_rate": 5e-06, "loss": 0.1174, "step": 3765 }, { "epoch": 0.720076481835564, "grad_norm": 2.2532196044921875, "learning_rate": 5e-06, "loss": 0.1952, "step": 3766 }, { "epoch": 0.7202676864244741, "grad_norm": 1.7203007936477661, "learning_rate": 5e-06, "loss": 0.1286, "step": 3767 }, { "epoch": 0.7204588910133843, "grad_norm": 2.12467885017395, "learning_rate": 5e-06, "loss": 0.155, "step": 3768 }, { "epoch": 0.7206500956022944, "grad_norm": 1.4739885330200195, "learning_rate": 5e-06, "loss": 0.0862, "step": 3769 }, { "epoch": 0.7208413001912046, "grad_norm": 3.242621898651123, "learning_rate": 5e-06, "loss": 0.6335, "step": 3770 }, { "epoch": 0.7210325047801147, "grad_norm": 2.5275886058807373, "learning_rate": 5e-06, "loss": 0.4305, "step": 3771 }, { "epoch": 0.7212237093690248, "grad_norm": 1.7171810865402222, "learning_rate": 5e-06, "loss": 0.1955, "step": 3772 }, { "epoch": 0.721414913957935, "grad_norm": 1.231096625328064, "learning_rate": 5e-06, "loss": 0.092, "step": 3773 }, { "epoch": 0.7216061185468451, "grad_norm": 1.7485922574996948, "learning_rate": 5e-06, "loss": 0.1059, "step": 3774 }, { "epoch": 0.7217973231357553, "grad_norm": 2.117539882659912, "learning_rate": 5e-06, "loss": 0.1215, "step": 3775 }, { "epoch": 0.7219885277246654, "grad_norm": 3.3793411254882812, "learning_rate": 5e-06, "loss": 0.5776, "step": 3776 }, { "epoch": 0.7221797323135756, "grad_norm": 1.7460490465164185, "learning_rate": 5e-06, "loss": 0.1292, "step": 3777 }, { "epoch": 0.7223709369024857, "grad_norm": 2.388984441757202, "learning_rate": 5e-06, "loss": 0.3883, "step": 3778 }, { "epoch": 0.7225621414913957, "grad_norm": 2.096761703491211, "learning_rate": 5e-06, "loss": 0.1509, "step": 3779 }, { "epoch": 0.722753346080306, "grad_norm": 1.783759593963623, "learning_rate": 5e-06, "loss": 0.2602, "step": 3780 }, { "epoch": 0.722944550669216, "grad_norm": 1.4529781341552734, "learning_rate": 5e-06, "loss": 0.0799, "step": 3781 }, { "epoch": 0.7231357552581262, "grad_norm": 2.474672317504883, "learning_rate": 5e-06, "loss": 0.2724, "step": 3782 }, { "epoch": 0.7233269598470363, "grad_norm": 2.139212131500244, "learning_rate": 5e-06, "loss": 0.236, "step": 3783 }, { "epoch": 0.7235181644359464, "grad_norm": 2.131075859069824, "learning_rate": 5e-06, "loss": 0.3092, "step": 3784 }, { "epoch": 0.7237093690248566, "grad_norm": 1.3767306804656982, "learning_rate": 5e-06, "loss": 0.1263, "step": 3785 }, { "epoch": 0.7239005736137667, "grad_norm": 1.0231573581695557, "learning_rate": 5e-06, "loss": 0.0584, "step": 3786 }, { "epoch": 0.7240917782026769, "grad_norm": 1.745872139930725, "learning_rate": 5e-06, "loss": 0.132, "step": 3787 }, { "epoch": 0.724282982791587, "grad_norm": 2.5864417552948, "learning_rate": 5e-06, "loss": 0.4019, "step": 3788 }, { "epoch": 0.7244741873804972, "grad_norm": 1.4365683794021606, "learning_rate": 5e-06, "loss": 0.089, "step": 3789 }, { "epoch": 0.7246653919694073, "grad_norm": 2.468015193939209, "learning_rate": 5e-06, "loss": 0.3741, "step": 3790 }, { "epoch": 0.7248565965583174, "grad_norm": 3.0470778942108154, "learning_rate": 5e-06, "loss": 0.4877, "step": 3791 }, { "epoch": 0.7250478011472276, "grad_norm": 1.820553183555603, "learning_rate": 5e-06, "loss": 0.1056, "step": 3792 }, { "epoch": 0.7252390057361376, "grad_norm": 1.5439107418060303, "learning_rate": 5e-06, "loss": 0.0992, "step": 3793 }, { "epoch": 0.7254302103250478, "grad_norm": 2.6746387481689453, "learning_rate": 5e-06, "loss": 0.1281, "step": 3794 }, { "epoch": 0.7256214149139579, "grad_norm": 2.123269557952881, "learning_rate": 5e-06, "loss": 0.2376, "step": 3795 }, { "epoch": 0.725812619502868, "grad_norm": 1.2966785430908203, "learning_rate": 5e-06, "loss": 0.107, "step": 3796 }, { "epoch": 0.7260038240917782, "grad_norm": 1.3482623100280762, "learning_rate": 5e-06, "loss": 0.1186, "step": 3797 }, { "epoch": 0.7261950286806883, "grad_norm": 1.8395178318023682, "learning_rate": 5e-06, "loss": 0.2555, "step": 3798 }, { "epoch": 0.7263862332695985, "grad_norm": 1.2841962575912476, "learning_rate": 5e-06, "loss": 0.0857, "step": 3799 }, { "epoch": 0.7265774378585086, "grad_norm": 2.664163827896118, "learning_rate": 5e-06, "loss": 0.1759, "step": 3800 }, { "epoch": 0.7267686424474188, "grad_norm": 2.564634323120117, "learning_rate": 5e-06, "loss": 0.4485, "step": 3801 }, { "epoch": 0.7269598470363289, "grad_norm": 2.6707558631896973, "learning_rate": 5e-06, "loss": 0.4055, "step": 3802 }, { "epoch": 0.727151051625239, "grad_norm": 1.4090027809143066, "learning_rate": 5e-06, "loss": 0.1511, "step": 3803 }, { "epoch": 0.7273422562141492, "grad_norm": 1.7918213605880737, "learning_rate": 5e-06, "loss": 0.1137, "step": 3804 }, { "epoch": 0.7275334608030593, "grad_norm": 1.4990943670272827, "learning_rate": 5e-06, "loss": 0.166, "step": 3805 }, { "epoch": 0.7277246653919694, "grad_norm": 2.7204136848449707, "learning_rate": 5e-06, "loss": 0.1131, "step": 3806 }, { "epoch": 0.7279158699808795, "grad_norm": 2.5874335765838623, "learning_rate": 5e-06, "loss": 0.1842, "step": 3807 }, { "epoch": 0.7281070745697896, "grad_norm": 2.7016761302948, "learning_rate": 5e-06, "loss": 0.3025, "step": 3808 }, { "epoch": 0.7282982791586998, "grad_norm": 1.1666982173919678, "learning_rate": 5e-06, "loss": 0.1359, "step": 3809 }, { "epoch": 0.7284894837476099, "grad_norm": 1.7149765491485596, "learning_rate": 5e-06, "loss": 0.0882, "step": 3810 }, { "epoch": 0.7286806883365201, "grad_norm": 2.1318461894989014, "learning_rate": 5e-06, "loss": 0.1363, "step": 3811 }, { "epoch": 0.7288718929254302, "grad_norm": 1.7442179918289185, "learning_rate": 5e-06, "loss": 0.1217, "step": 3812 }, { "epoch": 0.7290630975143404, "grad_norm": 2.2777628898620605, "learning_rate": 5e-06, "loss": 0.1651, "step": 3813 }, { "epoch": 0.7292543021032505, "grad_norm": 6.601057052612305, "learning_rate": 5e-06, "loss": 0.5386, "step": 3814 }, { "epoch": 0.7294455066921606, "grad_norm": 1.6855504512786865, "learning_rate": 5e-06, "loss": 0.1608, "step": 3815 }, { "epoch": 0.7296367112810708, "grad_norm": 1.998500943183899, "learning_rate": 5e-06, "loss": 0.2681, "step": 3816 }, { "epoch": 0.7298279158699809, "grad_norm": 1.0050654411315918, "learning_rate": 5e-06, "loss": 0.1253, "step": 3817 }, { "epoch": 0.7300191204588911, "grad_norm": 2.510183811187744, "learning_rate": 5e-06, "loss": 0.2464, "step": 3818 }, { "epoch": 0.7302103250478011, "grad_norm": 1.5420417785644531, "learning_rate": 5e-06, "loss": 0.0933, "step": 3819 }, { "epoch": 0.7304015296367112, "grad_norm": 2.9417569637298584, "learning_rate": 5e-06, "loss": 0.5287, "step": 3820 }, { "epoch": 0.7305927342256214, "grad_norm": 2.2762291431427, "learning_rate": 5e-06, "loss": 0.2584, "step": 3821 }, { "epoch": 0.7307839388145315, "grad_norm": 3.564465045928955, "learning_rate": 5e-06, "loss": 0.2454, "step": 3822 }, { "epoch": 0.7309751434034417, "grad_norm": 1.3632512092590332, "learning_rate": 5e-06, "loss": 0.1078, "step": 3823 }, { "epoch": 0.7311663479923518, "grad_norm": 2.418616771697998, "learning_rate": 5e-06, "loss": 0.1328, "step": 3824 }, { "epoch": 0.731357552581262, "grad_norm": 1.0035635232925415, "learning_rate": 5e-06, "loss": 0.0504, "step": 3825 }, { "epoch": 0.7315487571701721, "grad_norm": 2.8982577323913574, "learning_rate": 5e-06, "loss": 0.5397, "step": 3826 }, { "epoch": 0.7317399617590822, "grad_norm": 2.096374273300171, "learning_rate": 5e-06, "loss": 0.3442, "step": 3827 }, { "epoch": 0.7319311663479924, "grad_norm": 2.2000863552093506, "learning_rate": 5e-06, "loss": 0.1408, "step": 3828 }, { "epoch": 0.7321223709369025, "grad_norm": 2.5556576251983643, "learning_rate": 5e-06, "loss": 0.2146, "step": 3829 }, { "epoch": 0.7323135755258127, "grad_norm": 1.023587942123413, "learning_rate": 5e-06, "loss": 0.0599, "step": 3830 }, { "epoch": 0.7325047801147228, "grad_norm": 2.018570899963379, "learning_rate": 5e-06, "loss": 0.1337, "step": 3831 }, { "epoch": 0.7326959847036328, "grad_norm": 1.6819159984588623, "learning_rate": 5e-06, "loss": 0.1293, "step": 3832 }, { "epoch": 0.732887189292543, "grad_norm": 10.953661918640137, "learning_rate": 5e-06, "loss": 0.1749, "step": 3833 }, { "epoch": 0.7330783938814531, "grad_norm": 2.3553245067596436, "learning_rate": 5e-06, "loss": 0.1587, "step": 3834 }, { "epoch": 0.7332695984703633, "grad_norm": 3.415651798248291, "learning_rate": 5e-06, "loss": 0.6602, "step": 3835 }, { "epoch": 0.7334608030592734, "grad_norm": 1.051983118057251, "learning_rate": 5e-06, "loss": 0.0589, "step": 3836 }, { "epoch": 0.7336520076481835, "grad_norm": 1.977689504623413, "learning_rate": 5e-06, "loss": 0.1378, "step": 3837 }, { "epoch": 0.7338432122370937, "grad_norm": 2.170513391494751, "learning_rate": 5e-06, "loss": 0.2598, "step": 3838 }, { "epoch": 0.7340344168260038, "grad_norm": 2.0110361576080322, "learning_rate": 5e-06, "loss": 0.3082, "step": 3839 }, { "epoch": 0.734225621414914, "grad_norm": 1.711444616317749, "learning_rate": 5e-06, "loss": 0.159, "step": 3840 }, { "epoch": 0.7344168260038241, "grad_norm": 1.1422127485275269, "learning_rate": 5e-06, "loss": 0.1196, "step": 3841 }, { "epoch": 0.7346080305927343, "grad_norm": 1.628873586654663, "learning_rate": 5e-06, "loss": 0.0867, "step": 3842 }, { "epoch": 0.7347992351816444, "grad_norm": 2.754671335220337, "learning_rate": 5e-06, "loss": 0.2112, "step": 3843 }, { "epoch": 0.7349904397705544, "grad_norm": 2.050431728363037, "learning_rate": 5e-06, "loss": 0.2173, "step": 3844 }, { "epoch": 0.7351816443594646, "grad_norm": 1.6508783102035522, "learning_rate": 5e-06, "loss": 0.1271, "step": 3845 }, { "epoch": 0.7353728489483747, "grad_norm": 1.9832112789154053, "learning_rate": 5e-06, "loss": 0.2658, "step": 3846 }, { "epoch": 0.7355640535372849, "grad_norm": 1.4163705110549927, "learning_rate": 5e-06, "loss": 0.0685, "step": 3847 }, { "epoch": 0.735755258126195, "grad_norm": 2.1025025844573975, "learning_rate": 5e-06, "loss": 0.0973, "step": 3848 }, { "epoch": 0.7359464627151051, "grad_norm": 1.321691632270813, "learning_rate": 5e-06, "loss": 0.0648, "step": 3849 }, { "epoch": 0.7361376673040153, "grad_norm": 1.1205233335494995, "learning_rate": 5e-06, "loss": 0.0491, "step": 3850 }, { "epoch": 0.7363288718929254, "grad_norm": 1.8860313892364502, "learning_rate": 5e-06, "loss": 0.246, "step": 3851 }, { "epoch": 0.7365200764818356, "grad_norm": 2.046706438064575, "learning_rate": 5e-06, "loss": 0.3516, "step": 3852 }, { "epoch": 0.7367112810707457, "grad_norm": 1.3845429420471191, "learning_rate": 5e-06, "loss": 0.115, "step": 3853 }, { "epoch": 0.7369024856596559, "grad_norm": 1.796648383140564, "learning_rate": 5e-06, "loss": 0.1873, "step": 3854 }, { "epoch": 0.737093690248566, "grad_norm": 1.5792717933654785, "learning_rate": 5e-06, "loss": 0.2649, "step": 3855 }, { "epoch": 0.7372848948374761, "grad_norm": 1.4286028146743774, "learning_rate": 5e-06, "loss": 0.0831, "step": 3856 }, { "epoch": 0.7374760994263863, "grad_norm": 1.319799780845642, "learning_rate": 5e-06, "loss": 0.1116, "step": 3857 }, { "epoch": 0.7376673040152963, "grad_norm": 1.7097197771072388, "learning_rate": 5e-06, "loss": 0.1823, "step": 3858 }, { "epoch": 0.7378585086042065, "grad_norm": 2.7568390369415283, "learning_rate": 5e-06, "loss": 0.1829, "step": 3859 }, { "epoch": 0.7380497131931166, "grad_norm": 2.267392873764038, "learning_rate": 5e-06, "loss": 0.3423, "step": 3860 }, { "epoch": 0.7382409177820267, "grad_norm": 1.5888205766677856, "learning_rate": 5e-06, "loss": 0.1232, "step": 3861 }, { "epoch": 0.7384321223709369, "grad_norm": 0.9303539991378784, "learning_rate": 5e-06, "loss": 0.0497, "step": 3862 }, { "epoch": 0.738623326959847, "grad_norm": 3.3895652294158936, "learning_rate": 5e-06, "loss": 0.1808, "step": 3863 }, { "epoch": 0.7388145315487572, "grad_norm": 3.354686975479126, "learning_rate": 5e-06, "loss": 0.3795, "step": 3864 }, { "epoch": 0.7390057361376673, "grad_norm": 2.8291988372802734, "learning_rate": 5e-06, "loss": 0.1875, "step": 3865 }, { "epoch": 0.7391969407265775, "grad_norm": 1.4242098331451416, "learning_rate": 5e-06, "loss": 0.1156, "step": 3866 }, { "epoch": 0.7393881453154876, "grad_norm": 0.8613669276237488, "learning_rate": 5e-06, "loss": 0.0874, "step": 3867 }, { "epoch": 0.7395793499043977, "grad_norm": 1.4547524452209473, "learning_rate": 5e-06, "loss": 0.1247, "step": 3868 }, { "epoch": 0.7397705544933079, "grad_norm": 1.1557694673538208, "learning_rate": 5e-06, "loss": 0.0494, "step": 3869 }, { "epoch": 0.739961759082218, "grad_norm": 2.8671276569366455, "learning_rate": 5e-06, "loss": 0.3717, "step": 3870 }, { "epoch": 0.7401529636711282, "grad_norm": 1.4811517000198364, "learning_rate": 5e-06, "loss": 0.1741, "step": 3871 }, { "epoch": 0.7403441682600382, "grad_norm": 1.3373982906341553, "learning_rate": 5e-06, "loss": 0.1384, "step": 3872 }, { "epoch": 0.7405353728489483, "grad_norm": 0.8785936832427979, "learning_rate": 5e-06, "loss": 0.1139, "step": 3873 }, { "epoch": 0.7407265774378585, "grad_norm": 0.9371583461761475, "learning_rate": 5e-06, "loss": 0.0445, "step": 3874 }, { "epoch": 0.7409177820267686, "grad_norm": 1.681098222732544, "learning_rate": 5e-06, "loss": 0.1076, "step": 3875 }, { "epoch": 0.7411089866156788, "grad_norm": 2.265328884124756, "learning_rate": 5e-06, "loss": 0.4473, "step": 3876 }, { "epoch": 0.7413001912045889, "grad_norm": 1.790353536605835, "learning_rate": 5e-06, "loss": 0.0827, "step": 3877 }, { "epoch": 0.7414913957934991, "grad_norm": 2.781074047088623, "learning_rate": 5e-06, "loss": 0.3687, "step": 3878 }, { "epoch": 0.7416826003824092, "grad_norm": 1.5163137912750244, "learning_rate": 5e-06, "loss": 0.0667, "step": 3879 }, { "epoch": 0.7418738049713193, "grad_norm": 1.391534686088562, "learning_rate": 5e-06, "loss": 0.0774, "step": 3880 }, { "epoch": 0.7420650095602295, "grad_norm": 1.9660178422927856, "learning_rate": 5e-06, "loss": 0.1352, "step": 3881 }, { "epoch": 0.7422562141491396, "grad_norm": 1.3986618518829346, "learning_rate": 5e-06, "loss": 0.1077, "step": 3882 }, { "epoch": 0.7424474187380498, "grad_norm": 2.422389507293701, "learning_rate": 5e-06, "loss": 0.3266, "step": 3883 }, { "epoch": 0.7426386233269598, "grad_norm": 1.2864700555801392, "learning_rate": 5e-06, "loss": 0.2191, "step": 3884 }, { "epoch": 0.7428298279158699, "grad_norm": 1.7133296728134155, "learning_rate": 5e-06, "loss": 0.1452, "step": 3885 }, { "epoch": 0.7430210325047801, "grad_norm": 2.243800163269043, "learning_rate": 5e-06, "loss": 0.1434, "step": 3886 }, { "epoch": 0.7432122370936902, "grad_norm": 2.1287524700164795, "learning_rate": 5e-06, "loss": 0.3237, "step": 3887 }, { "epoch": 0.7434034416826004, "grad_norm": 3.481367588043213, "learning_rate": 5e-06, "loss": 0.5053, "step": 3888 }, { "epoch": 0.7435946462715105, "grad_norm": 2.098524332046509, "learning_rate": 5e-06, "loss": 0.2776, "step": 3889 }, { "epoch": 0.7437858508604207, "grad_norm": 0.9846514463424683, "learning_rate": 5e-06, "loss": 0.0791, "step": 3890 }, { "epoch": 0.7439770554493308, "grad_norm": 1.8553352355957031, "learning_rate": 5e-06, "loss": 0.2831, "step": 3891 }, { "epoch": 0.7441682600382409, "grad_norm": 2.703209638595581, "learning_rate": 5e-06, "loss": 0.1956, "step": 3892 }, { "epoch": 0.7443594646271511, "grad_norm": 0.9680504202842712, "learning_rate": 5e-06, "loss": 0.0764, "step": 3893 }, { "epoch": 0.7445506692160612, "grad_norm": 1.8443779945373535, "learning_rate": 5e-06, "loss": 0.2195, "step": 3894 }, { "epoch": 0.7447418738049714, "grad_norm": 2.6461219787597656, "learning_rate": 5e-06, "loss": 0.4909, "step": 3895 }, { "epoch": 0.7449330783938815, "grad_norm": 1.3912622928619385, "learning_rate": 5e-06, "loss": 0.1268, "step": 3896 }, { "epoch": 0.7451242829827915, "grad_norm": 1.7276599407196045, "learning_rate": 5e-06, "loss": 0.226, "step": 3897 }, { "epoch": 0.7453154875717017, "grad_norm": 2.662609815597534, "learning_rate": 5e-06, "loss": 0.1942, "step": 3898 }, { "epoch": 0.7455066921606118, "grad_norm": 1.6807820796966553, "learning_rate": 5e-06, "loss": 0.1768, "step": 3899 }, { "epoch": 0.745697896749522, "grad_norm": 1.3572360277175903, "learning_rate": 5e-06, "loss": 0.0603, "step": 3900 }, { "epoch": 0.7458891013384321, "grad_norm": 1.2281373739242554, "learning_rate": 5e-06, "loss": 0.1086, "step": 3901 }, { "epoch": 0.7460803059273422, "grad_norm": 1.9232803583145142, "learning_rate": 5e-06, "loss": 0.2239, "step": 3902 }, { "epoch": 0.7462715105162524, "grad_norm": 2.2120046615600586, "learning_rate": 5e-06, "loss": 0.2476, "step": 3903 }, { "epoch": 0.7464627151051625, "grad_norm": 2.8236958980560303, "learning_rate": 5e-06, "loss": 0.5103, "step": 3904 }, { "epoch": 0.7466539196940727, "grad_norm": 2.782604694366455, "learning_rate": 5e-06, "loss": 0.3232, "step": 3905 }, { "epoch": 0.7468451242829828, "grad_norm": 1.3832602500915527, "learning_rate": 5e-06, "loss": 0.15, "step": 3906 }, { "epoch": 0.747036328871893, "grad_norm": 3.4751503467559814, "learning_rate": 5e-06, "loss": 0.3804, "step": 3907 }, { "epoch": 0.7472275334608031, "grad_norm": 1.4347681999206543, "learning_rate": 5e-06, "loss": 0.1287, "step": 3908 }, { "epoch": 0.7474187380497131, "grad_norm": 1.5916887521743774, "learning_rate": 5e-06, "loss": 0.2899, "step": 3909 }, { "epoch": 0.7476099426386233, "grad_norm": 1.5356292724609375, "learning_rate": 5e-06, "loss": 0.2278, "step": 3910 }, { "epoch": 0.7478011472275334, "grad_norm": 1.4358553886413574, "learning_rate": 5e-06, "loss": 0.0868, "step": 3911 }, { "epoch": 0.7479923518164436, "grad_norm": 2.5048086643218994, "learning_rate": 5e-06, "loss": 0.227, "step": 3912 }, { "epoch": 0.7481835564053537, "grad_norm": 1.7658495903015137, "learning_rate": 5e-06, "loss": 0.1685, "step": 3913 }, { "epoch": 0.7483747609942638, "grad_norm": 2.723329782485962, "learning_rate": 5e-06, "loss": 0.3353, "step": 3914 }, { "epoch": 0.748565965583174, "grad_norm": 2.999716281890869, "learning_rate": 5e-06, "loss": 0.3282, "step": 3915 }, { "epoch": 0.7487571701720841, "grad_norm": 3.776826858520508, "learning_rate": 5e-06, "loss": 0.2084, "step": 3916 }, { "epoch": 0.7489483747609943, "grad_norm": 1.5918197631835938, "learning_rate": 5e-06, "loss": 0.1376, "step": 3917 }, { "epoch": 0.7491395793499044, "grad_norm": 1.6553537845611572, "learning_rate": 5e-06, "loss": 0.1036, "step": 3918 }, { "epoch": 0.7493307839388146, "grad_norm": 1.9501301050186157, "learning_rate": 5e-06, "loss": 0.1137, "step": 3919 }, { "epoch": 0.7495219885277247, "grad_norm": 3.1001572608947754, "learning_rate": 5e-06, "loss": 0.7937, "step": 3920 }, { "epoch": 0.7497131931166348, "grad_norm": 1.8700470924377441, "learning_rate": 5e-06, "loss": 0.1598, "step": 3921 }, { "epoch": 0.749904397705545, "grad_norm": 1.2993032932281494, "learning_rate": 5e-06, "loss": 0.1429, "step": 3922 }, { "epoch": 0.750095602294455, "grad_norm": 2.387141466140747, "learning_rate": 5e-06, "loss": 0.1336, "step": 3923 }, { "epoch": 0.7502868068833652, "grad_norm": 1.0744256973266602, "learning_rate": 5e-06, "loss": 0.0685, "step": 3924 }, { "epoch": 0.7504780114722753, "grad_norm": 3.1105291843414307, "learning_rate": 5e-06, "loss": 0.2699, "step": 3925 }, { "epoch": 0.7506692160611854, "grad_norm": 3.3172290325164795, "learning_rate": 5e-06, "loss": 0.6928, "step": 3926 }, { "epoch": 0.7508604206500956, "grad_norm": 1.4145816564559937, "learning_rate": 5e-06, "loss": 0.1114, "step": 3927 }, { "epoch": 0.7510516252390057, "grad_norm": 2.017582893371582, "learning_rate": 5e-06, "loss": 0.3379, "step": 3928 }, { "epoch": 0.7512428298279159, "grad_norm": 2.438030481338501, "learning_rate": 5e-06, "loss": 0.273, "step": 3929 }, { "epoch": 0.751434034416826, "grad_norm": 2.810269832611084, "learning_rate": 5e-06, "loss": 0.1784, "step": 3930 }, { "epoch": 0.7516252390057362, "grad_norm": 2.189603328704834, "learning_rate": 5e-06, "loss": 0.1627, "step": 3931 }, { "epoch": 0.7518164435946463, "grad_norm": 1.3899329900741577, "learning_rate": 5e-06, "loss": 0.1443, "step": 3932 }, { "epoch": 0.7520076481835564, "grad_norm": 2.109938144683838, "learning_rate": 5e-06, "loss": 0.1432, "step": 3933 }, { "epoch": 0.7521988527724666, "grad_norm": 2.4114205837249756, "learning_rate": 5e-06, "loss": 0.3787, "step": 3934 }, { "epoch": 0.7523900573613767, "grad_norm": 1.5396156311035156, "learning_rate": 5e-06, "loss": 0.1456, "step": 3935 }, { "epoch": 0.7525812619502869, "grad_norm": 1.966928482055664, "learning_rate": 5e-06, "loss": 0.2534, "step": 3936 }, { "epoch": 0.7527724665391969, "grad_norm": 1.3008625507354736, "learning_rate": 5e-06, "loss": 0.0839, "step": 3937 }, { "epoch": 0.752963671128107, "grad_norm": 3.0656418800354004, "learning_rate": 5e-06, "loss": 0.3491, "step": 3938 }, { "epoch": 0.7531548757170172, "grad_norm": 2.63382887840271, "learning_rate": 5e-06, "loss": 0.3588, "step": 3939 }, { "epoch": 0.7533460803059273, "grad_norm": 1.6612306833267212, "learning_rate": 5e-06, "loss": 0.1236, "step": 3940 }, { "epoch": 0.7535372848948375, "grad_norm": 2.234178304672241, "learning_rate": 5e-06, "loss": 0.1589, "step": 3941 }, { "epoch": 0.7537284894837476, "grad_norm": 3.02970814704895, "learning_rate": 5e-06, "loss": 0.4124, "step": 3942 }, { "epoch": 0.7539196940726578, "grad_norm": 2.506859540939331, "learning_rate": 5e-06, "loss": 0.2223, "step": 3943 }, { "epoch": 0.7541108986615679, "grad_norm": 1.3613660335540771, "learning_rate": 5e-06, "loss": 0.1406, "step": 3944 }, { "epoch": 0.754302103250478, "grad_norm": 2.071458578109741, "learning_rate": 5e-06, "loss": 0.2519, "step": 3945 }, { "epoch": 0.7544933078393882, "grad_norm": 1.1499487161636353, "learning_rate": 5e-06, "loss": 0.1091, "step": 3946 }, { "epoch": 0.7546845124282983, "grad_norm": 2.7436764240264893, "learning_rate": 5e-06, "loss": 0.1509, "step": 3947 }, { "epoch": 0.7548757170172085, "grad_norm": 2.140192985534668, "learning_rate": 5e-06, "loss": 0.2268, "step": 3948 }, { "epoch": 0.7550669216061185, "grad_norm": 1.5796760320663452, "learning_rate": 5e-06, "loss": 0.113, "step": 3949 }, { "epoch": 0.7552581261950286, "grad_norm": 1.0383970737457275, "learning_rate": 5e-06, "loss": 0.0439, "step": 3950 }, { "epoch": 0.7554493307839388, "grad_norm": 2.50980281829834, "learning_rate": 5e-06, "loss": 0.4777, "step": 3951 }, { "epoch": 0.7556405353728489, "grad_norm": 1.7971466779708862, "learning_rate": 5e-06, "loss": 0.1112, "step": 3952 }, { "epoch": 0.7558317399617591, "grad_norm": 1.9320497512817383, "learning_rate": 5e-06, "loss": 0.1991, "step": 3953 }, { "epoch": 0.7560229445506692, "grad_norm": 1.246254324913025, "learning_rate": 5e-06, "loss": 0.0755, "step": 3954 }, { "epoch": 0.7562141491395793, "grad_norm": 1.7969180345535278, "learning_rate": 5e-06, "loss": 0.1124, "step": 3955 }, { "epoch": 0.7564053537284895, "grad_norm": 1.317395567893982, "learning_rate": 5e-06, "loss": 0.0401, "step": 3956 }, { "epoch": 0.7565965583173996, "grad_norm": 1.8792531490325928, "learning_rate": 5e-06, "loss": 0.308, "step": 3957 }, { "epoch": 0.7567877629063098, "grad_norm": 1.601319670677185, "learning_rate": 5e-06, "loss": 0.1683, "step": 3958 }, { "epoch": 0.7569789674952199, "grad_norm": 1.1691665649414062, "learning_rate": 5e-06, "loss": 0.0809, "step": 3959 }, { "epoch": 0.7571701720841301, "grad_norm": 1.8878777027130127, "learning_rate": 5e-06, "loss": 0.1181, "step": 3960 }, { "epoch": 0.7573613766730402, "grad_norm": 1.6922614574432373, "learning_rate": 5e-06, "loss": 0.1016, "step": 3961 }, { "epoch": 0.7575525812619502, "grad_norm": 2.844416379928589, "learning_rate": 5e-06, "loss": 0.1851, "step": 3962 }, { "epoch": 0.7577437858508604, "grad_norm": 2.265589952468872, "learning_rate": 5e-06, "loss": 0.2272, "step": 3963 }, { "epoch": 0.7579349904397705, "grad_norm": 2.574376344680786, "learning_rate": 5e-06, "loss": 0.4007, "step": 3964 }, { "epoch": 0.7581261950286807, "grad_norm": 2.0190422534942627, "learning_rate": 5e-06, "loss": 0.2131, "step": 3965 }, { "epoch": 0.7583173996175908, "grad_norm": 1.53504478931427, "learning_rate": 5e-06, "loss": 0.1167, "step": 3966 }, { "epoch": 0.7585086042065009, "grad_norm": 1.4824988842010498, "learning_rate": 5e-06, "loss": 0.16, "step": 3967 }, { "epoch": 0.7586998087954111, "grad_norm": 2.1277225017547607, "learning_rate": 5e-06, "loss": 0.15, "step": 3968 }, { "epoch": 0.7588910133843212, "grad_norm": 0.9543938636779785, "learning_rate": 5e-06, "loss": 0.0364, "step": 3969 }, { "epoch": 0.7590822179732314, "grad_norm": 3.176586151123047, "learning_rate": 5e-06, "loss": 0.5473, "step": 3970 }, { "epoch": 0.7592734225621415, "grad_norm": 1.5333970785140991, "learning_rate": 5e-06, "loss": 0.1819, "step": 3971 }, { "epoch": 0.7594646271510517, "grad_norm": 1.2855944633483887, "learning_rate": 5e-06, "loss": 0.1069, "step": 3972 }, { "epoch": 0.7596558317399618, "grad_norm": 1.5741631984710693, "learning_rate": 5e-06, "loss": 0.0947, "step": 3973 }, { "epoch": 0.7598470363288718, "grad_norm": 1.533542513847351, "learning_rate": 5e-06, "loss": 0.1809, "step": 3974 }, { "epoch": 0.760038240917782, "grad_norm": 2.86518931388855, "learning_rate": 5e-06, "loss": 0.2103, "step": 3975 }, { "epoch": 0.7602294455066921, "grad_norm": 2.7985222339630127, "learning_rate": 5e-06, "loss": 0.2571, "step": 3976 }, { "epoch": 0.7604206500956023, "grad_norm": 2.5995070934295654, "learning_rate": 5e-06, "loss": 0.4758, "step": 3977 }, { "epoch": 0.7606118546845124, "grad_norm": 1.5301446914672852, "learning_rate": 5e-06, "loss": 0.1074, "step": 3978 }, { "epoch": 0.7608030592734225, "grad_norm": 0.876580536365509, "learning_rate": 5e-06, "loss": 0.0367, "step": 3979 }, { "epoch": 0.7609942638623327, "grad_norm": 1.7435474395751953, "learning_rate": 5e-06, "loss": 0.1279, "step": 3980 }, { "epoch": 0.7611854684512428, "grad_norm": 1.7620370388031006, "learning_rate": 5e-06, "loss": 0.1008, "step": 3981 }, { "epoch": 0.761376673040153, "grad_norm": 1.7652002573013306, "learning_rate": 5e-06, "loss": 0.1108, "step": 3982 }, { "epoch": 0.7615678776290631, "grad_norm": 3.072636127471924, "learning_rate": 5e-06, "loss": 0.4287, "step": 3983 }, { "epoch": 0.7617590822179733, "grad_norm": 2.880697727203369, "learning_rate": 5e-06, "loss": 0.3342, "step": 3984 }, { "epoch": 0.7619502868068834, "grad_norm": 1.7110553979873657, "learning_rate": 5e-06, "loss": 0.1209, "step": 3985 }, { "epoch": 0.7621414913957935, "grad_norm": 1.7476038932800293, "learning_rate": 5e-06, "loss": 0.0955, "step": 3986 }, { "epoch": 0.7623326959847037, "grad_norm": 2.2482314109802246, "learning_rate": 5e-06, "loss": 0.1621, "step": 3987 }, { "epoch": 0.7625239005736137, "grad_norm": 1.754885196685791, "learning_rate": 5e-06, "loss": 0.1183, "step": 3988 }, { "epoch": 0.7627151051625239, "grad_norm": 2.0199167728424072, "learning_rate": 5e-06, "loss": 0.2003, "step": 3989 }, { "epoch": 0.762906309751434, "grad_norm": 2.1370720863342285, "learning_rate": 5e-06, "loss": 0.2559, "step": 3990 }, { "epoch": 0.7630975143403441, "grad_norm": 1.4818934202194214, "learning_rate": 5e-06, "loss": 0.119, "step": 3991 }, { "epoch": 0.7632887189292543, "grad_norm": 2.2542731761932373, "learning_rate": 5e-06, "loss": 0.1821, "step": 3992 }, { "epoch": 0.7634799235181644, "grad_norm": 2.6181223392486572, "learning_rate": 5e-06, "loss": 0.2197, "step": 3993 }, { "epoch": 0.7636711281070746, "grad_norm": 2.795283317565918, "learning_rate": 5e-06, "loss": 0.2661, "step": 3994 }, { "epoch": 0.7638623326959847, "grad_norm": 4.159665107727051, "learning_rate": 5e-06, "loss": 0.8165, "step": 3995 }, { "epoch": 0.7640535372848949, "grad_norm": 1.8209826946258545, "learning_rate": 5e-06, "loss": 0.221, "step": 3996 }, { "epoch": 0.764244741873805, "grad_norm": 1.490784764289856, "learning_rate": 5e-06, "loss": 0.1202, "step": 3997 }, { "epoch": 0.7644359464627151, "grad_norm": 1.8983914852142334, "learning_rate": 5e-06, "loss": 0.2311, "step": 3998 }, { "epoch": 0.7646271510516253, "grad_norm": 2.7288661003112793, "learning_rate": 5e-06, "loss": 0.1842, "step": 3999 }, { "epoch": 0.7648183556405354, "grad_norm": 1.5600886344909668, "learning_rate": 5e-06, "loss": 0.0708, "step": 4000 }, { "epoch": 0.7648183556405354, "eval_runtime": 831.7494, "eval_samples_per_second": 1.844, "eval_steps_per_second": 0.231, "step": 4000 }, { "epoch": 0.7650095602294456, "grad_norm": 1.5232917070388794, "learning_rate": 5e-06, "loss": 0.1157, "step": 4001 }, { "epoch": 0.7652007648183556, "grad_norm": 1.4118472337722778, "learning_rate": 5e-06, "loss": 0.1306, "step": 4002 }, { "epoch": 0.7653919694072657, "grad_norm": 2.754512310028076, "learning_rate": 5e-06, "loss": 0.3549, "step": 4003 }, { "epoch": 0.7655831739961759, "grad_norm": 1.406946063041687, "learning_rate": 5e-06, "loss": 0.136, "step": 4004 }, { "epoch": 0.765774378585086, "grad_norm": 2.2281954288482666, "learning_rate": 5e-06, "loss": 0.0896, "step": 4005 }, { "epoch": 0.7659655831739962, "grad_norm": 1.819032907485962, "learning_rate": 5e-06, "loss": 0.1981, "step": 4006 }, { "epoch": 0.7661567877629063, "grad_norm": 2.1007330417633057, "learning_rate": 5e-06, "loss": 0.3524, "step": 4007 }, { "epoch": 0.7663479923518165, "grad_norm": 2.4521172046661377, "learning_rate": 5e-06, "loss": 0.4039, "step": 4008 }, { "epoch": 0.7665391969407266, "grad_norm": 2.598778247833252, "learning_rate": 5e-06, "loss": 0.2918, "step": 4009 }, { "epoch": 0.7667304015296367, "grad_norm": 2.3652114868164062, "learning_rate": 5e-06, "loss": 0.2555, "step": 4010 }, { "epoch": 0.7669216061185469, "grad_norm": 1.7355620861053467, "learning_rate": 5e-06, "loss": 0.1323, "step": 4011 }, { "epoch": 0.767112810707457, "grad_norm": 3.2190678119659424, "learning_rate": 5e-06, "loss": 0.1842, "step": 4012 }, { "epoch": 0.7673040152963672, "grad_norm": 2.945258617401123, "learning_rate": 5e-06, "loss": 0.261, "step": 4013 }, { "epoch": 0.7674952198852772, "grad_norm": 4.270470142364502, "learning_rate": 5e-06, "loss": 0.7201, "step": 4014 }, { "epoch": 0.7676864244741873, "grad_norm": 1.7441529035568237, "learning_rate": 5e-06, "loss": 0.1405, "step": 4015 }, { "epoch": 0.7678776290630975, "grad_norm": 1.0526031255722046, "learning_rate": 5e-06, "loss": 0.1045, "step": 4016 }, { "epoch": 0.7680688336520076, "grad_norm": 2.0954020023345947, "learning_rate": 5e-06, "loss": 0.2734, "step": 4017 }, { "epoch": 0.7682600382409178, "grad_norm": 1.8673603534698486, "learning_rate": 5e-06, "loss": 0.1173, "step": 4018 }, { "epoch": 0.7684512428298279, "grad_norm": 1.6339843273162842, "learning_rate": 5e-06, "loss": 0.1095, "step": 4019 }, { "epoch": 0.768642447418738, "grad_norm": 1.8186370134353638, "learning_rate": 5e-06, "loss": 0.2133, "step": 4020 }, { "epoch": 0.7688336520076482, "grad_norm": 2.424779176712036, "learning_rate": 5e-06, "loss": 0.2374, "step": 4021 }, { "epoch": 0.7690248565965583, "grad_norm": 1.2409683465957642, "learning_rate": 5e-06, "loss": 0.0816, "step": 4022 }, { "epoch": 0.7692160611854685, "grad_norm": 2.5901854038238525, "learning_rate": 5e-06, "loss": 0.134, "step": 4023 }, { "epoch": 0.7694072657743786, "grad_norm": 1.4344134330749512, "learning_rate": 5e-06, "loss": 0.1287, "step": 4024 }, { "epoch": 0.7695984703632888, "grad_norm": 1.9438978433609009, "learning_rate": 5e-06, "loss": 0.0898, "step": 4025 }, { "epoch": 0.7697896749521989, "grad_norm": 3.3850362300872803, "learning_rate": 5e-06, "loss": 0.6324, "step": 4026 }, { "epoch": 0.7699808795411089, "grad_norm": 1.7402697801589966, "learning_rate": 5e-06, "loss": 0.2097, "step": 4027 }, { "epoch": 0.7701720841300191, "grad_norm": 1.9264459609985352, "learning_rate": 5e-06, "loss": 0.14, "step": 4028 }, { "epoch": 0.7703632887189292, "grad_norm": 1.2523527145385742, "learning_rate": 5e-06, "loss": 0.0865, "step": 4029 }, { "epoch": 0.7705544933078394, "grad_norm": 2.009114980697632, "learning_rate": 5e-06, "loss": 0.1727, "step": 4030 }, { "epoch": 0.7707456978967495, "grad_norm": 1.3284012079238892, "learning_rate": 5e-06, "loss": 0.0669, "step": 4031 }, { "epoch": 0.7709369024856596, "grad_norm": 3.460033893585205, "learning_rate": 5e-06, "loss": 0.653, "step": 4032 }, { "epoch": 0.7711281070745698, "grad_norm": 1.1250536441802979, "learning_rate": 5e-06, "loss": 0.1061, "step": 4033 }, { "epoch": 0.7713193116634799, "grad_norm": 2.0778872966766357, "learning_rate": 5e-06, "loss": 0.4068, "step": 4034 }, { "epoch": 0.7715105162523901, "grad_norm": 2.448514461517334, "learning_rate": 5e-06, "loss": 0.1578, "step": 4035 }, { "epoch": 0.7717017208413002, "grad_norm": 1.0364723205566406, "learning_rate": 5e-06, "loss": 0.0746, "step": 4036 }, { "epoch": 0.7718929254302104, "grad_norm": 4.103033065795898, "learning_rate": 5e-06, "loss": 0.3191, "step": 4037 }, { "epoch": 0.7720841300191205, "grad_norm": 2.208747148513794, "learning_rate": 5e-06, "loss": 0.2741, "step": 4038 }, { "epoch": 0.7722753346080306, "grad_norm": 3.062417984008789, "learning_rate": 5e-06, "loss": 0.6122, "step": 4039 }, { "epoch": 0.7724665391969407, "grad_norm": 1.5257611274719238, "learning_rate": 5e-06, "loss": 0.1512, "step": 4040 }, { "epoch": 0.7726577437858508, "grad_norm": 1.7130473852157593, "learning_rate": 5e-06, "loss": 0.1445, "step": 4041 }, { "epoch": 0.772848948374761, "grad_norm": 2.802297353744507, "learning_rate": 5e-06, "loss": 0.0933, "step": 4042 }, { "epoch": 0.7730401529636711, "grad_norm": 1.7665363550186157, "learning_rate": 5e-06, "loss": 0.1321, "step": 4043 }, { "epoch": 0.7732313575525812, "grad_norm": 1.023848533630371, "learning_rate": 5e-06, "loss": 0.0549, "step": 4044 }, { "epoch": 0.7734225621414914, "grad_norm": 2.230396270751953, "learning_rate": 5e-06, "loss": 0.3784, "step": 4045 }, { "epoch": 0.7736137667304015, "grad_norm": 1.3800081014633179, "learning_rate": 5e-06, "loss": 0.1241, "step": 4046 }, { "epoch": 0.7738049713193117, "grad_norm": 2.2208352088928223, "learning_rate": 5e-06, "loss": 0.2697, "step": 4047 }, { "epoch": 0.7739961759082218, "grad_norm": 2.5575742721557617, "learning_rate": 5e-06, "loss": 0.2564, "step": 4048 }, { "epoch": 0.774187380497132, "grad_norm": 1.4058862924575806, "learning_rate": 5e-06, "loss": 0.0901, "step": 4049 }, { "epoch": 0.7743785850860421, "grad_norm": 2.812058687210083, "learning_rate": 5e-06, "loss": 0.1837, "step": 4050 }, { "epoch": 0.7745697896749522, "grad_norm": 2.573519706726074, "learning_rate": 5e-06, "loss": 0.483, "step": 4051 }, { "epoch": 0.7747609942638624, "grad_norm": 3.118896961212158, "learning_rate": 5e-06, "loss": 0.3858, "step": 4052 }, { "epoch": 0.7749521988527724, "grad_norm": 1.3650435209274292, "learning_rate": 5e-06, "loss": 0.0917, "step": 4053 }, { "epoch": 0.7751434034416826, "grad_norm": 3.2142133712768555, "learning_rate": 5e-06, "loss": 0.1071, "step": 4054 }, { "epoch": 0.7753346080305927, "grad_norm": 1.4311920404434204, "learning_rate": 5e-06, "loss": 0.095, "step": 4055 }, { "epoch": 0.7755258126195028, "grad_norm": 1.5923914909362793, "learning_rate": 5e-06, "loss": 0.0758, "step": 4056 }, { "epoch": 0.775717017208413, "grad_norm": 1.7863245010375977, "learning_rate": 5e-06, "loss": 0.1589, "step": 4057 }, { "epoch": 0.7759082217973231, "grad_norm": 2.2520720958709717, "learning_rate": 5e-06, "loss": 0.2503, "step": 4058 }, { "epoch": 0.7760994263862333, "grad_norm": 3.211054563522339, "learning_rate": 5e-06, "loss": 0.6649, "step": 4059 }, { "epoch": 0.7762906309751434, "grad_norm": 0.8572031855583191, "learning_rate": 5e-06, "loss": 0.0593, "step": 4060 }, { "epoch": 0.7764818355640536, "grad_norm": 1.4232878684997559, "learning_rate": 5e-06, "loss": 0.0751, "step": 4061 }, { "epoch": 0.7766730401529637, "grad_norm": 1.295220971107483, "learning_rate": 5e-06, "loss": 0.0767, "step": 4062 }, { "epoch": 0.7768642447418738, "grad_norm": 2.2189230918884277, "learning_rate": 5e-06, "loss": 0.1053, "step": 4063 }, { "epoch": 0.777055449330784, "grad_norm": 2.4356725215911865, "learning_rate": 5e-06, "loss": 0.3199, "step": 4064 }, { "epoch": 0.777246653919694, "grad_norm": 1.6857635974884033, "learning_rate": 5e-06, "loss": 0.1233, "step": 4065 }, { "epoch": 0.7774378585086043, "grad_norm": 2.9625089168548584, "learning_rate": 5e-06, "loss": 0.3591, "step": 4066 }, { "epoch": 0.7776290630975143, "grad_norm": 1.5815578699111938, "learning_rate": 5e-06, "loss": 0.0995, "step": 4067 }, { "epoch": 0.7778202676864244, "grad_norm": 1.3760156631469727, "learning_rate": 5e-06, "loss": 0.1136, "step": 4068 }, { "epoch": 0.7780114722753346, "grad_norm": 3.329010486602783, "learning_rate": 5e-06, "loss": 0.167, "step": 4069 }, { "epoch": 0.7782026768642447, "grad_norm": 2.279073715209961, "learning_rate": 5e-06, "loss": 0.3059, "step": 4070 }, { "epoch": 0.7783938814531549, "grad_norm": 1.5212337970733643, "learning_rate": 5e-06, "loss": 0.1047, "step": 4071 }, { "epoch": 0.778585086042065, "grad_norm": 1.8684823513031006, "learning_rate": 5e-06, "loss": 0.1369, "step": 4072 }, { "epoch": 0.7787762906309752, "grad_norm": 1.487801432609558, "learning_rate": 5e-06, "loss": 0.0738, "step": 4073 }, { "epoch": 0.7789674952198853, "grad_norm": 2.3601162433624268, "learning_rate": 5e-06, "loss": 0.0753, "step": 4074 }, { "epoch": 0.7791586998087954, "grad_norm": 1.5325002670288086, "learning_rate": 5e-06, "loss": 0.0589, "step": 4075 }, { "epoch": 0.7793499043977056, "grad_norm": 2.376420497894287, "learning_rate": 5e-06, "loss": 0.3178, "step": 4076 }, { "epoch": 0.7795411089866157, "grad_norm": 1.3300342559814453, "learning_rate": 5e-06, "loss": 0.1267, "step": 4077 }, { "epoch": 0.7797323135755259, "grad_norm": 1.2323758602142334, "learning_rate": 5e-06, "loss": 0.1096, "step": 4078 }, { "epoch": 0.779923518164436, "grad_norm": 2.7862613201141357, "learning_rate": 5e-06, "loss": 0.4678, "step": 4079 }, { "epoch": 0.780114722753346, "grad_norm": 1.2939293384552002, "learning_rate": 5e-06, "loss": 0.0784, "step": 4080 }, { "epoch": 0.7803059273422562, "grad_norm": 2.031770944595337, "learning_rate": 5e-06, "loss": 0.076, "step": 4081 }, { "epoch": 0.7804971319311663, "grad_norm": 1.987504243850708, "learning_rate": 5e-06, "loss": 0.2646, "step": 4082 }, { "epoch": 0.7806883365200765, "grad_norm": 3.281132936477661, "learning_rate": 5e-06, "loss": 0.4443, "step": 4083 }, { "epoch": 0.7808795411089866, "grad_norm": 1.6435940265655518, "learning_rate": 5e-06, "loss": 0.1493, "step": 4084 }, { "epoch": 0.7810707456978967, "grad_norm": 2.0128467082977295, "learning_rate": 5e-06, "loss": 0.1378, "step": 4085 }, { "epoch": 0.7812619502868069, "grad_norm": 1.966070532798767, "learning_rate": 5e-06, "loss": 0.1004, "step": 4086 }, { "epoch": 0.781453154875717, "grad_norm": 2.714599370956421, "learning_rate": 5e-06, "loss": 0.1898, "step": 4087 }, { "epoch": 0.7816443594646272, "grad_norm": 2.605646848678589, "learning_rate": 5e-06, "loss": 0.5377, "step": 4088 }, { "epoch": 0.7818355640535373, "grad_norm": 2.5793659687042236, "learning_rate": 5e-06, "loss": 0.3054, "step": 4089 }, { "epoch": 0.7820267686424475, "grad_norm": 1.0576157569885254, "learning_rate": 5e-06, "loss": 0.1205, "step": 4090 }, { "epoch": 0.7822179732313576, "grad_norm": 2.084843397140503, "learning_rate": 5e-06, "loss": 0.1685, "step": 4091 }, { "epoch": 0.7824091778202676, "grad_norm": 1.4531314373016357, "learning_rate": 5e-06, "loss": 0.1244, "step": 4092 }, { "epoch": 0.7826003824091778, "grad_norm": 1.2267307043075562, "learning_rate": 5e-06, "loss": 0.0672, "step": 4093 }, { "epoch": 0.7827915869980879, "grad_norm": 2.5020968914031982, "learning_rate": 5e-06, "loss": 0.0903, "step": 4094 }, { "epoch": 0.7829827915869981, "grad_norm": 2.856722354888916, "learning_rate": 5e-06, "loss": 0.6113, "step": 4095 }, { "epoch": 0.7831739961759082, "grad_norm": 3.0301995277404785, "learning_rate": 5e-06, "loss": 0.3728, "step": 4096 }, { "epoch": 0.7833652007648183, "grad_norm": 2.254077911376953, "learning_rate": 5e-06, "loss": 0.3342, "step": 4097 }, { "epoch": 0.7835564053537285, "grad_norm": 1.5700175762176514, "learning_rate": 5e-06, "loss": 0.0846, "step": 4098 }, { "epoch": 0.7837476099426386, "grad_norm": 1.795477032661438, "learning_rate": 5e-06, "loss": 0.1586, "step": 4099 }, { "epoch": 0.7839388145315488, "grad_norm": 3.384065866470337, "learning_rate": 5e-06, "loss": 0.1139, "step": 4100 }, { "epoch": 0.7841300191204589, "grad_norm": 1.4682124853134155, "learning_rate": 5e-06, "loss": 0.185, "step": 4101 }, { "epoch": 0.7843212237093691, "grad_norm": 1.558057188987732, "learning_rate": 5e-06, "loss": 0.1074, "step": 4102 }, { "epoch": 0.7845124282982792, "grad_norm": 1.5690349340438843, "learning_rate": 5e-06, "loss": 0.1275, "step": 4103 }, { "epoch": 0.7847036328871893, "grad_norm": 1.0313045978546143, "learning_rate": 5e-06, "loss": 0.0571, "step": 4104 }, { "epoch": 0.7848948374760994, "grad_norm": 1.4557856321334839, "learning_rate": 5e-06, "loss": 0.0803, "step": 4105 }, { "epoch": 0.7850860420650095, "grad_norm": 1.7461309432983398, "learning_rate": 5e-06, "loss": 0.0582, "step": 4106 }, { "epoch": 0.7852772466539197, "grad_norm": 3.133112668991089, "learning_rate": 5e-06, "loss": 0.4884, "step": 4107 }, { "epoch": 0.7854684512428298, "grad_norm": 2.038489580154419, "learning_rate": 5e-06, "loss": 0.2294, "step": 4108 }, { "epoch": 0.7856596558317399, "grad_norm": 1.714369535446167, "learning_rate": 5e-06, "loss": 0.1776, "step": 4109 }, { "epoch": 0.7858508604206501, "grad_norm": 2.173279047012329, "learning_rate": 5e-06, "loss": 0.1365, "step": 4110 }, { "epoch": 0.7860420650095602, "grad_norm": 1.4787670373916626, "learning_rate": 5e-06, "loss": 0.0929, "step": 4111 }, { "epoch": 0.7862332695984704, "grad_norm": 2.0035345554351807, "learning_rate": 5e-06, "loss": 0.1208, "step": 4112 }, { "epoch": 0.7864244741873805, "grad_norm": 2.027458906173706, "learning_rate": 5e-06, "loss": 0.3061, "step": 4113 }, { "epoch": 0.7866156787762907, "grad_norm": 3.2767152786254883, "learning_rate": 5e-06, "loss": 0.372, "step": 4114 }, { "epoch": 0.7868068833652008, "grad_norm": 2.2546582221984863, "learning_rate": 5e-06, "loss": 0.2883, "step": 4115 }, { "epoch": 0.7869980879541109, "grad_norm": 1.9276899099349976, "learning_rate": 5e-06, "loss": 0.1248, "step": 4116 }, { "epoch": 0.7871892925430211, "grad_norm": 1.4487714767456055, "learning_rate": 5e-06, "loss": 0.1152, "step": 4117 }, { "epoch": 0.7873804971319311, "grad_norm": 0.9173452854156494, "learning_rate": 5e-06, "loss": 0.0181, "step": 4118 }, { "epoch": 0.7875717017208413, "grad_norm": 2.420795440673828, "learning_rate": 5e-06, "loss": 0.1725, "step": 4119 }, { "epoch": 0.7877629063097514, "grad_norm": 1.8025798797607422, "learning_rate": 5e-06, "loss": 0.2264, "step": 4120 }, { "epoch": 0.7879541108986615, "grad_norm": 2.618238925933838, "learning_rate": 5e-06, "loss": 0.4072, "step": 4121 }, { "epoch": 0.7881453154875717, "grad_norm": 2.343017101287842, "learning_rate": 5e-06, "loss": 0.195, "step": 4122 }, { "epoch": 0.7883365200764818, "grad_norm": 1.8181397914886475, "learning_rate": 5e-06, "loss": 0.0863, "step": 4123 }, { "epoch": 0.788527724665392, "grad_norm": 1.8634696006774902, "learning_rate": 5e-06, "loss": 0.1055, "step": 4124 }, { "epoch": 0.7887189292543021, "grad_norm": 2.9761476516723633, "learning_rate": 5e-06, "loss": 0.1362, "step": 4125 }, { "epoch": 0.7889101338432123, "grad_norm": 2.6176702976226807, "learning_rate": 5e-06, "loss": 0.3955, "step": 4126 }, { "epoch": 0.7891013384321224, "grad_norm": 2.2697925567626953, "learning_rate": 5e-06, "loss": 0.3743, "step": 4127 }, { "epoch": 0.7892925430210325, "grad_norm": 2.294552803039551, "learning_rate": 5e-06, "loss": 0.2497, "step": 4128 }, { "epoch": 0.7894837476099427, "grad_norm": 3.0859687328338623, "learning_rate": 5e-06, "loss": 0.3342, "step": 4129 }, { "epoch": 0.7896749521988528, "grad_norm": 3.8342316150665283, "learning_rate": 5e-06, "loss": 0.2451, "step": 4130 }, { "epoch": 0.789866156787763, "grad_norm": 1.9949150085449219, "learning_rate": 5e-06, "loss": 0.1862, "step": 4131 }, { "epoch": 0.790057361376673, "grad_norm": 1.5915290117263794, "learning_rate": 5e-06, "loss": 0.1456, "step": 4132 }, { "epoch": 0.7902485659655831, "grad_norm": 2.4300689697265625, "learning_rate": 5e-06, "loss": 0.3243, "step": 4133 }, { "epoch": 0.7904397705544933, "grad_norm": 2.693277597427368, "learning_rate": 5e-06, "loss": 0.2611, "step": 4134 }, { "epoch": 0.7906309751434034, "grad_norm": 1.841639518737793, "learning_rate": 5e-06, "loss": 0.1271, "step": 4135 }, { "epoch": 0.7908221797323136, "grad_norm": 3.0559420585632324, "learning_rate": 5e-06, "loss": 0.25, "step": 4136 }, { "epoch": 0.7910133843212237, "grad_norm": 2.3945229053497314, "learning_rate": 5e-06, "loss": 0.1167, "step": 4137 }, { "epoch": 0.7912045889101338, "grad_norm": 1.6890507936477661, "learning_rate": 5e-06, "loss": 0.1109, "step": 4138 }, { "epoch": 0.791395793499044, "grad_norm": 4.068086624145508, "learning_rate": 5e-06, "loss": 0.4148, "step": 4139 }, { "epoch": 0.7915869980879541, "grad_norm": 1.1503350734710693, "learning_rate": 5e-06, "loss": 0.1057, "step": 4140 }, { "epoch": 0.7917782026768643, "grad_norm": 1.6826403141021729, "learning_rate": 5e-06, "loss": 0.2092, "step": 4141 }, { "epoch": 0.7919694072657744, "grad_norm": 2.374329090118408, "learning_rate": 5e-06, "loss": 0.1302, "step": 4142 }, { "epoch": 0.7921606118546846, "grad_norm": 1.413367509841919, "learning_rate": 5e-06, "loss": 0.0968, "step": 4143 }, { "epoch": 0.7923518164435946, "grad_norm": 0.6716412305831909, "learning_rate": 5e-06, "loss": 0.0294, "step": 4144 }, { "epoch": 0.7925430210325047, "grad_norm": 1.7345741987228394, "learning_rate": 5e-06, "loss": 0.1587, "step": 4145 }, { "epoch": 0.7927342256214149, "grad_norm": 2.5227432250976562, "learning_rate": 5e-06, "loss": 0.2367, "step": 4146 }, { "epoch": 0.792925430210325, "grad_norm": 1.6562448740005493, "learning_rate": 5e-06, "loss": 0.1356, "step": 4147 }, { "epoch": 0.7931166347992352, "grad_norm": 1.9978289604187012, "learning_rate": 5e-06, "loss": 0.0856, "step": 4148 }, { "epoch": 0.7933078393881453, "grad_norm": 1.263630747795105, "learning_rate": 5e-06, "loss": 0.0879, "step": 4149 }, { "epoch": 0.7934990439770554, "grad_norm": 1.8136980533599854, "learning_rate": 5e-06, "loss": 0.1034, "step": 4150 }, { "epoch": 0.7936902485659656, "grad_norm": 2.311706781387329, "learning_rate": 5e-06, "loss": 0.1771, "step": 4151 }, { "epoch": 0.7938814531548757, "grad_norm": 2.5121023654937744, "learning_rate": 5e-06, "loss": 0.2529, "step": 4152 }, { "epoch": 0.7940726577437859, "grad_norm": 2.3066329956054688, "learning_rate": 5e-06, "loss": 0.3744, "step": 4153 }, { "epoch": 0.794263862332696, "grad_norm": 1.4853479862213135, "learning_rate": 5e-06, "loss": 0.1752, "step": 4154 }, { "epoch": 0.7944550669216062, "grad_norm": 1.425007939338684, "learning_rate": 5e-06, "loss": 0.0867, "step": 4155 }, { "epoch": 0.7946462715105163, "grad_norm": 2.0261852741241455, "learning_rate": 5e-06, "loss": 0.1272, "step": 4156 }, { "epoch": 0.7948374760994263, "grad_norm": 2.156608819961548, "learning_rate": 5e-06, "loss": 0.3367, "step": 4157 }, { "epoch": 0.7950286806883365, "grad_norm": 1.3627783060073853, "learning_rate": 5e-06, "loss": 0.1305, "step": 4158 }, { "epoch": 0.7952198852772466, "grad_norm": 2.0210812091827393, "learning_rate": 5e-06, "loss": 0.2219, "step": 4159 }, { "epoch": 0.7954110898661568, "grad_norm": 1.251874327659607, "learning_rate": 5e-06, "loss": 0.0726, "step": 4160 }, { "epoch": 0.7956022944550669, "grad_norm": 1.1710023880004883, "learning_rate": 5e-06, "loss": 0.0736, "step": 4161 }, { "epoch": 0.795793499043977, "grad_norm": 1.9900565147399902, "learning_rate": 5e-06, "loss": 0.2058, "step": 4162 }, { "epoch": 0.7959847036328872, "grad_norm": 1.1456722021102905, "learning_rate": 5e-06, "loss": 0.0785, "step": 4163 }, { "epoch": 0.7961759082217973, "grad_norm": 2.3891546726226807, "learning_rate": 5e-06, "loss": 0.3063, "step": 4164 }, { "epoch": 0.7963671128107075, "grad_norm": 1.7114723920822144, "learning_rate": 5e-06, "loss": 0.1208, "step": 4165 }, { "epoch": 0.7965583173996176, "grad_norm": 3.918834686279297, "learning_rate": 5e-06, "loss": 0.4596, "step": 4166 }, { "epoch": 0.7967495219885278, "grad_norm": 1.0303666591644287, "learning_rate": 5e-06, "loss": 0.0694, "step": 4167 }, { "epoch": 0.7969407265774379, "grad_norm": 1.4666823148727417, "learning_rate": 5e-06, "loss": 0.0555, "step": 4168 }, { "epoch": 0.797131931166348, "grad_norm": 1.776019811630249, "learning_rate": 5e-06, "loss": 0.1054, "step": 4169 }, { "epoch": 0.7973231357552581, "grad_norm": 3.0186281204223633, "learning_rate": 5e-06, "loss": 0.4206, "step": 4170 }, { "epoch": 0.7975143403441682, "grad_norm": 1.698807716369629, "learning_rate": 5e-06, "loss": 0.1383, "step": 4171 }, { "epoch": 0.7977055449330784, "grad_norm": 3.590824604034424, "learning_rate": 5e-06, "loss": 0.5016, "step": 4172 }, { "epoch": 0.7978967495219885, "grad_norm": 1.6603933572769165, "learning_rate": 5e-06, "loss": 0.1306, "step": 4173 }, { "epoch": 0.7980879541108986, "grad_norm": 2.0738401412963867, "learning_rate": 5e-06, "loss": 0.1074, "step": 4174 }, { "epoch": 0.7982791586998088, "grad_norm": 1.8535012006759644, "learning_rate": 5e-06, "loss": 0.1028, "step": 4175 }, { "epoch": 0.7984703632887189, "grad_norm": 2.4131641387939453, "learning_rate": 5e-06, "loss": 0.3586, "step": 4176 }, { "epoch": 0.7986615678776291, "grad_norm": 3.5233068466186523, "learning_rate": 5e-06, "loss": 0.6267, "step": 4177 }, { "epoch": 0.7988527724665392, "grad_norm": 1.7418206930160522, "learning_rate": 5e-06, "loss": 0.1779, "step": 4178 }, { "epoch": 0.7990439770554494, "grad_norm": 1.813206672668457, "learning_rate": 5e-06, "loss": 0.1225, "step": 4179 }, { "epoch": 0.7992351816443595, "grad_norm": 1.5349483489990234, "learning_rate": 5e-06, "loss": 0.1117, "step": 4180 }, { "epoch": 0.7994263862332696, "grad_norm": 1.7856202125549316, "learning_rate": 5e-06, "loss": 0.1138, "step": 4181 }, { "epoch": 0.7996175908221798, "grad_norm": 1.9936387538909912, "learning_rate": 5e-06, "loss": 0.2355, "step": 4182 }, { "epoch": 0.7998087954110898, "grad_norm": 1.6209287643432617, "learning_rate": 5e-06, "loss": 0.1824, "step": 4183 }, { "epoch": 0.8, "grad_norm": 2.2457656860351562, "learning_rate": 5e-06, "loss": 0.3357, "step": 4184 }, { "epoch": 0.8001912045889101, "grad_norm": 2.640838861465454, "learning_rate": 5e-06, "loss": 0.3455, "step": 4185 }, { "epoch": 0.8003824091778202, "grad_norm": 2.3119945526123047, "learning_rate": 5e-06, "loss": 0.2474, "step": 4186 }, { "epoch": 0.8005736137667304, "grad_norm": 3.9085891246795654, "learning_rate": 5e-06, "loss": 0.3996, "step": 4187 }, { "epoch": 0.8007648183556405, "grad_norm": 2.8617188930511475, "learning_rate": 5e-06, "loss": 0.4351, "step": 4188 }, { "epoch": 0.8009560229445507, "grad_norm": 0.908932089805603, "learning_rate": 5e-06, "loss": 0.0602, "step": 4189 }, { "epoch": 0.8011472275334608, "grad_norm": 1.0495219230651855, "learning_rate": 5e-06, "loss": 0.0867, "step": 4190 }, { "epoch": 0.801338432122371, "grad_norm": 2.866431951522827, "learning_rate": 5e-06, "loss": 0.4275, "step": 4191 }, { "epoch": 0.8015296367112811, "grad_norm": 1.4292815923690796, "learning_rate": 5e-06, "loss": 0.1008, "step": 4192 }, { "epoch": 0.8017208413001912, "grad_norm": 1.7318116426467896, "learning_rate": 5e-06, "loss": 0.1159, "step": 4193 }, { "epoch": 0.8019120458891014, "grad_norm": 1.6380189657211304, "learning_rate": 5e-06, "loss": 0.1392, "step": 4194 }, { "epoch": 0.8021032504780115, "grad_norm": 2.122971534729004, "learning_rate": 5e-06, "loss": 0.2761, "step": 4195 }, { "epoch": 0.8022944550669217, "grad_norm": 2.2278707027435303, "learning_rate": 5e-06, "loss": 0.1824, "step": 4196 }, { "epoch": 0.8024856596558317, "grad_norm": 1.6206939220428467, "learning_rate": 5e-06, "loss": 0.2697, "step": 4197 }, { "epoch": 0.8026768642447418, "grad_norm": 2.017160654067993, "learning_rate": 5e-06, "loss": 0.1218, "step": 4198 }, { "epoch": 0.802868068833652, "grad_norm": 2.2551474571228027, "learning_rate": 5e-06, "loss": 0.1287, "step": 4199 }, { "epoch": 0.8030592734225621, "grad_norm": 0.9970536231994629, "learning_rate": 5e-06, "loss": 0.0528, "step": 4200 }, { "epoch": 0.8032504780114723, "grad_norm": 1.8154901266098022, "learning_rate": 5e-06, "loss": 0.2272, "step": 4201 }, { "epoch": 0.8034416826003824, "grad_norm": 3.230262041091919, "learning_rate": 5e-06, "loss": 0.5354, "step": 4202 }, { "epoch": 0.8036328871892925, "grad_norm": 2.7765953540802, "learning_rate": 5e-06, "loss": 0.247, "step": 4203 }, { "epoch": 0.8038240917782027, "grad_norm": 1.6847764253616333, "learning_rate": 5e-06, "loss": 0.148, "step": 4204 }, { "epoch": 0.8040152963671128, "grad_norm": 1.6412423849105835, "learning_rate": 5e-06, "loss": 0.2218, "step": 4205 }, { "epoch": 0.804206500956023, "grad_norm": 1.1843851804733276, "learning_rate": 5e-06, "loss": 0.0497, "step": 4206 }, { "epoch": 0.8043977055449331, "grad_norm": 2.4135520458221436, "learning_rate": 5e-06, "loss": 0.3082, "step": 4207 }, { "epoch": 0.8045889101338433, "grad_norm": 2.1240572929382324, "learning_rate": 5e-06, "loss": 0.3888, "step": 4208 }, { "epoch": 0.8047801147227533, "grad_norm": 1.9531352519989014, "learning_rate": 5e-06, "loss": 0.183, "step": 4209 }, { "epoch": 0.8049713193116634, "grad_norm": 1.7559449672698975, "learning_rate": 5e-06, "loss": 0.1177, "step": 4210 }, { "epoch": 0.8051625239005736, "grad_norm": 1.8385118246078491, "learning_rate": 5e-06, "loss": 0.0969, "step": 4211 }, { "epoch": 0.8053537284894837, "grad_norm": 2.0431954860687256, "learning_rate": 5e-06, "loss": 0.1699, "step": 4212 }, { "epoch": 0.8055449330783939, "grad_norm": 1.8413811922073364, "learning_rate": 5e-06, "loss": 0.1222, "step": 4213 }, { "epoch": 0.805736137667304, "grad_norm": 2.166095018386841, "learning_rate": 5e-06, "loss": 0.2953, "step": 4214 }, { "epoch": 0.8059273422562141, "grad_norm": 1.8633686304092407, "learning_rate": 5e-06, "loss": 0.1473, "step": 4215 }, { "epoch": 0.8061185468451243, "grad_norm": 6.3489861488342285, "learning_rate": 5e-06, "loss": 0.417, "step": 4216 }, { "epoch": 0.8063097514340344, "grad_norm": 1.1961588859558105, "learning_rate": 5e-06, "loss": 0.0894, "step": 4217 }, { "epoch": 0.8065009560229446, "grad_norm": 2.456892728805542, "learning_rate": 5e-06, "loss": 0.2194, "step": 4218 }, { "epoch": 0.8066921606118547, "grad_norm": 1.9836076498031616, "learning_rate": 5e-06, "loss": 0.1635, "step": 4219 }, { "epoch": 0.8068833652007649, "grad_norm": 2.320347785949707, "learning_rate": 5e-06, "loss": 0.3102, "step": 4220 }, { "epoch": 0.807074569789675, "grad_norm": 1.4314194917678833, "learning_rate": 5e-06, "loss": 0.1227, "step": 4221 }, { "epoch": 0.807265774378585, "grad_norm": 2.4354302883148193, "learning_rate": 5e-06, "loss": 0.3198, "step": 4222 }, { "epoch": 0.8074569789674952, "grad_norm": 3.135202407836914, "learning_rate": 5e-06, "loss": 0.2219, "step": 4223 }, { "epoch": 0.8076481835564053, "grad_norm": 1.7042864561080933, "learning_rate": 5e-06, "loss": 0.1439, "step": 4224 }, { "epoch": 0.8078393881453155, "grad_norm": 2.076150417327881, "learning_rate": 5e-06, "loss": 0.1322, "step": 4225 }, { "epoch": 0.8080305927342256, "grad_norm": 2.117445707321167, "learning_rate": 5e-06, "loss": 0.223, "step": 4226 }, { "epoch": 0.8082217973231357, "grad_norm": 2.044095993041992, "learning_rate": 5e-06, "loss": 0.2822, "step": 4227 }, { "epoch": 0.8084130019120459, "grad_norm": 2.046536684036255, "learning_rate": 5e-06, "loss": 0.184, "step": 4228 }, { "epoch": 0.808604206500956, "grad_norm": 1.4895386695861816, "learning_rate": 5e-06, "loss": 0.0805, "step": 4229 }, { "epoch": 0.8087954110898662, "grad_norm": 0.9864174723625183, "learning_rate": 5e-06, "loss": 0.0903, "step": 4230 }, { "epoch": 0.8089866156787763, "grad_norm": 1.7758212089538574, "learning_rate": 5e-06, "loss": 0.2118, "step": 4231 }, { "epoch": 0.8091778202676865, "grad_norm": 2.2796030044555664, "learning_rate": 5e-06, "loss": 0.2636, "step": 4232 }, { "epoch": 0.8093690248565966, "grad_norm": 1.5463277101516724, "learning_rate": 5e-06, "loss": 0.1616, "step": 4233 }, { "epoch": 0.8095602294455067, "grad_norm": 1.383233904838562, "learning_rate": 5e-06, "loss": 0.1344, "step": 4234 }, { "epoch": 0.8097514340344169, "grad_norm": 1.3084925413131714, "learning_rate": 5e-06, "loss": 0.139, "step": 4235 }, { "epoch": 0.8099426386233269, "grad_norm": 1.4625215530395508, "learning_rate": 5e-06, "loss": 0.1294, "step": 4236 }, { "epoch": 0.8101338432122371, "grad_norm": 1.1085083484649658, "learning_rate": 5e-06, "loss": 0.0566, "step": 4237 }, { "epoch": 0.8103250478011472, "grad_norm": 2.045025110244751, "learning_rate": 5e-06, "loss": 0.1817, "step": 4238 }, { "epoch": 0.8105162523900573, "grad_norm": 2.7750654220581055, "learning_rate": 5e-06, "loss": 0.3498, "step": 4239 }, { "epoch": 0.8107074569789675, "grad_norm": 1.906060814857483, "learning_rate": 5e-06, "loss": 0.1208, "step": 4240 }, { "epoch": 0.8108986615678776, "grad_norm": 3.07199764251709, "learning_rate": 5e-06, "loss": 0.3699, "step": 4241 }, { "epoch": 0.8110898661567878, "grad_norm": 1.987126350402832, "learning_rate": 5e-06, "loss": 0.2444, "step": 4242 }, { "epoch": 0.8112810707456979, "grad_norm": 1.435240387916565, "learning_rate": 5e-06, "loss": 0.0708, "step": 4243 }, { "epoch": 0.8114722753346081, "grad_norm": 1.3612838983535767, "learning_rate": 5e-06, "loss": 0.0594, "step": 4244 }, { "epoch": 0.8116634799235182, "grad_norm": 2.586632013320923, "learning_rate": 5e-06, "loss": 0.3922, "step": 4245 }, { "epoch": 0.8118546845124283, "grad_norm": 1.353735327720642, "learning_rate": 5e-06, "loss": 0.0882, "step": 4246 }, { "epoch": 0.8120458891013385, "grad_norm": 1.7560728788375854, "learning_rate": 5e-06, "loss": 0.2315, "step": 4247 }, { "epoch": 0.8122370936902485, "grad_norm": 1.8247698545455933, "learning_rate": 5e-06, "loss": 0.1895, "step": 4248 }, { "epoch": 0.8124282982791587, "grad_norm": 1.327373743057251, "learning_rate": 5e-06, "loss": 0.0454, "step": 4249 }, { "epoch": 0.8126195028680688, "grad_norm": 1.8750287294387817, "learning_rate": 5e-06, "loss": 0.1399, "step": 4250 }, { "epoch": 0.8128107074569789, "grad_norm": 2.295496702194214, "learning_rate": 5e-06, "loss": 0.2928, "step": 4251 }, { "epoch": 0.8130019120458891, "grad_norm": 2.422713041305542, "learning_rate": 5e-06, "loss": 0.283, "step": 4252 }, { "epoch": 0.8131931166347992, "grad_norm": 1.8091152906417847, "learning_rate": 5e-06, "loss": 0.1467, "step": 4253 }, { "epoch": 0.8133843212237094, "grad_norm": 1.2801040410995483, "learning_rate": 5e-06, "loss": 0.1223, "step": 4254 }, { "epoch": 0.8135755258126195, "grad_norm": 1.0623581409454346, "learning_rate": 5e-06, "loss": 0.0903, "step": 4255 }, { "epoch": 0.8137667304015297, "grad_norm": 2.4953532218933105, "learning_rate": 5e-06, "loss": 0.1855, "step": 4256 }, { "epoch": 0.8139579349904398, "grad_norm": 2.282299757003784, "learning_rate": 5e-06, "loss": 0.4767, "step": 4257 }, { "epoch": 0.8141491395793499, "grad_norm": 1.9749537706375122, "learning_rate": 5e-06, "loss": 0.1855, "step": 4258 }, { "epoch": 0.8143403441682601, "grad_norm": 1.3995250463485718, "learning_rate": 5e-06, "loss": 0.1177, "step": 4259 }, { "epoch": 0.8145315487571702, "grad_norm": 2.201573133468628, "learning_rate": 5e-06, "loss": 0.1921, "step": 4260 }, { "epoch": 0.8147227533460804, "grad_norm": 1.34676194190979, "learning_rate": 5e-06, "loss": 0.0499, "step": 4261 }, { "epoch": 0.8149139579349904, "grad_norm": 1.8636780977249146, "learning_rate": 5e-06, "loss": 0.2137, "step": 4262 }, { "epoch": 0.8151051625239005, "grad_norm": 1.9577280282974243, "learning_rate": 5e-06, "loss": 0.3015, "step": 4263 }, { "epoch": 0.8152963671128107, "grad_norm": 2.576096296310425, "learning_rate": 5e-06, "loss": 0.2557, "step": 4264 }, { "epoch": 0.8154875717017208, "grad_norm": 1.2772091627120972, "learning_rate": 5e-06, "loss": 0.1053, "step": 4265 }, { "epoch": 0.815678776290631, "grad_norm": 1.6483020782470703, "learning_rate": 5e-06, "loss": 0.1241, "step": 4266 }, { "epoch": 0.8158699808795411, "grad_norm": 2.198559522628784, "learning_rate": 5e-06, "loss": 0.1151, "step": 4267 }, { "epoch": 0.8160611854684512, "grad_norm": 0.5627172589302063, "learning_rate": 5e-06, "loss": 0.021, "step": 4268 }, { "epoch": 0.8162523900573614, "grad_norm": 1.8419286012649536, "learning_rate": 5e-06, "loss": 0.1135, "step": 4269 }, { "epoch": 0.8164435946462715, "grad_norm": 2.3597888946533203, "learning_rate": 5e-06, "loss": 0.5168, "step": 4270 }, { "epoch": 0.8166347992351817, "grad_norm": 1.2093944549560547, "learning_rate": 5e-06, "loss": 0.1298, "step": 4271 }, { "epoch": 0.8168260038240918, "grad_norm": 2.0463786125183105, "learning_rate": 5e-06, "loss": 0.2085, "step": 4272 }, { "epoch": 0.817017208413002, "grad_norm": 2.7457947731018066, "learning_rate": 5e-06, "loss": 0.246, "step": 4273 }, { "epoch": 0.817208413001912, "grad_norm": 1.0748988389968872, "learning_rate": 5e-06, "loss": 0.0544, "step": 4274 }, { "epoch": 0.8173996175908221, "grad_norm": 3.576910972595215, "learning_rate": 5e-06, "loss": 0.3724, "step": 4275 }, { "epoch": 0.8175908221797323, "grad_norm": 2.98368763923645, "learning_rate": 5e-06, "loss": 0.5464, "step": 4276 }, { "epoch": 0.8177820267686424, "grad_norm": 2.850346565246582, "learning_rate": 5e-06, "loss": 0.4558, "step": 4277 }, { "epoch": 0.8179732313575526, "grad_norm": 2.513153553009033, "learning_rate": 5e-06, "loss": 0.2441, "step": 4278 }, { "epoch": 0.8181644359464627, "grad_norm": 0.7250884175300598, "learning_rate": 5e-06, "loss": 0.0649, "step": 4279 }, { "epoch": 0.8183556405353728, "grad_norm": 1.4739652872085571, "learning_rate": 5e-06, "loss": 0.1276, "step": 4280 }, { "epoch": 0.818546845124283, "grad_norm": 2.511843681335449, "learning_rate": 5e-06, "loss": 0.1925, "step": 4281 }, { "epoch": 0.8187380497131931, "grad_norm": 2.0406453609466553, "learning_rate": 5e-06, "loss": 0.3064, "step": 4282 }, { "epoch": 0.8189292543021033, "grad_norm": 2.142224073410034, "learning_rate": 5e-06, "loss": 0.2985, "step": 4283 }, { "epoch": 0.8191204588910134, "grad_norm": 1.9565352201461792, "learning_rate": 5e-06, "loss": 0.2398, "step": 4284 }, { "epoch": 0.8193116634799236, "grad_norm": 1.1692537069320679, "learning_rate": 5e-06, "loss": 0.1201, "step": 4285 }, { "epoch": 0.8195028680688337, "grad_norm": 0.9735251665115356, "learning_rate": 5e-06, "loss": 0.0589, "step": 4286 }, { "epoch": 0.8196940726577437, "grad_norm": 1.0305509567260742, "learning_rate": 5e-06, "loss": 0.0485, "step": 4287 }, { "epoch": 0.8198852772466539, "grad_norm": 2.014246940612793, "learning_rate": 5e-06, "loss": 0.2368, "step": 4288 }, { "epoch": 0.820076481835564, "grad_norm": 3.3924779891967773, "learning_rate": 5e-06, "loss": 0.381, "step": 4289 }, { "epoch": 0.8202676864244742, "grad_norm": 2.973250389099121, "learning_rate": 5e-06, "loss": 0.4036, "step": 4290 }, { "epoch": 0.8204588910133843, "grad_norm": 1.6813417673110962, "learning_rate": 5e-06, "loss": 0.1751, "step": 4291 }, { "epoch": 0.8206500956022944, "grad_norm": 1.9835141897201538, "learning_rate": 5e-06, "loss": 0.1582, "step": 4292 }, { "epoch": 0.8208413001912046, "grad_norm": 2.34795880317688, "learning_rate": 5e-06, "loss": 0.2726, "step": 4293 }, { "epoch": 0.8210325047801147, "grad_norm": 2.353971004486084, "learning_rate": 5e-06, "loss": 0.2274, "step": 4294 }, { "epoch": 0.8212237093690249, "grad_norm": 2.977048635482788, "learning_rate": 5e-06, "loss": 0.4693, "step": 4295 }, { "epoch": 0.821414913957935, "grad_norm": 1.9630651473999023, "learning_rate": 5e-06, "loss": 0.1426, "step": 4296 }, { "epoch": 0.8216061185468452, "grad_norm": 1.8660008907318115, "learning_rate": 5e-06, "loss": 0.1312, "step": 4297 }, { "epoch": 0.8217973231357553, "grad_norm": 2.163969039916992, "learning_rate": 5e-06, "loss": 0.1436, "step": 4298 }, { "epoch": 0.8219885277246654, "grad_norm": 1.4228352308273315, "learning_rate": 5e-06, "loss": 0.1172, "step": 4299 }, { "epoch": 0.8221797323135756, "grad_norm": 2.8689212799072266, "learning_rate": 5e-06, "loss": 0.3061, "step": 4300 }, { "epoch": 0.8223709369024856, "grad_norm": 2.3054051399230957, "learning_rate": 5e-06, "loss": 0.3133, "step": 4301 }, { "epoch": 0.8225621414913958, "grad_norm": 2.6142942905426025, "learning_rate": 5e-06, "loss": 0.5324, "step": 4302 }, { "epoch": 0.8227533460803059, "grad_norm": 1.669835090637207, "learning_rate": 5e-06, "loss": 0.1098, "step": 4303 }, { "epoch": 0.822944550669216, "grad_norm": 1.9401994943618774, "learning_rate": 5e-06, "loss": 0.2461, "step": 4304 }, { "epoch": 0.8231357552581262, "grad_norm": 3.1794354915618896, "learning_rate": 5e-06, "loss": 0.3799, "step": 4305 }, { "epoch": 0.8233269598470363, "grad_norm": 2.4734315872192383, "learning_rate": 5e-06, "loss": 0.0708, "step": 4306 }, { "epoch": 0.8235181644359465, "grad_norm": 2.165508508682251, "learning_rate": 5e-06, "loss": 0.1704, "step": 4307 }, { "epoch": 0.8237093690248566, "grad_norm": 3.454468250274658, "learning_rate": 5e-06, "loss": 0.4224, "step": 4308 }, { "epoch": 0.8239005736137668, "grad_norm": 1.4881060123443604, "learning_rate": 5e-06, "loss": 0.1237, "step": 4309 }, { "epoch": 0.8240917782026769, "grad_norm": 2.7141613960266113, "learning_rate": 5e-06, "loss": 0.4328, "step": 4310 }, { "epoch": 0.824282982791587, "grad_norm": 1.3264292478561401, "learning_rate": 5e-06, "loss": 0.0853, "step": 4311 }, { "epoch": 0.8244741873804972, "grad_norm": 1.4157055616378784, "learning_rate": 5e-06, "loss": 0.0634, "step": 4312 }, { "epoch": 0.8246653919694072, "grad_norm": 2.11893630027771, "learning_rate": 5e-06, "loss": 0.2645, "step": 4313 }, { "epoch": 0.8248565965583174, "grad_norm": 2.659165620803833, "learning_rate": 5e-06, "loss": 0.4391, "step": 4314 }, { "epoch": 0.8250478011472275, "grad_norm": 3.263123035430908, "learning_rate": 5e-06, "loss": 0.1809, "step": 4315 }, { "epoch": 0.8252390057361376, "grad_norm": 1.4275321960449219, "learning_rate": 5e-06, "loss": 0.1575, "step": 4316 }, { "epoch": 0.8254302103250478, "grad_norm": 1.7727454900741577, "learning_rate": 5e-06, "loss": 0.0899, "step": 4317 }, { "epoch": 0.8256214149139579, "grad_norm": 2.1350901126861572, "learning_rate": 5e-06, "loss": 0.1986, "step": 4318 }, { "epoch": 0.8258126195028681, "grad_norm": 3.224039316177368, "learning_rate": 5e-06, "loss": 0.2131, "step": 4319 }, { "epoch": 0.8260038240917782, "grad_norm": 2.624913215637207, "learning_rate": 5e-06, "loss": 0.4338, "step": 4320 }, { "epoch": 0.8261950286806883, "grad_norm": 1.3268989324569702, "learning_rate": 5e-06, "loss": 0.0897, "step": 4321 }, { "epoch": 0.8263862332695985, "grad_norm": 1.9256988763809204, "learning_rate": 5e-06, "loss": 0.177, "step": 4322 }, { "epoch": 0.8265774378585086, "grad_norm": 3.2375950813293457, "learning_rate": 5e-06, "loss": 0.2375, "step": 4323 }, { "epoch": 0.8267686424474188, "grad_norm": 3.528641700744629, "learning_rate": 5e-06, "loss": 0.3476, "step": 4324 }, { "epoch": 0.8269598470363289, "grad_norm": 5.507506847381592, "learning_rate": 5e-06, "loss": 0.3687, "step": 4325 }, { "epoch": 0.827151051625239, "grad_norm": 2.442595958709717, "learning_rate": 5e-06, "loss": 0.3488, "step": 4326 }, { "epoch": 0.8273422562141491, "grad_norm": 4.454098224639893, "learning_rate": 5e-06, "loss": 0.1144, "step": 4327 }, { "epoch": 0.8275334608030592, "grad_norm": 2.602275848388672, "learning_rate": 5e-06, "loss": 0.3673, "step": 4328 }, { "epoch": 0.8277246653919694, "grad_norm": 1.1066206693649292, "learning_rate": 5e-06, "loss": 0.0566, "step": 4329 }, { "epoch": 0.8279158699808795, "grad_norm": 2.778461217880249, "learning_rate": 5e-06, "loss": 0.1148, "step": 4330 }, { "epoch": 0.8281070745697897, "grad_norm": 1.5432476997375488, "learning_rate": 5e-06, "loss": 0.1084, "step": 4331 }, { "epoch": 0.8282982791586998, "grad_norm": 1.3616504669189453, "learning_rate": 5e-06, "loss": 0.1148, "step": 4332 }, { "epoch": 0.8284894837476099, "grad_norm": 2.906709671020508, "learning_rate": 5e-06, "loss": 0.2886, "step": 4333 }, { "epoch": 0.8286806883365201, "grad_norm": 3.554781913757324, "learning_rate": 5e-06, "loss": 0.1041, "step": 4334 }, { "epoch": 0.8288718929254302, "grad_norm": 2.459982395172119, "learning_rate": 5e-06, "loss": 0.1131, "step": 4335 }, { "epoch": 0.8290630975143404, "grad_norm": 2.0762665271759033, "learning_rate": 5e-06, "loss": 0.0706, "step": 4336 }, { "epoch": 0.8292543021032505, "grad_norm": 1.5950919389724731, "learning_rate": 5e-06, "loss": 0.0903, "step": 4337 }, { "epoch": 0.8294455066921607, "grad_norm": 2.4426164627075195, "learning_rate": 5e-06, "loss": 0.4334, "step": 4338 }, { "epoch": 0.8296367112810707, "grad_norm": 2.9879558086395264, "learning_rate": 5e-06, "loss": 0.5294, "step": 4339 }, { "epoch": 0.8298279158699808, "grad_norm": 1.552777886390686, "learning_rate": 5e-06, "loss": 0.1626, "step": 4340 }, { "epoch": 0.830019120458891, "grad_norm": 2.607077121734619, "learning_rate": 5e-06, "loss": 0.1706, "step": 4341 }, { "epoch": 0.8302103250478011, "grad_norm": 1.6359832286834717, "learning_rate": 5e-06, "loss": 0.079, "step": 4342 }, { "epoch": 0.8304015296367113, "grad_norm": 1.792439341545105, "learning_rate": 5e-06, "loss": 0.1382, "step": 4343 }, { "epoch": 0.8305927342256214, "grad_norm": 1.6875145435333252, "learning_rate": 5e-06, "loss": 0.0992, "step": 4344 }, { "epoch": 0.8307839388145315, "grad_norm": 1.9896464347839355, "learning_rate": 5e-06, "loss": 0.2902, "step": 4345 }, { "epoch": 0.8309751434034417, "grad_norm": 1.053695797920227, "learning_rate": 5e-06, "loss": 0.1258, "step": 4346 }, { "epoch": 0.8311663479923518, "grad_norm": 2.7914059162139893, "learning_rate": 5e-06, "loss": 0.3776, "step": 4347 }, { "epoch": 0.831357552581262, "grad_norm": 2.2700257301330566, "learning_rate": 5e-06, "loss": 0.2601, "step": 4348 }, { "epoch": 0.8315487571701721, "grad_norm": 2.1097259521484375, "learning_rate": 5e-06, "loss": 0.1924, "step": 4349 }, { "epoch": 0.8317399617590823, "grad_norm": 2.5981993675231934, "learning_rate": 5e-06, "loss": 0.1349, "step": 4350 }, { "epoch": 0.8319311663479924, "grad_norm": 2.368746280670166, "learning_rate": 5e-06, "loss": 0.3906, "step": 4351 }, { "epoch": 0.8321223709369024, "grad_norm": 3.390716075897217, "learning_rate": 5e-06, "loss": 0.3522, "step": 4352 }, { "epoch": 0.8323135755258126, "grad_norm": 2.0879626274108887, "learning_rate": 5e-06, "loss": 0.3756, "step": 4353 }, { "epoch": 0.8325047801147227, "grad_norm": 1.396887183189392, "learning_rate": 5e-06, "loss": 0.1102, "step": 4354 }, { "epoch": 0.8326959847036329, "grad_norm": 1.5137981176376343, "learning_rate": 5e-06, "loss": 0.0845, "step": 4355 }, { "epoch": 0.832887189292543, "grad_norm": 2.094395637512207, "learning_rate": 5e-06, "loss": 0.127, "step": 4356 }, { "epoch": 0.8330783938814531, "grad_norm": 2.304523229598999, "learning_rate": 5e-06, "loss": 0.309, "step": 4357 }, { "epoch": 0.8332695984703633, "grad_norm": 1.592523455619812, "learning_rate": 5e-06, "loss": 0.2083, "step": 4358 }, { "epoch": 0.8334608030592734, "grad_norm": 1.6386781930923462, "learning_rate": 5e-06, "loss": 0.1019, "step": 4359 }, { "epoch": 0.8336520076481836, "grad_norm": 1.4175693988800049, "learning_rate": 5e-06, "loss": 0.0789, "step": 4360 }, { "epoch": 0.8338432122370937, "grad_norm": 1.1679911613464355, "learning_rate": 5e-06, "loss": 0.068, "step": 4361 }, { "epoch": 0.8340344168260039, "grad_norm": 2.1180410385131836, "learning_rate": 5e-06, "loss": 0.1132, "step": 4362 }, { "epoch": 0.834225621414914, "grad_norm": 1.151820182800293, "learning_rate": 5e-06, "loss": 0.094, "step": 4363 }, { "epoch": 0.834416826003824, "grad_norm": 2.254429817199707, "learning_rate": 5e-06, "loss": 0.3339, "step": 4364 }, { "epoch": 0.8346080305927343, "grad_norm": 1.0687077045440674, "learning_rate": 5e-06, "loss": 0.1122, "step": 4365 }, { "epoch": 0.8347992351816443, "grad_norm": 3.133803129196167, "learning_rate": 5e-06, "loss": 0.32, "step": 4366 }, { "epoch": 0.8349904397705545, "grad_norm": 2.1867737770080566, "learning_rate": 5e-06, "loss": 0.2895, "step": 4367 }, { "epoch": 0.8351816443594646, "grad_norm": 1.6033610105514526, "learning_rate": 5e-06, "loss": 0.2681, "step": 4368 }, { "epoch": 0.8353728489483747, "grad_norm": 1.4587613344192505, "learning_rate": 5e-06, "loss": 0.1197, "step": 4369 }, { "epoch": 0.8355640535372849, "grad_norm": 2.3070292472839355, "learning_rate": 5e-06, "loss": 0.5036, "step": 4370 }, { "epoch": 0.835755258126195, "grad_norm": 1.722882628440857, "learning_rate": 5e-06, "loss": 0.2494, "step": 4371 }, { "epoch": 0.8359464627151052, "grad_norm": 1.6158150434494019, "learning_rate": 5e-06, "loss": 0.1648, "step": 4372 }, { "epoch": 0.8361376673040153, "grad_norm": 1.1540052890777588, "learning_rate": 5e-06, "loss": 0.0811, "step": 4373 }, { "epoch": 0.8363288718929255, "grad_norm": 0.9952983260154724, "learning_rate": 5e-06, "loss": 0.0439, "step": 4374 }, { "epoch": 0.8365200764818356, "grad_norm": 1.722088098526001, "learning_rate": 5e-06, "loss": 0.0845, "step": 4375 }, { "epoch": 0.8367112810707457, "grad_norm": 1.8059070110321045, "learning_rate": 5e-06, "loss": 0.3103, "step": 4376 }, { "epoch": 0.8369024856596559, "grad_norm": 1.7783750295639038, "learning_rate": 5e-06, "loss": 0.2965, "step": 4377 }, { "epoch": 0.837093690248566, "grad_norm": 1.6145813465118408, "learning_rate": 5e-06, "loss": 0.1776, "step": 4378 }, { "epoch": 0.8372848948374761, "grad_norm": 3.1882593631744385, "learning_rate": 5e-06, "loss": 0.3732, "step": 4379 }, { "epoch": 0.8374760994263862, "grad_norm": 1.8107242584228516, "learning_rate": 5e-06, "loss": 0.2011, "step": 4380 }, { "epoch": 0.8376673040152963, "grad_norm": 2.4885756969451904, "learning_rate": 5e-06, "loss": 0.1237, "step": 4381 }, { "epoch": 0.8378585086042065, "grad_norm": 3.0721991062164307, "learning_rate": 5e-06, "loss": 0.2544, "step": 4382 }, { "epoch": 0.8380497131931166, "grad_norm": 2.654618740081787, "learning_rate": 5e-06, "loss": 0.3133, "step": 4383 }, { "epoch": 0.8382409177820268, "grad_norm": 1.3207848072052002, "learning_rate": 5e-06, "loss": 0.2198, "step": 4384 }, { "epoch": 0.8384321223709369, "grad_norm": 1.5825202465057373, "learning_rate": 5e-06, "loss": 0.1373, "step": 4385 }, { "epoch": 0.838623326959847, "grad_norm": 3.0859808921813965, "learning_rate": 5e-06, "loss": 0.2396, "step": 4386 }, { "epoch": 0.8388145315487572, "grad_norm": 1.6806787252426147, "learning_rate": 5e-06, "loss": 0.0929, "step": 4387 }, { "epoch": 0.8390057361376673, "grad_norm": 2.5619802474975586, "learning_rate": 5e-06, "loss": 0.164, "step": 4388 }, { "epoch": 0.8391969407265775, "grad_norm": 2.1878180503845215, "learning_rate": 5e-06, "loss": 0.2702, "step": 4389 }, { "epoch": 0.8393881453154876, "grad_norm": 1.7535722255706787, "learning_rate": 5e-06, "loss": 0.1757, "step": 4390 }, { "epoch": 0.8395793499043978, "grad_norm": 2.350954055786133, "learning_rate": 5e-06, "loss": 0.1784, "step": 4391 }, { "epoch": 0.8397705544933078, "grad_norm": 1.941375494003296, "learning_rate": 5e-06, "loss": 0.0602, "step": 4392 }, { "epoch": 0.8399617590822179, "grad_norm": 0.9962285161018372, "learning_rate": 5e-06, "loss": 0.0257, "step": 4393 }, { "epoch": 0.8401529636711281, "grad_norm": 1.4986315965652466, "learning_rate": 5e-06, "loss": 0.0806, "step": 4394 }, { "epoch": 0.8403441682600382, "grad_norm": 2.466259717941284, "learning_rate": 5e-06, "loss": 0.1361, "step": 4395 }, { "epoch": 0.8405353728489484, "grad_norm": 1.511971116065979, "learning_rate": 5e-06, "loss": 0.146, "step": 4396 }, { "epoch": 0.8407265774378585, "grad_norm": 3.6628222465515137, "learning_rate": 5e-06, "loss": 0.4908, "step": 4397 }, { "epoch": 0.8409177820267686, "grad_norm": 1.4136232137680054, "learning_rate": 5e-06, "loss": 0.1436, "step": 4398 }, { "epoch": 0.8411089866156788, "grad_norm": 1.857986569404602, "learning_rate": 5e-06, "loss": 0.1274, "step": 4399 }, { "epoch": 0.8413001912045889, "grad_norm": 1.0941559076309204, "learning_rate": 5e-06, "loss": 0.0448, "step": 4400 }, { "epoch": 0.8414913957934991, "grad_norm": 2.9076249599456787, "learning_rate": 5e-06, "loss": 0.4855, "step": 4401 }, { "epoch": 0.8416826003824092, "grad_norm": 2.237715721130371, "learning_rate": 5e-06, "loss": 0.2907, "step": 4402 }, { "epoch": 0.8418738049713194, "grad_norm": 1.244497537612915, "learning_rate": 5e-06, "loss": 0.0821, "step": 4403 }, { "epoch": 0.8420650095602294, "grad_norm": 1.071437120437622, "learning_rate": 5e-06, "loss": 0.1103, "step": 4404 }, { "epoch": 0.8422562141491395, "grad_norm": 1.5743379592895508, "learning_rate": 5e-06, "loss": 0.0687, "step": 4405 }, { "epoch": 0.8424474187380497, "grad_norm": 2.162822723388672, "learning_rate": 5e-06, "loss": 0.1724, "step": 4406 }, { "epoch": 0.8426386233269598, "grad_norm": 1.7346101999282837, "learning_rate": 5e-06, "loss": 0.2139, "step": 4407 }, { "epoch": 0.84282982791587, "grad_norm": 1.8639531135559082, "learning_rate": 5e-06, "loss": 0.2507, "step": 4408 }, { "epoch": 0.8430210325047801, "grad_norm": 3.444169044494629, "learning_rate": 5e-06, "loss": 0.4141, "step": 4409 }, { "epoch": 0.8432122370936902, "grad_norm": 2.168269395828247, "learning_rate": 5e-06, "loss": 0.2911, "step": 4410 }, { "epoch": 0.8434034416826004, "grad_norm": 1.8232005834579468, "learning_rate": 5e-06, "loss": 0.1244, "step": 4411 }, { "epoch": 0.8435946462715105, "grad_norm": 2.408576488494873, "learning_rate": 5e-06, "loss": 0.121, "step": 4412 }, { "epoch": 0.8437858508604207, "grad_norm": 3.882044792175293, "learning_rate": 5e-06, "loss": 0.3033, "step": 4413 }, { "epoch": 0.8439770554493308, "grad_norm": 1.6000193357467651, "learning_rate": 5e-06, "loss": 0.1496, "step": 4414 }, { "epoch": 0.844168260038241, "grad_norm": 1.694844126701355, "learning_rate": 5e-06, "loss": 0.1392, "step": 4415 }, { "epoch": 0.8443594646271511, "grad_norm": 2.597773551940918, "learning_rate": 5e-06, "loss": 0.4699, "step": 4416 }, { "epoch": 0.8445506692160611, "grad_norm": 1.8982315063476562, "learning_rate": 5e-06, "loss": 0.0863, "step": 4417 }, { "epoch": 0.8447418738049713, "grad_norm": 2.2443645000457764, "learning_rate": 5e-06, "loss": 0.0798, "step": 4418 }, { "epoch": 0.8449330783938814, "grad_norm": 1.6703152656555176, "learning_rate": 5e-06, "loss": 0.117, "step": 4419 }, { "epoch": 0.8451242829827916, "grad_norm": 2.743288993835449, "learning_rate": 5e-06, "loss": 0.3322, "step": 4420 }, { "epoch": 0.8453154875717017, "grad_norm": 1.3997440338134766, "learning_rate": 5e-06, "loss": 0.1167, "step": 4421 }, { "epoch": 0.8455066921606118, "grad_norm": 1.7278668880462646, "learning_rate": 5e-06, "loss": 0.2425, "step": 4422 }, { "epoch": 0.845697896749522, "grad_norm": 2.021827459335327, "learning_rate": 5e-06, "loss": 0.1857, "step": 4423 }, { "epoch": 0.8458891013384321, "grad_norm": 1.941655158996582, "learning_rate": 5e-06, "loss": 0.2009, "step": 4424 }, { "epoch": 0.8460803059273423, "grad_norm": 2.000415563583374, "learning_rate": 5e-06, "loss": 0.1906, "step": 4425 }, { "epoch": 0.8462715105162524, "grad_norm": 2.7244679927825928, "learning_rate": 5e-06, "loss": 0.3982, "step": 4426 }, { "epoch": 0.8464627151051626, "grad_norm": 3.001358985900879, "learning_rate": 5e-06, "loss": 0.5675, "step": 4427 }, { "epoch": 0.8466539196940727, "grad_norm": 1.9397376775741577, "learning_rate": 5e-06, "loss": 0.3761, "step": 4428 }, { "epoch": 0.8468451242829828, "grad_norm": 2.951451539993286, "learning_rate": 5e-06, "loss": 0.3985, "step": 4429 }, { "epoch": 0.847036328871893, "grad_norm": 0.8639160990715027, "learning_rate": 5e-06, "loss": 0.0371, "step": 4430 }, { "epoch": 0.847227533460803, "grad_norm": 1.7857511043548584, "learning_rate": 5e-06, "loss": 0.0694, "step": 4431 }, { "epoch": 0.8474187380497132, "grad_norm": 1.9648922681808472, "learning_rate": 5e-06, "loss": 0.2294, "step": 4432 }, { "epoch": 0.8476099426386233, "grad_norm": 1.5596647262573242, "learning_rate": 5e-06, "loss": 0.2072, "step": 4433 }, { "epoch": 0.8478011472275334, "grad_norm": 1.354530692100525, "learning_rate": 5e-06, "loss": 0.0821, "step": 4434 }, { "epoch": 0.8479923518164436, "grad_norm": 2.014345407485962, "learning_rate": 5e-06, "loss": 0.1577, "step": 4435 }, { "epoch": 0.8481835564053537, "grad_norm": 0.9619423747062683, "learning_rate": 5e-06, "loss": 0.0551, "step": 4436 }, { "epoch": 0.8483747609942639, "grad_norm": 2.9514081478118896, "learning_rate": 5e-06, "loss": 0.2785, "step": 4437 }, { "epoch": 0.848565965583174, "grad_norm": 3.460991859436035, "learning_rate": 5e-06, "loss": 0.5416, "step": 4438 }, { "epoch": 0.8487571701720842, "grad_norm": 1.313177227973938, "learning_rate": 5e-06, "loss": 0.1352, "step": 4439 }, { "epoch": 0.8489483747609943, "grad_norm": 2.4580676555633545, "learning_rate": 5e-06, "loss": 0.3445, "step": 4440 }, { "epoch": 0.8491395793499044, "grad_norm": 1.3704016208648682, "learning_rate": 5e-06, "loss": 0.0929, "step": 4441 }, { "epoch": 0.8493307839388146, "grad_norm": 1.3280479907989502, "learning_rate": 5e-06, "loss": 0.0958, "step": 4442 }, { "epoch": 0.8495219885277246, "grad_norm": 2.1530704498291016, "learning_rate": 5e-06, "loss": 0.1695, "step": 4443 }, { "epoch": 0.8497131931166348, "grad_norm": 1.4062306880950928, "learning_rate": 5e-06, "loss": 0.0833, "step": 4444 }, { "epoch": 0.8499043977055449, "grad_norm": 1.9014475345611572, "learning_rate": 5e-06, "loss": 0.2039, "step": 4445 }, { "epoch": 0.850095602294455, "grad_norm": 1.7016692161560059, "learning_rate": 5e-06, "loss": 0.0979, "step": 4446 }, { "epoch": 0.8502868068833652, "grad_norm": 1.8992486000061035, "learning_rate": 5e-06, "loss": 0.2124, "step": 4447 }, { "epoch": 0.8504780114722753, "grad_norm": 2.011763095855713, "learning_rate": 5e-06, "loss": 0.1776, "step": 4448 }, { "epoch": 0.8506692160611855, "grad_norm": 0.7455622553825378, "learning_rate": 5e-06, "loss": 0.0826, "step": 4449 }, { "epoch": 0.8508604206500956, "grad_norm": 1.7753809690475464, "learning_rate": 5e-06, "loss": 0.1413, "step": 4450 }, { "epoch": 0.8510516252390057, "grad_norm": 1.924593448638916, "learning_rate": 5e-06, "loss": 0.3003, "step": 4451 }, { "epoch": 0.8512428298279159, "grad_norm": 1.3443734645843506, "learning_rate": 5e-06, "loss": 0.1168, "step": 4452 }, { "epoch": 0.851434034416826, "grad_norm": 1.712401032447815, "learning_rate": 5e-06, "loss": 0.1064, "step": 4453 }, { "epoch": 0.8516252390057362, "grad_norm": 2.6731860637664795, "learning_rate": 5e-06, "loss": 0.1451, "step": 4454 }, { "epoch": 0.8518164435946463, "grad_norm": 1.2535629272460938, "learning_rate": 5e-06, "loss": 0.1456, "step": 4455 }, { "epoch": 0.8520076481835565, "grad_norm": 1.3341909646987915, "learning_rate": 5e-06, "loss": 0.0821, "step": 4456 }, { "epoch": 0.8521988527724665, "grad_norm": 2.0949151515960693, "learning_rate": 5e-06, "loss": 0.3415, "step": 4457 }, { "epoch": 0.8523900573613766, "grad_norm": 2.8425564765930176, "learning_rate": 5e-06, "loss": 0.6229, "step": 4458 }, { "epoch": 0.8525812619502868, "grad_norm": 2.830179214477539, "learning_rate": 5e-06, "loss": 0.4966, "step": 4459 }, { "epoch": 0.8527724665391969, "grad_norm": 3.527388334274292, "learning_rate": 5e-06, "loss": 0.5026, "step": 4460 }, { "epoch": 0.8529636711281071, "grad_norm": 1.1000351905822754, "learning_rate": 5e-06, "loss": 0.0993, "step": 4461 }, { "epoch": 0.8531548757170172, "grad_norm": 1.7830493450164795, "learning_rate": 5e-06, "loss": 0.0735, "step": 4462 }, { "epoch": 0.8533460803059273, "grad_norm": 2.3873448371887207, "learning_rate": 5e-06, "loss": 0.271, "step": 4463 }, { "epoch": 0.8535372848948375, "grad_norm": 2.0163981914520264, "learning_rate": 5e-06, "loss": 0.1629, "step": 4464 }, { "epoch": 0.8537284894837476, "grad_norm": 1.701276183128357, "learning_rate": 5e-06, "loss": 0.2418, "step": 4465 }, { "epoch": 0.8539196940726578, "grad_norm": 1.9887462854385376, "learning_rate": 5e-06, "loss": 0.2069, "step": 4466 }, { "epoch": 0.8541108986615679, "grad_norm": 1.4311776161193848, "learning_rate": 5e-06, "loss": 0.0844, "step": 4467 }, { "epoch": 0.8543021032504781, "grad_norm": 1.4655697345733643, "learning_rate": 5e-06, "loss": 0.1087, "step": 4468 }, { "epoch": 0.8544933078393881, "grad_norm": 2.374417781829834, "learning_rate": 5e-06, "loss": 0.1252, "step": 4469 }, { "epoch": 0.8546845124282982, "grad_norm": 1.744074821472168, "learning_rate": 5e-06, "loss": 0.1841, "step": 4470 }, { "epoch": 0.8548757170172084, "grad_norm": 1.7160265445709229, "learning_rate": 5e-06, "loss": 0.1687, "step": 4471 }, { "epoch": 0.8550669216061185, "grad_norm": 2.1769425868988037, "learning_rate": 5e-06, "loss": 0.2596, "step": 4472 }, { "epoch": 0.8552581261950287, "grad_norm": 2.5814943313598633, "learning_rate": 5e-06, "loss": 0.2571, "step": 4473 }, { "epoch": 0.8554493307839388, "grad_norm": 1.3457120656967163, "learning_rate": 5e-06, "loss": 0.1145, "step": 4474 }, { "epoch": 0.8556405353728489, "grad_norm": 1.8106434345245361, "learning_rate": 5e-06, "loss": 0.1167, "step": 4475 }, { "epoch": 0.8558317399617591, "grad_norm": 1.8618403673171997, "learning_rate": 5e-06, "loss": 0.1876, "step": 4476 }, { "epoch": 0.8560229445506692, "grad_norm": 1.3731857538223267, "learning_rate": 5e-06, "loss": 0.1171, "step": 4477 }, { "epoch": 0.8562141491395794, "grad_norm": 3.5358798503875732, "learning_rate": 5e-06, "loss": 0.1766, "step": 4478 }, { "epoch": 0.8564053537284895, "grad_norm": 2.0287272930145264, "learning_rate": 5e-06, "loss": 0.1085, "step": 4479 }, { "epoch": 0.8565965583173997, "grad_norm": 1.2044274806976318, "learning_rate": 5e-06, "loss": 0.0622, "step": 4480 }, { "epoch": 0.8567877629063098, "grad_norm": 1.2688884735107422, "learning_rate": 5e-06, "loss": 0.0831, "step": 4481 }, { "epoch": 0.8569789674952198, "grad_norm": 3.058166980743408, "learning_rate": 5e-06, "loss": 0.5819, "step": 4482 }, { "epoch": 0.85717017208413, "grad_norm": 2.472184896469116, "learning_rate": 5e-06, "loss": 0.3191, "step": 4483 }, { "epoch": 0.8573613766730401, "grad_norm": 2.22625732421875, "learning_rate": 5e-06, "loss": 0.2619, "step": 4484 }, { "epoch": 0.8575525812619503, "grad_norm": 1.0253074169158936, "learning_rate": 5e-06, "loss": 0.0643, "step": 4485 }, { "epoch": 0.8577437858508604, "grad_norm": 1.16357421875, "learning_rate": 5e-06, "loss": 0.0652, "step": 4486 }, { "epoch": 0.8579349904397705, "grad_norm": 1.9855798482894897, "learning_rate": 5e-06, "loss": 0.136, "step": 4487 }, { "epoch": 0.8581261950286807, "grad_norm": 2.942619800567627, "learning_rate": 5e-06, "loss": 0.1555, "step": 4488 }, { "epoch": 0.8583173996175908, "grad_norm": 1.8146679401397705, "learning_rate": 5e-06, "loss": 0.136, "step": 4489 }, { "epoch": 0.858508604206501, "grad_norm": 1.2012311220169067, "learning_rate": 5e-06, "loss": 0.0923, "step": 4490 }, { "epoch": 0.8586998087954111, "grad_norm": 2.180446147918701, "learning_rate": 5e-06, "loss": 0.4023, "step": 4491 }, { "epoch": 0.8588910133843213, "grad_norm": 2.272432327270508, "learning_rate": 5e-06, "loss": 0.2363, "step": 4492 }, { "epoch": 0.8590822179732314, "grad_norm": 1.4137390851974487, "learning_rate": 5e-06, "loss": 0.1623, "step": 4493 }, { "epoch": 0.8592734225621415, "grad_norm": 3.289259433746338, "learning_rate": 5e-06, "loss": 0.3883, "step": 4494 }, { "epoch": 0.8594646271510517, "grad_norm": 1.947654128074646, "learning_rate": 5e-06, "loss": 0.2005, "step": 4495 }, { "epoch": 0.8596558317399617, "grad_norm": 1.1897157430648804, "learning_rate": 5e-06, "loss": 0.1192, "step": 4496 }, { "epoch": 0.8598470363288719, "grad_norm": 1.1473478078842163, "learning_rate": 5e-06, "loss": 0.1185, "step": 4497 }, { "epoch": 0.860038240917782, "grad_norm": 1.4698823690414429, "learning_rate": 5e-06, "loss": 0.0852, "step": 4498 }, { "epoch": 0.8602294455066921, "grad_norm": 3.318509578704834, "learning_rate": 5e-06, "loss": 0.3434, "step": 4499 }, { "epoch": 0.8604206500956023, "grad_norm": 1.8506419658660889, "learning_rate": 5e-06, "loss": 0.1088, "step": 4500 }, { "epoch": 0.8604206500956023, "eval_runtime": 769.0971, "eval_samples_per_second": 1.995, "eval_steps_per_second": 0.25, "step": 4500 }, { "epoch": 0.8606118546845124, "grad_norm": 1.9987014532089233, "learning_rate": 5e-06, "loss": 0.276, "step": 4501 }, { "epoch": 0.8608030592734226, "grad_norm": 14.492589950561523, "learning_rate": 5e-06, "loss": 0.4812, "step": 4502 }, { "epoch": 0.8609942638623327, "grad_norm": 2.069856882095337, "learning_rate": 5e-06, "loss": 0.2254, "step": 4503 }, { "epoch": 0.8611854684512428, "grad_norm": 2.994016647338867, "learning_rate": 5e-06, "loss": 0.291, "step": 4504 }, { "epoch": 0.861376673040153, "grad_norm": 1.7594068050384521, "learning_rate": 5e-06, "loss": 0.0717, "step": 4505 }, { "epoch": 0.8615678776290631, "grad_norm": 2.7324862480163574, "learning_rate": 5e-06, "loss": 0.0629, "step": 4506 }, { "epoch": 0.8617590822179733, "grad_norm": 3.193103075027466, "learning_rate": 5e-06, "loss": 0.6214, "step": 4507 }, { "epoch": 0.8619502868068833, "grad_norm": 2.8593437671661377, "learning_rate": 5e-06, "loss": 0.3593, "step": 4508 }, { "epoch": 0.8621414913957935, "grad_norm": 0.9549627304077148, "learning_rate": 5e-06, "loss": 0.0506, "step": 4509 }, { "epoch": 0.8623326959847036, "grad_norm": 0.934559166431427, "learning_rate": 5e-06, "loss": 0.0455, "step": 4510 }, { "epoch": 0.8625239005736137, "grad_norm": 1.489575743675232, "learning_rate": 5e-06, "loss": 0.1217, "step": 4511 }, { "epoch": 0.8627151051625239, "grad_norm": 2.525219202041626, "learning_rate": 5e-06, "loss": 0.1192, "step": 4512 }, { "epoch": 0.862906309751434, "grad_norm": 2.3394064903259277, "learning_rate": 5e-06, "loss": 0.2619, "step": 4513 }, { "epoch": 0.8630975143403442, "grad_norm": 3.068735361099243, "learning_rate": 5e-06, "loss": 0.497, "step": 4514 }, { "epoch": 0.8632887189292543, "grad_norm": 1.4633656740188599, "learning_rate": 5e-06, "loss": 0.1086, "step": 4515 }, { "epoch": 0.8634799235181644, "grad_norm": 1.276939868927002, "learning_rate": 5e-06, "loss": 0.071, "step": 4516 }, { "epoch": 0.8636711281070746, "grad_norm": 1.701491355895996, "learning_rate": 5e-06, "loss": 0.0944, "step": 4517 }, { "epoch": 0.8638623326959847, "grad_norm": 1.7778617143630981, "learning_rate": 5e-06, "loss": 0.158, "step": 4518 }, { "epoch": 0.8640535372848949, "grad_norm": 2.2561211585998535, "learning_rate": 5e-06, "loss": 0.1649, "step": 4519 }, { "epoch": 0.864244741873805, "grad_norm": 1.6372724771499634, "learning_rate": 5e-06, "loss": 0.165, "step": 4520 }, { "epoch": 0.8644359464627152, "grad_norm": 1.8756248950958252, "learning_rate": 5e-06, "loss": 0.1654, "step": 4521 }, { "epoch": 0.8646271510516252, "grad_norm": 1.4479553699493408, "learning_rate": 5e-06, "loss": 0.2137, "step": 4522 }, { "epoch": 0.8648183556405353, "grad_norm": 1.0815504789352417, "learning_rate": 5e-06, "loss": 0.0721, "step": 4523 }, { "epoch": 0.8650095602294455, "grad_norm": 1.7075153589248657, "learning_rate": 5e-06, "loss": 0.0744, "step": 4524 }, { "epoch": 0.8652007648183556, "grad_norm": 2.4414803981781006, "learning_rate": 5e-06, "loss": 0.1563, "step": 4525 }, { "epoch": 0.8653919694072658, "grad_norm": 2.04904842376709, "learning_rate": 5e-06, "loss": 0.2584, "step": 4526 }, { "epoch": 0.8655831739961759, "grad_norm": 2.7054378986358643, "learning_rate": 5e-06, "loss": 0.4034, "step": 4527 }, { "epoch": 0.865774378585086, "grad_norm": 1.9329419136047363, "learning_rate": 5e-06, "loss": 0.1169, "step": 4528 }, { "epoch": 0.8659655831739962, "grad_norm": 2.828913450241089, "learning_rate": 5e-06, "loss": 0.4183, "step": 4529 }, { "epoch": 0.8661567877629063, "grad_norm": 2.64652419090271, "learning_rate": 5e-06, "loss": 0.0719, "step": 4530 }, { "epoch": 0.8663479923518165, "grad_norm": 1.249855399131775, "learning_rate": 5e-06, "loss": 0.0515, "step": 4531 }, { "epoch": 0.8665391969407266, "grad_norm": 2.1562163829803467, "learning_rate": 5e-06, "loss": 0.1446, "step": 4532 }, { "epoch": 0.8667304015296368, "grad_norm": 2.4176135063171387, "learning_rate": 5e-06, "loss": 0.4205, "step": 4533 }, { "epoch": 0.8669216061185469, "grad_norm": 1.6887050867080688, "learning_rate": 5e-06, "loss": 0.1691, "step": 4534 }, { "epoch": 0.8671128107074569, "grad_norm": 3.1317782402038574, "learning_rate": 5e-06, "loss": 0.2806, "step": 4535 }, { "epoch": 0.8673040152963671, "grad_norm": 2.647075891494751, "learning_rate": 5e-06, "loss": 0.3814, "step": 4536 }, { "epoch": 0.8674952198852772, "grad_norm": 1.211713194847107, "learning_rate": 5e-06, "loss": 0.0743, "step": 4537 }, { "epoch": 0.8676864244741874, "grad_norm": 1.462576150894165, "learning_rate": 5e-06, "loss": 0.1225, "step": 4538 }, { "epoch": 0.8678776290630975, "grad_norm": 2.324312925338745, "learning_rate": 5e-06, "loss": 0.2452, "step": 4539 }, { "epoch": 0.8680688336520076, "grad_norm": 1.4178894758224487, "learning_rate": 5e-06, "loss": 0.0916, "step": 4540 }, { "epoch": 0.8682600382409178, "grad_norm": 4.281589508056641, "learning_rate": 5e-06, "loss": 0.1797, "step": 4541 }, { "epoch": 0.8684512428298279, "grad_norm": 2.9054739475250244, "learning_rate": 5e-06, "loss": 0.1756, "step": 4542 }, { "epoch": 0.8686424474187381, "grad_norm": 1.1191303730010986, "learning_rate": 5e-06, "loss": 0.0303, "step": 4543 }, { "epoch": 0.8688336520076482, "grad_norm": 4.550809383392334, "learning_rate": 5e-06, "loss": 0.2272, "step": 4544 }, { "epoch": 0.8690248565965584, "grad_norm": 2.311413526535034, "learning_rate": 5e-06, "loss": 0.2753, "step": 4545 }, { "epoch": 0.8692160611854685, "grad_norm": 2.0396246910095215, "learning_rate": 5e-06, "loss": 0.2035, "step": 4546 }, { "epoch": 0.8694072657743785, "grad_norm": 3.7515199184417725, "learning_rate": 5e-06, "loss": 0.3386, "step": 4547 }, { "epoch": 0.8695984703632887, "grad_norm": 2.69758939743042, "learning_rate": 5e-06, "loss": 0.3004, "step": 4548 }, { "epoch": 0.8697896749521988, "grad_norm": 1.6221609115600586, "learning_rate": 5e-06, "loss": 0.1011, "step": 4549 }, { "epoch": 0.869980879541109, "grad_norm": 1.3611334562301636, "learning_rate": 5e-06, "loss": 0.0866, "step": 4550 }, { "epoch": 0.8701720841300191, "grad_norm": 2.460482120513916, "learning_rate": 5e-06, "loss": 0.3597, "step": 4551 }, { "epoch": 0.8703632887189292, "grad_norm": 1.2450191974639893, "learning_rate": 5e-06, "loss": 0.1289, "step": 4552 }, { "epoch": 0.8705544933078394, "grad_norm": 1.1274508237838745, "learning_rate": 5e-06, "loss": 0.0882, "step": 4553 }, { "epoch": 0.8707456978967495, "grad_norm": 1.571494221687317, "learning_rate": 5e-06, "loss": 0.2236, "step": 4554 }, { "epoch": 0.8709369024856597, "grad_norm": 1.62301504611969, "learning_rate": 5e-06, "loss": 0.0966, "step": 4555 }, { "epoch": 0.8711281070745698, "grad_norm": 0.9670308828353882, "learning_rate": 5e-06, "loss": 0.0481, "step": 4556 }, { "epoch": 0.87131931166348, "grad_norm": 1.0194993019104004, "learning_rate": 5e-06, "loss": 0.0995, "step": 4557 }, { "epoch": 0.8715105162523901, "grad_norm": 2.630286455154419, "learning_rate": 5e-06, "loss": 0.526, "step": 4558 }, { "epoch": 0.8717017208413002, "grad_norm": 2.8174970149993896, "learning_rate": 5e-06, "loss": 0.1966, "step": 4559 }, { "epoch": 0.8718929254302104, "grad_norm": 1.6453908681869507, "learning_rate": 5e-06, "loss": 0.1281, "step": 4560 }, { "epoch": 0.8720841300191204, "grad_norm": 1.6050821542739868, "learning_rate": 5e-06, "loss": 0.1121, "step": 4561 }, { "epoch": 0.8722753346080306, "grad_norm": 2.457448959350586, "learning_rate": 5e-06, "loss": 0.2686, "step": 4562 }, { "epoch": 0.8724665391969407, "grad_norm": 1.3800158500671387, "learning_rate": 5e-06, "loss": 0.0852, "step": 4563 }, { "epoch": 0.8726577437858508, "grad_norm": 3.040879726409912, "learning_rate": 5e-06, "loss": 0.4089, "step": 4564 }, { "epoch": 0.872848948374761, "grad_norm": 1.8485065698623657, "learning_rate": 5e-06, "loss": 0.1646, "step": 4565 }, { "epoch": 0.8730401529636711, "grad_norm": 1.5646363496780396, "learning_rate": 5e-06, "loss": 0.1381, "step": 4566 }, { "epoch": 0.8732313575525813, "grad_norm": 2.561878204345703, "learning_rate": 5e-06, "loss": 0.1572, "step": 4567 }, { "epoch": 0.8734225621414914, "grad_norm": 1.187648892402649, "learning_rate": 5e-06, "loss": 0.0962, "step": 4568 }, { "epoch": 0.8736137667304015, "grad_norm": 2.9988815784454346, "learning_rate": 5e-06, "loss": 0.2014, "step": 4569 }, { "epoch": 0.8738049713193117, "grad_norm": 1.185423731803894, "learning_rate": 5e-06, "loss": 0.1753, "step": 4570 }, { "epoch": 0.8739961759082218, "grad_norm": 1.1318641901016235, "learning_rate": 5e-06, "loss": 0.0889, "step": 4571 }, { "epoch": 0.874187380497132, "grad_norm": 1.3041385412216187, "learning_rate": 5e-06, "loss": 0.0902, "step": 4572 }, { "epoch": 0.874378585086042, "grad_norm": 2.061596632003784, "learning_rate": 5e-06, "loss": 0.3094, "step": 4573 }, { "epoch": 0.8745697896749522, "grad_norm": 1.7980538606643677, "learning_rate": 5e-06, "loss": 0.2838, "step": 4574 }, { "epoch": 0.8747609942638623, "grad_norm": 4.6933135986328125, "learning_rate": 5e-06, "loss": 0.1503, "step": 4575 }, { "epoch": 0.8749521988527724, "grad_norm": 2.4895479679107666, "learning_rate": 5e-06, "loss": 0.385, "step": 4576 }, { "epoch": 0.8751434034416826, "grad_norm": 2.2268948554992676, "learning_rate": 5e-06, "loss": 0.2536, "step": 4577 }, { "epoch": 0.8753346080305927, "grad_norm": 1.4584846496582031, "learning_rate": 5e-06, "loss": 0.0736, "step": 4578 }, { "epoch": 0.8755258126195029, "grad_norm": 1.1387372016906738, "learning_rate": 5e-06, "loss": 0.0894, "step": 4579 }, { "epoch": 0.875717017208413, "grad_norm": 1.2333475351333618, "learning_rate": 5e-06, "loss": 0.0964, "step": 4580 }, { "epoch": 0.8759082217973231, "grad_norm": 1.2063689231872559, "learning_rate": 5e-06, "loss": 0.0427, "step": 4581 }, { "epoch": 0.8760994263862333, "grad_norm": 2.055612087249756, "learning_rate": 5e-06, "loss": 0.3624, "step": 4582 }, { "epoch": 0.8762906309751434, "grad_norm": 1.719244360923767, "learning_rate": 5e-06, "loss": 0.1258, "step": 4583 }, { "epoch": 0.8764818355640536, "grad_norm": 1.0476865768432617, "learning_rate": 5e-06, "loss": 0.0592, "step": 4584 }, { "epoch": 0.8766730401529637, "grad_norm": 1.689634919166565, "learning_rate": 5e-06, "loss": 0.2333, "step": 4585 }, { "epoch": 0.8768642447418739, "grad_norm": 1.7837719917297363, "learning_rate": 5e-06, "loss": 0.135, "step": 4586 }, { "epoch": 0.8770554493307839, "grad_norm": 1.6960902214050293, "learning_rate": 5e-06, "loss": 0.0815, "step": 4587 }, { "epoch": 0.877246653919694, "grad_norm": 3.9408388137817383, "learning_rate": 5e-06, "loss": 0.3912, "step": 4588 }, { "epoch": 0.8774378585086042, "grad_norm": 2.848705530166626, "learning_rate": 5e-06, "loss": 0.2726, "step": 4589 }, { "epoch": 0.8776290630975143, "grad_norm": 1.9206777811050415, "learning_rate": 5e-06, "loss": 0.1705, "step": 4590 }, { "epoch": 0.8778202676864245, "grad_norm": 1.366650104522705, "learning_rate": 5e-06, "loss": 0.1402, "step": 4591 }, { "epoch": 0.8780114722753346, "grad_norm": 2.2299587726593018, "learning_rate": 5e-06, "loss": 0.1416, "step": 4592 }, { "epoch": 0.8782026768642447, "grad_norm": 1.1620739698410034, "learning_rate": 5e-06, "loss": 0.0649, "step": 4593 }, { "epoch": 0.8783938814531549, "grad_norm": 1.5828089714050293, "learning_rate": 5e-06, "loss": 0.129, "step": 4594 }, { "epoch": 0.878585086042065, "grad_norm": 1.6714706420898438, "learning_rate": 5e-06, "loss": 0.213, "step": 4595 }, { "epoch": 0.8787762906309752, "grad_norm": 2.159733533859253, "learning_rate": 5e-06, "loss": 0.1566, "step": 4596 }, { "epoch": 0.8789674952198853, "grad_norm": 4.393616676330566, "learning_rate": 5e-06, "loss": 0.2552, "step": 4597 }, { "epoch": 0.8791586998087955, "grad_norm": 1.0370787382125854, "learning_rate": 5e-06, "loss": 0.0785, "step": 4598 }, { "epoch": 0.8793499043977056, "grad_norm": 1.0146803855895996, "learning_rate": 5e-06, "loss": 0.0634, "step": 4599 }, { "epoch": 0.8795411089866156, "grad_norm": 2.365654945373535, "learning_rate": 5e-06, "loss": 0.1258, "step": 4600 }, { "epoch": 0.8797323135755258, "grad_norm": 2.3569753170013428, "learning_rate": 5e-06, "loss": 0.4793, "step": 4601 }, { "epoch": 0.8799235181644359, "grad_norm": 3.6621952056884766, "learning_rate": 5e-06, "loss": 0.6331, "step": 4602 }, { "epoch": 0.8801147227533461, "grad_norm": 2.0783820152282715, "learning_rate": 5e-06, "loss": 0.3076, "step": 4603 }, { "epoch": 0.8803059273422562, "grad_norm": 1.9632409811019897, "learning_rate": 5e-06, "loss": 0.0954, "step": 4604 }, { "epoch": 0.8804971319311663, "grad_norm": 1.1970844268798828, "learning_rate": 5e-06, "loss": 0.0962, "step": 4605 }, { "epoch": 0.8806883365200765, "grad_norm": 1.9226810932159424, "learning_rate": 5e-06, "loss": 0.0956, "step": 4606 }, { "epoch": 0.8808795411089866, "grad_norm": 2.701702833175659, "learning_rate": 5e-06, "loss": 0.4784, "step": 4607 }, { "epoch": 0.8810707456978968, "grad_norm": 1.3243826627731323, "learning_rate": 5e-06, "loss": 0.0847, "step": 4608 }, { "epoch": 0.8812619502868069, "grad_norm": 1.84163236618042, "learning_rate": 5e-06, "loss": 0.1919, "step": 4609 }, { "epoch": 0.8814531548757171, "grad_norm": 2.9473471641540527, "learning_rate": 5e-06, "loss": 0.2047, "step": 4610 }, { "epoch": 0.8816443594646272, "grad_norm": 1.4090523719787598, "learning_rate": 5e-06, "loss": 0.1088, "step": 4611 }, { "epoch": 0.8818355640535372, "grad_norm": 1.92579185962677, "learning_rate": 5e-06, "loss": 0.0899, "step": 4612 }, { "epoch": 0.8820267686424474, "grad_norm": 2.1861867904663086, "learning_rate": 5e-06, "loss": 0.283, "step": 4613 }, { "epoch": 0.8822179732313575, "grad_norm": 1.9572925567626953, "learning_rate": 5e-06, "loss": 0.2118, "step": 4614 }, { "epoch": 0.8824091778202677, "grad_norm": 1.4317727088928223, "learning_rate": 5e-06, "loss": 0.1227, "step": 4615 }, { "epoch": 0.8826003824091778, "grad_norm": 0.7707310914993286, "learning_rate": 5e-06, "loss": 0.0597, "step": 4616 }, { "epoch": 0.8827915869980879, "grad_norm": 3.6430459022521973, "learning_rate": 5e-06, "loss": 0.5254, "step": 4617 }, { "epoch": 0.8829827915869981, "grad_norm": 0.7240305542945862, "learning_rate": 5e-06, "loss": 0.035, "step": 4618 }, { "epoch": 0.8831739961759082, "grad_norm": 1.372766137123108, "learning_rate": 5e-06, "loss": 0.0557, "step": 4619 }, { "epoch": 0.8833652007648184, "grad_norm": 2.592245578765869, "learning_rate": 5e-06, "loss": 0.3585, "step": 4620 }, { "epoch": 0.8835564053537285, "grad_norm": 1.7263092994689941, "learning_rate": 5e-06, "loss": 0.1779, "step": 4621 }, { "epoch": 0.8837476099426386, "grad_norm": 1.6172292232513428, "learning_rate": 5e-06, "loss": 0.1043, "step": 4622 }, { "epoch": 0.8839388145315488, "grad_norm": 1.8567088842391968, "learning_rate": 5e-06, "loss": 0.0783, "step": 4623 }, { "epoch": 0.8841300191204589, "grad_norm": 1.0310090780258179, "learning_rate": 5e-06, "loss": 0.0906, "step": 4624 }, { "epoch": 0.884321223709369, "grad_norm": 2.3422353267669678, "learning_rate": 5e-06, "loss": 0.1059, "step": 4625 }, { "epoch": 0.8845124282982791, "grad_norm": 2.5427935123443604, "learning_rate": 5e-06, "loss": 0.4579, "step": 4626 }, { "epoch": 0.8847036328871893, "grad_norm": 1.8456017971038818, "learning_rate": 5e-06, "loss": 0.1279, "step": 4627 }, { "epoch": 0.8848948374760994, "grad_norm": 1.4140512943267822, "learning_rate": 5e-06, "loss": 0.0981, "step": 4628 }, { "epoch": 0.8850860420650095, "grad_norm": 1.278416395187378, "learning_rate": 5e-06, "loss": 0.1425, "step": 4629 }, { "epoch": 0.8852772466539197, "grad_norm": 0.722949206829071, "learning_rate": 5e-06, "loss": 0.0267, "step": 4630 }, { "epoch": 0.8854684512428298, "grad_norm": 1.8881372213363647, "learning_rate": 5e-06, "loss": 0.094, "step": 4631 }, { "epoch": 0.88565965583174, "grad_norm": 2.0543901920318604, "learning_rate": 5e-06, "loss": 0.1314, "step": 4632 }, { "epoch": 0.8858508604206501, "grad_norm": 2.746734619140625, "learning_rate": 5e-06, "loss": 0.4295, "step": 4633 }, { "epoch": 0.8860420650095602, "grad_norm": 1.3126775026321411, "learning_rate": 5e-06, "loss": 0.072, "step": 4634 }, { "epoch": 0.8862332695984704, "grad_norm": 2.1847662925720215, "learning_rate": 5e-06, "loss": 0.193, "step": 4635 }, { "epoch": 0.8864244741873805, "grad_norm": 2.4903340339660645, "learning_rate": 5e-06, "loss": 0.0869, "step": 4636 }, { "epoch": 0.8866156787762907, "grad_norm": 2.261547803878784, "learning_rate": 5e-06, "loss": 0.1426, "step": 4637 }, { "epoch": 0.8868068833652007, "grad_norm": 3.0043082237243652, "learning_rate": 5e-06, "loss": 0.1545, "step": 4638 }, { "epoch": 0.886998087954111, "grad_norm": 1.8434854745864868, "learning_rate": 5e-06, "loss": 0.2356, "step": 4639 }, { "epoch": 0.887189292543021, "grad_norm": 1.221226453781128, "learning_rate": 5e-06, "loss": 0.0681, "step": 4640 }, { "epoch": 0.8873804971319311, "grad_norm": 1.7801470756530762, "learning_rate": 5e-06, "loss": 0.0785, "step": 4641 }, { "epoch": 0.8875717017208413, "grad_norm": 1.2522605657577515, "learning_rate": 5e-06, "loss": 0.0611, "step": 4642 }, { "epoch": 0.8877629063097514, "grad_norm": 1.8315064907073975, "learning_rate": 5e-06, "loss": 0.0613, "step": 4643 }, { "epoch": 0.8879541108986616, "grad_norm": 1.2048170566558838, "learning_rate": 5e-06, "loss": 0.0696, "step": 4644 }, { "epoch": 0.8881453154875717, "grad_norm": 2.3258895874023438, "learning_rate": 5e-06, "loss": 0.3081, "step": 4645 }, { "epoch": 0.8883365200764818, "grad_norm": 1.9494102001190186, "learning_rate": 5e-06, "loss": 0.1162, "step": 4646 }, { "epoch": 0.888527724665392, "grad_norm": 3.9364311695098877, "learning_rate": 5e-06, "loss": 0.5175, "step": 4647 }, { "epoch": 0.8887189292543021, "grad_norm": 2.00209641456604, "learning_rate": 5e-06, "loss": 0.2127, "step": 4648 }, { "epoch": 0.8889101338432123, "grad_norm": 1.54474675655365, "learning_rate": 5e-06, "loss": 0.0865, "step": 4649 }, { "epoch": 0.8891013384321224, "grad_norm": 2.311948537826538, "learning_rate": 5e-06, "loss": 0.1372, "step": 4650 }, { "epoch": 0.8892925430210326, "grad_norm": 2.3398048877716064, "learning_rate": 5e-06, "loss": 0.2935, "step": 4651 }, { "epoch": 0.8894837476099426, "grad_norm": 3.875178575515747, "learning_rate": 5e-06, "loss": 0.4512, "step": 4652 }, { "epoch": 0.8896749521988527, "grad_norm": 1.4064953327178955, "learning_rate": 5e-06, "loss": 0.1605, "step": 4653 }, { "epoch": 0.8898661567877629, "grad_norm": 2.825348138809204, "learning_rate": 5e-06, "loss": 0.2991, "step": 4654 }, { "epoch": 0.890057361376673, "grad_norm": 1.4716694355010986, "learning_rate": 5e-06, "loss": 0.092, "step": 4655 }, { "epoch": 0.8902485659655832, "grad_norm": 1.371217966079712, "learning_rate": 5e-06, "loss": 0.1255, "step": 4656 }, { "epoch": 0.8904397705544933, "grad_norm": 2.445219039916992, "learning_rate": 5e-06, "loss": 0.1148, "step": 4657 }, { "epoch": 0.8906309751434034, "grad_norm": 2.0857105255126953, "learning_rate": 5e-06, "loss": 0.33, "step": 4658 }, { "epoch": 0.8908221797323136, "grad_norm": 1.8164504766464233, "learning_rate": 5e-06, "loss": 0.256, "step": 4659 }, { "epoch": 0.8910133843212237, "grad_norm": 1.9205447435379028, "learning_rate": 5e-06, "loss": 0.1559, "step": 4660 }, { "epoch": 0.8912045889101339, "grad_norm": 1.0629031658172607, "learning_rate": 5e-06, "loss": 0.054, "step": 4661 }, { "epoch": 0.891395793499044, "grad_norm": 2.1165101528167725, "learning_rate": 5e-06, "loss": 0.2257, "step": 4662 }, { "epoch": 0.8915869980879542, "grad_norm": 2.4531354904174805, "learning_rate": 5e-06, "loss": 0.2545, "step": 4663 }, { "epoch": 0.8917782026768643, "grad_norm": 1.8205480575561523, "learning_rate": 5e-06, "loss": 0.2478, "step": 4664 }, { "epoch": 0.8919694072657743, "grad_norm": 1.4492504596710205, "learning_rate": 5e-06, "loss": 0.1171, "step": 4665 }, { "epoch": 0.8921606118546845, "grad_norm": 2.0282204151153564, "learning_rate": 5e-06, "loss": 0.109, "step": 4666 }, { "epoch": 0.8923518164435946, "grad_norm": 1.1359578371047974, "learning_rate": 5e-06, "loss": 0.0532, "step": 4667 }, { "epoch": 0.8925430210325048, "grad_norm": 1.3664461374282837, "learning_rate": 5e-06, "loss": 0.1644, "step": 4668 }, { "epoch": 0.8927342256214149, "grad_norm": 2.3731942176818848, "learning_rate": 5e-06, "loss": 0.1219, "step": 4669 }, { "epoch": 0.892925430210325, "grad_norm": 1.5064061880111694, "learning_rate": 5e-06, "loss": 0.157, "step": 4670 }, { "epoch": 0.8931166347992352, "grad_norm": 3.036389112472534, "learning_rate": 5e-06, "loss": 0.3978, "step": 4671 }, { "epoch": 0.8933078393881453, "grad_norm": 1.1229474544525146, "learning_rate": 5e-06, "loss": 0.1008, "step": 4672 }, { "epoch": 0.8934990439770555, "grad_norm": 1.6844687461853027, "learning_rate": 5e-06, "loss": 0.2174, "step": 4673 }, { "epoch": 0.8936902485659656, "grad_norm": 1.0797251462936401, "learning_rate": 5e-06, "loss": 0.0696, "step": 4674 }, { "epoch": 0.8938814531548758, "grad_norm": 1.7389681339263916, "learning_rate": 5e-06, "loss": 0.1154, "step": 4675 }, { "epoch": 0.8940726577437859, "grad_norm": 1.4130902290344238, "learning_rate": 5e-06, "loss": 0.144, "step": 4676 }, { "epoch": 0.894263862332696, "grad_norm": 1.8311725854873657, "learning_rate": 5e-06, "loss": 0.2501, "step": 4677 }, { "epoch": 0.8944550669216061, "grad_norm": 3.1457982063293457, "learning_rate": 5e-06, "loss": 0.2693, "step": 4678 }, { "epoch": 0.8946462715105162, "grad_norm": 1.2318836450576782, "learning_rate": 5e-06, "loss": 0.1091, "step": 4679 }, { "epoch": 0.8948374760994264, "grad_norm": 1.2631968259811401, "learning_rate": 5e-06, "loss": 0.0692, "step": 4680 }, { "epoch": 0.8950286806883365, "grad_norm": 1.3267889022827148, "learning_rate": 5e-06, "loss": 0.1024, "step": 4681 }, { "epoch": 0.8952198852772466, "grad_norm": 3.25459361076355, "learning_rate": 5e-06, "loss": 0.4319, "step": 4682 }, { "epoch": 0.8954110898661568, "grad_norm": 3.177501678466797, "learning_rate": 5e-06, "loss": 0.7296, "step": 4683 }, { "epoch": 0.8956022944550669, "grad_norm": 2.4800193309783936, "learning_rate": 5e-06, "loss": 0.309, "step": 4684 }, { "epoch": 0.8957934990439771, "grad_norm": 1.386919379234314, "learning_rate": 5e-06, "loss": 0.1178, "step": 4685 }, { "epoch": 0.8959847036328872, "grad_norm": 0.9860485196113586, "learning_rate": 5e-06, "loss": 0.0742, "step": 4686 }, { "epoch": 0.8961759082217973, "grad_norm": 1.5006732940673828, "learning_rate": 5e-06, "loss": 0.0625, "step": 4687 }, { "epoch": 0.8963671128107075, "grad_norm": 1.8717820644378662, "learning_rate": 5e-06, "loss": 0.1307, "step": 4688 }, { "epoch": 0.8965583173996176, "grad_norm": 2.3897619247436523, "learning_rate": 5e-06, "loss": 0.6113, "step": 4689 }, { "epoch": 0.8967495219885278, "grad_norm": 1.6034753322601318, "learning_rate": 5e-06, "loss": 0.1415, "step": 4690 }, { "epoch": 0.8969407265774378, "grad_norm": 2.3642208576202393, "learning_rate": 5e-06, "loss": 0.3357, "step": 4691 }, { "epoch": 0.897131931166348, "grad_norm": 2.514803409576416, "learning_rate": 5e-06, "loss": 0.1529, "step": 4692 }, { "epoch": 0.8973231357552581, "grad_norm": 1.097195029258728, "learning_rate": 5e-06, "loss": 0.0548, "step": 4693 }, { "epoch": 0.8975143403441682, "grad_norm": 1.1098480224609375, "learning_rate": 5e-06, "loss": 0.0912, "step": 4694 }, { "epoch": 0.8977055449330784, "grad_norm": 2.1470210552215576, "learning_rate": 5e-06, "loss": 0.3154, "step": 4695 }, { "epoch": 0.8978967495219885, "grad_norm": 1.509371042251587, "learning_rate": 5e-06, "loss": 0.1229, "step": 4696 }, { "epoch": 0.8980879541108987, "grad_norm": 1.2485581636428833, "learning_rate": 5e-06, "loss": 0.1053, "step": 4697 }, { "epoch": 0.8982791586998088, "grad_norm": 1.2133026123046875, "learning_rate": 5e-06, "loss": 0.1103, "step": 4698 }, { "epoch": 0.8984703632887189, "grad_norm": 1.5444940328598022, "learning_rate": 5e-06, "loss": 0.1142, "step": 4699 }, { "epoch": 0.8986615678776291, "grad_norm": 2.6265218257904053, "learning_rate": 5e-06, "loss": 0.2122, "step": 4700 }, { "epoch": 0.8988527724665392, "grad_norm": 2.756988286972046, "learning_rate": 5e-06, "loss": 0.5817, "step": 4701 }, { "epoch": 0.8990439770554494, "grad_norm": 2.6874217987060547, "learning_rate": 5e-06, "loss": 0.3232, "step": 4702 }, { "epoch": 0.8992351816443594, "grad_norm": 2.7615904808044434, "learning_rate": 5e-06, "loss": 0.3782, "step": 4703 }, { "epoch": 0.8994263862332696, "grad_norm": 2.0286366939544678, "learning_rate": 5e-06, "loss": 0.2288, "step": 4704 }, { "epoch": 0.8996175908221797, "grad_norm": 1.6421332359313965, "learning_rate": 5e-06, "loss": 0.1077, "step": 4705 }, { "epoch": 0.8998087954110898, "grad_norm": 1.2805842161178589, "learning_rate": 5e-06, "loss": 0.0755, "step": 4706 }, { "epoch": 0.9, "grad_norm": 1.985251545906067, "learning_rate": 5e-06, "loss": 0.3243, "step": 4707 }, { "epoch": 0.9001912045889101, "grad_norm": 1.3914923667907715, "learning_rate": 5e-06, "loss": 0.2003, "step": 4708 }, { "epoch": 0.9003824091778203, "grad_norm": 2.9044227600097656, "learning_rate": 5e-06, "loss": 0.3824, "step": 4709 }, { "epoch": 0.9005736137667304, "grad_norm": 1.989436149597168, "learning_rate": 5e-06, "loss": 0.0969, "step": 4710 }, { "epoch": 0.9007648183556405, "grad_norm": 3.248147487640381, "learning_rate": 5e-06, "loss": 0.1397, "step": 4711 }, { "epoch": 0.9009560229445507, "grad_norm": 3.772764205932617, "learning_rate": 5e-06, "loss": 0.1007, "step": 4712 }, { "epoch": 0.9011472275334608, "grad_norm": 1.53439199924469, "learning_rate": 5e-06, "loss": 0.1392, "step": 4713 }, { "epoch": 0.901338432122371, "grad_norm": 2.0962002277374268, "learning_rate": 5e-06, "loss": 0.2381, "step": 4714 }, { "epoch": 0.9015296367112811, "grad_norm": 1.7860227823257446, "learning_rate": 5e-06, "loss": 0.0828, "step": 4715 }, { "epoch": 0.9017208413001913, "grad_norm": 2.519381284713745, "learning_rate": 5e-06, "loss": 0.2643, "step": 4716 }, { "epoch": 0.9019120458891013, "grad_norm": 2.8200933933258057, "learning_rate": 5e-06, "loss": 0.3501, "step": 4717 }, { "epoch": 0.9021032504780114, "grad_norm": 1.7289233207702637, "learning_rate": 5e-06, "loss": 0.1181, "step": 4718 }, { "epoch": 0.9022944550669216, "grad_norm": 2.348267078399658, "learning_rate": 5e-06, "loss": 0.2862, "step": 4719 }, { "epoch": 0.9024856596558317, "grad_norm": 1.9853914976119995, "learning_rate": 5e-06, "loss": 0.3011, "step": 4720 }, { "epoch": 0.9026768642447419, "grad_norm": 1.2755216360092163, "learning_rate": 5e-06, "loss": 0.1263, "step": 4721 }, { "epoch": 0.902868068833652, "grad_norm": 2.737816095352173, "learning_rate": 5e-06, "loss": 0.3493, "step": 4722 }, { "epoch": 0.9030592734225621, "grad_norm": 2.026895523071289, "learning_rate": 5e-06, "loss": 0.2523, "step": 4723 }, { "epoch": 0.9032504780114723, "grad_norm": 1.8737328052520752, "learning_rate": 5e-06, "loss": 0.1153, "step": 4724 }, { "epoch": 0.9034416826003824, "grad_norm": 2.0649120807647705, "learning_rate": 5e-06, "loss": 0.1239, "step": 4725 }, { "epoch": 0.9036328871892926, "grad_norm": 2.1456220149993896, "learning_rate": 5e-06, "loss": 0.1975, "step": 4726 }, { "epoch": 0.9038240917782027, "grad_norm": 1.953724980354309, "learning_rate": 5e-06, "loss": 0.1403, "step": 4727 }, { "epoch": 0.9040152963671129, "grad_norm": 1.8601813316345215, "learning_rate": 5e-06, "loss": 0.1244, "step": 4728 }, { "epoch": 0.904206500956023, "grad_norm": 2.1330983638763428, "learning_rate": 5e-06, "loss": 0.1379, "step": 4729 }, { "epoch": 0.904397705544933, "grad_norm": 2.6561925411224365, "learning_rate": 5e-06, "loss": 0.277, "step": 4730 }, { "epoch": 0.9045889101338432, "grad_norm": 2.5746819972991943, "learning_rate": 5e-06, "loss": 0.0965, "step": 4731 }, { "epoch": 0.9047801147227533, "grad_norm": 2.185016632080078, "learning_rate": 5e-06, "loss": 0.2095, "step": 4732 }, { "epoch": 0.9049713193116635, "grad_norm": 2.8116836547851562, "learning_rate": 5e-06, "loss": 0.5521, "step": 4733 }, { "epoch": 0.9051625239005736, "grad_norm": 2.427065134048462, "learning_rate": 5e-06, "loss": 0.2564, "step": 4734 }, { "epoch": 0.9053537284894837, "grad_norm": 2.053335666656494, "learning_rate": 5e-06, "loss": 0.272, "step": 4735 }, { "epoch": 0.9055449330783939, "grad_norm": 2.481733798980713, "learning_rate": 5e-06, "loss": 0.3331, "step": 4736 }, { "epoch": 0.905736137667304, "grad_norm": 2.1682722568511963, "learning_rate": 5e-06, "loss": 0.0942, "step": 4737 }, { "epoch": 0.9059273422562142, "grad_norm": 2.0562095642089844, "learning_rate": 5e-06, "loss": 0.1516, "step": 4738 }, { "epoch": 0.9061185468451243, "grad_norm": 1.898068904876709, "learning_rate": 5e-06, "loss": 0.2335, "step": 4739 }, { "epoch": 0.9063097514340345, "grad_norm": 1.218976378440857, "learning_rate": 5e-06, "loss": 0.085, "step": 4740 }, { "epoch": 0.9065009560229446, "grad_norm": 1.4753073453903198, "learning_rate": 5e-06, "loss": 0.181, "step": 4741 }, { "epoch": 0.9066921606118546, "grad_norm": 1.8766474723815918, "learning_rate": 5e-06, "loss": 0.1318, "step": 4742 }, { "epoch": 0.9068833652007648, "grad_norm": 1.521669626235962, "learning_rate": 5e-06, "loss": 0.0619, "step": 4743 }, { "epoch": 0.9070745697896749, "grad_norm": 1.3307064771652222, "learning_rate": 5e-06, "loss": 0.1082, "step": 4744 }, { "epoch": 0.9072657743785851, "grad_norm": 2.4466915130615234, "learning_rate": 5e-06, "loss": 0.6762, "step": 4745 }, { "epoch": 0.9074569789674952, "grad_norm": 1.5775731801986694, "learning_rate": 5e-06, "loss": 0.261, "step": 4746 }, { "epoch": 0.9076481835564053, "grad_norm": 1.7367063760757446, "learning_rate": 5e-06, "loss": 0.217, "step": 4747 }, { "epoch": 0.9078393881453155, "grad_norm": 2.7421231269836426, "learning_rate": 5e-06, "loss": 0.2712, "step": 4748 }, { "epoch": 0.9080305927342256, "grad_norm": 1.4664653539657593, "learning_rate": 5e-06, "loss": 0.1045, "step": 4749 }, { "epoch": 0.9082217973231358, "grad_norm": 6.154154300689697, "learning_rate": 5e-06, "loss": 0.1511, "step": 4750 }, { "epoch": 0.9084130019120459, "grad_norm": 1.722429871559143, "learning_rate": 5e-06, "loss": 0.1382, "step": 4751 }, { "epoch": 0.908604206500956, "grad_norm": 3.028075695037842, "learning_rate": 5e-06, "loss": 0.5589, "step": 4752 }, { "epoch": 0.9087954110898662, "grad_norm": 1.7719171047210693, "learning_rate": 5e-06, "loss": 0.2481, "step": 4753 }, { "epoch": 0.9089866156787763, "grad_norm": 2.284649610519409, "learning_rate": 5e-06, "loss": 0.3038, "step": 4754 }, { "epoch": 0.9091778202676865, "grad_norm": 1.697432041168213, "learning_rate": 5e-06, "loss": 0.093, "step": 4755 }, { "epoch": 0.9093690248565965, "grad_norm": 1.257247805595398, "learning_rate": 5e-06, "loss": 0.1112, "step": 4756 }, { "epoch": 0.9095602294455067, "grad_norm": 4.2492523193359375, "learning_rate": 5e-06, "loss": 0.2155, "step": 4757 }, { "epoch": 0.9097514340344168, "grad_norm": 1.3657695055007935, "learning_rate": 5e-06, "loss": 0.2232, "step": 4758 }, { "epoch": 0.9099426386233269, "grad_norm": 1.7062994241714478, "learning_rate": 5e-06, "loss": 0.2433, "step": 4759 }, { "epoch": 0.9101338432122371, "grad_norm": 1.1820368766784668, "learning_rate": 5e-06, "loss": 0.0536, "step": 4760 }, { "epoch": 0.9103250478011472, "grad_norm": 1.561043620109558, "learning_rate": 5e-06, "loss": 0.0761, "step": 4761 }, { "epoch": 0.9105162523900574, "grad_norm": 2.44998836517334, "learning_rate": 5e-06, "loss": 0.2631, "step": 4762 }, { "epoch": 0.9107074569789675, "grad_norm": 2.2695345878601074, "learning_rate": 5e-06, "loss": 0.2851, "step": 4763 }, { "epoch": 0.9108986615678776, "grad_norm": 2.3107171058654785, "learning_rate": 5e-06, "loss": 0.3215, "step": 4764 }, { "epoch": 0.9110898661567878, "grad_norm": 2.1133556365966797, "learning_rate": 5e-06, "loss": 0.2423, "step": 4765 }, { "epoch": 0.9112810707456979, "grad_norm": 1.5830267667770386, "learning_rate": 5e-06, "loss": 0.1177, "step": 4766 }, { "epoch": 0.9114722753346081, "grad_norm": 2.6378262042999268, "learning_rate": 5e-06, "loss": 0.1486, "step": 4767 }, { "epoch": 0.9116634799235181, "grad_norm": 1.1338980197906494, "learning_rate": 5e-06, "loss": 0.0795, "step": 4768 }, { "epoch": 0.9118546845124283, "grad_norm": 2.1762497425079346, "learning_rate": 5e-06, "loss": 0.1608, "step": 4769 }, { "epoch": 0.9120458891013384, "grad_norm": 2.5121426582336426, "learning_rate": 5e-06, "loss": 0.3784, "step": 4770 }, { "epoch": 0.9122370936902485, "grad_norm": 2.436002731323242, "learning_rate": 5e-06, "loss": 0.3143, "step": 4771 }, { "epoch": 0.9124282982791587, "grad_norm": 2.774428129196167, "learning_rate": 5e-06, "loss": 0.3945, "step": 4772 }, { "epoch": 0.9126195028680688, "grad_norm": 1.4498441219329834, "learning_rate": 5e-06, "loss": 0.1016, "step": 4773 }, { "epoch": 0.912810707456979, "grad_norm": 2.4954347610473633, "learning_rate": 5e-06, "loss": 0.0956, "step": 4774 }, { "epoch": 0.9130019120458891, "grad_norm": 1.8431687355041504, "learning_rate": 5e-06, "loss": 0.0724, "step": 4775 }, { "epoch": 0.9131931166347992, "grad_norm": 1.2787989377975464, "learning_rate": 5e-06, "loss": 0.1331, "step": 4776 }, { "epoch": 0.9133843212237094, "grad_norm": 1.2174687385559082, "learning_rate": 5e-06, "loss": 0.1211, "step": 4777 }, { "epoch": 0.9135755258126195, "grad_norm": 2.3031578063964844, "learning_rate": 5e-06, "loss": 0.3992, "step": 4778 }, { "epoch": 0.9137667304015297, "grad_norm": 2.224651336669922, "learning_rate": 5e-06, "loss": 0.3414, "step": 4779 }, { "epoch": 0.9139579349904398, "grad_norm": 1.978184700012207, "learning_rate": 5e-06, "loss": 0.2514, "step": 4780 }, { "epoch": 0.91414913957935, "grad_norm": 1.6605628728866577, "learning_rate": 5e-06, "loss": 0.0844, "step": 4781 }, { "epoch": 0.91434034416826, "grad_norm": 1.5570906400680542, "learning_rate": 5e-06, "loss": 0.1116, "step": 4782 }, { "epoch": 0.9145315487571701, "grad_norm": 1.553684949874878, "learning_rate": 5e-06, "loss": 0.1289, "step": 4783 }, { "epoch": 0.9147227533460803, "grad_norm": 1.4999451637268066, "learning_rate": 5e-06, "loss": 0.0977, "step": 4784 }, { "epoch": 0.9149139579349904, "grad_norm": 1.102944254875183, "learning_rate": 5e-06, "loss": 0.0922, "step": 4785 }, { "epoch": 0.9151051625239006, "grad_norm": 1.5913718938827515, "learning_rate": 5e-06, "loss": 0.0704, "step": 4786 }, { "epoch": 0.9152963671128107, "grad_norm": 1.003926396369934, "learning_rate": 5e-06, "loss": 0.0447, "step": 4787 }, { "epoch": 0.9154875717017208, "grad_norm": 3.418393135070801, "learning_rate": 5e-06, "loss": 0.5752, "step": 4788 }, { "epoch": 0.915678776290631, "grad_norm": 1.964396595954895, "learning_rate": 5e-06, "loss": 0.374, "step": 4789 }, { "epoch": 0.9158699808795411, "grad_norm": 2.352846145629883, "learning_rate": 5e-06, "loss": 0.2557, "step": 4790 }, { "epoch": 0.9160611854684513, "grad_norm": 1.851499319076538, "learning_rate": 5e-06, "loss": 0.1095, "step": 4791 }, { "epoch": 0.9162523900573614, "grad_norm": 2.526007652282715, "learning_rate": 5e-06, "loss": 0.1511, "step": 4792 }, { "epoch": 0.9164435946462716, "grad_norm": 2.6899187564849854, "learning_rate": 5e-06, "loss": 0.2373, "step": 4793 }, { "epoch": 0.9166347992351817, "grad_norm": 1.4613804817199707, "learning_rate": 5e-06, "loss": 0.0741, "step": 4794 }, { "epoch": 0.9168260038240917, "grad_norm": 1.7826400995254517, "learning_rate": 5e-06, "loss": 0.1562, "step": 4795 }, { "epoch": 0.9170172084130019, "grad_norm": 2.50441575050354, "learning_rate": 5e-06, "loss": 0.1662, "step": 4796 }, { "epoch": 0.917208413001912, "grad_norm": 1.1559553146362305, "learning_rate": 5e-06, "loss": 0.0645, "step": 4797 }, { "epoch": 0.9173996175908222, "grad_norm": 1.493688702583313, "learning_rate": 5e-06, "loss": 0.1094, "step": 4798 }, { "epoch": 0.9175908221797323, "grad_norm": 2.530729055404663, "learning_rate": 5e-06, "loss": 0.127, "step": 4799 }, { "epoch": 0.9177820267686424, "grad_norm": 2.9667983055114746, "learning_rate": 5e-06, "loss": 0.1215, "step": 4800 }, { "epoch": 0.9179732313575526, "grad_norm": 1.1197313070297241, "learning_rate": 5e-06, "loss": 0.0769, "step": 4801 }, { "epoch": 0.9181644359464627, "grad_norm": 1.4685611724853516, "learning_rate": 5e-06, "loss": 0.1881, "step": 4802 }, { "epoch": 0.9183556405353729, "grad_norm": 1.4080411195755005, "learning_rate": 5e-06, "loss": 0.1195, "step": 4803 }, { "epoch": 0.918546845124283, "grad_norm": 3.9118635654449463, "learning_rate": 5e-06, "loss": 0.3449, "step": 4804 }, { "epoch": 0.9187380497131931, "grad_norm": 1.7937930822372437, "learning_rate": 5e-06, "loss": 0.2372, "step": 4805 }, { "epoch": 0.9189292543021033, "grad_norm": 1.3490447998046875, "learning_rate": 5e-06, "loss": 0.0773, "step": 4806 }, { "epoch": 0.9191204588910133, "grad_norm": 2.1110494136810303, "learning_rate": 5e-06, "loss": 0.2215, "step": 4807 }, { "epoch": 0.9193116634799235, "grad_norm": 2.0199520587921143, "learning_rate": 5e-06, "loss": 0.2441, "step": 4808 }, { "epoch": 0.9195028680688336, "grad_norm": 1.7356966733932495, "learning_rate": 5e-06, "loss": 0.1303, "step": 4809 }, { "epoch": 0.9196940726577438, "grad_norm": 1.952366828918457, "learning_rate": 5e-06, "loss": 0.1433, "step": 4810 }, { "epoch": 0.9198852772466539, "grad_norm": 2.1498913764953613, "learning_rate": 5e-06, "loss": 0.1087, "step": 4811 }, { "epoch": 0.920076481835564, "grad_norm": 1.1379492282867432, "learning_rate": 5e-06, "loss": 0.0524, "step": 4812 }, { "epoch": 0.9202676864244742, "grad_norm": 2.057628870010376, "learning_rate": 5e-06, "loss": 0.1946, "step": 4813 }, { "epoch": 0.9204588910133843, "grad_norm": 2.6569430828094482, "learning_rate": 5e-06, "loss": 0.4691, "step": 4814 }, { "epoch": 0.9206500956022945, "grad_norm": 2.3530871868133545, "learning_rate": 5e-06, "loss": 0.2668, "step": 4815 }, { "epoch": 0.9208413001912046, "grad_norm": 2.5906331539154053, "learning_rate": 5e-06, "loss": 0.2775, "step": 4816 }, { "epoch": 0.9210325047801147, "grad_norm": 1.04031503200531, "learning_rate": 5e-06, "loss": 0.0453, "step": 4817 }, { "epoch": 0.9212237093690249, "grad_norm": 1.8329519033432007, "learning_rate": 5e-06, "loss": 0.0703, "step": 4818 }, { "epoch": 0.921414913957935, "grad_norm": 1.4083631038665771, "learning_rate": 5e-06, "loss": 0.0891, "step": 4819 }, { "epoch": 0.9216061185468452, "grad_norm": 2.5658862590789795, "learning_rate": 5e-06, "loss": 0.4482, "step": 4820 }, { "epoch": 0.9217973231357552, "grad_norm": 2.2376391887664795, "learning_rate": 5e-06, "loss": 0.2053, "step": 4821 }, { "epoch": 0.9219885277246654, "grad_norm": 1.5132383108139038, "learning_rate": 5e-06, "loss": 0.1199, "step": 4822 }, { "epoch": 0.9221797323135755, "grad_norm": 1.9439269304275513, "learning_rate": 5e-06, "loss": 0.2446, "step": 4823 }, { "epoch": 0.9223709369024856, "grad_norm": 1.8772526979446411, "learning_rate": 5e-06, "loss": 0.0855, "step": 4824 }, { "epoch": 0.9225621414913958, "grad_norm": 0.8960617184638977, "learning_rate": 5e-06, "loss": 0.0342, "step": 4825 }, { "epoch": 0.9227533460803059, "grad_norm": 2.050640106201172, "learning_rate": 5e-06, "loss": 0.3478, "step": 4826 }, { "epoch": 0.9229445506692161, "grad_norm": 1.714992880821228, "learning_rate": 5e-06, "loss": 0.1056, "step": 4827 }, { "epoch": 0.9231357552581262, "grad_norm": 1.1834768056869507, "learning_rate": 5e-06, "loss": 0.0615, "step": 4828 }, { "epoch": 0.9233269598470363, "grad_norm": 1.3316147327423096, "learning_rate": 5e-06, "loss": 0.0867, "step": 4829 }, { "epoch": 0.9235181644359465, "grad_norm": 2.6052935123443604, "learning_rate": 5e-06, "loss": 0.1191, "step": 4830 }, { "epoch": 0.9237093690248566, "grad_norm": 4.509915828704834, "learning_rate": 5e-06, "loss": 0.3938, "step": 4831 }, { "epoch": 0.9239005736137668, "grad_norm": 2.1398138999938965, "learning_rate": 5e-06, "loss": 0.3432, "step": 4832 }, { "epoch": 0.9240917782026769, "grad_norm": 2.1571106910705566, "learning_rate": 5e-06, "loss": 0.3267, "step": 4833 }, { "epoch": 0.924282982791587, "grad_norm": 1.6129310131072998, "learning_rate": 5e-06, "loss": 0.1886, "step": 4834 }, { "epoch": 0.9244741873804971, "grad_norm": 1.8231936693191528, "learning_rate": 5e-06, "loss": 0.1287, "step": 4835 }, { "epoch": 0.9246653919694072, "grad_norm": 1.4259830713272095, "learning_rate": 5e-06, "loss": 0.065, "step": 4836 }, { "epoch": 0.9248565965583174, "grad_norm": 1.6335722208023071, "learning_rate": 5e-06, "loss": 0.0964, "step": 4837 }, { "epoch": 0.9250478011472275, "grad_norm": 2.2979252338409424, "learning_rate": 5e-06, "loss": 0.0973, "step": 4838 }, { "epoch": 0.9252390057361377, "grad_norm": 2.0486056804656982, "learning_rate": 5e-06, "loss": 0.4276, "step": 4839 }, { "epoch": 0.9254302103250478, "grad_norm": 0.9143030047416687, "learning_rate": 5e-06, "loss": 0.0769, "step": 4840 }, { "epoch": 0.9256214149139579, "grad_norm": 2.2786619663238525, "learning_rate": 5e-06, "loss": 0.2726, "step": 4841 }, { "epoch": 0.9258126195028681, "grad_norm": 0.9805214405059814, "learning_rate": 5e-06, "loss": 0.0744, "step": 4842 }, { "epoch": 0.9260038240917782, "grad_norm": 1.4829126596450806, "learning_rate": 5e-06, "loss": 0.0954, "step": 4843 }, { "epoch": 0.9261950286806884, "grad_norm": 1.7729511260986328, "learning_rate": 5e-06, "loss": 0.1219, "step": 4844 }, { "epoch": 0.9263862332695985, "grad_norm": 2.1169512271881104, "learning_rate": 5e-06, "loss": 0.2112, "step": 4845 }, { "epoch": 0.9265774378585087, "grad_norm": 3.1143903732299805, "learning_rate": 5e-06, "loss": 0.1733, "step": 4846 }, { "epoch": 0.9267686424474187, "grad_norm": 1.8943371772766113, "learning_rate": 5e-06, "loss": 0.3117, "step": 4847 }, { "epoch": 0.9269598470363288, "grad_norm": 1.6305577754974365, "learning_rate": 5e-06, "loss": 0.169, "step": 4848 }, { "epoch": 0.927151051625239, "grad_norm": 0.9189224243164062, "learning_rate": 5e-06, "loss": 0.0763, "step": 4849 }, { "epoch": 0.9273422562141491, "grad_norm": 2.8332042694091797, "learning_rate": 5e-06, "loss": 0.2152, "step": 4850 }, { "epoch": 0.9275334608030593, "grad_norm": 5.684455394744873, "learning_rate": 5e-06, "loss": 0.2779, "step": 4851 }, { "epoch": 0.9277246653919694, "grad_norm": 1.2276694774627686, "learning_rate": 5e-06, "loss": 0.1409, "step": 4852 }, { "epoch": 0.9279158699808795, "grad_norm": 2.8665270805358887, "learning_rate": 5e-06, "loss": 0.3275, "step": 4853 }, { "epoch": 0.9281070745697897, "grad_norm": 1.772354245185852, "learning_rate": 5e-06, "loss": 0.1607, "step": 4854 }, { "epoch": 0.9282982791586998, "grad_norm": 2.3981382846832275, "learning_rate": 5e-06, "loss": 0.1545, "step": 4855 }, { "epoch": 0.92848948374761, "grad_norm": 1.4854987859725952, "learning_rate": 5e-06, "loss": 0.0896, "step": 4856 }, { "epoch": 0.9286806883365201, "grad_norm": 1.3724586963653564, "learning_rate": 5e-06, "loss": 0.1103, "step": 4857 }, { "epoch": 0.9288718929254303, "grad_norm": 1.819861650466919, "learning_rate": 5e-06, "loss": 0.2387, "step": 4858 }, { "epoch": 0.9290630975143404, "grad_norm": 1.742345929145813, "learning_rate": 5e-06, "loss": 0.1194, "step": 4859 }, { "epoch": 0.9292543021032504, "grad_norm": 1.5727949142456055, "learning_rate": 5e-06, "loss": 0.1547, "step": 4860 }, { "epoch": 0.9294455066921606, "grad_norm": 1.9011776447296143, "learning_rate": 5e-06, "loss": 0.2739, "step": 4861 }, { "epoch": 0.9296367112810707, "grad_norm": 1.4754587411880493, "learning_rate": 5e-06, "loss": 0.111, "step": 4862 }, { "epoch": 0.9298279158699809, "grad_norm": 1.5760308504104614, "learning_rate": 5e-06, "loss": 0.1089, "step": 4863 }, { "epoch": 0.930019120458891, "grad_norm": 2.719432830810547, "learning_rate": 5e-06, "loss": 0.4185, "step": 4864 }, { "epoch": 0.9302103250478011, "grad_norm": 2.2570371627807617, "learning_rate": 5e-06, "loss": 0.1839, "step": 4865 }, { "epoch": 0.9304015296367113, "grad_norm": 1.3954803943634033, "learning_rate": 5e-06, "loss": 0.099, "step": 4866 }, { "epoch": 0.9305927342256214, "grad_norm": 1.2225748300552368, "learning_rate": 5e-06, "loss": 0.1137, "step": 4867 }, { "epoch": 0.9307839388145316, "grad_norm": 0.926919162273407, "learning_rate": 5e-06, "loss": 0.0477, "step": 4868 }, { "epoch": 0.9309751434034417, "grad_norm": 1.8387597799301147, "learning_rate": 5e-06, "loss": 0.0733, "step": 4869 }, { "epoch": 0.9311663479923518, "grad_norm": 1.1521694660186768, "learning_rate": 5e-06, "loss": 0.1333, "step": 4870 }, { "epoch": 0.931357552581262, "grad_norm": 1.0635244846343994, "learning_rate": 5e-06, "loss": 0.1047, "step": 4871 }, { "epoch": 0.931548757170172, "grad_norm": 1.6397044658660889, "learning_rate": 5e-06, "loss": 0.1682, "step": 4872 }, { "epoch": 0.9317399617590822, "grad_norm": 1.338733434677124, "learning_rate": 5e-06, "loss": 0.1432, "step": 4873 }, { "epoch": 0.9319311663479923, "grad_norm": 1.307789921760559, "learning_rate": 5e-06, "loss": 0.0916, "step": 4874 }, { "epoch": 0.9321223709369025, "grad_norm": 2.0790135860443115, "learning_rate": 5e-06, "loss": 0.119, "step": 4875 }, { "epoch": 0.9323135755258126, "grad_norm": 1.5820248126983643, "learning_rate": 5e-06, "loss": 0.206, "step": 4876 }, { "epoch": 0.9325047801147227, "grad_norm": 1.5666821002960205, "learning_rate": 5e-06, "loss": 0.1286, "step": 4877 }, { "epoch": 0.9326959847036329, "grad_norm": 1.4068495035171509, "learning_rate": 5e-06, "loss": 0.2069, "step": 4878 }, { "epoch": 0.932887189292543, "grad_norm": 1.985937237739563, "learning_rate": 5e-06, "loss": 0.1667, "step": 4879 }, { "epoch": 0.9330783938814532, "grad_norm": 1.2681323289871216, "learning_rate": 5e-06, "loss": 0.0669, "step": 4880 }, { "epoch": 0.9332695984703633, "grad_norm": 1.4360939264297485, "learning_rate": 5e-06, "loss": 0.0561, "step": 4881 }, { "epoch": 0.9334608030592734, "grad_norm": 1.2678735256195068, "learning_rate": 5e-06, "loss": 0.1019, "step": 4882 }, { "epoch": 0.9336520076481836, "grad_norm": 2.8025400638580322, "learning_rate": 5e-06, "loss": 0.4316, "step": 4883 }, { "epoch": 0.9338432122370937, "grad_norm": 2.1860554218292236, "learning_rate": 5e-06, "loss": 0.1563, "step": 4884 }, { "epoch": 0.9340344168260039, "grad_norm": 1.1986353397369385, "learning_rate": 5e-06, "loss": 0.099, "step": 4885 }, { "epoch": 0.9342256214149139, "grad_norm": 0.7431411743164062, "learning_rate": 5e-06, "loss": 0.0325, "step": 4886 }, { "epoch": 0.9344168260038241, "grad_norm": 1.0779439210891724, "learning_rate": 5e-06, "loss": 0.0742, "step": 4887 }, { "epoch": 0.9346080305927342, "grad_norm": 1.574372410774231, "learning_rate": 5e-06, "loss": 0.1254, "step": 4888 }, { "epoch": 0.9347992351816443, "grad_norm": 1.4904820919036865, "learning_rate": 5e-06, "loss": 0.1166, "step": 4889 }, { "epoch": 0.9349904397705545, "grad_norm": 3.761897325515747, "learning_rate": 5e-06, "loss": 0.5072, "step": 4890 }, { "epoch": 0.9351816443594646, "grad_norm": 2.5792999267578125, "learning_rate": 5e-06, "loss": 0.3623, "step": 4891 }, { "epoch": 0.9353728489483748, "grad_norm": 2.6418819427490234, "learning_rate": 5e-06, "loss": 0.1922, "step": 4892 }, { "epoch": 0.9355640535372849, "grad_norm": 2.635206937789917, "learning_rate": 5e-06, "loss": 0.1172, "step": 4893 }, { "epoch": 0.935755258126195, "grad_norm": 2.728708028793335, "learning_rate": 5e-06, "loss": 0.3097, "step": 4894 }, { "epoch": 0.9359464627151052, "grad_norm": 2.1992111206054688, "learning_rate": 5e-06, "loss": 0.2703, "step": 4895 }, { "epoch": 0.9361376673040153, "grad_norm": 1.2282865047454834, "learning_rate": 5e-06, "loss": 0.1054, "step": 4896 }, { "epoch": 0.9363288718929255, "grad_norm": 1.8844170570373535, "learning_rate": 5e-06, "loss": 0.1161, "step": 4897 }, { "epoch": 0.9365200764818356, "grad_norm": 1.5336863994598389, "learning_rate": 5e-06, "loss": 0.109, "step": 4898 }, { "epoch": 0.9367112810707457, "grad_norm": 1.703996181488037, "learning_rate": 5e-06, "loss": 0.081, "step": 4899 }, { "epoch": 0.9369024856596558, "grad_norm": 1.7110828161239624, "learning_rate": 5e-06, "loss": 0.1016, "step": 4900 }, { "epoch": 0.9370936902485659, "grad_norm": 1.9495599269866943, "learning_rate": 5e-06, "loss": 0.3473, "step": 4901 }, { "epoch": 0.9372848948374761, "grad_norm": 2.5797476768493652, "learning_rate": 5e-06, "loss": 0.4117, "step": 4902 }, { "epoch": 0.9374760994263862, "grad_norm": 2.9593513011932373, "learning_rate": 5e-06, "loss": 0.3348, "step": 4903 }, { "epoch": 0.9376673040152964, "grad_norm": 1.2573286294937134, "learning_rate": 5e-06, "loss": 0.1025, "step": 4904 }, { "epoch": 0.9378585086042065, "grad_norm": 0.9126336574554443, "learning_rate": 5e-06, "loss": 0.0479, "step": 4905 }, { "epoch": 0.9380497131931166, "grad_norm": 2.2401199340820312, "learning_rate": 5e-06, "loss": 0.1684, "step": 4906 }, { "epoch": 0.9382409177820268, "grad_norm": 1.2739428281784058, "learning_rate": 5e-06, "loss": 0.1452, "step": 4907 }, { "epoch": 0.9384321223709369, "grad_norm": 1.4765453338623047, "learning_rate": 5e-06, "loss": 0.1207, "step": 4908 }, { "epoch": 0.9386233269598471, "grad_norm": 2.1147773265838623, "learning_rate": 5e-06, "loss": 0.1677, "step": 4909 }, { "epoch": 0.9388145315487572, "grad_norm": 1.5339463949203491, "learning_rate": 5e-06, "loss": 0.1036, "step": 4910 }, { "epoch": 0.9390057361376674, "grad_norm": 2.7874417304992676, "learning_rate": 5e-06, "loss": 0.1291, "step": 4911 }, { "epoch": 0.9391969407265774, "grad_norm": 1.52443265914917, "learning_rate": 5e-06, "loss": 0.0438, "step": 4912 }, { "epoch": 0.9393881453154875, "grad_norm": 2.6554689407348633, "learning_rate": 5e-06, "loss": 0.4022, "step": 4913 }, { "epoch": 0.9395793499043977, "grad_norm": 2.2543184757232666, "learning_rate": 5e-06, "loss": 0.3544, "step": 4914 }, { "epoch": 0.9397705544933078, "grad_norm": 3.414179563522339, "learning_rate": 5e-06, "loss": 0.5646, "step": 4915 }, { "epoch": 0.939961759082218, "grad_norm": 2.3218886852264404, "learning_rate": 5e-06, "loss": 0.2771, "step": 4916 }, { "epoch": 0.9401529636711281, "grad_norm": 1.0181642770767212, "learning_rate": 5e-06, "loss": 0.0543, "step": 4917 }, { "epoch": 0.9403441682600382, "grad_norm": 2.524704933166504, "learning_rate": 5e-06, "loss": 0.1449, "step": 4918 }, { "epoch": 0.9405353728489484, "grad_norm": 2.3251535892486572, "learning_rate": 5e-06, "loss": 0.1364, "step": 4919 }, { "epoch": 0.9407265774378585, "grad_norm": 2.4270453453063965, "learning_rate": 5e-06, "loss": 0.2367, "step": 4920 }, { "epoch": 0.9409177820267687, "grad_norm": 1.9268070459365845, "learning_rate": 5e-06, "loss": 0.2149, "step": 4921 }, { "epoch": 0.9411089866156788, "grad_norm": 1.8743796348571777, "learning_rate": 5e-06, "loss": 0.3339, "step": 4922 }, { "epoch": 0.941300191204589, "grad_norm": 1.756379246711731, "learning_rate": 5e-06, "loss": 0.119, "step": 4923 }, { "epoch": 0.941491395793499, "grad_norm": 0.8560100793838501, "learning_rate": 5e-06, "loss": 0.0507, "step": 4924 }, { "epoch": 0.9416826003824091, "grad_norm": 2.884312391281128, "learning_rate": 5e-06, "loss": 0.2991, "step": 4925 }, { "epoch": 0.9418738049713193, "grad_norm": 1.6877045631408691, "learning_rate": 5e-06, "loss": 0.2015, "step": 4926 }, { "epoch": 0.9420650095602294, "grad_norm": 1.8372879028320312, "learning_rate": 5e-06, "loss": 0.1594, "step": 4927 }, { "epoch": 0.9422562141491396, "grad_norm": 1.6219450235366821, "learning_rate": 5e-06, "loss": 0.1066, "step": 4928 }, { "epoch": 0.9424474187380497, "grad_norm": 3.6380860805511475, "learning_rate": 5e-06, "loss": 0.541, "step": 4929 }, { "epoch": 0.9426386233269598, "grad_norm": 2.0586066246032715, "learning_rate": 5e-06, "loss": 0.1329, "step": 4930 }, { "epoch": 0.94282982791587, "grad_norm": 1.7319679260253906, "learning_rate": 5e-06, "loss": 0.0845, "step": 4931 }, { "epoch": 0.9430210325047801, "grad_norm": 2.5363359451293945, "learning_rate": 5e-06, "loss": 0.4112, "step": 4932 }, { "epoch": 0.9432122370936903, "grad_norm": 1.9458513259887695, "learning_rate": 5e-06, "loss": 0.2529, "step": 4933 }, { "epoch": 0.9434034416826004, "grad_norm": 3.23052716255188, "learning_rate": 5e-06, "loss": 0.5106, "step": 4934 }, { "epoch": 0.9435946462715105, "grad_norm": 1.5255218744277954, "learning_rate": 5e-06, "loss": 0.1049, "step": 4935 }, { "epoch": 0.9437858508604207, "grad_norm": 2.298287868499756, "learning_rate": 5e-06, "loss": 0.3477, "step": 4936 }, { "epoch": 0.9439770554493307, "grad_norm": 1.1250810623168945, "learning_rate": 5e-06, "loss": 0.054, "step": 4937 }, { "epoch": 0.944168260038241, "grad_norm": 3.244180679321289, "learning_rate": 5e-06, "loss": 0.2005, "step": 4938 }, { "epoch": 0.944359464627151, "grad_norm": 3.1849043369293213, "learning_rate": 5e-06, "loss": 0.355, "step": 4939 }, { "epoch": 0.9445506692160612, "grad_norm": 2.582594156265259, "learning_rate": 5e-06, "loss": 0.1716, "step": 4940 }, { "epoch": 0.9447418738049713, "grad_norm": 1.0963287353515625, "learning_rate": 5e-06, "loss": 0.0854, "step": 4941 }, { "epoch": 0.9449330783938814, "grad_norm": 2.1446595191955566, "learning_rate": 5e-06, "loss": 0.2562, "step": 4942 }, { "epoch": 0.9451242829827916, "grad_norm": 1.8604848384857178, "learning_rate": 5e-06, "loss": 0.0994, "step": 4943 }, { "epoch": 0.9453154875717017, "grad_norm": 1.9539889097213745, "learning_rate": 5e-06, "loss": 0.0991, "step": 4944 }, { "epoch": 0.9455066921606119, "grad_norm": 2.2747185230255127, "learning_rate": 5e-06, "loss": 0.2482, "step": 4945 }, { "epoch": 0.945697896749522, "grad_norm": 1.741715669631958, "learning_rate": 5e-06, "loss": 0.2089, "step": 4946 }, { "epoch": 0.9458891013384321, "grad_norm": 1.2032427787780762, "learning_rate": 5e-06, "loss": 0.095, "step": 4947 }, { "epoch": 0.9460803059273423, "grad_norm": 1.8849596977233887, "learning_rate": 5e-06, "loss": 0.0805, "step": 4948 }, { "epoch": 0.9462715105162524, "grad_norm": 1.4403064250946045, "learning_rate": 5e-06, "loss": 0.1019, "step": 4949 }, { "epoch": 0.9464627151051626, "grad_norm": 1.0823032855987549, "learning_rate": 5e-06, "loss": 0.059, "step": 4950 }, { "epoch": 0.9466539196940726, "grad_norm": 3.9184749126434326, "learning_rate": 5e-06, "loss": 0.7392, "step": 4951 }, { "epoch": 0.9468451242829828, "grad_norm": 10.959403991699219, "learning_rate": 5e-06, "loss": 0.1341, "step": 4952 }, { "epoch": 0.9470363288718929, "grad_norm": 2.1564931869506836, "learning_rate": 5e-06, "loss": 0.2236, "step": 4953 }, { "epoch": 0.947227533460803, "grad_norm": 1.5129090547561646, "learning_rate": 5e-06, "loss": 0.1154, "step": 4954 }, { "epoch": 0.9474187380497132, "grad_norm": 0.9726706147193909, "learning_rate": 5e-06, "loss": 0.0421, "step": 4955 }, { "epoch": 0.9476099426386233, "grad_norm": 1.2113652229309082, "learning_rate": 5e-06, "loss": 0.0896, "step": 4956 }, { "epoch": 0.9478011472275335, "grad_norm": 1.5728042125701904, "learning_rate": 5e-06, "loss": 0.1368, "step": 4957 }, { "epoch": 0.9479923518164436, "grad_norm": 1.3641796112060547, "learning_rate": 5e-06, "loss": 0.1675, "step": 4958 }, { "epoch": 0.9481835564053537, "grad_norm": 1.7657511234283447, "learning_rate": 5e-06, "loss": 0.2036, "step": 4959 }, { "epoch": 0.9483747609942639, "grad_norm": 0.9892101883888245, "learning_rate": 5e-06, "loss": 0.0903, "step": 4960 }, { "epoch": 0.948565965583174, "grad_norm": 2.8380050659179688, "learning_rate": 5e-06, "loss": 0.3084, "step": 4961 }, { "epoch": 0.9487571701720842, "grad_norm": 0.8977944850921631, "learning_rate": 5e-06, "loss": 0.0524, "step": 4962 }, { "epoch": 0.9489483747609943, "grad_norm": 2.5370876789093018, "learning_rate": 5e-06, "loss": 0.407, "step": 4963 }, { "epoch": 0.9491395793499044, "grad_norm": 2.922842502593994, "learning_rate": 5e-06, "loss": 0.4577, "step": 4964 }, { "epoch": 0.9493307839388145, "grad_norm": 1.981605887413025, "learning_rate": 5e-06, "loss": 0.2596, "step": 4965 }, { "epoch": 0.9495219885277246, "grad_norm": 2.6923184394836426, "learning_rate": 5e-06, "loss": 0.4727, "step": 4966 }, { "epoch": 0.9497131931166348, "grad_norm": 1.4957002401351929, "learning_rate": 5e-06, "loss": 0.1353, "step": 4967 }, { "epoch": 0.9499043977055449, "grad_norm": 1.2848210334777832, "learning_rate": 5e-06, "loss": 0.078, "step": 4968 }, { "epoch": 0.9500956022944551, "grad_norm": 1.8343178033828735, "learning_rate": 5e-06, "loss": 0.1058, "step": 4969 }, { "epoch": 0.9502868068833652, "grad_norm": 1.764131784439087, "learning_rate": 5e-06, "loss": 0.2088, "step": 4970 }, { "epoch": 0.9504780114722753, "grad_norm": 1.4648293256759644, "learning_rate": 5e-06, "loss": 0.1017, "step": 4971 }, { "epoch": 0.9506692160611855, "grad_norm": 1.3409184217453003, "learning_rate": 5e-06, "loss": 0.0881, "step": 4972 }, { "epoch": 0.9508604206500956, "grad_norm": 1.3016523122787476, "learning_rate": 5e-06, "loss": 0.0451, "step": 4973 }, { "epoch": 0.9510516252390058, "grad_norm": 0.7061133980751038, "learning_rate": 5e-06, "loss": 0.0428, "step": 4974 }, { "epoch": 0.9512428298279159, "grad_norm": 3.5251235961914062, "learning_rate": 5e-06, "loss": 0.2068, "step": 4975 }, { "epoch": 0.9514340344168261, "grad_norm": 2.1218931674957275, "learning_rate": 5e-06, "loss": 0.3303, "step": 4976 }, { "epoch": 0.9516252390057361, "grad_norm": 2.1984875202178955, "learning_rate": 5e-06, "loss": 0.3954, "step": 4977 }, { "epoch": 0.9518164435946462, "grad_norm": 1.5826531648635864, "learning_rate": 5e-06, "loss": 0.1482, "step": 4978 }, { "epoch": 0.9520076481835564, "grad_norm": 1.824207067489624, "learning_rate": 5e-06, "loss": 0.1272, "step": 4979 }, { "epoch": 0.9521988527724665, "grad_norm": 1.0942171812057495, "learning_rate": 5e-06, "loss": 0.0596, "step": 4980 }, { "epoch": 0.9523900573613767, "grad_norm": 2.183330535888672, "learning_rate": 5e-06, "loss": 0.1098, "step": 4981 }, { "epoch": 0.9525812619502868, "grad_norm": 1.866705060005188, "learning_rate": 5e-06, "loss": 0.1239, "step": 4982 }, { "epoch": 0.9527724665391969, "grad_norm": 2.1280243396759033, "learning_rate": 5e-06, "loss": 0.2009, "step": 4983 }, { "epoch": 0.9529636711281071, "grad_norm": 1.8956713676452637, "learning_rate": 5e-06, "loss": 0.1752, "step": 4984 }, { "epoch": 0.9531548757170172, "grad_norm": 2.302192449569702, "learning_rate": 5e-06, "loss": 0.2851, "step": 4985 }, { "epoch": 0.9533460803059274, "grad_norm": 4.471398830413818, "learning_rate": 5e-06, "loss": 0.1626, "step": 4986 }, { "epoch": 0.9535372848948375, "grad_norm": 2.243290424346924, "learning_rate": 5e-06, "loss": 0.1037, "step": 4987 }, { "epoch": 0.9537284894837476, "grad_norm": 2.0457358360290527, "learning_rate": 5e-06, "loss": 0.2751, "step": 4988 }, { "epoch": 0.9539196940726578, "grad_norm": 1.7639656066894531, "learning_rate": 5e-06, "loss": 0.2408, "step": 4989 }, { "epoch": 0.9541108986615678, "grad_norm": 2.313894748687744, "learning_rate": 5e-06, "loss": 0.2289, "step": 4990 }, { "epoch": 0.954302103250478, "grad_norm": 1.8488069772720337, "learning_rate": 5e-06, "loss": 0.1356, "step": 4991 }, { "epoch": 0.9544933078393881, "grad_norm": 1.261997938156128, "learning_rate": 5e-06, "loss": 0.1232, "step": 4992 }, { "epoch": 0.9546845124282983, "grad_norm": 0.8408591747283936, "learning_rate": 5e-06, "loss": 0.0461, "step": 4993 }, { "epoch": 0.9548757170172084, "grad_norm": 2.0298595428466797, "learning_rate": 5e-06, "loss": 0.0984, "step": 4994 }, { "epoch": 0.9550669216061185, "grad_norm": 2.701509714126587, "learning_rate": 5e-06, "loss": 0.3663, "step": 4995 }, { "epoch": 0.9552581261950287, "grad_norm": 1.558736801147461, "learning_rate": 5e-06, "loss": 0.1501, "step": 4996 }, { "epoch": 0.9554493307839388, "grad_norm": 2.992225170135498, "learning_rate": 5e-06, "loss": 0.4301, "step": 4997 }, { "epoch": 0.955640535372849, "grad_norm": 1.2962523698806763, "learning_rate": 5e-06, "loss": 0.0935, "step": 4998 }, { "epoch": 0.9558317399617591, "grad_norm": 1.3460264205932617, "learning_rate": 5e-06, "loss": 0.0787, "step": 4999 }, { "epoch": 0.9560229445506692, "grad_norm": 2.1929922103881836, "learning_rate": 5e-06, "loss": 0.1547, "step": 5000 }, { "epoch": 0.9560229445506692, "eval_runtime": 742.0542, "eval_samples_per_second": 2.067, "eval_steps_per_second": 0.259, "step": 5000 }, { "epoch": 0.9562141491395794, "grad_norm": 1.8363579511642456, "learning_rate": 5e-06, "loss": 0.1593, "step": 5001 }, { "epoch": 0.9564053537284894, "grad_norm": 2.0634498596191406, "learning_rate": 5e-06, "loss": 0.1911, "step": 5002 }, { "epoch": 0.9565965583173996, "grad_norm": 1.5266684293746948, "learning_rate": 5e-06, "loss": 0.1115, "step": 5003 }, { "epoch": 0.9567877629063097, "grad_norm": 1.1533805131912231, "learning_rate": 5e-06, "loss": 0.1106, "step": 5004 }, { "epoch": 0.9569789674952199, "grad_norm": 2.2362139225006104, "learning_rate": 5e-06, "loss": 0.1925, "step": 5005 }, { "epoch": 0.95717017208413, "grad_norm": 2.0843067169189453, "learning_rate": 5e-06, "loss": 0.1154, "step": 5006 }, { "epoch": 0.9573613766730401, "grad_norm": 2.134740114212036, "learning_rate": 5e-06, "loss": 0.3089, "step": 5007 }, { "epoch": 0.9575525812619503, "grad_norm": 2.9222230911254883, "learning_rate": 5e-06, "loss": 0.4906, "step": 5008 }, { "epoch": 0.9577437858508604, "grad_norm": 2.486271619796753, "learning_rate": 5e-06, "loss": 0.3661, "step": 5009 }, { "epoch": 0.9579349904397706, "grad_norm": 0.9269048571586609, "learning_rate": 5e-06, "loss": 0.1277, "step": 5010 }, { "epoch": 0.9581261950286807, "grad_norm": 2.4355266094207764, "learning_rate": 5e-06, "loss": 0.1951, "step": 5011 }, { "epoch": 0.9583173996175908, "grad_norm": 2.3811750411987305, "learning_rate": 5e-06, "loss": 0.144, "step": 5012 }, { "epoch": 0.958508604206501, "grad_norm": 2.0431880950927734, "learning_rate": 5e-06, "loss": 0.1872, "step": 5013 }, { "epoch": 0.9586998087954111, "grad_norm": 3.104672908782959, "learning_rate": 5e-06, "loss": 0.4546, "step": 5014 }, { "epoch": 0.9588910133843213, "grad_norm": 2.163524866104126, "learning_rate": 5e-06, "loss": 0.1894, "step": 5015 }, { "epoch": 0.9590822179732313, "grad_norm": 2.0325043201446533, "learning_rate": 5e-06, "loss": 0.1353, "step": 5016 }, { "epoch": 0.9592734225621415, "grad_norm": 2.2608044147491455, "learning_rate": 5e-06, "loss": 0.2509, "step": 5017 }, { "epoch": 0.9594646271510516, "grad_norm": 1.2994120121002197, "learning_rate": 5e-06, "loss": 0.0878, "step": 5018 }, { "epoch": 0.9596558317399617, "grad_norm": 1.0766092538833618, "learning_rate": 5e-06, "loss": 0.0688, "step": 5019 }, { "epoch": 0.9598470363288719, "grad_norm": 2.5063343048095703, "learning_rate": 5e-06, "loss": 0.3423, "step": 5020 }, { "epoch": 0.960038240917782, "grad_norm": 2.1994948387145996, "learning_rate": 5e-06, "loss": 0.2923, "step": 5021 }, { "epoch": 0.9602294455066922, "grad_norm": 3.1960678100585938, "learning_rate": 5e-06, "loss": 0.4548, "step": 5022 }, { "epoch": 0.9604206500956023, "grad_norm": 1.9123889207839966, "learning_rate": 5e-06, "loss": 0.1785, "step": 5023 }, { "epoch": 0.9606118546845124, "grad_norm": 1.6679797172546387, "learning_rate": 5e-06, "loss": 0.1141, "step": 5024 }, { "epoch": 0.9608030592734226, "grad_norm": 1.2722249031066895, "learning_rate": 5e-06, "loss": 0.0643, "step": 5025 }, { "epoch": 0.9609942638623327, "grad_norm": 1.5316091775894165, "learning_rate": 5e-06, "loss": 0.2274, "step": 5026 }, { "epoch": 0.9611854684512429, "grad_norm": 1.5301028490066528, "learning_rate": 5e-06, "loss": 0.1516, "step": 5027 }, { "epoch": 0.961376673040153, "grad_norm": 1.963606357574463, "learning_rate": 5e-06, "loss": 0.2119, "step": 5028 }, { "epoch": 0.9615678776290632, "grad_norm": 1.5402876138687134, "learning_rate": 5e-06, "loss": 0.1275, "step": 5029 }, { "epoch": 0.9617590822179732, "grad_norm": 1.1363723278045654, "learning_rate": 5e-06, "loss": 0.0777, "step": 5030 }, { "epoch": 0.9619502868068833, "grad_norm": 1.909873604774475, "learning_rate": 5e-06, "loss": 0.1227, "step": 5031 }, { "epoch": 0.9621414913957935, "grad_norm": 1.3693710565567017, "learning_rate": 5e-06, "loss": 0.0944, "step": 5032 }, { "epoch": 0.9623326959847036, "grad_norm": 2.0859200954437256, "learning_rate": 5e-06, "loss": 0.2549, "step": 5033 }, { "epoch": 0.9625239005736138, "grad_norm": 1.5054364204406738, "learning_rate": 5e-06, "loss": 0.1717, "step": 5034 }, { "epoch": 0.9627151051625239, "grad_norm": 1.8584833145141602, "learning_rate": 5e-06, "loss": 0.1093, "step": 5035 }, { "epoch": 0.962906309751434, "grad_norm": 1.1875016689300537, "learning_rate": 5e-06, "loss": 0.0729, "step": 5036 }, { "epoch": 0.9630975143403442, "grad_norm": 1.1567950248718262, "learning_rate": 5e-06, "loss": 0.0653, "step": 5037 }, { "epoch": 0.9632887189292543, "grad_norm": 1.8108049631118774, "learning_rate": 5e-06, "loss": 0.1345, "step": 5038 }, { "epoch": 0.9634799235181645, "grad_norm": 3.5861775875091553, "learning_rate": 5e-06, "loss": 0.5882, "step": 5039 }, { "epoch": 0.9636711281070746, "grad_norm": 2.6124978065490723, "learning_rate": 5e-06, "loss": 0.1039, "step": 5040 }, { "epoch": 0.9638623326959848, "grad_norm": 1.3670365810394287, "learning_rate": 5e-06, "loss": 0.1041, "step": 5041 }, { "epoch": 0.9640535372848948, "grad_norm": 1.332395315170288, "learning_rate": 5e-06, "loss": 0.1791, "step": 5042 }, { "epoch": 0.9642447418738049, "grad_norm": 1.1727179288864136, "learning_rate": 5e-06, "loss": 0.1109, "step": 5043 }, { "epoch": 0.9644359464627151, "grad_norm": 1.3848689794540405, "learning_rate": 5e-06, "loss": 0.1115, "step": 5044 }, { "epoch": 0.9646271510516252, "grad_norm": 1.4246430397033691, "learning_rate": 5e-06, "loss": 0.099, "step": 5045 }, { "epoch": 0.9648183556405354, "grad_norm": 2.0380055904388428, "learning_rate": 5e-06, "loss": 0.2963, "step": 5046 }, { "epoch": 0.9650095602294455, "grad_norm": 2.6432945728302, "learning_rate": 5e-06, "loss": 0.236, "step": 5047 }, { "epoch": 0.9652007648183556, "grad_norm": 1.9849876165390015, "learning_rate": 5e-06, "loss": 0.2692, "step": 5048 }, { "epoch": 0.9653919694072658, "grad_norm": 1.9646875858306885, "learning_rate": 5e-06, "loss": 0.3539, "step": 5049 }, { "epoch": 0.9655831739961759, "grad_norm": 2.007697820663452, "learning_rate": 5e-06, "loss": 0.1226, "step": 5050 }, { "epoch": 0.9657743785850861, "grad_norm": 1.224379062652588, "learning_rate": 5e-06, "loss": 0.1355, "step": 5051 }, { "epoch": 0.9659655831739962, "grad_norm": 0.585542619228363, "learning_rate": 5e-06, "loss": 0.0688, "step": 5052 }, { "epoch": 0.9661567877629063, "grad_norm": 1.7798622846603394, "learning_rate": 5e-06, "loss": 0.2512, "step": 5053 }, { "epoch": 0.9663479923518165, "grad_norm": 1.9933887720108032, "learning_rate": 5e-06, "loss": 0.1716, "step": 5054 }, { "epoch": 0.9665391969407265, "grad_norm": 1.6793018579483032, "learning_rate": 5e-06, "loss": 0.1758, "step": 5055 }, { "epoch": 0.9667304015296367, "grad_norm": 1.5060217380523682, "learning_rate": 5e-06, "loss": 0.1141, "step": 5056 }, { "epoch": 0.9669216061185468, "grad_norm": 2.19502329826355, "learning_rate": 5e-06, "loss": 0.2314, "step": 5057 }, { "epoch": 0.967112810707457, "grad_norm": 1.7284197807312012, "learning_rate": 5e-06, "loss": 0.1213, "step": 5058 }, { "epoch": 0.9673040152963671, "grad_norm": 1.6519883871078491, "learning_rate": 5e-06, "loss": 0.1296, "step": 5059 }, { "epoch": 0.9674952198852772, "grad_norm": 2.585042715072632, "learning_rate": 5e-06, "loss": 0.2157, "step": 5060 }, { "epoch": 0.9676864244741874, "grad_norm": 2.122485637664795, "learning_rate": 5e-06, "loss": 0.3705, "step": 5061 }, { "epoch": 0.9678776290630975, "grad_norm": 1.6645944118499756, "learning_rate": 5e-06, "loss": 0.1524, "step": 5062 }, { "epoch": 0.9680688336520077, "grad_norm": 2.446328639984131, "learning_rate": 5e-06, "loss": 0.1761, "step": 5063 }, { "epoch": 0.9682600382409178, "grad_norm": 3.5966954231262207, "learning_rate": 5e-06, "loss": 0.5335, "step": 5064 }, { "epoch": 0.9684512428298279, "grad_norm": 1.5934321880340576, "learning_rate": 5e-06, "loss": 0.1176, "step": 5065 }, { "epoch": 0.9686424474187381, "grad_norm": 2.7553305625915527, "learning_rate": 5e-06, "loss": 0.2518, "step": 5066 }, { "epoch": 0.9688336520076481, "grad_norm": 1.3165737390518188, "learning_rate": 5e-06, "loss": 0.0789, "step": 5067 }, { "epoch": 0.9690248565965583, "grad_norm": 1.7967562675476074, "learning_rate": 5e-06, "loss": 0.1001, "step": 5068 }, { "epoch": 0.9692160611854684, "grad_norm": 1.7139383554458618, "learning_rate": 5e-06, "loss": 0.1541, "step": 5069 }, { "epoch": 0.9694072657743786, "grad_norm": 2.3538708686828613, "learning_rate": 5e-06, "loss": 0.3276, "step": 5070 }, { "epoch": 0.9695984703632887, "grad_norm": 2.8249590396881104, "learning_rate": 5e-06, "loss": 0.446, "step": 5071 }, { "epoch": 0.9697896749521988, "grad_norm": 1.2258639335632324, "learning_rate": 5e-06, "loss": 0.1059, "step": 5072 }, { "epoch": 0.969980879541109, "grad_norm": 1.419464111328125, "learning_rate": 5e-06, "loss": 0.0757, "step": 5073 }, { "epoch": 0.9701720841300191, "grad_norm": 1.7254623174667358, "learning_rate": 5e-06, "loss": 0.0864, "step": 5074 }, { "epoch": 0.9703632887189293, "grad_norm": 2.9186739921569824, "learning_rate": 5e-06, "loss": 0.2495, "step": 5075 }, { "epoch": 0.9705544933078394, "grad_norm": 1.3285423517227173, "learning_rate": 5e-06, "loss": 0.1859, "step": 5076 }, { "epoch": 0.9707456978967495, "grad_norm": 1.7535793781280518, "learning_rate": 5e-06, "loss": 0.2977, "step": 5077 }, { "epoch": 0.9709369024856597, "grad_norm": 1.533988356590271, "learning_rate": 5e-06, "loss": 0.1335, "step": 5078 }, { "epoch": 0.9711281070745698, "grad_norm": 1.5041394233703613, "learning_rate": 5e-06, "loss": 0.142, "step": 5079 }, { "epoch": 0.97131931166348, "grad_norm": 1.4372128248214722, "learning_rate": 5e-06, "loss": 0.0904, "step": 5080 }, { "epoch": 0.97151051625239, "grad_norm": 2.476388454437256, "learning_rate": 5e-06, "loss": 0.1434, "step": 5081 }, { "epoch": 0.9717017208413002, "grad_norm": 2.5645976066589355, "learning_rate": 5e-06, "loss": 0.2597, "step": 5082 }, { "epoch": 0.9718929254302103, "grad_norm": 1.5782102346420288, "learning_rate": 5e-06, "loss": 0.1355, "step": 5083 }, { "epoch": 0.9720841300191204, "grad_norm": 1.2604018449783325, "learning_rate": 5e-06, "loss": 0.2281, "step": 5084 }, { "epoch": 0.9722753346080306, "grad_norm": 1.2508944272994995, "learning_rate": 5e-06, "loss": 0.0719, "step": 5085 }, { "epoch": 0.9724665391969407, "grad_norm": 2.0485973358154297, "learning_rate": 5e-06, "loss": 0.0766, "step": 5086 }, { "epoch": 0.9726577437858509, "grad_norm": 1.1528031826019287, "learning_rate": 5e-06, "loss": 0.0832, "step": 5087 }, { "epoch": 0.972848948374761, "grad_norm": 1.789041519165039, "learning_rate": 5e-06, "loss": 0.1181, "step": 5088 }, { "epoch": 0.9730401529636711, "grad_norm": 1.92593514919281, "learning_rate": 5e-06, "loss": 0.2047, "step": 5089 }, { "epoch": 0.9732313575525813, "grad_norm": 1.1563007831573486, "learning_rate": 5e-06, "loss": 0.1133, "step": 5090 }, { "epoch": 0.9734225621414914, "grad_norm": 1.1158252954483032, "learning_rate": 5e-06, "loss": 0.0862, "step": 5091 }, { "epoch": 0.9736137667304016, "grad_norm": 1.5522339344024658, "learning_rate": 5e-06, "loss": 0.103, "step": 5092 }, { "epoch": 0.9738049713193117, "grad_norm": 2.535926342010498, "learning_rate": 5e-06, "loss": 0.2977, "step": 5093 }, { "epoch": 0.9739961759082219, "grad_norm": 1.2164846658706665, "learning_rate": 5e-06, "loss": 0.0662, "step": 5094 }, { "epoch": 0.9741873804971319, "grad_norm": 2.7936418056488037, "learning_rate": 5e-06, "loss": 0.4989, "step": 5095 }, { "epoch": 0.974378585086042, "grad_norm": 2.6002554893493652, "learning_rate": 5e-06, "loss": 0.3435, "step": 5096 }, { "epoch": 0.9745697896749522, "grad_norm": 2.3612749576568604, "learning_rate": 5e-06, "loss": 0.2797, "step": 5097 }, { "epoch": 0.9747609942638623, "grad_norm": 2.0107052326202393, "learning_rate": 5e-06, "loss": 0.1331, "step": 5098 }, { "epoch": 0.9749521988527725, "grad_norm": 1.1361372470855713, "learning_rate": 5e-06, "loss": 0.0784, "step": 5099 }, { "epoch": 0.9751434034416826, "grad_norm": 1.9277843236923218, "learning_rate": 5e-06, "loss": 0.1257, "step": 5100 }, { "epoch": 0.9753346080305927, "grad_norm": 1.3512511253356934, "learning_rate": 5e-06, "loss": 0.119, "step": 5101 }, { "epoch": 0.9755258126195029, "grad_norm": 3.0426383018493652, "learning_rate": 5e-06, "loss": 0.4038, "step": 5102 }, { "epoch": 0.975717017208413, "grad_norm": 2.047166347503662, "learning_rate": 5e-06, "loss": 0.1507, "step": 5103 }, { "epoch": 0.9759082217973232, "grad_norm": 1.3602712154388428, "learning_rate": 5e-06, "loss": 0.0953, "step": 5104 }, { "epoch": 0.9760994263862333, "grad_norm": 2.696289539337158, "learning_rate": 5e-06, "loss": 0.1088, "step": 5105 }, { "epoch": 0.9762906309751435, "grad_norm": 1.646152138710022, "learning_rate": 5e-06, "loss": 0.1026, "step": 5106 }, { "epoch": 0.9764818355640535, "grad_norm": 1.7276148796081543, "learning_rate": 5e-06, "loss": 0.1875, "step": 5107 }, { "epoch": 0.9766730401529636, "grad_norm": 2.928664445877075, "learning_rate": 5e-06, "loss": 0.1254, "step": 5108 }, { "epoch": 0.9768642447418738, "grad_norm": 2.494978904724121, "learning_rate": 5e-06, "loss": 0.3259, "step": 5109 }, { "epoch": 0.9770554493307839, "grad_norm": 1.8363969326019287, "learning_rate": 5e-06, "loss": 0.106, "step": 5110 }, { "epoch": 0.9772466539196941, "grad_norm": 3.1696293354034424, "learning_rate": 5e-06, "loss": 0.2118, "step": 5111 }, { "epoch": 0.9774378585086042, "grad_norm": 1.7869844436645508, "learning_rate": 5e-06, "loss": 0.1343, "step": 5112 }, { "epoch": 0.9776290630975143, "grad_norm": 1.5095562934875488, "learning_rate": 5e-06, "loss": 0.1248, "step": 5113 }, { "epoch": 0.9778202676864245, "grad_norm": 1.0771520137786865, "learning_rate": 5e-06, "loss": 0.1536, "step": 5114 }, { "epoch": 0.9780114722753346, "grad_norm": 1.9261468648910522, "learning_rate": 5e-06, "loss": 0.2512, "step": 5115 }, { "epoch": 0.9782026768642448, "grad_norm": 2.3080861568450928, "learning_rate": 5e-06, "loss": 0.1425, "step": 5116 }, { "epoch": 0.9783938814531549, "grad_norm": 2.382253885269165, "learning_rate": 5e-06, "loss": 0.1637, "step": 5117 }, { "epoch": 0.978585086042065, "grad_norm": 2.896066665649414, "learning_rate": 5e-06, "loss": 0.3333, "step": 5118 }, { "epoch": 0.9787762906309752, "grad_norm": 1.2175503969192505, "learning_rate": 5e-06, "loss": 0.0933, "step": 5119 }, { "epoch": 0.9789674952198852, "grad_norm": 1.7644296884536743, "learning_rate": 5e-06, "loss": 0.1718, "step": 5120 }, { "epoch": 0.9791586998087954, "grad_norm": 1.8412610292434692, "learning_rate": 5e-06, "loss": 0.1739, "step": 5121 }, { "epoch": 0.9793499043977055, "grad_norm": 1.3663194179534912, "learning_rate": 5e-06, "loss": 0.1117, "step": 5122 }, { "epoch": 0.9795411089866157, "grad_norm": 1.2034631967544556, "learning_rate": 5e-06, "loss": 0.1127, "step": 5123 }, { "epoch": 0.9797323135755258, "grad_norm": 2.052450180053711, "learning_rate": 5e-06, "loss": 0.1142, "step": 5124 }, { "epoch": 0.9799235181644359, "grad_norm": 2.3145527839660645, "learning_rate": 5e-06, "loss": 0.1296, "step": 5125 }, { "epoch": 0.9801147227533461, "grad_norm": 1.6712604761123657, "learning_rate": 5e-06, "loss": 0.2101, "step": 5126 }, { "epoch": 0.9803059273422562, "grad_norm": 2.388617992401123, "learning_rate": 5e-06, "loss": 0.2405, "step": 5127 }, { "epoch": 0.9804971319311664, "grad_norm": 3.332637071609497, "learning_rate": 5e-06, "loss": 0.286, "step": 5128 }, { "epoch": 0.9806883365200765, "grad_norm": 2.245945930480957, "learning_rate": 5e-06, "loss": 0.1992, "step": 5129 }, { "epoch": 0.9808795411089866, "grad_norm": 2.0014476776123047, "learning_rate": 5e-06, "loss": 0.2369, "step": 5130 }, { "epoch": 0.9810707456978968, "grad_norm": 2.185051441192627, "learning_rate": 5e-06, "loss": 0.124, "step": 5131 }, { "epoch": 0.9812619502868068, "grad_norm": 3.205618381500244, "learning_rate": 5e-06, "loss": 0.1862, "step": 5132 }, { "epoch": 0.981453154875717, "grad_norm": 2.2920567989349365, "learning_rate": 5e-06, "loss": 0.3075, "step": 5133 }, { "epoch": 0.9816443594646271, "grad_norm": 1.5987259149551392, "learning_rate": 5e-06, "loss": 0.0987, "step": 5134 }, { "epoch": 0.9818355640535373, "grad_norm": 1.4586559534072876, "learning_rate": 5e-06, "loss": 0.1095, "step": 5135 }, { "epoch": 0.9820267686424474, "grad_norm": 1.3265559673309326, "learning_rate": 5e-06, "loss": 0.0885, "step": 5136 }, { "epoch": 0.9822179732313575, "grad_norm": 1.8111308813095093, "learning_rate": 5e-06, "loss": 0.098, "step": 5137 }, { "epoch": 0.9824091778202677, "grad_norm": 1.9559201002120972, "learning_rate": 5e-06, "loss": 0.1827, "step": 5138 }, { "epoch": 0.9826003824091778, "grad_norm": 2.519045829772949, "learning_rate": 5e-06, "loss": 0.4279, "step": 5139 }, { "epoch": 0.982791586998088, "grad_norm": 1.0115805864334106, "learning_rate": 5e-06, "loss": 0.0554, "step": 5140 }, { "epoch": 0.9829827915869981, "grad_norm": 1.5444289445877075, "learning_rate": 5e-06, "loss": 0.1956, "step": 5141 }, { "epoch": 0.9831739961759082, "grad_norm": 1.1781493425369263, "learning_rate": 5e-06, "loss": 0.1124, "step": 5142 }, { "epoch": 0.9833652007648184, "grad_norm": 1.6144378185272217, "learning_rate": 5e-06, "loss": 0.1011, "step": 5143 }, { "epoch": 0.9835564053537285, "grad_norm": 1.777444839477539, "learning_rate": 5e-06, "loss": 0.3058, "step": 5144 }, { "epoch": 0.9837476099426387, "grad_norm": 2.543937921524048, "learning_rate": 5e-06, "loss": 0.3751, "step": 5145 }, { "epoch": 0.9839388145315487, "grad_norm": 2.032430648803711, "learning_rate": 5e-06, "loss": 0.2294, "step": 5146 }, { "epoch": 0.9841300191204589, "grad_norm": 1.2935893535614014, "learning_rate": 5e-06, "loss": 0.1531, "step": 5147 }, { "epoch": 0.984321223709369, "grad_norm": 1.9025208950042725, "learning_rate": 5e-06, "loss": 0.1284, "step": 5148 }, { "epoch": 0.9845124282982791, "grad_norm": 3.2536699771881104, "learning_rate": 5e-06, "loss": 0.3226, "step": 5149 }, { "epoch": 0.9847036328871893, "grad_norm": 2.4361836910247803, "learning_rate": 5e-06, "loss": 0.1494, "step": 5150 }, { "epoch": 0.9848948374760994, "grad_norm": 1.2773830890655518, "learning_rate": 5e-06, "loss": 0.102, "step": 5151 }, { "epoch": 0.9850860420650096, "grad_norm": 2.558335542678833, "learning_rate": 5e-06, "loss": 0.2727, "step": 5152 }, { "epoch": 0.9852772466539197, "grad_norm": 2.8196585178375244, "learning_rate": 5e-06, "loss": 0.2036, "step": 5153 }, { "epoch": 0.9854684512428298, "grad_norm": 1.9609150886535645, "learning_rate": 5e-06, "loss": 0.1723, "step": 5154 }, { "epoch": 0.98565965583174, "grad_norm": 3.2695369720458984, "learning_rate": 5e-06, "loss": 0.0569, "step": 5155 }, { "epoch": 0.9858508604206501, "grad_norm": 1.6373162269592285, "learning_rate": 5e-06, "loss": 0.0689, "step": 5156 }, { "epoch": 0.9860420650095603, "grad_norm": 1.8946233987808228, "learning_rate": 5e-06, "loss": 0.2049, "step": 5157 }, { "epoch": 0.9862332695984704, "grad_norm": 2.2586464881896973, "learning_rate": 5e-06, "loss": 0.1795, "step": 5158 }, { "epoch": 0.9864244741873806, "grad_norm": 2.046656370162964, "learning_rate": 5e-06, "loss": 0.2424, "step": 5159 }, { "epoch": 0.9866156787762906, "grad_norm": 0.6712673902511597, "learning_rate": 5e-06, "loss": 0.0386, "step": 5160 }, { "epoch": 0.9868068833652007, "grad_norm": 1.4954816102981567, "learning_rate": 5e-06, "loss": 0.1202, "step": 5161 }, { "epoch": 0.9869980879541109, "grad_norm": 2.4143035411834717, "learning_rate": 5e-06, "loss": 0.0619, "step": 5162 }, { "epoch": 0.987189292543021, "grad_norm": 1.1633849143981934, "learning_rate": 5e-06, "loss": 0.0919, "step": 5163 }, { "epoch": 0.9873804971319312, "grad_norm": 3.078310966491699, "learning_rate": 5e-06, "loss": 0.5286, "step": 5164 }, { "epoch": 0.9875717017208413, "grad_norm": 3.7422358989715576, "learning_rate": 5e-06, "loss": 0.7243, "step": 5165 }, { "epoch": 0.9877629063097514, "grad_norm": 2.2467732429504395, "learning_rate": 5e-06, "loss": 0.2758, "step": 5166 }, { "epoch": 0.9879541108986616, "grad_norm": 1.4942512512207031, "learning_rate": 5e-06, "loss": 0.1251, "step": 5167 }, { "epoch": 0.9881453154875717, "grad_norm": 3.4213297367095947, "learning_rate": 5e-06, "loss": 0.1562, "step": 5168 }, { "epoch": 0.9883365200764819, "grad_norm": 0.8668385744094849, "learning_rate": 5e-06, "loss": 0.0532, "step": 5169 }, { "epoch": 0.988527724665392, "grad_norm": 4.07417106628418, "learning_rate": 5e-06, "loss": 0.2916, "step": 5170 }, { "epoch": 0.988718929254302, "grad_norm": 1.8834352493286133, "learning_rate": 5e-06, "loss": 0.2522, "step": 5171 }, { "epoch": 0.9889101338432122, "grad_norm": 1.5943543910980225, "learning_rate": 5e-06, "loss": 0.1172, "step": 5172 }, { "epoch": 0.9891013384321223, "grad_norm": 2.434356927871704, "learning_rate": 5e-06, "loss": 0.1855, "step": 5173 }, { "epoch": 0.9892925430210325, "grad_norm": 2.505683422088623, "learning_rate": 5e-06, "loss": 0.1302, "step": 5174 }, { "epoch": 0.9894837476099426, "grad_norm": 2.152951240539551, "learning_rate": 5e-06, "loss": 0.1351, "step": 5175 }, { "epoch": 0.9896749521988528, "grad_norm": 1.6887750625610352, "learning_rate": 5e-06, "loss": 0.1331, "step": 5176 }, { "epoch": 0.9898661567877629, "grad_norm": 1.2444119453430176, "learning_rate": 5e-06, "loss": 0.1079, "step": 5177 }, { "epoch": 0.990057361376673, "grad_norm": 2.2396018505096436, "learning_rate": 5e-06, "loss": 0.3107, "step": 5178 }, { "epoch": 0.9902485659655832, "grad_norm": 2.357428550720215, "learning_rate": 5e-06, "loss": 0.3262, "step": 5179 }, { "epoch": 0.9904397705544933, "grad_norm": 1.2109572887420654, "learning_rate": 5e-06, "loss": 0.0421, "step": 5180 }, { "epoch": 0.9906309751434035, "grad_norm": 3.045706272125244, "learning_rate": 5e-06, "loss": 0.0984, "step": 5181 }, { "epoch": 0.9908221797323136, "grad_norm": 3.762176990509033, "learning_rate": 5e-06, "loss": 0.2968, "step": 5182 }, { "epoch": 0.9910133843212237, "grad_norm": 1.7476848363876343, "learning_rate": 5e-06, "loss": 0.1861, "step": 5183 }, { "epoch": 0.9912045889101339, "grad_norm": 1.974594235420227, "learning_rate": 5e-06, "loss": 0.1671, "step": 5184 }, { "epoch": 0.9913957934990439, "grad_norm": 1.9838353395462036, "learning_rate": 5e-06, "loss": 0.1467, "step": 5185 }, { "epoch": 0.9915869980879541, "grad_norm": 1.257691502571106, "learning_rate": 5e-06, "loss": 0.0803, "step": 5186 }, { "epoch": 0.9917782026768642, "grad_norm": 1.7940335273742676, "learning_rate": 5e-06, "loss": 0.0902, "step": 5187 }, { "epoch": 0.9919694072657744, "grad_norm": 2.434577226638794, "learning_rate": 5e-06, "loss": 0.1678, "step": 5188 }, { "epoch": 0.9921606118546845, "grad_norm": 2.495274543762207, "learning_rate": 5e-06, "loss": 0.3524, "step": 5189 }, { "epoch": 0.9923518164435946, "grad_norm": 2.122135639190674, "learning_rate": 5e-06, "loss": 0.334, "step": 5190 }, { "epoch": 0.9925430210325048, "grad_norm": 3.421670436859131, "learning_rate": 5e-06, "loss": 0.3898, "step": 5191 }, { "epoch": 0.9927342256214149, "grad_norm": 0.9619971513748169, "learning_rate": 5e-06, "loss": 0.0719, "step": 5192 }, { "epoch": 0.9929254302103251, "grad_norm": 0.8660413026809692, "learning_rate": 5e-06, "loss": 0.0436, "step": 5193 }, { "epoch": 0.9931166347992352, "grad_norm": 1.1905746459960938, "learning_rate": 5e-06, "loss": 0.0579, "step": 5194 }, { "epoch": 0.9933078393881453, "grad_norm": 3.714054822921753, "learning_rate": 5e-06, "loss": 0.4012, "step": 5195 }, { "epoch": 0.9934990439770555, "grad_norm": 1.5647445917129517, "learning_rate": 5e-06, "loss": 0.1707, "step": 5196 }, { "epoch": 0.9936902485659656, "grad_norm": 1.8715081214904785, "learning_rate": 5e-06, "loss": 0.236, "step": 5197 }, { "epoch": 0.9938814531548757, "grad_norm": 1.3924305438995361, "learning_rate": 5e-06, "loss": 0.1171, "step": 5198 }, { "epoch": 0.9940726577437858, "grad_norm": 1.4491206407546997, "learning_rate": 5e-06, "loss": 0.1173, "step": 5199 }, { "epoch": 0.994263862332696, "grad_norm": 3.2908480167388916, "learning_rate": 5e-06, "loss": 0.241, "step": 5200 }, { "epoch": 0.9944550669216061, "grad_norm": 1.7284696102142334, "learning_rate": 5e-06, "loss": 0.1932, "step": 5201 }, { "epoch": 0.9946462715105162, "grad_norm": 3.488659381866455, "learning_rate": 5e-06, "loss": 0.3217, "step": 5202 }, { "epoch": 0.9948374760994264, "grad_norm": 1.9990845918655396, "learning_rate": 5e-06, "loss": 0.4941, "step": 5203 }, { "epoch": 0.9950286806883365, "grad_norm": 1.8910921812057495, "learning_rate": 5e-06, "loss": 0.1171, "step": 5204 }, { "epoch": 0.9952198852772467, "grad_norm": 1.4411567449569702, "learning_rate": 5e-06, "loss": 0.1537, "step": 5205 }, { "epoch": 0.9954110898661568, "grad_norm": 1.101347804069519, "learning_rate": 5e-06, "loss": 0.0547, "step": 5206 }, { "epoch": 0.9956022944550669, "grad_norm": 1.8866088390350342, "learning_rate": 5e-06, "loss": 0.1649, "step": 5207 }, { "epoch": 0.9957934990439771, "grad_norm": 2.1902003288269043, "learning_rate": 5e-06, "loss": 0.281, "step": 5208 }, { "epoch": 0.9959847036328872, "grad_norm": 2.494462490081787, "learning_rate": 5e-06, "loss": 0.1644, "step": 5209 }, { "epoch": 0.9961759082217974, "grad_norm": 1.2047346830368042, "learning_rate": 5e-06, "loss": 0.1172, "step": 5210 }, { "epoch": 0.9963671128107074, "grad_norm": 0.9425076246261597, "learning_rate": 5e-06, "loss": 0.059, "step": 5211 }, { "epoch": 0.9965583173996176, "grad_norm": 1.435917615890503, "learning_rate": 5e-06, "loss": 0.0519, "step": 5212 }, { "epoch": 0.9967495219885277, "grad_norm": 1.1277811527252197, "learning_rate": 5e-06, "loss": 0.0978, "step": 5213 }, { "epoch": 0.9969407265774378, "grad_norm": 2.782651901245117, "learning_rate": 5e-06, "loss": 0.3652, "step": 5214 }, { "epoch": 0.997131931166348, "grad_norm": 1.5946991443634033, "learning_rate": 5e-06, "loss": 0.1018, "step": 5215 }, { "epoch": 0.9973231357552581, "grad_norm": 0.9089525938034058, "learning_rate": 5e-06, "loss": 0.0902, "step": 5216 }, { "epoch": 0.9975143403441683, "grad_norm": 2.2287213802337646, "learning_rate": 5e-06, "loss": 0.1174, "step": 5217 }, { "epoch": 0.9977055449330784, "grad_norm": 2.977839231491089, "learning_rate": 5e-06, "loss": 0.2146, "step": 5218 }, { "epoch": 0.9978967495219885, "grad_norm": 1.4728442430496216, "learning_rate": 5e-06, "loss": 0.2433, "step": 5219 }, { "epoch": 0.9980879541108987, "grad_norm": 2.0566630363464355, "learning_rate": 5e-06, "loss": 0.2769, "step": 5220 }, { "epoch": 0.9982791586998088, "grad_norm": 1.2457760572433472, "learning_rate": 5e-06, "loss": 0.0825, "step": 5221 }, { "epoch": 0.998470363288719, "grad_norm": 1.6305168867111206, "learning_rate": 5e-06, "loss": 0.1614, "step": 5222 }, { "epoch": 0.998661567877629, "grad_norm": 1.9912116527557373, "learning_rate": 5e-06, "loss": 0.0982, "step": 5223 }, { "epoch": 0.9988527724665393, "grad_norm": 4.200954437255859, "learning_rate": 5e-06, "loss": 0.2215, "step": 5224 }, { "epoch": 0.9990439770554493, "grad_norm": 1.4355394840240479, "learning_rate": 5e-06, "loss": 0.0419, "step": 5225 }, { "epoch": 0.9992351816443594, "grad_norm": 1.8551381826400757, "learning_rate": 5e-06, "loss": 0.2277, "step": 5226 }, { "epoch": 0.9994263862332696, "grad_norm": 2.7024409770965576, "learning_rate": 5e-06, "loss": 0.5278, "step": 5227 }, { "epoch": 0.9996175908221797, "grad_norm": 1.0480506420135498, "learning_rate": 5e-06, "loss": 0.0521, "step": 5228 }, { "epoch": 0.9998087954110899, "grad_norm": 2.2718355655670166, "learning_rate": 5e-06, "loss": 0.1477, "step": 5229 }, { "epoch": 1.0, "grad_norm": 3.5123586654663086, "learning_rate": 5e-06, "loss": 0.3213, "step": 5230 }, { "epoch": 1.00019120458891, "grad_norm": 1.7297927141189575, "learning_rate": 5e-06, "loss": 0.2801, "step": 5231 }, { "epoch": 1.0003824091778202, "grad_norm": 1.3347065448760986, "learning_rate": 5e-06, "loss": 0.0934, "step": 5232 }, { "epoch": 1.0005736137667305, "grad_norm": 1.1105912923812866, "learning_rate": 5e-06, "loss": 0.094, "step": 5233 }, { "epoch": 1.0007648183556406, "grad_norm": 0.9613377451896667, "learning_rate": 5e-06, "loss": 0.0577, "step": 5234 }, { "epoch": 1.0009560229445507, "grad_norm": 1.0919339656829834, "learning_rate": 5e-06, "loss": 0.099, "step": 5235 }, { "epoch": 1.0011472275334607, "grad_norm": 0.9715074300765991, "learning_rate": 5e-06, "loss": 0.063, "step": 5236 }, { "epoch": 1.0013384321223708, "grad_norm": 1.8640201091766357, "learning_rate": 5e-06, "loss": 0.2279, "step": 5237 }, { "epoch": 1.0015296367112811, "grad_norm": 2.2503163814544678, "learning_rate": 5e-06, "loss": 0.2751, "step": 5238 }, { "epoch": 1.0017208413001912, "grad_norm": 1.2504420280456543, "learning_rate": 5e-06, "loss": 0.0953, "step": 5239 }, { "epoch": 1.0019120458891013, "grad_norm": 1.6367700099945068, "learning_rate": 5e-06, "loss": 0.103, "step": 5240 }, { "epoch": 1.0021032504780114, "grad_norm": 1.514311671257019, "learning_rate": 5e-06, "loss": 0.144, "step": 5241 }, { "epoch": 1.0022944550669215, "grad_norm": 1.054772973060608, "learning_rate": 5e-06, "loss": 0.0299, "step": 5242 }, { "epoch": 1.0024856596558318, "grad_norm": 2.3778746128082275, "learning_rate": 5e-06, "loss": 0.3053, "step": 5243 }, { "epoch": 1.002676864244742, "grad_norm": 2.213348150253296, "learning_rate": 5e-06, "loss": 0.2055, "step": 5244 }, { "epoch": 1.002868068833652, "grad_norm": 1.6743370294570923, "learning_rate": 5e-06, "loss": 0.0935, "step": 5245 }, { "epoch": 1.003059273422562, "grad_norm": 1.9218884706497192, "learning_rate": 5e-06, "loss": 0.1872, "step": 5246 }, { "epoch": 1.0032504780114724, "grad_norm": 0.5441107153892517, "learning_rate": 5e-06, "loss": 0.0418, "step": 5247 }, { "epoch": 1.0034416826003825, "grad_norm": 1.4847966432571411, "learning_rate": 5e-06, "loss": 0.0598, "step": 5248 }, { "epoch": 1.0036328871892926, "grad_norm": 1.8190624713897705, "learning_rate": 5e-06, "loss": 0.0375, "step": 5249 }, { "epoch": 1.0038240917782026, "grad_norm": 2.3524463176727295, "learning_rate": 5e-06, "loss": 0.1668, "step": 5250 }, { "epoch": 1.0040152963671127, "grad_norm": 2.548030376434326, "learning_rate": 5e-06, "loss": 0.3332, "step": 5251 }, { "epoch": 1.004206500956023, "grad_norm": 1.4580950736999512, "learning_rate": 5e-06, "loss": 0.1052, "step": 5252 }, { "epoch": 1.0043977055449331, "grad_norm": 1.2642977237701416, "learning_rate": 5e-06, "loss": 0.0873, "step": 5253 }, { "epoch": 1.0045889101338432, "grad_norm": 1.0301834344863892, "learning_rate": 5e-06, "loss": 0.0373, "step": 5254 }, { "epoch": 1.0047801147227533, "grad_norm": 1.7443534135818481, "learning_rate": 5e-06, "loss": 0.1106, "step": 5255 }, { "epoch": 1.0049713193116634, "grad_norm": 1.4613491296768188, "learning_rate": 5e-06, "loss": 0.1496, "step": 5256 }, { "epoch": 1.0051625239005737, "grad_norm": 1.6461591720581055, "learning_rate": 5e-06, "loss": 0.1837, "step": 5257 }, { "epoch": 1.0053537284894838, "grad_norm": 1.1870779991149902, "learning_rate": 5e-06, "loss": 0.0809, "step": 5258 }, { "epoch": 1.0055449330783939, "grad_norm": 0.8499864935874939, "learning_rate": 5e-06, "loss": 0.0478, "step": 5259 }, { "epoch": 1.005736137667304, "grad_norm": 1.4943770170211792, "learning_rate": 5e-06, "loss": 0.0532, "step": 5260 }, { "epoch": 1.005927342256214, "grad_norm": 1.3564469814300537, "learning_rate": 5e-06, "loss": 0.0571, "step": 5261 }, { "epoch": 1.0061185468451244, "grad_norm": 1.2749664783477783, "learning_rate": 5e-06, "loss": 0.066, "step": 5262 }, { "epoch": 1.0063097514340344, "grad_norm": 2.179372549057007, "learning_rate": 5e-06, "loss": 0.1723, "step": 5263 }, { "epoch": 1.0065009560229445, "grad_norm": 1.2617064714431763, "learning_rate": 5e-06, "loss": 0.0835, "step": 5264 }, { "epoch": 1.0066921606118546, "grad_norm": 0.9173052310943604, "learning_rate": 5e-06, "loss": 0.0922, "step": 5265 }, { "epoch": 1.0068833652007647, "grad_norm": 1.5798940658569336, "learning_rate": 5e-06, "loss": 0.0287, "step": 5266 }, { "epoch": 1.007074569789675, "grad_norm": 1.8742361068725586, "learning_rate": 5e-06, "loss": 0.088, "step": 5267 }, { "epoch": 1.007265774378585, "grad_norm": 2.5780484676361084, "learning_rate": 5e-06, "loss": 0.087, "step": 5268 }, { "epoch": 1.0074569789674952, "grad_norm": 2.258584499359131, "learning_rate": 5e-06, "loss": 0.2834, "step": 5269 }, { "epoch": 1.0076481835564053, "grad_norm": 0.6943368315696716, "learning_rate": 5e-06, "loss": 0.0394, "step": 5270 }, { "epoch": 1.0078393881453156, "grad_norm": 1.9142544269561768, "learning_rate": 5e-06, "loss": 0.0845, "step": 5271 }, { "epoch": 1.0080305927342257, "grad_norm": 0.8416318297386169, "learning_rate": 5e-06, "loss": 0.0664, "step": 5272 }, { "epoch": 1.0082217973231358, "grad_norm": 1.94535493850708, "learning_rate": 5e-06, "loss": 0.1711, "step": 5273 }, { "epoch": 1.0084130019120459, "grad_norm": 1.2606523036956787, "learning_rate": 5e-06, "loss": 0.0534, "step": 5274 }, { "epoch": 1.008604206500956, "grad_norm": 2.3237664699554443, "learning_rate": 5e-06, "loss": 0.2517, "step": 5275 }, { "epoch": 1.0087954110898663, "grad_norm": 2.197056293487549, "learning_rate": 5e-06, "loss": 0.3001, "step": 5276 }, { "epoch": 1.0089866156787763, "grad_norm": 1.2944321632385254, "learning_rate": 5e-06, "loss": 0.103, "step": 5277 }, { "epoch": 1.0091778202676864, "grad_norm": 1.5966204404830933, "learning_rate": 5e-06, "loss": 0.1872, "step": 5278 }, { "epoch": 1.0093690248565965, "grad_norm": 1.0905331373214722, "learning_rate": 5e-06, "loss": 0.0653, "step": 5279 }, { "epoch": 1.0095602294455066, "grad_norm": 1.1597042083740234, "learning_rate": 5e-06, "loss": 0.0651, "step": 5280 }, { "epoch": 1.009751434034417, "grad_norm": 2.1614441871643066, "learning_rate": 5e-06, "loss": 0.1937, "step": 5281 }, { "epoch": 1.009942638623327, "grad_norm": 2.6233420372009277, "learning_rate": 5e-06, "loss": 0.3533, "step": 5282 }, { "epoch": 1.010133843212237, "grad_norm": 1.4536938667297363, "learning_rate": 5e-06, "loss": 0.1249, "step": 5283 }, { "epoch": 1.0103250478011472, "grad_norm": 0.9678513407707214, "learning_rate": 5e-06, "loss": 0.0769, "step": 5284 }, { "epoch": 1.0105162523900573, "grad_norm": 1.2624231576919556, "learning_rate": 5e-06, "loss": 0.0625, "step": 5285 }, { "epoch": 1.0107074569789676, "grad_norm": 1.1822149753570557, "learning_rate": 5e-06, "loss": 0.0367, "step": 5286 }, { "epoch": 1.0108986615678777, "grad_norm": 1.7562121152877808, "learning_rate": 5e-06, "loss": 0.1614, "step": 5287 }, { "epoch": 1.0110898661567878, "grad_norm": 2.224581003189087, "learning_rate": 5e-06, "loss": 0.3048, "step": 5288 }, { "epoch": 1.0112810707456978, "grad_norm": 1.7163106203079224, "learning_rate": 5e-06, "loss": 0.2074, "step": 5289 }, { "epoch": 1.011472275334608, "grad_norm": 1.9225854873657227, "learning_rate": 5e-06, "loss": 0.2347, "step": 5290 }, { "epoch": 1.0116634799235182, "grad_norm": 0.8340242505073547, "learning_rate": 5e-06, "loss": 0.0415, "step": 5291 }, { "epoch": 1.0118546845124283, "grad_norm": 0.9325177073478699, "learning_rate": 5e-06, "loss": 0.0336, "step": 5292 }, { "epoch": 1.0120458891013384, "grad_norm": 1.3594424724578857, "learning_rate": 5e-06, "loss": 0.0575, "step": 5293 }, { "epoch": 1.0122370936902485, "grad_norm": 2.631110429763794, "learning_rate": 5e-06, "loss": 0.4276, "step": 5294 }, { "epoch": 1.0124282982791586, "grad_norm": 1.8750988245010376, "learning_rate": 5e-06, "loss": 0.3076, "step": 5295 }, { "epoch": 1.012619502868069, "grad_norm": 1.2204387187957764, "learning_rate": 5e-06, "loss": 0.074, "step": 5296 }, { "epoch": 1.012810707456979, "grad_norm": 1.7058926820755005, "learning_rate": 5e-06, "loss": 0.1998, "step": 5297 }, { "epoch": 1.013001912045889, "grad_norm": 1.6197351217269897, "learning_rate": 5e-06, "loss": 0.071, "step": 5298 }, { "epoch": 1.0131931166347992, "grad_norm": 1.8550605773925781, "learning_rate": 5e-06, "loss": 0.0786, "step": 5299 }, { "epoch": 1.0133843212237095, "grad_norm": 1.1241230964660645, "learning_rate": 5e-06, "loss": 0.0807, "step": 5300 }, { "epoch": 1.0135755258126196, "grad_norm": 2.6108241081237793, "learning_rate": 5e-06, "loss": 0.2384, "step": 5301 }, { "epoch": 1.0137667304015296, "grad_norm": 1.967355728149414, "learning_rate": 5e-06, "loss": 0.1577, "step": 5302 }, { "epoch": 1.0139579349904397, "grad_norm": 1.6688017845153809, "learning_rate": 5e-06, "loss": 0.0525, "step": 5303 }, { "epoch": 1.0141491395793498, "grad_norm": 1.1315393447875977, "learning_rate": 5e-06, "loss": 0.0706, "step": 5304 }, { "epoch": 1.0143403441682601, "grad_norm": 1.7984042167663574, "learning_rate": 5e-06, "loss": 0.078, "step": 5305 }, { "epoch": 1.0145315487571702, "grad_norm": 1.8164480924606323, "learning_rate": 5e-06, "loss": 0.1293, "step": 5306 }, { "epoch": 1.0147227533460803, "grad_norm": 1.3991438150405884, "learning_rate": 5e-06, "loss": 0.1551, "step": 5307 }, { "epoch": 1.0149139579349904, "grad_norm": 1.8388389348983765, "learning_rate": 5e-06, "loss": 0.0591, "step": 5308 }, { "epoch": 1.0151051625239005, "grad_norm": 1.4206583499908447, "learning_rate": 5e-06, "loss": 0.0782, "step": 5309 }, { "epoch": 1.0152963671128108, "grad_norm": 0.5835526585578918, "learning_rate": 5e-06, "loss": 0.0423, "step": 5310 }, { "epoch": 1.0154875717017209, "grad_norm": 1.2603877782821655, "learning_rate": 5e-06, "loss": 0.0581, "step": 5311 }, { "epoch": 1.015678776290631, "grad_norm": 2.069904327392578, "learning_rate": 5e-06, "loss": 0.2226, "step": 5312 }, { "epoch": 1.015869980879541, "grad_norm": 1.6898385286331177, "learning_rate": 5e-06, "loss": 0.1273, "step": 5313 }, { "epoch": 1.0160611854684511, "grad_norm": 2.38932466506958, "learning_rate": 5e-06, "loss": 0.1309, "step": 5314 }, { "epoch": 1.0162523900573615, "grad_norm": 1.2808679342269897, "learning_rate": 5e-06, "loss": 0.0763, "step": 5315 }, { "epoch": 1.0164435946462715, "grad_norm": 1.2353378534317017, "learning_rate": 5e-06, "loss": 0.0659, "step": 5316 }, { "epoch": 1.0166347992351816, "grad_norm": 1.1403536796569824, "learning_rate": 5e-06, "loss": 0.059, "step": 5317 }, { "epoch": 1.0168260038240917, "grad_norm": 1.8925150632858276, "learning_rate": 5e-06, "loss": 0.1917, "step": 5318 }, { "epoch": 1.0170172084130018, "grad_norm": 1.5492744445800781, "learning_rate": 5e-06, "loss": 0.1937, "step": 5319 }, { "epoch": 1.0172084130019121, "grad_norm": 2.6364314556121826, "learning_rate": 5e-06, "loss": 0.2993, "step": 5320 }, { "epoch": 1.0173996175908222, "grad_norm": 1.9068900346755981, "learning_rate": 5e-06, "loss": 0.2033, "step": 5321 }, { "epoch": 1.0175908221797323, "grad_norm": 1.244210958480835, "learning_rate": 5e-06, "loss": 0.084, "step": 5322 }, { "epoch": 1.0177820267686424, "grad_norm": 1.4095242023468018, "learning_rate": 5e-06, "loss": 0.0952, "step": 5323 }, { "epoch": 1.0179732313575527, "grad_norm": 1.3021609783172607, "learning_rate": 5e-06, "loss": 0.0732, "step": 5324 }, { "epoch": 1.0181644359464628, "grad_norm": 2.593829870223999, "learning_rate": 5e-06, "loss": 0.3372, "step": 5325 }, { "epoch": 1.0183556405353729, "grad_norm": 0.7937406897544861, "learning_rate": 5e-06, "loss": 0.0694, "step": 5326 }, { "epoch": 1.018546845124283, "grad_norm": 1.4523568153381348, "learning_rate": 5e-06, "loss": 0.0889, "step": 5327 }, { "epoch": 1.018738049713193, "grad_norm": 2.000311851501465, "learning_rate": 5e-06, "loss": 0.1579, "step": 5328 }, { "epoch": 1.0189292543021033, "grad_norm": 1.0171622037887573, "learning_rate": 5e-06, "loss": 0.0692, "step": 5329 }, { "epoch": 1.0191204588910134, "grad_norm": 0.9312133193016052, "learning_rate": 5e-06, "loss": 0.0239, "step": 5330 }, { "epoch": 1.0193116634799235, "grad_norm": 2.9251790046691895, "learning_rate": 5e-06, "loss": 0.1988, "step": 5331 }, { "epoch": 1.0195028680688336, "grad_norm": 1.3772673606872559, "learning_rate": 5e-06, "loss": 0.0688, "step": 5332 }, { "epoch": 1.0196940726577437, "grad_norm": 1.559841275215149, "learning_rate": 5e-06, "loss": 0.2109, "step": 5333 }, { "epoch": 1.019885277246654, "grad_norm": 1.385416865348816, "learning_rate": 5e-06, "loss": 0.0531, "step": 5334 }, { "epoch": 1.020076481835564, "grad_norm": 1.2152140140533447, "learning_rate": 5e-06, "loss": 0.0514, "step": 5335 }, { "epoch": 1.0202676864244742, "grad_norm": 1.0661227703094482, "learning_rate": 5e-06, "loss": 0.0496, "step": 5336 }, { "epoch": 1.0204588910133843, "grad_norm": 2.223980188369751, "learning_rate": 5e-06, "loss": 0.2001, "step": 5337 }, { "epoch": 1.0206500956022944, "grad_norm": 2.8563883304595947, "learning_rate": 5e-06, "loss": 0.3093, "step": 5338 }, { "epoch": 1.0208413001912047, "grad_norm": 1.7817109823226929, "learning_rate": 5e-06, "loss": 0.0834, "step": 5339 }, { "epoch": 1.0210325047801148, "grad_norm": 1.39713716506958, "learning_rate": 5e-06, "loss": 0.1341, "step": 5340 }, { "epoch": 1.0212237093690248, "grad_norm": 0.9216711521148682, "learning_rate": 5e-06, "loss": 0.046, "step": 5341 }, { "epoch": 1.021414913957935, "grad_norm": 3.6287028789520264, "learning_rate": 5e-06, "loss": 0.066, "step": 5342 }, { "epoch": 1.021606118546845, "grad_norm": 1.79201078414917, "learning_rate": 5e-06, "loss": 0.1809, "step": 5343 }, { "epoch": 1.0217973231357553, "grad_norm": 1.3624271154403687, "learning_rate": 5e-06, "loss": 0.0847, "step": 5344 }, { "epoch": 1.0219885277246654, "grad_norm": 1.809230089187622, "learning_rate": 5e-06, "loss": 0.0799, "step": 5345 }, { "epoch": 1.0221797323135755, "grad_norm": 1.3011666536331177, "learning_rate": 5e-06, "loss": 0.0849, "step": 5346 }, { "epoch": 1.0223709369024856, "grad_norm": 0.8237725496292114, "learning_rate": 5e-06, "loss": 0.0524, "step": 5347 }, { "epoch": 1.0225621414913957, "grad_norm": 1.9484095573425293, "learning_rate": 5e-06, "loss": 0.0832, "step": 5348 }, { "epoch": 1.022753346080306, "grad_norm": 1.2306699752807617, "learning_rate": 5e-06, "loss": 0.0433, "step": 5349 }, { "epoch": 1.022944550669216, "grad_norm": 2.3627052307128906, "learning_rate": 5e-06, "loss": 0.2586, "step": 5350 }, { "epoch": 1.0231357552581262, "grad_norm": 1.5180178880691528, "learning_rate": 5e-06, "loss": 0.1013, "step": 5351 }, { "epoch": 1.0233269598470363, "grad_norm": 1.259223222732544, "learning_rate": 5e-06, "loss": 0.1714, "step": 5352 }, { "epoch": 1.0235181644359466, "grad_norm": 1.2246848344802856, "learning_rate": 5e-06, "loss": 0.0907, "step": 5353 }, { "epoch": 1.0237093690248567, "grad_norm": 2.1600708961486816, "learning_rate": 5e-06, "loss": 0.1361, "step": 5354 }, { "epoch": 1.0239005736137667, "grad_norm": 1.1809910535812378, "learning_rate": 5e-06, "loss": 0.0521, "step": 5355 }, { "epoch": 1.0240917782026768, "grad_norm": 2.0541279315948486, "learning_rate": 5e-06, "loss": 0.1937, "step": 5356 }, { "epoch": 1.024282982791587, "grad_norm": 1.9363659620285034, "learning_rate": 5e-06, "loss": 0.1494, "step": 5357 }, { "epoch": 1.0244741873804972, "grad_norm": 2.4680871963500977, "learning_rate": 5e-06, "loss": 0.1925, "step": 5358 }, { "epoch": 1.0246653919694073, "grad_norm": 1.0142306089401245, "learning_rate": 5e-06, "loss": 0.0611, "step": 5359 }, { "epoch": 1.0248565965583174, "grad_norm": 1.288448452949524, "learning_rate": 5e-06, "loss": 0.0642, "step": 5360 }, { "epoch": 1.0250478011472275, "grad_norm": 1.4450470209121704, "learning_rate": 5e-06, "loss": 0.0648, "step": 5361 }, { "epoch": 1.0252390057361376, "grad_norm": 1.5391541719436646, "learning_rate": 5e-06, "loss": 0.1351, "step": 5362 }, { "epoch": 1.0254302103250479, "grad_norm": 2.144197940826416, "learning_rate": 5e-06, "loss": 0.3029, "step": 5363 }, { "epoch": 1.025621414913958, "grad_norm": 1.9811673164367676, "learning_rate": 5e-06, "loss": 0.1664, "step": 5364 }, { "epoch": 1.025812619502868, "grad_norm": 0.9573968648910522, "learning_rate": 5e-06, "loss": 0.0651, "step": 5365 }, { "epoch": 1.0260038240917781, "grad_norm": 1.0169429779052734, "learning_rate": 5e-06, "loss": 0.0364, "step": 5366 }, { "epoch": 1.0261950286806882, "grad_norm": 0.7411577701568604, "learning_rate": 5e-06, "loss": 0.0185, "step": 5367 }, { "epoch": 1.0263862332695985, "grad_norm": 1.0757877826690674, "learning_rate": 5e-06, "loss": 0.082, "step": 5368 }, { "epoch": 1.0265774378585086, "grad_norm": 2.224330425262451, "learning_rate": 5e-06, "loss": 0.2543, "step": 5369 }, { "epoch": 1.0267686424474187, "grad_norm": 1.7419105768203735, "learning_rate": 5e-06, "loss": 0.1179, "step": 5370 }, { "epoch": 1.0269598470363288, "grad_norm": 1.4337900876998901, "learning_rate": 5e-06, "loss": 0.0853, "step": 5371 }, { "epoch": 1.027151051625239, "grad_norm": 1.0431238412857056, "learning_rate": 5e-06, "loss": 0.0712, "step": 5372 }, { "epoch": 1.0273422562141492, "grad_norm": 1.8076082468032837, "learning_rate": 5e-06, "loss": 0.0804, "step": 5373 }, { "epoch": 1.0275334608030593, "grad_norm": 1.998808741569519, "learning_rate": 5e-06, "loss": 0.0978, "step": 5374 }, { "epoch": 1.0277246653919694, "grad_norm": 1.9513105154037476, "learning_rate": 5e-06, "loss": 0.1732, "step": 5375 }, { "epoch": 1.0279158699808795, "grad_norm": 0.6905821561813354, "learning_rate": 5e-06, "loss": 0.0502, "step": 5376 }, { "epoch": 1.0281070745697898, "grad_norm": 1.0172103643417358, "learning_rate": 5e-06, "loss": 0.0641, "step": 5377 }, { "epoch": 1.0282982791586999, "grad_norm": 1.166339635848999, "learning_rate": 5e-06, "loss": 0.0766, "step": 5378 }, { "epoch": 1.02848948374761, "grad_norm": 1.3585234880447388, "learning_rate": 5e-06, "loss": 0.0646, "step": 5379 }, { "epoch": 1.02868068833652, "grad_norm": 1.65544593334198, "learning_rate": 5e-06, "loss": 0.1481, "step": 5380 }, { "epoch": 1.0288718929254301, "grad_norm": 2.576430082321167, "learning_rate": 5e-06, "loss": 0.2197, "step": 5381 }, { "epoch": 1.0290630975143404, "grad_norm": 2.2190842628479004, "learning_rate": 5e-06, "loss": 0.1957, "step": 5382 }, { "epoch": 1.0292543021032505, "grad_norm": 1.092215895652771, "learning_rate": 5e-06, "loss": 0.0609, "step": 5383 }, { "epoch": 1.0294455066921606, "grad_norm": 1.6553555727005005, "learning_rate": 5e-06, "loss": 0.085, "step": 5384 }, { "epoch": 1.0296367112810707, "grad_norm": 1.6299556493759155, "learning_rate": 5e-06, "loss": 0.0454, "step": 5385 }, { "epoch": 1.0298279158699808, "grad_norm": 1.2839386463165283, "learning_rate": 5e-06, "loss": 0.068, "step": 5386 }, { "epoch": 1.030019120458891, "grad_norm": 1.5672410726547241, "learning_rate": 5e-06, "loss": 0.0897, "step": 5387 }, { "epoch": 1.0302103250478012, "grad_norm": 1.3381305932998657, "learning_rate": 5e-06, "loss": 0.0716, "step": 5388 }, { "epoch": 1.0304015296367113, "grad_norm": 0.9112756252288818, "learning_rate": 5e-06, "loss": 0.0466, "step": 5389 }, { "epoch": 1.0305927342256214, "grad_norm": 0.6958617568016052, "learning_rate": 5e-06, "loss": 0.0528, "step": 5390 }, { "epoch": 1.0307839388145315, "grad_norm": 1.5371332168579102, "learning_rate": 5e-06, "loss": 0.0767, "step": 5391 }, { "epoch": 1.0309751434034418, "grad_norm": 2.49212646484375, "learning_rate": 5e-06, "loss": 0.1944, "step": 5392 }, { "epoch": 1.0311663479923519, "grad_norm": 1.2017213106155396, "learning_rate": 5e-06, "loss": 0.1326, "step": 5393 }, { "epoch": 1.031357552581262, "grad_norm": 1.103096604347229, "learning_rate": 5e-06, "loss": 0.0552, "step": 5394 }, { "epoch": 1.031548757170172, "grad_norm": 2.4898500442504883, "learning_rate": 5e-06, "loss": 0.1824, "step": 5395 }, { "epoch": 1.0317399617590821, "grad_norm": 1.9159975051879883, "learning_rate": 5e-06, "loss": 0.1253, "step": 5396 }, { "epoch": 1.0319311663479924, "grad_norm": 1.6219099760055542, "learning_rate": 5e-06, "loss": 0.0913, "step": 5397 }, { "epoch": 1.0321223709369025, "grad_norm": 1.076381802558899, "learning_rate": 5e-06, "loss": 0.0744, "step": 5398 }, { "epoch": 1.0323135755258126, "grad_norm": 1.1917436122894287, "learning_rate": 5e-06, "loss": 0.0657, "step": 5399 }, { "epoch": 1.0325047801147227, "grad_norm": 1.8527624607086182, "learning_rate": 5e-06, "loss": 0.16, "step": 5400 }, { "epoch": 1.0326959847036328, "grad_norm": 2.29830002784729, "learning_rate": 5e-06, "loss": 0.1488, "step": 5401 }, { "epoch": 1.032887189292543, "grad_norm": 2.4546074867248535, "learning_rate": 5e-06, "loss": 0.2168, "step": 5402 }, { "epoch": 1.0330783938814532, "grad_norm": 0.7161755561828613, "learning_rate": 5e-06, "loss": 0.0753, "step": 5403 }, { "epoch": 1.0332695984703633, "grad_norm": 1.9103176593780518, "learning_rate": 5e-06, "loss": 0.0824, "step": 5404 }, { "epoch": 1.0334608030592733, "grad_norm": 2.567988634109497, "learning_rate": 5e-06, "loss": 0.1142, "step": 5405 }, { "epoch": 1.0336520076481837, "grad_norm": 2.4080469608306885, "learning_rate": 5e-06, "loss": 0.3055, "step": 5406 }, { "epoch": 1.0338432122370937, "grad_norm": 2.5664243698120117, "learning_rate": 5e-06, "loss": 0.3235, "step": 5407 }, { "epoch": 1.0340344168260038, "grad_norm": 0.7243027091026306, "learning_rate": 5e-06, "loss": 0.1029, "step": 5408 }, { "epoch": 1.034225621414914, "grad_norm": 2.28975510597229, "learning_rate": 5e-06, "loss": 0.089, "step": 5409 }, { "epoch": 1.034416826003824, "grad_norm": 1.4908415079116821, "learning_rate": 5e-06, "loss": 0.0705, "step": 5410 }, { "epoch": 1.0346080305927343, "grad_norm": 0.9343728423118591, "learning_rate": 5e-06, "loss": 0.0416, "step": 5411 }, { "epoch": 1.0347992351816444, "grad_norm": 2.183262586593628, "learning_rate": 5e-06, "loss": 0.1471, "step": 5412 }, { "epoch": 1.0349904397705545, "grad_norm": 1.1275417804718018, "learning_rate": 5e-06, "loss": 0.0581, "step": 5413 }, { "epoch": 1.0351816443594646, "grad_norm": 3.412479877471924, "learning_rate": 5e-06, "loss": 0.4229, "step": 5414 }, { "epoch": 1.0353728489483747, "grad_norm": 0.81615149974823, "learning_rate": 5e-06, "loss": 0.0412, "step": 5415 }, { "epoch": 1.035564053537285, "grad_norm": 1.7186781167984009, "learning_rate": 5e-06, "loss": 0.1121, "step": 5416 }, { "epoch": 1.035755258126195, "grad_norm": 3.903059482574463, "learning_rate": 5e-06, "loss": 0.0223, "step": 5417 }, { "epoch": 1.0359464627151052, "grad_norm": 1.3878508806228638, "learning_rate": 5e-06, "loss": 0.069, "step": 5418 }, { "epoch": 1.0361376673040152, "grad_norm": 2.2635209560394287, "learning_rate": 5e-06, "loss": 0.1977, "step": 5419 }, { "epoch": 1.0363288718929253, "grad_norm": 1.8340933322906494, "learning_rate": 5e-06, "loss": 0.1436, "step": 5420 }, { "epoch": 1.0365200764818356, "grad_norm": 1.2530016899108887, "learning_rate": 5e-06, "loss": 0.1269, "step": 5421 }, { "epoch": 1.0367112810707457, "grad_norm": 0.9224182367324829, "learning_rate": 5e-06, "loss": 0.0661, "step": 5422 }, { "epoch": 1.0369024856596558, "grad_norm": 1.306754231452942, "learning_rate": 5e-06, "loss": 0.0599, "step": 5423 }, { "epoch": 1.037093690248566, "grad_norm": 1.311036467552185, "learning_rate": 5e-06, "loss": 0.0821, "step": 5424 }, { "epoch": 1.0372848948374762, "grad_norm": 1.1835675239562988, "learning_rate": 5e-06, "loss": 0.0955, "step": 5425 }, { "epoch": 1.0374760994263863, "grad_norm": 0.7402036190032959, "learning_rate": 5e-06, "loss": 0.0427, "step": 5426 }, { "epoch": 1.0376673040152964, "grad_norm": 2.02817964553833, "learning_rate": 5e-06, "loss": 0.1171, "step": 5427 }, { "epoch": 1.0378585086042065, "grad_norm": 2.148780107498169, "learning_rate": 5e-06, "loss": 0.1064, "step": 5428 }, { "epoch": 1.0380497131931166, "grad_norm": 1.8620522022247314, "learning_rate": 5e-06, "loss": 0.1644, "step": 5429 }, { "epoch": 1.0382409177820269, "grad_norm": 0.7214341759681702, "learning_rate": 5e-06, "loss": 0.0191, "step": 5430 }, { "epoch": 1.038432122370937, "grad_norm": 1.828162670135498, "learning_rate": 5e-06, "loss": 0.1798, "step": 5431 }, { "epoch": 1.038623326959847, "grad_norm": 2.594289541244507, "learning_rate": 5e-06, "loss": 0.2456, "step": 5432 }, { "epoch": 1.0388145315487571, "grad_norm": 0.8717902302742004, "learning_rate": 5e-06, "loss": 0.0624, "step": 5433 }, { "epoch": 1.0390057361376672, "grad_norm": 2.1278491020202637, "learning_rate": 5e-06, "loss": 0.1108, "step": 5434 }, { "epoch": 1.0391969407265775, "grad_norm": 1.1939607858657837, "learning_rate": 5e-06, "loss": 0.0389, "step": 5435 }, { "epoch": 1.0393881453154876, "grad_norm": 2.5707457065582275, "learning_rate": 5e-06, "loss": 0.2028, "step": 5436 }, { "epoch": 1.0395793499043977, "grad_norm": 1.869125247001648, "learning_rate": 5e-06, "loss": 0.2081, "step": 5437 }, { "epoch": 1.0397705544933078, "grad_norm": 3.4773709774017334, "learning_rate": 5e-06, "loss": 0.4955, "step": 5438 }, { "epoch": 1.0399617590822179, "grad_norm": 1.6218843460083008, "learning_rate": 5e-06, "loss": 0.0909, "step": 5439 }, { "epoch": 1.0401529636711282, "grad_norm": 2.7059316635131836, "learning_rate": 5e-06, "loss": 0.3133, "step": 5440 }, { "epoch": 1.0403441682600383, "grad_norm": 1.0128536224365234, "learning_rate": 5e-06, "loss": 0.1095, "step": 5441 }, { "epoch": 1.0405353728489484, "grad_norm": 6.157577037811279, "learning_rate": 5e-06, "loss": 0.2846, "step": 5442 }, { "epoch": 1.0407265774378585, "grad_norm": 1.409836769104004, "learning_rate": 5e-06, "loss": 0.0739, "step": 5443 }, { "epoch": 1.0409177820267685, "grad_norm": 2.545416831970215, "learning_rate": 5e-06, "loss": 0.3916, "step": 5444 }, { "epoch": 1.0411089866156789, "grad_norm": 1.2188429832458496, "learning_rate": 5e-06, "loss": 0.0897, "step": 5445 }, { "epoch": 1.041300191204589, "grad_norm": 3.023515224456787, "learning_rate": 5e-06, "loss": 0.4011, "step": 5446 }, { "epoch": 1.041491395793499, "grad_norm": 2.3562259674072266, "learning_rate": 5e-06, "loss": 0.1399, "step": 5447 }, { "epoch": 1.0416826003824091, "grad_norm": 1.0878733396530151, "learning_rate": 5e-06, "loss": 0.0712, "step": 5448 }, { "epoch": 1.0418738049713192, "grad_norm": 2.7172553539276123, "learning_rate": 5e-06, "loss": 0.1926, "step": 5449 }, { "epoch": 1.0420650095602295, "grad_norm": 2.0290565490722656, "learning_rate": 5e-06, "loss": 0.2468, "step": 5450 }, { "epoch": 1.0422562141491396, "grad_norm": 0.8205150961875916, "learning_rate": 5e-06, "loss": 0.0483, "step": 5451 }, { "epoch": 1.0424474187380497, "grad_norm": 1.924567699432373, "learning_rate": 5e-06, "loss": 0.2065, "step": 5452 }, { "epoch": 1.0426386233269598, "grad_norm": 0.8093776702880859, "learning_rate": 5e-06, "loss": 0.0619, "step": 5453 }, { "epoch": 1.0428298279158699, "grad_norm": 1.8981060981750488, "learning_rate": 5e-06, "loss": 0.1608, "step": 5454 }, { "epoch": 1.0430210325047802, "grad_norm": 0.724181056022644, "learning_rate": 5e-06, "loss": 0.0284, "step": 5455 }, { "epoch": 1.0432122370936903, "grad_norm": 1.9111442565917969, "learning_rate": 5e-06, "loss": 0.1779, "step": 5456 }, { "epoch": 1.0434034416826004, "grad_norm": 1.8289096355438232, "learning_rate": 5e-06, "loss": 0.1876, "step": 5457 }, { "epoch": 1.0435946462715104, "grad_norm": 2.535822629928589, "learning_rate": 5e-06, "loss": 0.2191, "step": 5458 }, { "epoch": 1.0437858508604207, "grad_norm": 1.2734262943267822, "learning_rate": 5e-06, "loss": 0.0492, "step": 5459 }, { "epoch": 1.0439770554493308, "grad_norm": 0.6029118895530701, "learning_rate": 5e-06, "loss": 0.0392, "step": 5460 }, { "epoch": 1.044168260038241, "grad_norm": 1.2994762659072876, "learning_rate": 5e-06, "loss": 0.058, "step": 5461 }, { "epoch": 1.044359464627151, "grad_norm": 1.6217557191848755, "learning_rate": 5e-06, "loss": 0.1581, "step": 5462 }, { "epoch": 1.044550669216061, "grad_norm": 1.316620111465454, "learning_rate": 5e-06, "loss": 0.0736, "step": 5463 }, { "epoch": 1.0447418738049714, "grad_norm": 1.5537855625152588, "learning_rate": 5e-06, "loss": 0.1094, "step": 5464 }, { "epoch": 1.0449330783938815, "grad_norm": 0.6778096556663513, "learning_rate": 5e-06, "loss": 0.0312, "step": 5465 }, { "epoch": 1.0451242829827916, "grad_norm": 1.1639436483383179, "learning_rate": 5e-06, "loss": 0.0931, "step": 5466 }, { "epoch": 1.0453154875717017, "grad_norm": 1.2169827222824097, "learning_rate": 5e-06, "loss": 0.035, "step": 5467 }, { "epoch": 1.0455066921606118, "grad_norm": 3.1080920696258545, "learning_rate": 5e-06, "loss": 0.2154, "step": 5468 }, { "epoch": 1.045697896749522, "grad_norm": 1.4423012733459473, "learning_rate": 5e-06, "loss": 0.1196, "step": 5469 }, { "epoch": 1.0458891013384322, "grad_norm": 1.3293002843856812, "learning_rate": 5e-06, "loss": 0.112, "step": 5470 }, { "epoch": 1.0460803059273422, "grad_norm": 4.677468299865723, "learning_rate": 5e-06, "loss": 0.1826, "step": 5471 }, { "epoch": 1.0462715105162523, "grad_norm": 1.4991474151611328, "learning_rate": 5e-06, "loss": 0.1735, "step": 5472 }, { "epoch": 1.0464627151051624, "grad_norm": 0.8098369836807251, "learning_rate": 5e-06, "loss": 0.0329, "step": 5473 }, { "epoch": 1.0466539196940727, "grad_norm": 1.0252405405044556, "learning_rate": 5e-06, "loss": 0.0512, "step": 5474 }, { "epoch": 1.0468451242829828, "grad_norm": 2.2641191482543945, "learning_rate": 5e-06, "loss": 0.263, "step": 5475 }, { "epoch": 1.047036328871893, "grad_norm": 2.4479317665100098, "learning_rate": 5e-06, "loss": 0.1826, "step": 5476 }, { "epoch": 1.047227533460803, "grad_norm": 1.0218003988265991, "learning_rate": 5e-06, "loss": 0.0605, "step": 5477 }, { "epoch": 1.0474187380497133, "grad_norm": 1.1280349493026733, "learning_rate": 5e-06, "loss": 0.0742, "step": 5478 }, { "epoch": 1.0476099426386234, "grad_norm": 2.0644752979278564, "learning_rate": 5e-06, "loss": 0.1082, "step": 5479 }, { "epoch": 1.0478011472275335, "grad_norm": 1.3084979057312012, "learning_rate": 5e-06, "loss": 0.0601, "step": 5480 }, { "epoch": 1.0479923518164436, "grad_norm": 1.2070064544677734, "learning_rate": 5e-06, "loss": 0.0784, "step": 5481 }, { "epoch": 1.0481835564053537, "grad_norm": 1.7970086336135864, "learning_rate": 5e-06, "loss": 0.1671, "step": 5482 }, { "epoch": 1.048374760994264, "grad_norm": 2.143139362335205, "learning_rate": 5e-06, "loss": 0.1785, "step": 5483 }, { "epoch": 1.048565965583174, "grad_norm": 1.9227676391601562, "learning_rate": 5e-06, "loss": 0.1499, "step": 5484 }, { "epoch": 1.0487571701720841, "grad_norm": 1.6543242931365967, "learning_rate": 5e-06, "loss": 0.0493, "step": 5485 }, { "epoch": 1.0489483747609942, "grad_norm": 0.5971057415008545, "learning_rate": 5e-06, "loss": 0.0124, "step": 5486 }, { "epoch": 1.0491395793499043, "grad_norm": 2.2405505180358887, "learning_rate": 5e-06, "loss": 0.2293, "step": 5487 }, { "epoch": 1.0493307839388146, "grad_norm": 2.6099843978881836, "learning_rate": 5e-06, "loss": 0.2646, "step": 5488 }, { "epoch": 1.0495219885277247, "grad_norm": 1.8190488815307617, "learning_rate": 5e-06, "loss": 0.1664, "step": 5489 }, { "epoch": 1.0497131931166348, "grad_norm": 2.8577523231506348, "learning_rate": 5e-06, "loss": 0.2463, "step": 5490 }, { "epoch": 1.049904397705545, "grad_norm": 1.3947685956954956, "learning_rate": 5e-06, "loss": 0.115, "step": 5491 }, { "epoch": 1.050095602294455, "grad_norm": 1.2213077545166016, "learning_rate": 5e-06, "loss": 0.0604, "step": 5492 }, { "epoch": 1.0502868068833653, "grad_norm": 1.0234959125518799, "learning_rate": 5e-06, "loss": 0.0541, "step": 5493 }, { "epoch": 1.0504780114722754, "grad_norm": 2.8558273315429688, "learning_rate": 5e-06, "loss": 0.266, "step": 5494 }, { "epoch": 1.0506692160611855, "grad_norm": 1.4367798566818237, "learning_rate": 5e-06, "loss": 0.1352, "step": 5495 }, { "epoch": 1.0508604206500956, "grad_norm": 1.1142265796661377, "learning_rate": 5e-06, "loss": 0.0389, "step": 5496 }, { "epoch": 1.0510516252390056, "grad_norm": 2.527897596359253, "learning_rate": 5e-06, "loss": 0.1458, "step": 5497 }, { "epoch": 1.051242829827916, "grad_norm": 1.8113329410552979, "learning_rate": 5e-06, "loss": 0.1046, "step": 5498 }, { "epoch": 1.051434034416826, "grad_norm": 1.4811005592346191, "learning_rate": 5e-06, "loss": 0.1069, "step": 5499 }, { "epoch": 1.0516252390057361, "grad_norm": 1.7655647993087769, "learning_rate": 5e-06, "loss": 0.1356, "step": 5500 }, { "epoch": 1.0516252390057361, "eval_runtime": 834.4534, "eval_samples_per_second": 1.838, "eval_steps_per_second": 0.23, "step": 5500 }, { "epoch": 1.0518164435946462, "grad_norm": 1.7707321643829346, "learning_rate": 5e-06, "loss": 0.1367, "step": 5501 }, { "epoch": 1.0520076481835563, "grad_norm": 1.482149362564087, "learning_rate": 5e-06, "loss": 0.0581, "step": 5502 }, { "epoch": 1.0521988527724666, "grad_norm": 0.7621336579322815, "learning_rate": 5e-06, "loss": 0.057, "step": 5503 }, { "epoch": 1.0523900573613767, "grad_norm": 1.2675679922103882, "learning_rate": 5e-06, "loss": 0.0503, "step": 5504 }, { "epoch": 1.0525812619502868, "grad_norm": 1.968985676765442, "learning_rate": 5e-06, "loss": 0.0581, "step": 5505 }, { "epoch": 1.0527724665391969, "grad_norm": 2.8859829902648926, "learning_rate": 5e-06, "loss": 0.2671, "step": 5506 }, { "epoch": 1.0529636711281072, "grad_norm": 0.8013049960136414, "learning_rate": 5e-06, "loss": 0.045, "step": 5507 }, { "epoch": 1.0531548757170173, "grad_norm": 1.266425609588623, "learning_rate": 5e-06, "loss": 0.0874, "step": 5508 }, { "epoch": 1.0533460803059274, "grad_norm": 2.2374284267425537, "learning_rate": 5e-06, "loss": 0.0577, "step": 5509 }, { "epoch": 1.0535372848948374, "grad_norm": 1.1194673776626587, "learning_rate": 5e-06, "loss": 0.0354, "step": 5510 }, { "epoch": 1.0537284894837475, "grad_norm": 1.0510578155517578, "learning_rate": 5e-06, "loss": 0.03, "step": 5511 }, { "epoch": 1.0539196940726578, "grad_norm": 1.7931699752807617, "learning_rate": 5e-06, "loss": 0.1108, "step": 5512 }, { "epoch": 1.054110898661568, "grad_norm": 1.9386789798736572, "learning_rate": 5e-06, "loss": 0.0997, "step": 5513 }, { "epoch": 1.054302103250478, "grad_norm": 0.9754608869552612, "learning_rate": 5e-06, "loss": 0.0676, "step": 5514 }, { "epoch": 1.054493307839388, "grad_norm": 1.582482099533081, "learning_rate": 5e-06, "loss": 0.0941, "step": 5515 }, { "epoch": 1.0546845124282982, "grad_norm": 0.8954532742500305, "learning_rate": 5e-06, "loss": 0.0351, "step": 5516 }, { "epoch": 1.0548757170172085, "grad_norm": 1.9438811540603638, "learning_rate": 5e-06, "loss": 0.1878, "step": 5517 }, { "epoch": 1.0550669216061186, "grad_norm": 2.3192455768585205, "learning_rate": 5e-06, "loss": 0.1108, "step": 5518 }, { "epoch": 1.0552581261950287, "grad_norm": 2.396188974380493, "learning_rate": 5e-06, "loss": 0.2317, "step": 5519 }, { "epoch": 1.0554493307839388, "grad_norm": 1.4901149272918701, "learning_rate": 5e-06, "loss": 0.1238, "step": 5520 }, { "epoch": 1.0556405353728489, "grad_norm": 1.5792475938796997, "learning_rate": 5e-06, "loss": 0.1507, "step": 5521 }, { "epoch": 1.0558317399617592, "grad_norm": 1.3586615324020386, "learning_rate": 5e-06, "loss": 0.1084, "step": 5522 }, { "epoch": 1.0560229445506693, "grad_norm": 1.5009024143218994, "learning_rate": 5e-06, "loss": 0.0418, "step": 5523 }, { "epoch": 1.0562141491395793, "grad_norm": 1.5317716598510742, "learning_rate": 5e-06, "loss": 0.0772, "step": 5524 }, { "epoch": 1.0564053537284894, "grad_norm": 1.119912028312683, "learning_rate": 5e-06, "loss": 0.1205, "step": 5525 }, { "epoch": 1.0565965583173995, "grad_norm": 1.4832195043563843, "learning_rate": 5e-06, "loss": 0.1263, "step": 5526 }, { "epoch": 1.0567877629063098, "grad_norm": 1.1946576833724976, "learning_rate": 5e-06, "loss": 0.0627, "step": 5527 }, { "epoch": 1.05697896749522, "grad_norm": 1.3606984615325928, "learning_rate": 5e-06, "loss": 0.0645, "step": 5528 }, { "epoch": 1.05717017208413, "grad_norm": 0.7441090941429138, "learning_rate": 5e-06, "loss": 0.0384, "step": 5529 }, { "epoch": 1.05736137667304, "grad_norm": 1.1623265743255615, "learning_rate": 5e-06, "loss": 0.0436, "step": 5530 }, { "epoch": 1.0575525812619504, "grad_norm": 1.458216667175293, "learning_rate": 5e-06, "loss": 0.1655, "step": 5531 }, { "epoch": 1.0577437858508605, "grad_norm": 1.6108118295669556, "learning_rate": 5e-06, "loss": 0.1881, "step": 5532 }, { "epoch": 1.0579349904397706, "grad_norm": 1.7551403045654297, "learning_rate": 5e-06, "loss": 0.1642, "step": 5533 }, { "epoch": 1.0581261950286807, "grad_norm": 1.0185822248458862, "learning_rate": 5e-06, "loss": 0.055, "step": 5534 }, { "epoch": 1.0583173996175907, "grad_norm": 0.6421904563903809, "learning_rate": 5e-06, "loss": 0.0271, "step": 5535 }, { "epoch": 1.058508604206501, "grad_norm": 1.085301160812378, "learning_rate": 5e-06, "loss": 0.0458, "step": 5536 }, { "epoch": 1.0586998087954111, "grad_norm": 1.3988016843795776, "learning_rate": 5e-06, "loss": 0.0921, "step": 5537 }, { "epoch": 1.0588910133843212, "grad_norm": 3.1126415729522705, "learning_rate": 5e-06, "loss": 0.2017, "step": 5538 }, { "epoch": 1.0590822179732313, "grad_norm": 1.5114021301269531, "learning_rate": 5e-06, "loss": 0.0989, "step": 5539 }, { "epoch": 1.0592734225621414, "grad_norm": 1.0168532133102417, "learning_rate": 5e-06, "loss": 0.0519, "step": 5540 }, { "epoch": 1.0594646271510517, "grad_norm": 0.8101119995117188, "learning_rate": 5e-06, "loss": 0.0505, "step": 5541 }, { "epoch": 1.0596558317399618, "grad_norm": 1.7537641525268555, "learning_rate": 5e-06, "loss": 0.0264, "step": 5542 }, { "epoch": 1.059847036328872, "grad_norm": 1.9035402536392212, "learning_rate": 5e-06, "loss": 0.168, "step": 5543 }, { "epoch": 1.060038240917782, "grad_norm": 2.1393039226531982, "learning_rate": 5e-06, "loss": 0.2744, "step": 5544 }, { "epoch": 1.060229445506692, "grad_norm": 1.2142505645751953, "learning_rate": 5e-06, "loss": 0.0952, "step": 5545 }, { "epoch": 1.0604206500956024, "grad_norm": 1.868961215019226, "learning_rate": 5e-06, "loss": 0.0702, "step": 5546 }, { "epoch": 1.0606118546845125, "grad_norm": 1.7776259183883667, "learning_rate": 5e-06, "loss": 0.198, "step": 5547 }, { "epoch": 1.0608030592734226, "grad_norm": 0.7262550592422485, "learning_rate": 5e-06, "loss": 0.0306, "step": 5548 }, { "epoch": 1.0609942638623326, "grad_norm": 1.554850697517395, "learning_rate": 5e-06, "loss": 0.0685, "step": 5549 }, { "epoch": 1.0611854684512427, "grad_norm": 2.132505178451538, "learning_rate": 5e-06, "loss": 0.3059, "step": 5550 }, { "epoch": 1.061376673040153, "grad_norm": 0.8302657008171082, "learning_rate": 5e-06, "loss": 0.0978, "step": 5551 }, { "epoch": 1.0615678776290631, "grad_norm": 1.9379445314407349, "learning_rate": 5e-06, "loss": 0.1882, "step": 5552 }, { "epoch": 1.0617590822179732, "grad_norm": 1.474511742591858, "learning_rate": 5e-06, "loss": 0.1236, "step": 5553 }, { "epoch": 1.0619502868068833, "grad_norm": 2.032550573348999, "learning_rate": 5e-06, "loss": 0.1122, "step": 5554 }, { "epoch": 1.0621414913957934, "grad_norm": 1.345967411994934, "learning_rate": 5e-06, "loss": 0.0447, "step": 5555 }, { "epoch": 1.0623326959847037, "grad_norm": 2.1029436588287354, "learning_rate": 5e-06, "loss": 0.2457, "step": 5556 }, { "epoch": 1.0625239005736138, "grad_norm": 1.188673734664917, "learning_rate": 5e-06, "loss": 0.0634, "step": 5557 }, { "epoch": 1.0627151051625239, "grad_norm": 0.6115692257881165, "learning_rate": 5e-06, "loss": 0.0683, "step": 5558 }, { "epoch": 1.062906309751434, "grad_norm": 1.1898462772369385, "learning_rate": 5e-06, "loss": 0.042, "step": 5559 }, { "epoch": 1.063097514340344, "grad_norm": 2.9459636211395264, "learning_rate": 5e-06, "loss": 0.1648, "step": 5560 }, { "epoch": 1.0632887189292544, "grad_norm": 1.1826893091201782, "learning_rate": 5e-06, "loss": 0.0368, "step": 5561 }, { "epoch": 1.0634799235181644, "grad_norm": 2.073742389678955, "learning_rate": 5e-06, "loss": 0.2756, "step": 5562 }, { "epoch": 1.0636711281070745, "grad_norm": 1.3164788484573364, "learning_rate": 5e-06, "loss": 0.1287, "step": 5563 }, { "epoch": 1.0638623326959846, "grad_norm": 1.7094725370407104, "learning_rate": 5e-06, "loss": 0.2081, "step": 5564 }, { "epoch": 1.064053537284895, "grad_norm": 1.2266261577606201, "learning_rate": 5e-06, "loss": 0.0791, "step": 5565 }, { "epoch": 1.064244741873805, "grad_norm": 0.5611611008644104, "learning_rate": 5e-06, "loss": 0.0304, "step": 5566 }, { "epoch": 1.064435946462715, "grad_norm": 1.0190863609313965, "learning_rate": 5e-06, "loss": 0.0428, "step": 5567 }, { "epoch": 1.0646271510516252, "grad_norm": 1.5948188304901123, "learning_rate": 5e-06, "loss": 0.1038, "step": 5568 }, { "epoch": 1.0648183556405353, "grad_norm": 1.694000005722046, "learning_rate": 5e-06, "loss": 0.1146, "step": 5569 }, { "epoch": 1.0650095602294456, "grad_norm": 2.386840581893921, "learning_rate": 5e-06, "loss": 0.1727, "step": 5570 }, { "epoch": 1.0652007648183557, "grad_norm": 2.1206300258636475, "learning_rate": 5e-06, "loss": 0.1712, "step": 5571 }, { "epoch": 1.0653919694072658, "grad_norm": 1.195825457572937, "learning_rate": 5e-06, "loss": 0.0482, "step": 5572 }, { "epoch": 1.0655831739961759, "grad_norm": 0.9524969458580017, "learning_rate": 5e-06, "loss": 0.0435, "step": 5573 }, { "epoch": 1.065774378585086, "grad_norm": 1.50506591796875, "learning_rate": 5e-06, "loss": 0.1009, "step": 5574 }, { "epoch": 1.0659655831739963, "grad_norm": 2.291487216949463, "learning_rate": 5e-06, "loss": 0.2958, "step": 5575 }, { "epoch": 1.0661567877629063, "grad_norm": 1.8258750438690186, "learning_rate": 5e-06, "loss": 0.2111, "step": 5576 }, { "epoch": 1.0663479923518164, "grad_norm": 1.87051260471344, "learning_rate": 5e-06, "loss": 0.1976, "step": 5577 }, { "epoch": 1.0665391969407265, "grad_norm": 2.5715811252593994, "learning_rate": 5e-06, "loss": 0.1482, "step": 5578 }, { "epoch": 1.0667304015296368, "grad_norm": 0.7487394213676453, "learning_rate": 5e-06, "loss": 0.0376, "step": 5579 }, { "epoch": 1.066921606118547, "grad_norm": 2.3859152793884277, "learning_rate": 5e-06, "loss": 0.0333, "step": 5580 }, { "epoch": 1.067112810707457, "grad_norm": 1.7255477905273438, "learning_rate": 5e-06, "loss": 0.072, "step": 5581 }, { "epoch": 1.067304015296367, "grad_norm": 2.3104193210601807, "learning_rate": 5e-06, "loss": 0.3214, "step": 5582 }, { "epoch": 1.0674952198852772, "grad_norm": 1.6485390663146973, "learning_rate": 5e-06, "loss": 0.1561, "step": 5583 }, { "epoch": 1.0676864244741875, "grad_norm": 0.9040722846984863, "learning_rate": 5e-06, "loss": 0.0759, "step": 5584 }, { "epoch": 1.0678776290630976, "grad_norm": 0.904682993888855, "learning_rate": 5e-06, "loss": 0.0422, "step": 5585 }, { "epoch": 1.0680688336520077, "grad_norm": 1.8263367414474487, "learning_rate": 5e-06, "loss": 0.0747, "step": 5586 }, { "epoch": 1.0682600382409178, "grad_norm": 2.2079832553863525, "learning_rate": 5e-06, "loss": 0.3012, "step": 5587 }, { "epoch": 1.0684512428298278, "grad_norm": 1.0028715133666992, "learning_rate": 5e-06, "loss": 0.0642, "step": 5588 }, { "epoch": 1.0686424474187382, "grad_norm": 2.1110336780548096, "learning_rate": 5e-06, "loss": 0.1711, "step": 5589 }, { "epoch": 1.0688336520076482, "grad_norm": 1.1553751230239868, "learning_rate": 5e-06, "loss": 0.0648, "step": 5590 }, { "epoch": 1.0690248565965583, "grad_norm": 1.4524354934692383, "learning_rate": 5e-06, "loss": 0.0671, "step": 5591 }, { "epoch": 1.0692160611854684, "grad_norm": 1.1088978052139282, "learning_rate": 5e-06, "loss": 0.0488, "step": 5592 }, { "epoch": 1.0694072657743785, "grad_norm": 1.0848307609558105, "learning_rate": 5e-06, "loss": 0.0353, "step": 5593 }, { "epoch": 1.0695984703632888, "grad_norm": 1.9532465934753418, "learning_rate": 5e-06, "loss": 0.2058, "step": 5594 }, { "epoch": 1.069789674952199, "grad_norm": 2.134166955947876, "learning_rate": 5e-06, "loss": 0.1638, "step": 5595 }, { "epoch": 1.069980879541109, "grad_norm": 0.9889646172523499, "learning_rate": 5e-06, "loss": 0.076, "step": 5596 }, { "epoch": 1.070172084130019, "grad_norm": 4.0119452476501465, "learning_rate": 5e-06, "loss": 0.2381, "step": 5597 }, { "epoch": 1.0703632887189292, "grad_norm": 1.1719545125961304, "learning_rate": 5e-06, "loss": 0.0532, "step": 5598 }, { "epoch": 1.0705544933078395, "grad_norm": 0.7966266870498657, "learning_rate": 5e-06, "loss": 0.0488, "step": 5599 }, { "epoch": 1.0707456978967496, "grad_norm": 2.092799186706543, "learning_rate": 5e-06, "loss": 0.2781, "step": 5600 }, { "epoch": 1.0709369024856596, "grad_norm": 1.770919680595398, "learning_rate": 5e-06, "loss": 0.2382, "step": 5601 }, { "epoch": 1.0711281070745697, "grad_norm": 1.2337734699249268, "learning_rate": 5e-06, "loss": 0.1008, "step": 5602 }, { "epoch": 1.0713193116634798, "grad_norm": 1.2285529375076294, "learning_rate": 5e-06, "loss": 0.0923, "step": 5603 }, { "epoch": 1.0715105162523901, "grad_norm": 0.8008958101272583, "learning_rate": 5e-06, "loss": 0.0347, "step": 5604 }, { "epoch": 1.0717017208413002, "grad_norm": 1.2167035341262817, "learning_rate": 5e-06, "loss": 0.0495, "step": 5605 }, { "epoch": 1.0718929254302103, "grad_norm": 2.4168663024902344, "learning_rate": 5e-06, "loss": 0.3622, "step": 5606 }, { "epoch": 1.0720841300191204, "grad_norm": 2.050773859024048, "learning_rate": 5e-06, "loss": 0.3046, "step": 5607 }, { "epoch": 1.0722753346080305, "grad_norm": 2.666419267654419, "learning_rate": 5e-06, "loss": 0.1967, "step": 5608 }, { "epoch": 1.0724665391969408, "grad_norm": 0.6062523722648621, "learning_rate": 5e-06, "loss": 0.0457, "step": 5609 }, { "epoch": 1.0726577437858509, "grad_norm": 1.7348066568374634, "learning_rate": 5e-06, "loss": 0.0553, "step": 5610 }, { "epoch": 1.072848948374761, "grad_norm": 2.2407853603363037, "learning_rate": 5e-06, "loss": 0.0718, "step": 5611 }, { "epoch": 1.073040152963671, "grad_norm": 2.270958662033081, "learning_rate": 5e-06, "loss": 0.1383, "step": 5612 }, { "epoch": 1.0732313575525811, "grad_norm": 2.6858391761779785, "learning_rate": 5e-06, "loss": 0.3042, "step": 5613 }, { "epoch": 1.0734225621414915, "grad_norm": 2.1360089778900146, "learning_rate": 5e-06, "loss": 0.2963, "step": 5614 }, { "epoch": 1.0736137667304015, "grad_norm": 1.7626913785934448, "learning_rate": 5e-06, "loss": 0.1656, "step": 5615 }, { "epoch": 1.0738049713193116, "grad_norm": 0.7246912717819214, "learning_rate": 5e-06, "loss": 0.0466, "step": 5616 }, { "epoch": 1.0739961759082217, "grad_norm": 1.5797908306121826, "learning_rate": 5e-06, "loss": 0.0513, "step": 5617 }, { "epoch": 1.074187380497132, "grad_norm": 1.0937782526016235, "learning_rate": 5e-06, "loss": 0.066, "step": 5618 }, { "epoch": 1.0743785850860421, "grad_norm": 1.6355293989181519, "learning_rate": 5e-06, "loss": 0.2058, "step": 5619 }, { "epoch": 1.0745697896749522, "grad_norm": 1.8476656675338745, "learning_rate": 5e-06, "loss": 0.2098, "step": 5620 }, { "epoch": 1.0747609942638623, "grad_norm": 1.877700686454773, "learning_rate": 5e-06, "loss": 0.0347, "step": 5621 }, { "epoch": 1.0749521988527724, "grad_norm": 1.5795397758483887, "learning_rate": 5e-06, "loss": 0.092, "step": 5622 }, { "epoch": 1.0751434034416827, "grad_norm": 1.5501025915145874, "learning_rate": 5e-06, "loss": 0.1344, "step": 5623 }, { "epoch": 1.0753346080305928, "grad_norm": 1.7347816228866577, "learning_rate": 5e-06, "loss": 0.0974, "step": 5624 }, { "epoch": 1.0755258126195029, "grad_norm": 2.559631109237671, "learning_rate": 5e-06, "loss": 0.3785, "step": 5625 }, { "epoch": 1.075717017208413, "grad_norm": 1.182547688484192, "learning_rate": 5e-06, "loss": 0.0788, "step": 5626 }, { "epoch": 1.075908221797323, "grad_norm": 1.3121072053909302, "learning_rate": 5e-06, "loss": 0.0665, "step": 5627 }, { "epoch": 1.0760994263862333, "grad_norm": 1.6444857120513916, "learning_rate": 5e-06, "loss": 0.0654, "step": 5628 }, { "epoch": 1.0762906309751434, "grad_norm": 1.5076884031295776, "learning_rate": 5e-06, "loss": 0.0763, "step": 5629 }, { "epoch": 1.0764818355640535, "grad_norm": 1.1843534708023071, "learning_rate": 5e-06, "loss": 0.037, "step": 5630 }, { "epoch": 1.0766730401529636, "grad_norm": 1.5834217071533203, "learning_rate": 5e-06, "loss": 0.1003, "step": 5631 }, { "epoch": 1.076864244741874, "grad_norm": 2.579793691635132, "learning_rate": 5e-06, "loss": 0.2568, "step": 5632 }, { "epoch": 1.077055449330784, "grad_norm": 1.1158941984176636, "learning_rate": 5e-06, "loss": 0.0999, "step": 5633 }, { "epoch": 1.077246653919694, "grad_norm": 1.8821864128112793, "learning_rate": 5e-06, "loss": 0.1314, "step": 5634 }, { "epoch": 1.0774378585086042, "grad_norm": 1.1883410215377808, "learning_rate": 5e-06, "loss": 0.0513, "step": 5635 }, { "epoch": 1.0776290630975143, "grad_norm": 2.1233952045440674, "learning_rate": 5e-06, "loss": 0.0766, "step": 5636 }, { "epoch": 1.0778202676864246, "grad_norm": 2.0192222595214844, "learning_rate": 5e-06, "loss": 0.2886, "step": 5637 }, { "epoch": 1.0780114722753347, "grad_norm": 1.116824746131897, "learning_rate": 5e-06, "loss": 0.081, "step": 5638 }, { "epoch": 1.0782026768642448, "grad_norm": 1.6397048234939575, "learning_rate": 5e-06, "loss": 0.0968, "step": 5639 }, { "epoch": 1.0783938814531548, "grad_norm": 0.804574191570282, "learning_rate": 5e-06, "loss": 0.0296, "step": 5640 }, { "epoch": 1.078585086042065, "grad_norm": 0.6931762099266052, "learning_rate": 5e-06, "loss": 0.0159, "step": 5641 }, { "epoch": 1.0787762906309752, "grad_norm": 2.571634531021118, "learning_rate": 5e-06, "loss": 0.1144, "step": 5642 }, { "epoch": 1.0789674952198853, "grad_norm": 1.3164654970169067, "learning_rate": 5e-06, "loss": 0.0838, "step": 5643 }, { "epoch": 1.0791586998087954, "grad_norm": 1.9440586566925049, "learning_rate": 5e-06, "loss": 0.2427, "step": 5644 }, { "epoch": 1.0793499043977055, "grad_norm": 1.7131543159484863, "learning_rate": 5e-06, "loss": 0.1543, "step": 5645 }, { "epoch": 1.0795411089866156, "grad_norm": 0.7819735407829285, "learning_rate": 5e-06, "loss": 0.062, "step": 5646 }, { "epoch": 1.079732313575526, "grad_norm": 1.6629984378814697, "learning_rate": 5e-06, "loss": 0.0666, "step": 5647 }, { "epoch": 1.079923518164436, "grad_norm": 1.143560528755188, "learning_rate": 5e-06, "loss": 0.0335, "step": 5648 }, { "epoch": 1.080114722753346, "grad_norm": 1.9708220958709717, "learning_rate": 5e-06, "loss": 0.1641, "step": 5649 }, { "epoch": 1.0803059273422562, "grad_norm": 1.5629487037658691, "learning_rate": 5e-06, "loss": 0.2152, "step": 5650 }, { "epoch": 1.0804971319311663, "grad_norm": 1.2981854677200317, "learning_rate": 5e-06, "loss": 0.0818, "step": 5651 }, { "epoch": 1.0806883365200766, "grad_norm": 2.244119644165039, "learning_rate": 5e-06, "loss": 0.2363, "step": 5652 }, { "epoch": 1.0808795411089867, "grad_norm": 0.7663077116012573, "learning_rate": 5e-06, "loss": 0.0666, "step": 5653 }, { "epoch": 1.0810707456978967, "grad_norm": 0.9932133555412292, "learning_rate": 5e-06, "loss": 0.0601, "step": 5654 }, { "epoch": 1.0812619502868068, "grad_norm": 0.8187389969825745, "learning_rate": 5e-06, "loss": 0.0233, "step": 5655 }, { "epoch": 1.081453154875717, "grad_norm": 1.5282580852508545, "learning_rate": 5e-06, "loss": 0.1785, "step": 5656 }, { "epoch": 1.0816443594646272, "grad_norm": 1.7210948467254639, "learning_rate": 5e-06, "loss": 0.1786, "step": 5657 }, { "epoch": 1.0818355640535373, "grad_norm": 0.6434665322303772, "learning_rate": 5e-06, "loss": 0.0408, "step": 5658 }, { "epoch": 1.0820267686424474, "grad_norm": 1.558206558227539, "learning_rate": 5e-06, "loss": 0.1026, "step": 5659 }, { "epoch": 1.0822179732313575, "grad_norm": 1.6600244045257568, "learning_rate": 5e-06, "loss": 0.0921, "step": 5660 }, { "epoch": 1.0824091778202676, "grad_norm": 1.1153446435928345, "learning_rate": 5e-06, "loss": 0.024, "step": 5661 }, { "epoch": 1.0826003824091779, "grad_norm": 2.0310747623443604, "learning_rate": 5e-06, "loss": 0.1647, "step": 5662 }, { "epoch": 1.082791586998088, "grad_norm": 1.8215316534042358, "learning_rate": 5e-06, "loss": 0.1737, "step": 5663 }, { "epoch": 1.082982791586998, "grad_norm": 1.404730200767517, "learning_rate": 5e-06, "loss": 0.0599, "step": 5664 }, { "epoch": 1.0831739961759081, "grad_norm": 5.3708176612854, "learning_rate": 5e-06, "loss": 0.0709, "step": 5665 }, { "epoch": 1.0833652007648185, "grad_norm": 2.5323567390441895, "learning_rate": 5e-06, "loss": 0.1791, "step": 5666 }, { "epoch": 1.0835564053537285, "grad_norm": 0.777966320514679, "learning_rate": 5e-06, "loss": 0.0265, "step": 5667 }, { "epoch": 1.0837476099426386, "grad_norm": 1.7438549995422363, "learning_rate": 5e-06, "loss": 0.0811, "step": 5668 }, { "epoch": 1.0839388145315487, "grad_norm": 1.896105408668518, "learning_rate": 5e-06, "loss": 0.0971, "step": 5669 }, { "epoch": 1.0841300191204588, "grad_norm": 1.2311056852340698, "learning_rate": 5e-06, "loss": 0.0739, "step": 5670 }, { "epoch": 1.0843212237093691, "grad_norm": 1.2870060205459595, "learning_rate": 5e-06, "loss": 0.0791, "step": 5671 }, { "epoch": 1.0845124282982792, "grad_norm": 1.5939801931381226, "learning_rate": 5e-06, "loss": 0.035, "step": 5672 }, { "epoch": 1.0847036328871893, "grad_norm": 1.4833265542984009, "learning_rate": 5e-06, "loss": 0.0654, "step": 5673 }, { "epoch": 1.0848948374760994, "grad_norm": 2.980525255203247, "learning_rate": 5e-06, "loss": 0.3089, "step": 5674 }, { "epoch": 1.0850860420650095, "grad_norm": 2.04789137840271, "learning_rate": 5e-06, "loss": 0.1638, "step": 5675 }, { "epoch": 1.0852772466539198, "grad_norm": 1.1447137594223022, "learning_rate": 5e-06, "loss": 0.1109, "step": 5676 }, { "epoch": 1.0854684512428299, "grad_norm": 2.3989462852478027, "learning_rate": 5e-06, "loss": 0.2504, "step": 5677 }, { "epoch": 1.08565965583174, "grad_norm": 1.2400035858154297, "learning_rate": 5e-06, "loss": 0.0553, "step": 5678 }, { "epoch": 1.08585086042065, "grad_norm": 0.6254027485847473, "learning_rate": 5e-06, "loss": 0.0353, "step": 5679 }, { "epoch": 1.0860420650095601, "grad_norm": 0.8050343990325928, "learning_rate": 5e-06, "loss": 0.0261, "step": 5680 }, { "epoch": 1.0862332695984704, "grad_norm": 2.053995132446289, "learning_rate": 5e-06, "loss": 0.3031, "step": 5681 }, { "epoch": 1.0864244741873805, "grad_norm": 2.055222272872925, "learning_rate": 5e-06, "loss": 0.0618, "step": 5682 }, { "epoch": 1.0866156787762906, "grad_norm": 1.6199257373809814, "learning_rate": 5e-06, "loss": 0.1672, "step": 5683 }, { "epoch": 1.0868068833652007, "grad_norm": 1.113048791885376, "learning_rate": 5e-06, "loss": 0.0616, "step": 5684 }, { "epoch": 1.086998087954111, "grad_norm": 0.8362628817558289, "learning_rate": 5e-06, "loss": 0.0341, "step": 5685 }, { "epoch": 1.087189292543021, "grad_norm": 5.398261070251465, "learning_rate": 5e-06, "loss": 0.1019, "step": 5686 }, { "epoch": 1.0873804971319312, "grad_norm": 1.4154852628707886, "learning_rate": 5e-06, "loss": 0.0897, "step": 5687 }, { "epoch": 1.0875717017208413, "grad_norm": 1.3424711227416992, "learning_rate": 5e-06, "loss": 0.1103, "step": 5688 }, { "epoch": 1.0877629063097514, "grad_norm": 1.525865912437439, "learning_rate": 5e-06, "loss": 0.0945, "step": 5689 }, { "epoch": 1.0879541108986617, "grad_norm": 1.0463107824325562, "learning_rate": 5e-06, "loss": 0.0452, "step": 5690 }, { "epoch": 1.0881453154875718, "grad_norm": 2.542323589324951, "learning_rate": 5e-06, "loss": 0.2381, "step": 5691 }, { "epoch": 1.0883365200764819, "grad_norm": 1.3651113510131836, "learning_rate": 5e-06, "loss": 0.0745, "step": 5692 }, { "epoch": 1.088527724665392, "grad_norm": 1.5302486419677734, "learning_rate": 5e-06, "loss": 0.1264, "step": 5693 }, { "epoch": 1.088718929254302, "grad_norm": 1.7708359956741333, "learning_rate": 5e-06, "loss": 0.1359, "step": 5694 }, { "epoch": 1.0889101338432123, "grad_norm": 1.8701834678649902, "learning_rate": 5e-06, "loss": 0.2137, "step": 5695 }, { "epoch": 1.0891013384321224, "grad_norm": 2.5269713401794434, "learning_rate": 5e-06, "loss": 0.0765, "step": 5696 }, { "epoch": 1.0892925430210325, "grad_norm": 1.7278193235397339, "learning_rate": 5e-06, "loss": 0.0885, "step": 5697 }, { "epoch": 1.0894837476099426, "grad_norm": 1.453526496887207, "learning_rate": 5e-06, "loss": 0.0511, "step": 5698 }, { "epoch": 1.0896749521988527, "grad_norm": 5.5870819091796875, "learning_rate": 5e-06, "loss": 0.2771, "step": 5699 }, { "epoch": 1.089866156787763, "grad_norm": 1.4349433183670044, "learning_rate": 5e-06, "loss": 0.1705, "step": 5700 }, { "epoch": 1.090057361376673, "grad_norm": 1.717117190361023, "learning_rate": 5e-06, "loss": 0.1773, "step": 5701 }, { "epoch": 1.0902485659655832, "grad_norm": 1.0715373754501343, "learning_rate": 5e-06, "loss": 0.0745, "step": 5702 }, { "epoch": 1.0904397705544933, "grad_norm": 1.4231574535369873, "learning_rate": 5e-06, "loss": 0.0805, "step": 5703 }, { "epoch": 1.0906309751434033, "grad_norm": 2.196079730987549, "learning_rate": 5e-06, "loss": 0.1501, "step": 5704 }, { "epoch": 1.0908221797323137, "grad_norm": 1.016573429107666, "learning_rate": 5e-06, "loss": 0.0582, "step": 5705 }, { "epoch": 1.0910133843212237, "grad_norm": 1.7154649496078491, "learning_rate": 5e-06, "loss": 0.2169, "step": 5706 }, { "epoch": 1.0912045889101338, "grad_norm": 1.9732598066329956, "learning_rate": 5e-06, "loss": 0.1264, "step": 5707 }, { "epoch": 1.091395793499044, "grad_norm": 1.975980281829834, "learning_rate": 5e-06, "loss": 0.1932, "step": 5708 }, { "epoch": 1.091586998087954, "grad_norm": 1.5534021854400635, "learning_rate": 5e-06, "loss": 0.0715, "step": 5709 }, { "epoch": 1.0917782026768643, "grad_norm": 1.1966547966003418, "learning_rate": 5e-06, "loss": 0.0663, "step": 5710 }, { "epoch": 1.0919694072657744, "grad_norm": 1.5349960327148438, "learning_rate": 5e-06, "loss": 0.0343, "step": 5711 }, { "epoch": 1.0921606118546845, "grad_norm": 1.9171141386032104, "learning_rate": 5e-06, "loss": 0.0983, "step": 5712 }, { "epoch": 1.0923518164435946, "grad_norm": 2.5431854724884033, "learning_rate": 5e-06, "loss": 0.4502, "step": 5713 }, { "epoch": 1.0925430210325047, "grad_norm": 1.5429447889328003, "learning_rate": 5e-06, "loss": 0.1223, "step": 5714 }, { "epoch": 1.092734225621415, "grad_norm": 2.018566131591797, "learning_rate": 5e-06, "loss": 0.2262, "step": 5715 }, { "epoch": 1.092925430210325, "grad_norm": 1.1679967641830444, "learning_rate": 5e-06, "loss": 0.0731, "step": 5716 }, { "epoch": 1.0931166347992352, "grad_norm": 1.0205624103546143, "learning_rate": 5e-06, "loss": 0.0543, "step": 5717 }, { "epoch": 1.0933078393881452, "grad_norm": 1.0302752256393433, "learning_rate": 5e-06, "loss": 0.0479, "step": 5718 }, { "epoch": 1.0934990439770556, "grad_norm": 2.270318031311035, "learning_rate": 5e-06, "loss": 0.251, "step": 5719 }, { "epoch": 1.0936902485659656, "grad_norm": 1.8095113039016724, "learning_rate": 5e-06, "loss": 0.16, "step": 5720 }, { "epoch": 1.0938814531548757, "grad_norm": 0.7488591074943542, "learning_rate": 5e-06, "loss": 0.0514, "step": 5721 }, { "epoch": 1.0940726577437858, "grad_norm": 1.4656505584716797, "learning_rate": 5e-06, "loss": 0.0985, "step": 5722 }, { "epoch": 1.094263862332696, "grad_norm": 0.7714589238166809, "learning_rate": 5e-06, "loss": 0.0423, "step": 5723 }, { "epoch": 1.0944550669216062, "grad_norm": 2.106182813644409, "learning_rate": 5e-06, "loss": 0.2728, "step": 5724 }, { "epoch": 1.0946462715105163, "grad_norm": 2.305906057357788, "learning_rate": 5e-06, "loss": 0.2627, "step": 5725 }, { "epoch": 1.0948374760994264, "grad_norm": 0.6955013275146484, "learning_rate": 5e-06, "loss": 0.0651, "step": 5726 }, { "epoch": 1.0950286806883365, "grad_norm": 1.7944916486740112, "learning_rate": 5e-06, "loss": 0.173, "step": 5727 }, { "epoch": 1.0952198852772466, "grad_norm": 2.637842893600464, "learning_rate": 5e-06, "loss": 0.1017, "step": 5728 }, { "epoch": 1.0954110898661569, "grad_norm": 0.9952343106269836, "learning_rate": 5e-06, "loss": 0.0618, "step": 5729 }, { "epoch": 1.095602294455067, "grad_norm": 1.2145094871520996, "learning_rate": 5e-06, "loss": 0.0363, "step": 5730 }, { "epoch": 1.095793499043977, "grad_norm": 2.777398109436035, "learning_rate": 5e-06, "loss": 0.1873, "step": 5731 }, { "epoch": 1.0959847036328871, "grad_norm": 0.9324513673782349, "learning_rate": 5e-06, "loss": 0.069, "step": 5732 }, { "epoch": 1.0961759082217972, "grad_norm": 1.6317458152770996, "learning_rate": 5e-06, "loss": 0.1003, "step": 5733 }, { "epoch": 1.0963671128107075, "grad_norm": 1.5389405488967896, "learning_rate": 5e-06, "loss": 0.0722, "step": 5734 }, { "epoch": 1.0965583173996176, "grad_norm": 3.062220335006714, "learning_rate": 5e-06, "loss": 0.1294, "step": 5735 }, { "epoch": 1.0967495219885277, "grad_norm": 1.457374930381775, "learning_rate": 5e-06, "loss": 0.0802, "step": 5736 }, { "epoch": 1.0969407265774378, "grad_norm": 2.036764144897461, "learning_rate": 5e-06, "loss": 0.1513, "step": 5737 }, { "epoch": 1.097131931166348, "grad_norm": 2.1664798259735107, "learning_rate": 5e-06, "loss": 0.2764, "step": 5738 }, { "epoch": 1.0973231357552582, "grad_norm": 1.5670701265335083, "learning_rate": 5e-06, "loss": 0.0703, "step": 5739 }, { "epoch": 1.0975143403441683, "grad_norm": 0.9358064532279968, "learning_rate": 5e-06, "loss": 0.0345, "step": 5740 }, { "epoch": 1.0977055449330784, "grad_norm": 1.4006162881851196, "learning_rate": 5e-06, "loss": 0.0829, "step": 5741 }, { "epoch": 1.0978967495219885, "grad_norm": 1.403589129447937, "learning_rate": 5e-06, "loss": 0.061, "step": 5742 }, { "epoch": 1.0980879541108988, "grad_norm": 1.3776750564575195, "learning_rate": 5e-06, "loss": 0.0641, "step": 5743 }, { "epoch": 1.0982791586998089, "grad_norm": 1.898293375968933, "learning_rate": 5e-06, "loss": 0.1924, "step": 5744 }, { "epoch": 1.098470363288719, "grad_norm": 1.39720618724823, "learning_rate": 5e-06, "loss": 0.0802, "step": 5745 }, { "epoch": 1.098661567877629, "grad_norm": 1.3132373094558716, "learning_rate": 5e-06, "loss": 0.063, "step": 5746 }, { "epoch": 1.0988527724665391, "grad_norm": 1.5557122230529785, "learning_rate": 5e-06, "loss": 0.175, "step": 5747 }, { "epoch": 1.0990439770554494, "grad_norm": 2.059504508972168, "learning_rate": 5e-06, "loss": 0.1491, "step": 5748 }, { "epoch": 1.0992351816443595, "grad_norm": 1.1423454284667969, "learning_rate": 5e-06, "loss": 0.0325, "step": 5749 }, { "epoch": 1.0994263862332696, "grad_norm": 2.8969271183013916, "learning_rate": 5e-06, "loss": 0.4175, "step": 5750 }, { "epoch": 1.0996175908221797, "grad_norm": 1.9927480220794678, "learning_rate": 5e-06, "loss": 0.0938, "step": 5751 }, { "epoch": 1.0998087954110898, "grad_norm": 1.3723220825195312, "learning_rate": 5e-06, "loss": 0.1006, "step": 5752 }, { "epoch": 1.1, "grad_norm": 0.7870752811431885, "learning_rate": 5e-06, "loss": 0.0486, "step": 5753 }, { "epoch": 1.1001912045889102, "grad_norm": 0.7222342491149902, "learning_rate": 5e-06, "loss": 0.0125, "step": 5754 }, { "epoch": 1.1003824091778203, "grad_norm": 1.8501266241073608, "learning_rate": 5e-06, "loss": 0.0527, "step": 5755 }, { "epoch": 1.1005736137667304, "grad_norm": 2.527247905731201, "learning_rate": 5e-06, "loss": 0.2819, "step": 5756 }, { "epoch": 1.1007648183556404, "grad_norm": 2.091769218444824, "learning_rate": 5e-06, "loss": 0.2612, "step": 5757 }, { "epoch": 1.1009560229445507, "grad_norm": 1.6816740036010742, "learning_rate": 5e-06, "loss": 0.1683, "step": 5758 }, { "epoch": 1.1011472275334608, "grad_norm": 1.4403516054153442, "learning_rate": 5e-06, "loss": 0.0814, "step": 5759 }, { "epoch": 1.101338432122371, "grad_norm": 0.9857239723205566, "learning_rate": 5e-06, "loss": 0.0488, "step": 5760 }, { "epoch": 1.101529636711281, "grad_norm": 2.1389448642730713, "learning_rate": 5e-06, "loss": 0.0507, "step": 5761 }, { "epoch": 1.101720841300191, "grad_norm": 1.8587839603424072, "learning_rate": 5e-06, "loss": 0.1574, "step": 5762 }, { "epoch": 1.1019120458891014, "grad_norm": 1.851593017578125, "learning_rate": 5e-06, "loss": 0.1472, "step": 5763 }, { "epoch": 1.1021032504780115, "grad_norm": 1.861345648765564, "learning_rate": 5e-06, "loss": 0.1222, "step": 5764 }, { "epoch": 1.1022944550669216, "grad_norm": 1.249029278755188, "learning_rate": 5e-06, "loss": 0.1045, "step": 5765 }, { "epoch": 1.1024856596558317, "grad_norm": 2.417471408843994, "learning_rate": 5e-06, "loss": 0.2049, "step": 5766 }, { "epoch": 1.1026768642447418, "grad_norm": 1.392234206199646, "learning_rate": 5e-06, "loss": 0.0759, "step": 5767 }, { "epoch": 1.102868068833652, "grad_norm": 1.4488508701324463, "learning_rate": 5e-06, "loss": 0.0945, "step": 5768 }, { "epoch": 1.1030592734225622, "grad_norm": 1.731885552406311, "learning_rate": 5e-06, "loss": 0.1366, "step": 5769 }, { "epoch": 1.1032504780114722, "grad_norm": 1.2528032064437866, "learning_rate": 5e-06, "loss": 0.0849, "step": 5770 }, { "epoch": 1.1034416826003823, "grad_norm": 1.366706132888794, "learning_rate": 5e-06, "loss": 0.1138, "step": 5771 }, { "epoch": 1.1036328871892926, "grad_norm": 1.9706212282180786, "learning_rate": 5e-06, "loss": 0.1266, "step": 5772 }, { "epoch": 1.1038240917782027, "grad_norm": 6.170831680297852, "learning_rate": 5e-06, "loss": 0.1795, "step": 5773 }, { "epoch": 1.1040152963671128, "grad_norm": 1.9874886274337769, "learning_rate": 5e-06, "loss": 0.1312, "step": 5774 }, { "epoch": 1.104206500956023, "grad_norm": 2.058870553970337, "learning_rate": 5e-06, "loss": 0.2049, "step": 5775 }, { "epoch": 1.104397705544933, "grad_norm": 1.8878477811813354, "learning_rate": 5e-06, "loss": 0.2064, "step": 5776 }, { "epoch": 1.1045889101338433, "grad_norm": 0.7636244893074036, "learning_rate": 5e-06, "loss": 0.0408, "step": 5777 }, { "epoch": 1.1047801147227534, "grad_norm": 1.490784764289856, "learning_rate": 5e-06, "loss": 0.1765, "step": 5778 }, { "epoch": 1.1049713193116635, "grad_norm": 0.795039713382721, "learning_rate": 5e-06, "loss": 0.0424, "step": 5779 }, { "epoch": 1.1051625239005736, "grad_norm": 1.5659568309783936, "learning_rate": 5e-06, "loss": 0.0788, "step": 5780 }, { "epoch": 1.1053537284894837, "grad_norm": 0.8034716844558716, "learning_rate": 5e-06, "loss": 0.0485, "step": 5781 }, { "epoch": 1.105544933078394, "grad_norm": 1.0298188924789429, "learning_rate": 5e-06, "loss": 0.0677, "step": 5782 }, { "epoch": 1.105736137667304, "grad_norm": 1.8984624147415161, "learning_rate": 5e-06, "loss": 0.1402, "step": 5783 }, { "epoch": 1.1059273422562141, "grad_norm": 1.9967960119247437, "learning_rate": 5e-06, "loss": 0.1297, "step": 5784 }, { "epoch": 1.1061185468451242, "grad_norm": 1.45951247215271, "learning_rate": 5e-06, "loss": 0.0652, "step": 5785 }, { "epoch": 1.1063097514340343, "grad_norm": 1.4071576595306396, "learning_rate": 5e-06, "loss": 0.0426, "step": 5786 }, { "epoch": 1.1065009560229446, "grad_norm": 3.093616247177124, "learning_rate": 5e-06, "loss": 0.2499, "step": 5787 }, { "epoch": 1.1066921606118547, "grad_norm": 1.2860229015350342, "learning_rate": 5e-06, "loss": 0.0862, "step": 5788 }, { "epoch": 1.1068833652007648, "grad_norm": 1.9504213333129883, "learning_rate": 5e-06, "loss": 0.2004, "step": 5789 }, { "epoch": 1.107074569789675, "grad_norm": 2.4501025676727295, "learning_rate": 5e-06, "loss": 0.1979, "step": 5790 }, { "epoch": 1.1072657743785852, "grad_norm": 3.6061577796936035, "learning_rate": 5e-06, "loss": 0.1549, "step": 5791 }, { "epoch": 1.1074569789674953, "grad_norm": 0.46892136335372925, "learning_rate": 5e-06, "loss": 0.0101, "step": 5792 }, { "epoch": 1.1076481835564054, "grad_norm": 1.5113166570663452, "learning_rate": 5e-06, "loss": 0.0745, "step": 5793 }, { "epoch": 1.1078393881453155, "grad_norm": 2.696958303451538, "learning_rate": 5e-06, "loss": 0.3514, "step": 5794 }, { "epoch": 1.1080305927342256, "grad_norm": 2.0832767486572266, "learning_rate": 5e-06, "loss": 0.1931, "step": 5795 }, { "epoch": 1.1082217973231359, "grad_norm": 1.0346436500549316, "learning_rate": 5e-06, "loss": 0.0442, "step": 5796 }, { "epoch": 1.108413001912046, "grad_norm": 2.7505767345428467, "learning_rate": 5e-06, "loss": 0.134, "step": 5797 }, { "epoch": 1.108604206500956, "grad_norm": 1.7536537647247314, "learning_rate": 5e-06, "loss": 0.1032, "step": 5798 }, { "epoch": 1.1087954110898661, "grad_norm": 0.7006078362464905, "learning_rate": 5e-06, "loss": 0.0423, "step": 5799 }, { "epoch": 1.1089866156787762, "grad_norm": 1.4535229206085205, "learning_rate": 5e-06, "loss": 0.0865, "step": 5800 }, { "epoch": 1.1091778202676865, "grad_norm": 2.1496787071228027, "learning_rate": 5e-06, "loss": 0.2779, "step": 5801 }, { "epoch": 1.1093690248565966, "grad_norm": 2.135427474975586, "learning_rate": 5e-06, "loss": 0.2009, "step": 5802 }, { "epoch": 1.1095602294455067, "grad_norm": 2.1016106605529785, "learning_rate": 5e-06, "loss": 0.111, "step": 5803 }, { "epoch": 1.1097514340344168, "grad_norm": 0.7748586535453796, "learning_rate": 5e-06, "loss": 0.064, "step": 5804 }, { "epoch": 1.1099426386233269, "grad_norm": 1.6331055164337158, "learning_rate": 5e-06, "loss": 0.0694, "step": 5805 }, { "epoch": 1.1101338432122372, "grad_norm": 1.2938159704208374, "learning_rate": 5e-06, "loss": 0.0924, "step": 5806 }, { "epoch": 1.1103250478011473, "grad_norm": 1.352454662322998, "learning_rate": 5e-06, "loss": 0.074, "step": 5807 }, { "epoch": 1.1105162523900574, "grad_norm": 1.8348606824874878, "learning_rate": 5e-06, "loss": 0.2089, "step": 5808 }, { "epoch": 1.1107074569789674, "grad_norm": 1.0210434198379517, "learning_rate": 5e-06, "loss": 0.0382, "step": 5809 }, { "epoch": 1.1108986615678775, "grad_norm": 2.106792449951172, "learning_rate": 5e-06, "loss": 0.1903, "step": 5810 }, { "epoch": 1.1110898661567878, "grad_norm": 1.2394472360610962, "learning_rate": 5e-06, "loss": 0.0562, "step": 5811 }, { "epoch": 1.111281070745698, "grad_norm": 1.395771861076355, "learning_rate": 5e-06, "loss": 0.1461, "step": 5812 }, { "epoch": 1.111472275334608, "grad_norm": 3.2685039043426514, "learning_rate": 5e-06, "loss": 0.1339, "step": 5813 }, { "epoch": 1.111663479923518, "grad_norm": 1.9594659805297852, "learning_rate": 5e-06, "loss": 0.2694, "step": 5814 }, { "epoch": 1.1118546845124282, "grad_norm": 1.7652355432510376, "learning_rate": 5e-06, "loss": 0.0963, "step": 5815 }, { "epoch": 1.1120458891013385, "grad_norm": 2.3200769424438477, "learning_rate": 5e-06, "loss": 0.2094, "step": 5816 }, { "epoch": 1.1122370936902486, "grad_norm": 1.5320895910263062, "learning_rate": 5e-06, "loss": 0.0662, "step": 5817 }, { "epoch": 1.1124282982791587, "grad_norm": 0.7863326668739319, "learning_rate": 5e-06, "loss": 0.044, "step": 5818 }, { "epoch": 1.1126195028680688, "grad_norm": 1.5986088514328003, "learning_rate": 5e-06, "loss": 0.2212, "step": 5819 }, { "epoch": 1.1128107074569789, "grad_norm": 0.8004074096679688, "learning_rate": 5e-06, "loss": 0.0729, "step": 5820 }, { "epoch": 1.1130019120458892, "grad_norm": 1.8215408325195312, "learning_rate": 5e-06, "loss": 0.15, "step": 5821 }, { "epoch": 1.1131931166347993, "grad_norm": 0.8063507080078125, "learning_rate": 5e-06, "loss": 0.0511, "step": 5822 }, { "epoch": 1.1133843212237093, "grad_norm": 0.7384288311004639, "learning_rate": 5e-06, "loss": 0.0304, "step": 5823 }, { "epoch": 1.1135755258126194, "grad_norm": 0.910090446472168, "learning_rate": 5e-06, "loss": 0.0327, "step": 5824 }, { "epoch": 1.1137667304015297, "grad_norm": 2.1890387535095215, "learning_rate": 5e-06, "loss": 0.2288, "step": 5825 }, { "epoch": 1.1139579349904398, "grad_norm": 0.9365002512931824, "learning_rate": 5e-06, "loss": 0.074, "step": 5826 }, { "epoch": 1.11414913957935, "grad_norm": 1.6376757621765137, "learning_rate": 5e-06, "loss": 0.1314, "step": 5827 }, { "epoch": 1.11434034416826, "grad_norm": 0.6365956664085388, "learning_rate": 5e-06, "loss": 0.0318, "step": 5828 }, { "epoch": 1.11453154875717, "grad_norm": 2.4572699069976807, "learning_rate": 5e-06, "loss": 0.1096, "step": 5829 }, { "epoch": 1.1147227533460804, "grad_norm": 0.6337371468544006, "learning_rate": 5e-06, "loss": 0.0204, "step": 5830 }, { "epoch": 1.1149139579349905, "grad_norm": 1.9503843784332275, "learning_rate": 5e-06, "loss": 0.2206, "step": 5831 }, { "epoch": 1.1151051625239006, "grad_norm": 2.674628973007202, "learning_rate": 5e-06, "loss": 0.2115, "step": 5832 }, { "epoch": 1.1152963671128107, "grad_norm": 1.2188706398010254, "learning_rate": 5e-06, "loss": 0.0726, "step": 5833 }, { "epoch": 1.1154875717017207, "grad_norm": 4.067685127258301, "learning_rate": 5e-06, "loss": 0.1759, "step": 5834 }, { "epoch": 1.115678776290631, "grad_norm": 1.1010500192642212, "learning_rate": 5e-06, "loss": 0.0777, "step": 5835 }, { "epoch": 1.1158699808795411, "grad_norm": 1.6503252983093262, "learning_rate": 5e-06, "loss": 0.101, "step": 5836 }, { "epoch": 1.1160611854684512, "grad_norm": 1.9464383125305176, "learning_rate": 5e-06, "loss": 0.2607, "step": 5837 }, { "epoch": 1.1162523900573613, "grad_norm": 2.022468090057373, "learning_rate": 5e-06, "loss": 0.2771, "step": 5838 }, { "epoch": 1.1164435946462714, "grad_norm": 1.6697824001312256, "learning_rate": 5e-06, "loss": 0.1383, "step": 5839 }, { "epoch": 1.1166347992351817, "grad_norm": 2.806473731994629, "learning_rate": 5e-06, "loss": 0.282, "step": 5840 }, { "epoch": 1.1168260038240918, "grad_norm": 1.8859554529190063, "learning_rate": 5e-06, "loss": 0.1404, "step": 5841 }, { "epoch": 1.117017208413002, "grad_norm": 2.372626543045044, "learning_rate": 5e-06, "loss": 0.1272, "step": 5842 }, { "epoch": 1.117208413001912, "grad_norm": 1.7864800691604614, "learning_rate": 5e-06, "loss": 0.093, "step": 5843 }, { "epoch": 1.1173996175908223, "grad_norm": 2.1923136711120605, "learning_rate": 5e-06, "loss": 0.175, "step": 5844 }, { "epoch": 1.1175908221797324, "grad_norm": 1.0703550577163696, "learning_rate": 5e-06, "loss": 0.0707, "step": 5845 }, { "epoch": 1.1177820267686425, "grad_norm": 0.8329720497131348, "learning_rate": 5e-06, "loss": 0.073, "step": 5846 }, { "epoch": 1.1179732313575526, "grad_norm": 1.4551035165786743, "learning_rate": 5e-06, "loss": 0.0402, "step": 5847 }, { "epoch": 1.1181644359464626, "grad_norm": 1.0404424667358398, "learning_rate": 5e-06, "loss": 0.0406, "step": 5848 }, { "epoch": 1.118355640535373, "grad_norm": 1.1678119897842407, "learning_rate": 5e-06, "loss": 0.0683, "step": 5849 }, { "epoch": 1.118546845124283, "grad_norm": 3.122912883758545, "learning_rate": 5e-06, "loss": 0.496, "step": 5850 }, { "epoch": 1.1187380497131931, "grad_norm": 1.2659235000610352, "learning_rate": 5e-06, "loss": 0.0904, "step": 5851 }, { "epoch": 1.1189292543021032, "grad_norm": 2.43969988822937, "learning_rate": 5e-06, "loss": 0.308, "step": 5852 }, { "epoch": 1.1191204588910133, "grad_norm": 1.8691391944885254, "learning_rate": 5e-06, "loss": 0.1474, "step": 5853 }, { "epoch": 1.1193116634799236, "grad_norm": 2.168302059173584, "learning_rate": 5e-06, "loss": 0.0874, "step": 5854 }, { "epoch": 1.1195028680688337, "grad_norm": 1.7108036279678345, "learning_rate": 5e-06, "loss": 0.0426, "step": 5855 }, { "epoch": 1.1196940726577438, "grad_norm": 2.807419538497925, "learning_rate": 5e-06, "loss": 0.3353, "step": 5856 }, { "epoch": 1.1198852772466539, "grad_norm": 1.1391634941101074, "learning_rate": 5e-06, "loss": 0.0533, "step": 5857 }, { "epoch": 1.120076481835564, "grad_norm": 0.9980553984642029, "learning_rate": 5e-06, "loss": 0.0557, "step": 5858 }, { "epoch": 1.1202676864244743, "grad_norm": 1.4949246644973755, "learning_rate": 5e-06, "loss": 0.1786, "step": 5859 }, { "epoch": 1.1204588910133844, "grad_norm": 0.8271896839141846, "learning_rate": 5e-06, "loss": 0.0383, "step": 5860 }, { "epoch": 1.1206500956022944, "grad_norm": 1.2064502239227295, "learning_rate": 5e-06, "loss": 0.0586, "step": 5861 }, { "epoch": 1.1208413001912045, "grad_norm": 1.5888252258300781, "learning_rate": 5e-06, "loss": 0.1418, "step": 5862 }, { "epoch": 1.1210325047801146, "grad_norm": 1.441205382347107, "learning_rate": 5e-06, "loss": 0.1375, "step": 5863 }, { "epoch": 1.121223709369025, "grad_norm": 1.0657639503479004, "learning_rate": 5e-06, "loss": 0.0368, "step": 5864 }, { "epoch": 1.121414913957935, "grad_norm": 1.1127941608428955, "learning_rate": 5e-06, "loss": 0.0692, "step": 5865 }, { "epoch": 1.121606118546845, "grad_norm": 1.5667933225631714, "learning_rate": 5e-06, "loss": 0.0617, "step": 5866 }, { "epoch": 1.1217973231357552, "grad_norm": 1.4817070960998535, "learning_rate": 5e-06, "loss": 0.0875, "step": 5867 }, { "epoch": 1.1219885277246653, "grad_norm": 2.7602312564849854, "learning_rate": 5e-06, "loss": 0.2847, "step": 5868 }, { "epoch": 1.1221797323135756, "grad_norm": 2.3214738368988037, "learning_rate": 5e-06, "loss": 0.3105, "step": 5869 }, { "epoch": 1.1223709369024857, "grad_norm": 1.2657291889190674, "learning_rate": 5e-06, "loss": 0.0898, "step": 5870 }, { "epoch": 1.1225621414913958, "grad_norm": 1.5459098815917969, "learning_rate": 5e-06, "loss": 0.1353, "step": 5871 }, { "epoch": 1.1227533460803059, "grad_norm": 1.9266481399536133, "learning_rate": 5e-06, "loss": 0.1243, "step": 5872 }, { "epoch": 1.122944550669216, "grad_norm": 1.1586101055145264, "learning_rate": 5e-06, "loss": 0.0829, "step": 5873 }, { "epoch": 1.1231357552581263, "grad_norm": 1.7313776016235352, "learning_rate": 5e-06, "loss": 0.0654, "step": 5874 }, { "epoch": 1.1233269598470363, "grad_norm": 2.0912880897521973, "learning_rate": 5e-06, "loss": 0.1978, "step": 5875 }, { "epoch": 1.1235181644359464, "grad_norm": 2.500584840774536, "learning_rate": 5e-06, "loss": 0.2266, "step": 5876 }, { "epoch": 1.1237093690248565, "grad_norm": 1.7160296440124512, "learning_rate": 5e-06, "loss": 0.153, "step": 5877 }, { "epoch": 1.1239005736137668, "grad_norm": 0.9689812660217285, "learning_rate": 5e-06, "loss": 0.0418, "step": 5878 }, { "epoch": 1.124091778202677, "grad_norm": 1.4203402996063232, "learning_rate": 5e-06, "loss": 0.0807, "step": 5879 }, { "epoch": 1.124282982791587, "grad_norm": 3.106449842453003, "learning_rate": 5e-06, "loss": 0.0981, "step": 5880 }, { "epoch": 1.124474187380497, "grad_norm": 2.113834857940674, "learning_rate": 5e-06, "loss": 0.2381, "step": 5881 }, { "epoch": 1.1246653919694072, "grad_norm": 1.5136511325836182, "learning_rate": 5e-06, "loss": 0.1507, "step": 5882 }, { "epoch": 1.1248565965583175, "grad_norm": 2.4438841342926025, "learning_rate": 5e-06, "loss": 0.1835, "step": 5883 }, { "epoch": 1.1250478011472276, "grad_norm": 1.4762717485427856, "learning_rate": 5e-06, "loss": 0.0685, "step": 5884 }, { "epoch": 1.1252390057361377, "grad_norm": 0.3290434181690216, "learning_rate": 5e-06, "loss": 0.013, "step": 5885 }, { "epoch": 1.1254302103250478, "grad_norm": 1.6333457231521606, "learning_rate": 5e-06, "loss": 0.048, "step": 5886 }, { "epoch": 1.1256214149139578, "grad_norm": 2.1950032711029053, "learning_rate": 5e-06, "loss": 0.192, "step": 5887 }, { "epoch": 1.1258126195028682, "grad_norm": 0.8317578434944153, "learning_rate": 5e-06, "loss": 0.0635, "step": 5888 }, { "epoch": 1.1260038240917782, "grad_norm": 1.20002281665802, "learning_rate": 5e-06, "loss": 0.0849, "step": 5889 }, { "epoch": 1.1261950286806883, "grad_norm": 1.6884593963623047, "learning_rate": 5e-06, "loss": 0.0486, "step": 5890 }, { "epoch": 1.1263862332695984, "grad_norm": 2.1091883182525635, "learning_rate": 5e-06, "loss": 0.1591, "step": 5891 }, { "epoch": 1.1265774378585087, "grad_norm": 1.4711389541625977, "learning_rate": 5e-06, "loss": 0.0747, "step": 5892 }, { "epoch": 1.1267686424474188, "grad_norm": 2.467928409576416, "learning_rate": 5e-06, "loss": 0.3312, "step": 5893 }, { "epoch": 1.126959847036329, "grad_norm": 1.4449323415756226, "learning_rate": 5e-06, "loss": 0.1376, "step": 5894 }, { "epoch": 1.127151051625239, "grad_norm": 2.1099445819854736, "learning_rate": 5e-06, "loss": 0.1635, "step": 5895 }, { "epoch": 1.127342256214149, "grad_norm": 2.1566996574401855, "learning_rate": 5e-06, "loss": 0.1596, "step": 5896 }, { "epoch": 1.1275334608030594, "grad_norm": 1.25776207447052, "learning_rate": 5e-06, "loss": 0.0662, "step": 5897 }, { "epoch": 1.1277246653919695, "grad_norm": 1.7440266609191895, "learning_rate": 5e-06, "loss": 0.1401, "step": 5898 }, { "epoch": 1.1279158699808796, "grad_norm": 1.6530354022979736, "learning_rate": 5e-06, "loss": 0.0837, "step": 5899 }, { "epoch": 1.1281070745697896, "grad_norm": 2.073582649230957, "learning_rate": 5e-06, "loss": 0.209, "step": 5900 }, { "epoch": 1.1282982791586997, "grad_norm": 1.2013795375823975, "learning_rate": 5e-06, "loss": 0.1344, "step": 5901 }, { "epoch": 1.12848948374761, "grad_norm": 0.7417505979537964, "learning_rate": 5e-06, "loss": 0.061, "step": 5902 }, { "epoch": 1.1286806883365201, "grad_norm": 0.7994497418403625, "learning_rate": 5e-06, "loss": 0.0461, "step": 5903 }, { "epoch": 1.1288718929254302, "grad_norm": 0.9240248203277588, "learning_rate": 5e-06, "loss": 0.0244, "step": 5904 }, { "epoch": 1.1290630975143403, "grad_norm": 1.514775276184082, "learning_rate": 5e-06, "loss": 0.0728, "step": 5905 }, { "epoch": 1.1292543021032504, "grad_norm": 2.3339710235595703, "learning_rate": 5e-06, "loss": 0.2318, "step": 5906 }, { "epoch": 1.1294455066921607, "grad_norm": 1.739105463027954, "learning_rate": 5e-06, "loss": 0.1055, "step": 5907 }, { "epoch": 1.1296367112810708, "grad_norm": 1.3210444450378418, "learning_rate": 5e-06, "loss": 0.1018, "step": 5908 }, { "epoch": 1.1298279158699809, "grad_norm": 1.95649254322052, "learning_rate": 5e-06, "loss": 0.127, "step": 5909 }, { "epoch": 1.130019120458891, "grad_norm": 1.2586040496826172, "learning_rate": 5e-06, "loss": 0.0602, "step": 5910 }, { "epoch": 1.130210325047801, "grad_norm": 0.6684005260467529, "learning_rate": 5e-06, "loss": 0.0093, "step": 5911 }, { "epoch": 1.1304015296367114, "grad_norm": 1.8613158464431763, "learning_rate": 5e-06, "loss": 0.166, "step": 5912 }, { "epoch": 1.1305927342256215, "grad_norm": 1.138454794883728, "learning_rate": 5e-06, "loss": 0.0658, "step": 5913 }, { "epoch": 1.1307839388145315, "grad_norm": 1.3337939977645874, "learning_rate": 5e-06, "loss": 0.1279, "step": 5914 }, { "epoch": 1.1309751434034416, "grad_norm": 1.7277913093566895, "learning_rate": 5e-06, "loss": 0.0672, "step": 5915 }, { "epoch": 1.1311663479923517, "grad_norm": 1.699109673500061, "learning_rate": 5e-06, "loss": 0.0633, "step": 5916 }, { "epoch": 1.131357552581262, "grad_norm": 2.2800168991088867, "learning_rate": 5e-06, "loss": 0.1858, "step": 5917 }, { "epoch": 1.1315487571701721, "grad_norm": 2.4426426887512207, "learning_rate": 5e-06, "loss": 0.1289, "step": 5918 }, { "epoch": 1.1317399617590822, "grad_norm": 1.9887288808822632, "learning_rate": 5e-06, "loss": 0.2119, "step": 5919 }, { "epoch": 1.1319311663479923, "grad_norm": 2.3811888694763184, "learning_rate": 5e-06, "loss": 0.2509, "step": 5920 }, { "epoch": 1.1321223709369024, "grad_norm": 2.353497266769409, "learning_rate": 5e-06, "loss": 0.2354, "step": 5921 }, { "epoch": 1.1323135755258127, "grad_norm": 0.9172454476356506, "learning_rate": 5e-06, "loss": 0.0662, "step": 5922 }, { "epoch": 1.1325047801147228, "grad_norm": 2.1203384399414062, "learning_rate": 5e-06, "loss": 0.085, "step": 5923 }, { "epoch": 1.1326959847036329, "grad_norm": 2.3185367584228516, "learning_rate": 5e-06, "loss": 0.0868, "step": 5924 }, { "epoch": 1.132887189292543, "grad_norm": 2.7123560905456543, "learning_rate": 5e-06, "loss": 0.3535, "step": 5925 }, { "epoch": 1.133078393881453, "grad_norm": 2.161722183227539, "learning_rate": 5e-06, "loss": 0.2252, "step": 5926 }, { "epoch": 1.1332695984703633, "grad_norm": 1.7932413816452026, "learning_rate": 5e-06, "loss": 0.1072, "step": 5927 }, { "epoch": 1.1334608030592734, "grad_norm": 1.5227441787719727, "learning_rate": 5e-06, "loss": 0.0524, "step": 5928 }, { "epoch": 1.1336520076481835, "grad_norm": 1.1363903284072876, "learning_rate": 5e-06, "loss": 0.065, "step": 5929 }, { "epoch": 1.1338432122370936, "grad_norm": 2.1632423400878906, "learning_rate": 5e-06, "loss": 0.0894, "step": 5930 }, { "epoch": 1.1340344168260037, "grad_norm": 0.6506508588790894, "learning_rate": 5e-06, "loss": 0.0442, "step": 5931 }, { "epoch": 1.134225621414914, "grad_norm": 3.3030409812927246, "learning_rate": 5e-06, "loss": 0.3152, "step": 5932 }, { "epoch": 1.134416826003824, "grad_norm": 1.9436111450195312, "learning_rate": 5e-06, "loss": 0.0738, "step": 5933 }, { "epoch": 1.1346080305927342, "grad_norm": 3.1385626792907715, "learning_rate": 5e-06, "loss": 0.3247, "step": 5934 }, { "epoch": 1.1347992351816443, "grad_norm": 3.9077584743499756, "learning_rate": 5e-06, "loss": 0.0565, "step": 5935 }, { "epoch": 1.1349904397705546, "grad_norm": 1.5972917079925537, "learning_rate": 5e-06, "loss": 0.0843, "step": 5936 }, { "epoch": 1.1351816443594647, "grad_norm": 1.5931931734085083, "learning_rate": 5e-06, "loss": 0.1674, "step": 5937 }, { "epoch": 1.1353728489483748, "grad_norm": 1.6486834287643433, "learning_rate": 5e-06, "loss": 0.1208, "step": 5938 }, { "epoch": 1.1355640535372848, "grad_norm": 2.0563056468963623, "learning_rate": 5e-06, "loss": 0.2244, "step": 5939 }, { "epoch": 1.135755258126195, "grad_norm": 1.0756323337554932, "learning_rate": 5e-06, "loss": 0.0842, "step": 5940 }, { "epoch": 1.1359464627151052, "grad_norm": 1.615752935409546, "learning_rate": 5e-06, "loss": 0.1011, "step": 5941 }, { "epoch": 1.1361376673040153, "grad_norm": 1.9002844095230103, "learning_rate": 5e-06, "loss": 0.0837, "step": 5942 }, { "epoch": 1.1363288718929254, "grad_norm": 1.3055564165115356, "learning_rate": 5e-06, "loss": 0.0799, "step": 5943 }, { "epoch": 1.1365200764818355, "grad_norm": 2.0098092555999756, "learning_rate": 5e-06, "loss": 0.2034, "step": 5944 }, { "epoch": 1.1367112810707458, "grad_norm": 1.3458549976348877, "learning_rate": 5e-06, "loss": 0.157, "step": 5945 }, { "epoch": 1.136902485659656, "grad_norm": 1.6178194284439087, "learning_rate": 5e-06, "loss": 0.1039, "step": 5946 }, { "epoch": 1.137093690248566, "grad_norm": 2.4097132682800293, "learning_rate": 5e-06, "loss": 0.0646, "step": 5947 }, { "epoch": 1.137284894837476, "grad_norm": 1.505638599395752, "learning_rate": 5e-06, "loss": 0.0537, "step": 5948 }, { "epoch": 1.1374760994263862, "grad_norm": 1.2082053422927856, "learning_rate": 5e-06, "loss": 0.047, "step": 5949 }, { "epoch": 1.1376673040152965, "grad_norm": 1.2429389953613281, "learning_rate": 5e-06, "loss": 0.0812, "step": 5950 }, { "epoch": 1.1378585086042066, "grad_norm": 2.585109233856201, "learning_rate": 5e-06, "loss": 0.069, "step": 5951 }, { "epoch": 1.1380497131931167, "grad_norm": 2.0243451595306396, "learning_rate": 5e-06, "loss": 0.1475, "step": 5952 }, { "epoch": 1.1382409177820267, "grad_norm": 1.217773199081421, "learning_rate": 5e-06, "loss": 0.0684, "step": 5953 }, { "epoch": 1.1384321223709368, "grad_norm": 0.9870648384094238, "learning_rate": 5e-06, "loss": 0.0429, "step": 5954 }, { "epoch": 1.1386233269598471, "grad_norm": 1.2159374952316284, "learning_rate": 5e-06, "loss": 0.0458, "step": 5955 }, { "epoch": 1.1388145315487572, "grad_norm": 1.7573827505111694, "learning_rate": 5e-06, "loss": 0.2002, "step": 5956 }, { "epoch": 1.1390057361376673, "grad_norm": 1.0106160640716553, "learning_rate": 5e-06, "loss": 0.1065, "step": 5957 }, { "epoch": 1.1391969407265774, "grad_norm": 1.294363260269165, "learning_rate": 5e-06, "loss": 0.0982, "step": 5958 }, { "epoch": 1.1393881453154875, "grad_norm": 1.239507794380188, "learning_rate": 5e-06, "loss": 0.1389, "step": 5959 }, { "epoch": 1.1395793499043978, "grad_norm": 1.3383917808532715, "learning_rate": 5e-06, "loss": 0.0797, "step": 5960 }, { "epoch": 1.1397705544933079, "grad_norm": 1.403836965560913, "learning_rate": 5e-06, "loss": 0.0702, "step": 5961 }, { "epoch": 1.139961759082218, "grad_norm": 2.712651491165161, "learning_rate": 5e-06, "loss": 0.0612, "step": 5962 }, { "epoch": 1.140152963671128, "grad_norm": 2.9551162719726562, "learning_rate": 5e-06, "loss": 0.2241, "step": 5963 }, { "epoch": 1.1403441682600381, "grad_norm": 0.7462462186813354, "learning_rate": 5e-06, "loss": 0.0658, "step": 5964 }, { "epoch": 1.1405353728489485, "grad_norm": 0.8238846659660339, "learning_rate": 5e-06, "loss": 0.0566, "step": 5965 }, { "epoch": 1.1407265774378585, "grad_norm": 1.3812743425369263, "learning_rate": 5e-06, "loss": 0.0468, "step": 5966 }, { "epoch": 1.1409177820267686, "grad_norm": 1.1476503610610962, "learning_rate": 5e-06, "loss": 0.0558, "step": 5967 }, { "epoch": 1.1411089866156787, "grad_norm": 1.7411783933639526, "learning_rate": 5e-06, "loss": 0.1208, "step": 5968 }, { "epoch": 1.1413001912045888, "grad_norm": 2.1010475158691406, "learning_rate": 5e-06, "loss": 0.176, "step": 5969 }, { "epoch": 1.1414913957934991, "grad_norm": 0.9700941443443298, "learning_rate": 5e-06, "loss": 0.0863, "step": 5970 }, { "epoch": 1.1416826003824092, "grad_norm": 1.139196753501892, "learning_rate": 5e-06, "loss": 0.0632, "step": 5971 }, { "epoch": 1.1418738049713193, "grad_norm": 1.9784907102584839, "learning_rate": 5e-06, "loss": 0.1047, "step": 5972 }, { "epoch": 1.1420650095602294, "grad_norm": 1.6605274677276611, "learning_rate": 5e-06, "loss": 0.1016, "step": 5973 }, { "epoch": 1.1422562141491395, "grad_norm": 2.2134573459625244, "learning_rate": 5e-06, "loss": 0.1446, "step": 5974 }, { "epoch": 1.1424474187380498, "grad_norm": 2.1722609996795654, "learning_rate": 5e-06, "loss": 0.3741, "step": 5975 }, { "epoch": 1.1426386233269599, "grad_norm": 2.2831599712371826, "learning_rate": 5e-06, "loss": 0.3514, "step": 5976 }, { "epoch": 1.14282982791587, "grad_norm": 3.3943796157836914, "learning_rate": 5e-06, "loss": 0.4212, "step": 5977 }, { "epoch": 1.14302103250478, "grad_norm": 1.330841302871704, "learning_rate": 5e-06, "loss": 0.0732, "step": 5978 }, { "epoch": 1.1432122370936901, "grad_norm": 1.9645999670028687, "learning_rate": 5e-06, "loss": 0.0853, "step": 5979 }, { "epoch": 1.1434034416826004, "grad_norm": 1.6076546907424927, "learning_rate": 5e-06, "loss": 0.0658, "step": 5980 }, { "epoch": 1.1435946462715105, "grad_norm": 2.182152509689331, "learning_rate": 5e-06, "loss": 0.1959, "step": 5981 }, { "epoch": 1.1437858508604206, "grad_norm": 2.1681408882141113, "learning_rate": 5e-06, "loss": 0.2228, "step": 5982 }, { "epoch": 1.1439770554493307, "grad_norm": 2.4408507347106934, "learning_rate": 5e-06, "loss": 0.2301, "step": 5983 }, { "epoch": 1.144168260038241, "grad_norm": 1.2057291269302368, "learning_rate": 5e-06, "loss": 0.107, "step": 5984 }, { "epoch": 1.144359464627151, "grad_norm": 0.7931608557701111, "learning_rate": 5e-06, "loss": 0.0782, "step": 5985 }, { "epoch": 1.1445506692160612, "grad_norm": 1.4083235263824463, "learning_rate": 5e-06, "loss": 0.0425, "step": 5986 }, { "epoch": 1.1447418738049713, "grad_norm": 3.754728317260742, "learning_rate": 5e-06, "loss": 0.32, "step": 5987 }, { "epoch": 1.1449330783938814, "grad_norm": 2.447702407836914, "learning_rate": 5e-06, "loss": 0.2783, "step": 5988 }, { "epoch": 1.1451242829827917, "grad_norm": 1.0956696271896362, "learning_rate": 5e-06, "loss": 0.1021, "step": 5989 }, { "epoch": 1.1453154875717018, "grad_norm": 0.831962525844574, "learning_rate": 5e-06, "loss": 0.0399, "step": 5990 }, { "epoch": 1.1455066921606119, "grad_norm": 1.3386517763137817, "learning_rate": 5e-06, "loss": 0.0901, "step": 5991 }, { "epoch": 1.145697896749522, "grad_norm": 2.1000680923461914, "learning_rate": 5e-06, "loss": 0.0823, "step": 5992 }, { "epoch": 1.145889101338432, "grad_norm": 1.4930262565612793, "learning_rate": 5e-06, "loss": 0.0767, "step": 5993 }, { "epoch": 1.1460803059273423, "grad_norm": 1.3432178497314453, "learning_rate": 5e-06, "loss": 0.1057, "step": 5994 }, { "epoch": 1.1462715105162524, "grad_norm": 1.8772666454315186, "learning_rate": 5e-06, "loss": 0.14, "step": 5995 }, { "epoch": 1.1464627151051625, "grad_norm": 0.8968759179115295, "learning_rate": 5e-06, "loss": 0.0477, "step": 5996 }, { "epoch": 1.1466539196940726, "grad_norm": 1.3898628950119019, "learning_rate": 5e-06, "loss": 0.067, "step": 5997 }, { "epoch": 1.146845124282983, "grad_norm": 0.6423681974411011, "learning_rate": 5e-06, "loss": 0.0118, "step": 5998 }, { "epoch": 1.147036328871893, "grad_norm": 1.6495819091796875, "learning_rate": 5e-06, "loss": 0.1907, "step": 5999 }, { "epoch": 1.147227533460803, "grad_norm": 1.3852123022079468, "learning_rate": 5e-06, "loss": 0.0823, "step": 6000 }, { "epoch": 1.147227533460803, "eval_runtime": 801.3896, "eval_samples_per_second": 1.914, "eval_steps_per_second": 0.24, "step": 6000 }, { "epoch": 1.1474187380497132, "grad_norm": 1.957743763923645, "learning_rate": 5e-06, "loss": 0.1578, "step": 6001 }, { "epoch": 1.1476099426386233, "grad_norm": 2.059922218322754, "learning_rate": 5e-06, "loss": 0.15, "step": 6002 }, { "epoch": 1.1478011472275336, "grad_norm": 1.291016936302185, "learning_rate": 5e-06, "loss": 0.0634, "step": 6003 }, { "epoch": 1.1479923518164437, "grad_norm": 1.9278820753097534, "learning_rate": 5e-06, "loss": 0.2147, "step": 6004 }, { "epoch": 1.1481835564053537, "grad_norm": 1.496976613998413, "learning_rate": 5e-06, "loss": 0.0778, "step": 6005 }, { "epoch": 1.1483747609942638, "grad_norm": 1.579020619392395, "learning_rate": 5e-06, "loss": 0.0946, "step": 6006 }, { "epoch": 1.148565965583174, "grad_norm": 1.9187579154968262, "learning_rate": 5e-06, "loss": 0.1447, "step": 6007 }, { "epoch": 1.1487571701720842, "grad_norm": 1.770951271057129, "learning_rate": 5e-06, "loss": 0.1904, "step": 6008 }, { "epoch": 1.1489483747609943, "grad_norm": 1.5869866609573364, "learning_rate": 5e-06, "loss": 0.1424, "step": 6009 }, { "epoch": 1.1491395793499044, "grad_norm": 2.293226718902588, "learning_rate": 5e-06, "loss": 0.0983, "step": 6010 }, { "epoch": 1.1493307839388145, "grad_norm": 1.4532525539398193, "learning_rate": 5e-06, "loss": 0.0784, "step": 6011 }, { "epoch": 1.1495219885277246, "grad_norm": 1.6799381971359253, "learning_rate": 5e-06, "loss": 0.0964, "step": 6012 }, { "epoch": 1.149713193116635, "grad_norm": 1.4979642629623413, "learning_rate": 5e-06, "loss": 0.0636, "step": 6013 }, { "epoch": 1.149904397705545, "grad_norm": 2.968336582183838, "learning_rate": 5e-06, "loss": 0.3399, "step": 6014 }, { "epoch": 1.150095602294455, "grad_norm": 2.201011896133423, "learning_rate": 5e-06, "loss": 0.1948, "step": 6015 }, { "epoch": 1.1502868068833652, "grad_norm": 1.8662936687469482, "learning_rate": 5e-06, "loss": 0.0909, "step": 6016 }, { "epoch": 1.1504780114722752, "grad_norm": 1.6262683868408203, "learning_rate": 5e-06, "loss": 0.153, "step": 6017 }, { "epoch": 1.1506692160611856, "grad_norm": 1.8292471170425415, "learning_rate": 5e-06, "loss": 0.1355, "step": 6018 }, { "epoch": 1.1508604206500956, "grad_norm": 1.0284324884414673, "learning_rate": 5e-06, "loss": 0.0604, "step": 6019 }, { "epoch": 1.1510516252390057, "grad_norm": 2.682225465774536, "learning_rate": 5e-06, "loss": 0.2026, "step": 6020 }, { "epoch": 1.1512428298279158, "grad_norm": 0.996833086013794, "learning_rate": 5e-06, "loss": 0.06, "step": 6021 }, { "epoch": 1.151434034416826, "grad_norm": 1.5025444030761719, "learning_rate": 5e-06, "loss": 0.0522, "step": 6022 }, { "epoch": 1.1516252390057362, "grad_norm": 1.4137760400772095, "learning_rate": 5e-06, "loss": 0.079, "step": 6023 }, { "epoch": 1.1518164435946463, "grad_norm": 2.3708930015563965, "learning_rate": 5e-06, "loss": 0.1059, "step": 6024 }, { "epoch": 1.1520076481835564, "grad_norm": 1.4922761917114258, "learning_rate": 5e-06, "loss": 0.1294, "step": 6025 }, { "epoch": 1.1521988527724665, "grad_norm": 0.9097602367401123, "learning_rate": 5e-06, "loss": 0.0681, "step": 6026 }, { "epoch": 1.1523900573613766, "grad_norm": 1.8793368339538574, "learning_rate": 5e-06, "loss": 0.1962, "step": 6027 }, { "epoch": 1.1525812619502869, "grad_norm": 3.2023956775665283, "learning_rate": 5e-06, "loss": 0.0744, "step": 6028 }, { "epoch": 1.152772466539197, "grad_norm": 1.3651431798934937, "learning_rate": 5e-06, "loss": 0.0934, "step": 6029 }, { "epoch": 1.152963671128107, "grad_norm": 2.276491641998291, "learning_rate": 5e-06, "loss": 0.1804, "step": 6030 }, { "epoch": 1.1531548757170171, "grad_norm": 1.6140533685684204, "learning_rate": 5e-06, "loss": 0.1421, "step": 6031 }, { "epoch": 1.1533460803059272, "grad_norm": 1.3192780017852783, "learning_rate": 5e-06, "loss": 0.0654, "step": 6032 }, { "epoch": 1.1535372848948375, "grad_norm": 2.071134567260742, "learning_rate": 5e-06, "loss": 0.2245, "step": 6033 }, { "epoch": 1.1537284894837476, "grad_norm": 1.0834347009658813, "learning_rate": 5e-06, "loss": 0.0735, "step": 6034 }, { "epoch": 1.1539196940726577, "grad_norm": 5.82857084274292, "learning_rate": 5e-06, "loss": 0.1003, "step": 6035 }, { "epoch": 1.1541108986615678, "grad_norm": 1.3361073732376099, "learning_rate": 5e-06, "loss": 0.0476, "step": 6036 }, { "epoch": 1.154302103250478, "grad_norm": 2.2594399452209473, "learning_rate": 5e-06, "loss": 0.1533, "step": 6037 }, { "epoch": 1.1544933078393882, "grad_norm": 1.668912649154663, "learning_rate": 5e-06, "loss": 0.1638, "step": 6038 }, { "epoch": 1.1546845124282983, "grad_norm": 1.6249210834503174, "learning_rate": 5e-06, "loss": 0.0849, "step": 6039 }, { "epoch": 1.1548757170172084, "grad_norm": 1.301710605621338, "learning_rate": 5e-06, "loss": 0.081, "step": 6040 }, { "epoch": 1.1550669216061185, "grad_norm": 0.9434700608253479, "learning_rate": 5e-06, "loss": 0.0392, "step": 6041 }, { "epoch": 1.1552581261950288, "grad_norm": 0.9116209745407104, "learning_rate": 5e-06, "loss": 0.0235, "step": 6042 }, { "epoch": 1.1554493307839389, "grad_norm": 0.9934965968132019, "learning_rate": 5e-06, "loss": 0.0644, "step": 6043 }, { "epoch": 1.155640535372849, "grad_norm": 1.8826981782913208, "learning_rate": 5e-06, "loss": 0.2933, "step": 6044 }, { "epoch": 1.155831739961759, "grad_norm": 1.0682721138000488, "learning_rate": 5e-06, "loss": 0.0483, "step": 6045 }, { "epoch": 1.1560229445506693, "grad_norm": 1.3939552307128906, "learning_rate": 5e-06, "loss": 0.0893, "step": 6046 }, { "epoch": 1.1562141491395794, "grad_norm": 0.7219886183738708, "learning_rate": 5e-06, "loss": 0.0433, "step": 6047 }, { "epoch": 1.1564053537284895, "grad_norm": 0.8082029819488525, "learning_rate": 5e-06, "loss": 0.0271, "step": 6048 }, { "epoch": 1.1565965583173996, "grad_norm": 0.8083641529083252, "learning_rate": 5e-06, "loss": 0.0503, "step": 6049 }, { "epoch": 1.1567877629063097, "grad_norm": 1.0870314836502075, "learning_rate": 5e-06, "loss": 0.0769, "step": 6050 }, { "epoch": 1.15697896749522, "grad_norm": 0.9009522795677185, "learning_rate": 5e-06, "loss": 0.0477, "step": 6051 }, { "epoch": 1.15717017208413, "grad_norm": 0.9931294322013855, "learning_rate": 5e-06, "loss": 0.0635, "step": 6052 }, { "epoch": 1.1573613766730402, "grad_norm": 1.5237797498703003, "learning_rate": 5e-06, "loss": 0.0961, "step": 6053 }, { "epoch": 1.1575525812619503, "grad_norm": 1.4311531782150269, "learning_rate": 5e-06, "loss": 0.0464, "step": 6054 }, { "epoch": 1.1577437858508604, "grad_norm": 2.5143849849700928, "learning_rate": 5e-06, "loss": 0.2268, "step": 6055 }, { "epoch": 1.1579349904397707, "grad_norm": 1.216488003730774, "learning_rate": 5e-06, "loss": 0.0839, "step": 6056 }, { "epoch": 1.1581261950286807, "grad_norm": 2.2803962230682373, "learning_rate": 5e-06, "loss": 0.3013, "step": 6057 }, { "epoch": 1.1583173996175908, "grad_norm": 0.7981217503547668, "learning_rate": 5e-06, "loss": 0.0436, "step": 6058 }, { "epoch": 1.158508604206501, "grad_norm": 1.780248761177063, "learning_rate": 5e-06, "loss": 0.1795, "step": 6059 }, { "epoch": 1.158699808795411, "grad_norm": 5.465449810028076, "learning_rate": 5e-06, "loss": 0.1149, "step": 6060 }, { "epoch": 1.1588910133843213, "grad_norm": 1.3702939748764038, "learning_rate": 5e-06, "loss": 0.0291, "step": 6061 }, { "epoch": 1.1590822179732314, "grad_norm": 2.034011125564575, "learning_rate": 5e-06, "loss": 0.2144, "step": 6062 }, { "epoch": 1.1592734225621415, "grad_norm": 1.4905356168746948, "learning_rate": 5e-06, "loss": 0.1838, "step": 6063 }, { "epoch": 1.1594646271510516, "grad_norm": 2.3215534687042236, "learning_rate": 5e-06, "loss": 0.1431, "step": 6064 }, { "epoch": 1.1596558317399617, "grad_norm": 0.8124056458473206, "learning_rate": 5e-06, "loss": 0.0456, "step": 6065 }, { "epoch": 1.159847036328872, "grad_norm": 1.8607640266418457, "learning_rate": 5e-06, "loss": 0.131, "step": 6066 }, { "epoch": 1.160038240917782, "grad_norm": 1.4982339143753052, "learning_rate": 5e-06, "loss": 0.0986, "step": 6067 }, { "epoch": 1.1602294455066922, "grad_norm": 2.7232859134674072, "learning_rate": 5e-06, "loss": 0.2743, "step": 6068 }, { "epoch": 1.1604206500956022, "grad_norm": 2.740915536880493, "learning_rate": 5e-06, "loss": 0.4415, "step": 6069 }, { "epoch": 1.1606118546845123, "grad_norm": 1.1084314584732056, "learning_rate": 5e-06, "loss": 0.0626, "step": 6070 }, { "epoch": 1.1608030592734226, "grad_norm": 3.3199756145477295, "learning_rate": 5e-06, "loss": 0.2035, "step": 6071 }, { "epoch": 1.1609942638623327, "grad_norm": 0.5316591262817383, "learning_rate": 5e-06, "loss": 0.0395, "step": 6072 }, { "epoch": 1.1611854684512428, "grad_norm": 0.8929632306098938, "learning_rate": 5e-06, "loss": 0.0379, "step": 6073 }, { "epoch": 1.161376673040153, "grad_norm": 1.1094083786010742, "learning_rate": 5e-06, "loss": 0.0883, "step": 6074 }, { "epoch": 1.161567877629063, "grad_norm": 3.199720621109009, "learning_rate": 5e-06, "loss": 0.4122, "step": 6075 }, { "epoch": 1.1617590822179733, "grad_norm": 2.4385669231414795, "learning_rate": 5e-06, "loss": 0.3021, "step": 6076 }, { "epoch": 1.1619502868068834, "grad_norm": 1.3131438493728638, "learning_rate": 5e-06, "loss": 0.0734, "step": 6077 }, { "epoch": 1.1621414913957935, "grad_norm": 2.2846639156341553, "learning_rate": 5e-06, "loss": 0.162, "step": 6078 }, { "epoch": 1.1623326959847036, "grad_norm": 1.474809169769287, "learning_rate": 5e-06, "loss": 0.0868, "step": 6079 }, { "epoch": 1.1625239005736137, "grad_norm": 1.7511301040649414, "learning_rate": 5e-06, "loss": 0.0742, "step": 6080 }, { "epoch": 1.162715105162524, "grad_norm": 1.1560462713241577, "learning_rate": 5e-06, "loss": 0.1053, "step": 6081 }, { "epoch": 1.162906309751434, "grad_norm": 1.9263542890548706, "learning_rate": 5e-06, "loss": 0.151, "step": 6082 }, { "epoch": 1.1630975143403441, "grad_norm": 1.997341275215149, "learning_rate": 5e-06, "loss": 0.0731, "step": 6083 }, { "epoch": 1.1632887189292542, "grad_norm": 2.5195326805114746, "learning_rate": 5e-06, "loss": 0.1253, "step": 6084 }, { "epoch": 1.1634799235181643, "grad_norm": 1.3920161724090576, "learning_rate": 5e-06, "loss": 0.0689, "step": 6085 }, { "epoch": 1.1636711281070746, "grad_norm": 1.848002552986145, "learning_rate": 5e-06, "loss": 0.0813, "step": 6086 }, { "epoch": 1.1638623326959847, "grad_norm": 1.6859074831008911, "learning_rate": 5e-06, "loss": 0.2863, "step": 6087 }, { "epoch": 1.1640535372848948, "grad_norm": 2.795480251312256, "learning_rate": 5e-06, "loss": 0.3935, "step": 6088 }, { "epoch": 1.164244741873805, "grad_norm": 2.764321804046631, "learning_rate": 5e-06, "loss": 0.268, "step": 6089 }, { "epoch": 1.1644359464627152, "grad_norm": 0.9207529425621033, "learning_rate": 5e-06, "loss": 0.0395, "step": 6090 }, { "epoch": 1.1646271510516253, "grad_norm": 2.6257948875427246, "learning_rate": 5e-06, "loss": 0.2537, "step": 6091 }, { "epoch": 1.1648183556405354, "grad_norm": 1.9078866243362427, "learning_rate": 5e-06, "loss": 0.1349, "step": 6092 }, { "epoch": 1.1650095602294455, "grad_norm": 6.173279285430908, "learning_rate": 5e-06, "loss": 0.1253, "step": 6093 }, { "epoch": 1.1652007648183555, "grad_norm": 2.563831090927124, "learning_rate": 5e-06, "loss": 0.1948, "step": 6094 }, { "epoch": 1.1653919694072659, "grad_norm": 1.040425181388855, "learning_rate": 5e-06, "loss": 0.1154, "step": 6095 }, { "epoch": 1.165583173996176, "grad_norm": 0.45935672521591187, "learning_rate": 5e-06, "loss": 0.0246, "step": 6096 }, { "epoch": 1.165774378585086, "grad_norm": 1.1373692750930786, "learning_rate": 5e-06, "loss": 0.0963, "step": 6097 }, { "epoch": 1.1659655831739961, "grad_norm": 2.903848886489868, "learning_rate": 5e-06, "loss": 0.1996, "step": 6098 }, { "epoch": 1.1661567877629064, "grad_norm": 1.5372850894927979, "learning_rate": 5e-06, "loss": 0.075, "step": 6099 }, { "epoch": 1.1663479923518165, "grad_norm": 2.0959360599517822, "learning_rate": 5e-06, "loss": 0.2309, "step": 6100 }, { "epoch": 1.1665391969407266, "grad_norm": 3.3226559162139893, "learning_rate": 5e-06, "loss": 0.233, "step": 6101 }, { "epoch": 1.1667304015296367, "grad_norm": 1.1195683479309082, "learning_rate": 5e-06, "loss": 0.0845, "step": 6102 }, { "epoch": 1.1669216061185468, "grad_norm": 1.387656569480896, "learning_rate": 5e-06, "loss": 0.0717, "step": 6103 }, { "epoch": 1.167112810707457, "grad_norm": 1.3395695686340332, "learning_rate": 5e-06, "loss": 0.0479, "step": 6104 }, { "epoch": 1.1673040152963672, "grad_norm": 1.6233667135238647, "learning_rate": 5e-06, "loss": 0.0703, "step": 6105 }, { "epoch": 1.1674952198852773, "grad_norm": 2.596073627471924, "learning_rate": 5e-06, "loss": 0.1554, "step": 6106 }, { "epoch": 1.1676864244741874, "grad_norm": 1.9220967292785645, "learning_rate": 5e-06, "loss": 0.2205, "step": 6107 }, { "epoch": 1.1678776290630974, "grad_norm": 1.6835957765579224, "learning_rate": 5e-06, "loss": 0.1507, "step": 6108 }, { "epoch": 1.1680688336520078, "grad_norm": 1.530069351196289, "learning_rate": 5e-06, "loss": 0.0651, "step": 6109 }, { "epoch": 1.1682600382409178, "grad_norm": 0.737148106098175, "learning_rate": 5e-06, "loss": 0.0299, "step": 6110 }, { "epoch": 1.168451242829828, "grad_norm": 1.4814426898956299, "learning_rate": 5e-06, "loss": 0.0882, "step": 6111 }, { "epoch": 1.168642447418738, "grad_norm": 0.973702609539032, "learning_rate": 5e-06, "loss": 0.0567, "step": 6112 }, { "epoch": 1.168833652007648, "grad_norm": 2.017190933227539, "learning_rate": 5e-06, "loss": 0.2373, "step": 6113 }, { "epoch": 1.1690248565965584, "grad_norm": 1.150895595550537, "learning_rate": 5e-06, "loss": 0.0863, "step": 6114 }, { "epoch": 1.1692160611854685, "grad_norm": 2.035149335861206, "learning_rate": 5e-06, "loss": 0.0599, "step": 6115 }, { "epoch": 1.1694072657743786, "grad_norm": 1.2829310894012451, "learning_rate": 5e-06, "loss": 0.0662, "step": 6116 }, { "epoch": 1.1695984703632887, "grad_norm": 0.9252585172653198, "learning_rate": 5e-06, "loss": 0.0334, "step": 6117 }, { "epoch": 1.1697896749521988, "grad_norm": 2.3354899883270264, "learning_rate": 5e-06, "loss": 0.1792, "step": 6118 }, { "epoch": 1.169980879541109, "grad_norm": 1.619714617729187, "learning_rate": 5e-06, "loss": 0.2189, "step": 6119 }, { "epoch": 1.1701720841300192, "grad_norm": 1.9781533479690552, "learning_rate": 5e-06, "loss": 0.179, "step": 6120 }, { "epoch": 1.1703632887189293, "grad_norm": 1.8020018339157104, "learning_rate": 5e-06, "loss": 0.2103, "step": 6121 }, { "epoch": 1.1705544933078393, "grad_norm": 1.7455902099609375, "learning_rate": 5e-06, "loss": 0.1837, "step": 6122 }, { "epoch": 1.1707456978967494, "grad_norm": 1.8223216533660889, "learning_rate": 5e-06, "loss": 0.0997, "step": 6123 }, { "epoch": 1.1709369024856597, "grad_norm": 1.6536144018173218, "learning_rate": 5e-06, "loss": 0.0922, "step": 6124 }, { "epoch": 1.1711281070745698, "grad_norm": 2.5655362606048584, "learning_rate": 5e-06, "loss": 0.3305, "step": 6125 }, { "epoch": 1.17131931166348, "grad_norm": 1.325499415397644, "learning_rate": 5e-06, "loss": 0.105, "step": 6126 }, { "epoch": 1.17151051625239, "grad_norm": 1.0477885007858276, "learning_rate": 5e-06, "loss": 0.082, "step": 6127 }, { "epoch": 1.1717017208413, "grad_norm": 1.0732998847961426, "learning_rate": 5e-06, "loss": 0.0464, "step": 6128 }, { "epoch": 1.1718929254302104, "grad_norm": 1.3447692394256592, "learning_rate": 5e-06, "loss": 0.0626, "step": 6129 }, { "epoch": 1.1720841300191205, "grad_norm": 2.3556089401245117, "learning_rate": 5e-06, "loss": 0.0672, "step": 6130 }, { "epoch": 1.1722753346080306, "grad_norm": 2.144721031188965, "learning_rate": 5e-06, "loss": 0.2276, "step": 6131 }, { "epoch": 1.1724665391969407, "grad_norm": 1.2007111310958862, "learning_rate": 5e-06, "loss": 0.0624, "step": 6132 }, { "epoch": 1.1726577437858507, "grad_norm": 1.4135570526123047, "learning_rate": 5e-06, "loss": 0.082, "step": 6133 }, { "epoch": 1.172848948374761, "grad_norm": 1.207506537437439, "learning_rate": 5e-06, "loss": 0.0531, "step": 6134 }, { "epoch": 1.1730401529636711, "grad_norm": 0.7473751306533813, "learning_rate": 5e-06, "loss": 0.0336, "step": 6135 }, { "epoch": 1.1732313575525812, "grad_norm": 1.5870224237442017, "learning_rate": 5e-06, "loss": 0.0666, "step": 6136 }, { "epoch": 1.1734225621414913, "grad_norm": 2.2782132625579834, "learning_rate": 5e-06, "loss": 0.2869, "step": 6137 }, { "epoch": 1.1736137667304014, "grad_norm": 2.2009999752044678, "learning_rate": 5e-06, "loss": 0.1322, "step": 6138 }, { "epoch": 1.1738049713193117, "grad_norm": 2.3219454288482666, "learning_rate": 5e-06, "loss": 0.2503, "step": 6139 }, { "epoch": 1.1739961759082218, "grad_norm": 1.4381366968154907, "learning_rate": 5e-06, "loss": 0.0768, "step": 6140 }, { "epoch": 1.174187380497132, "grad_norm": 2.0978338718414307, "learning_rate": 5e-06, "loss": 0.1613, "step": 6141 }, { "epoch": 1.174378585086042, "grad_norm": 2.1573426723480225, "learning_rate": 5e-06, "loss": 0.083, "step": 6142 }, { "epoch": 1.1745697896749523, "grad_norm": 1.2375134229660034, "learning_rate": 5e-06, "loss": 0.0635, "step": 6143 }, { "epoch": 1.1747609942638624, "grad_norm": 1.8515455722808838, "learning_rate": 5e-06, "loss": 0.2427, "step": 6144 }, { "epoch": 1.1749521988527725, "grad_norm": 1.246639609336853, "learning_rate": 5e-06, "loss": 0.0717, "step": 6145 }, { "epoch": 1.1751434034416826, "grad_norm": 1.7270658016204834, "learning_rate": 5e-06, "loss": 0.0917, "step": 6146 }, { "epoch": 1.1753346080305926, "grad_norm": 1.3657946586608887, "learning_rate": 5e-06, "loss": 0.1012, "step": 6147 }, { "epoch": 1.175525812619503, "grad_norm": 1.2972500324249268, "learning_rate": 5e-06, "loss": 0.0726, "step": 6148 }, { "epoch": 1.175717017208413, "grad_norm": 2.3606135845184326, "learning_rate": 5e-06, "loss": 0.1246, "step": 6149 }, { "epoch": 1.1759082217973231, "grad_norm": 1.981109619140625, "learning_rate": 5e-06, "loss": 0.2327, "step": 6150 }, { "epoch": 1.1760994263862332, "grad_norm": 2.298509120941162, "learning_rate": 5e-06, "loss": 0.172, "step": 6151 }, { "epoch": 1.1762906309751435, "grad_norm": 2.005854606628418, "learning_rate": 5e-06, "loss": 0.1567, "step": 6152 }, { "epoch": 1.1764818355640536, "grad_norm": 0.9500100612640381, "learning_rate": 5e-06, "loss": 0.0708, "step": 6153 }, { "epoch": 1.1766730401529637, "grad_norm": 1.3746157884597778, "learning_rate": 5e-06, "loss": 0.0448, "step": 6154 }, { "epoch": 1.1768642447418738, "grad_norm": 0.5043768882751465, "learning_rate": 5e-06, "loss": 0.0102, "step": 6155 }, { "epoch": 1.1770554493307839, "grad_norm": 1.2846918106079102, "learning_rate": 5e-06, "loss": 0.1466, "step": 6156 }, { "epoch": 1.1772466539196942, "grad_norm": 2.024791955947876, "learning_rate": 5e-06, "loss": 0.2269, "step": 6157 }, { "epoch": 1.1774378585086043, "grad_norm": 0.7490910887718201, "learning_rate": 5e-06, "loss": 0.0589, "step": 6158 }, { "epoch": 1.1776290630975144, "grad_norm": 1.5239231586456299, "learning_rate": 5e-06, "loss": 0.1146, "step": 6159 }, { "epoch": 1.1778202676864244, "grad_norm": 1.2455819845199585, "learning_rate": 5e-06, "loss": 0.0616, "step": 6160 }, { "epoch": 1.1780114722753345, "grad_norm": 1.4771140813827515, "learning_rate": 5e-06, "loss": 0.0756, "step": 6161 }, { "epoch": 1.1782026768642448, "grad_norm": 2.541170120239258, "learning_rate": 5e-06, "loss": 0.1628, "step": 6162 }, { "epoch": 1.178393881453155, "grad_norm": 2.1921584606170654, "learning_rate": 5e-06, "loss": 0.2259, "step": 6163 }, { "epoch": 1.178585086042065, "grad_norm": 1.606255292892456, "learning_rate": 5e-06, "loss": 0.0649, "step": 6164 }, { "epoch": 1.178776290630975, "grad_norm": 3.125506639480591, "learning_rate": 5e-06, "loss": 0.2574, "step": 6165 }, { "epoch": 1.1789674952198852, "grad_norm": 1.2741800546646118, "learning_rate": 5e-06, "loss": 0.0475, "step": 6166 }, { "epoch": 1.1791586998087955, "grad_norm": 2.3493738174438477, "learning_rate": 5e-06, "loss": 0.059, "step": 6167 }, { "epoch": 1.1793499043977056, "grad_norm": 1.9525309801101685, "learning_rate": 5e-06, "loss": 0.1746, "step": 6168 }, { "epoch": 1.1795411089866157, "grad_norm": 2.7668604850769043, "learning_rate": 5e-06, "loss": 0.309, "step": 6169 }, { "epoch": 1.1797323135755258, "grad_norm": 2.444951057434082, "learning_rate": 5e-06, "loss": 0.2414, "step": 6170 }, { "epoch": 1.1799235181644359, "grad_norm": 1.980841040611267, "learning_rate": 5e-06, "loss": 0.0647, "step": 6171 }, { "epoch": 1.1801147227533462, "grad_norm": 1.0069383382797241, "learning_rate": 5e-06, "loss": 0.0172, "step": 6172 }, { "epoch": 1.1803059273422563, "grad_norm": 1.7199982404708862, "learning_rate": 5e-06, "loss": 0.1625, "step": 6173 }, { "epoch": 1.1804971319311663, "grad_norm": 1.0968600511550903, "learning_rate": 5e-06, "loss": 0.0659, "step": 6174 }, { "epoch": 1.1806883365200764, "grad_norm": 2.4999165534973145, "learning_rate": 5e-06, "loss": 0.3111, "step": 6175 }, { "epoch": 1.1808795411089865, "grad_norm": 1.1034135818481445, "learning_rate": 5e-06, "loss": 0.0808, "step": 6176 }, { "epoch": 1.1810707456978968, "grad_norm": 2.0255119800567627, "learning_rate": 5e-06, "loss": 0.1643, "step": 6177 }, { "epoch": 1.181261950286807, "grad_norm": 1.7352789640426636, "learning_rate": 5e-06, "loss": 0.1539, "step": 6178 }, { "epoch": 1.181453154875717, "grad_norm": 0.796521782875061, "learning_rate": 5e-06, "loss": 0.0369, "step": 6179 }, { "epoch": 1.181644359464627, "grad_norm": 2.0245773792266846, "learning_rate": 5e-06, "loss": 0.0833, "step": 6180 }, { "epoch": 1.1818355640535372, "grad_norm": 1.772589921951294, "learning_rate": 5e-06, "loss": 0.2376, "step": 6181 }, { "epoch": 1.1820267686424475, "grad_norm": 2.6077778339385986, "learning_rate": 5e-06, "loss": 0.3178, "step": 6182 }, { "epoch": 1.1822179732313576, "grad_norm": 1.4697093963623047, "learning_rate": 5e-06, "loss": 0.0894, "step": 6183 }, { "epoch": 1.1824091778202677, "grad_norm": 1.1189794540405273, "learning_rate": 5e-06, "loss": 0.0712, "step": 6184 }, { "epoch": 1.1826003824091778, "grad_norm": 1.7616618871688843, "learning_rate": 5e-06, "loss": 0.1014, "step": 6185 }, { "epoch": 1.1827915869980878, "grad_norm": 1.4431734085083008, "learning_rate": 5e-06, "loss": 0.0509, "step": 6186 }, { "epoch": 1.1829827915869982, "grad_norm": 1.4921287298202515, "learning_rate": 5e-06, "loss": 0.0924, "step": 6187 }, { "epoch": 1.1831739961759082, "grad_norm": 3.1014909744262695, "learning_rate": 5e-06, "loss": 0.3719, "step": 6188 }, { "epoch": 1.1833652007648183, "grad_norm": 1.158164620399475, "learning_rate": 5e-06, "loss": 0.0296, "step": 6189 }, { "epoch": 1.1835564053537284, "grad_norm": 1.8710097074508667, "learning_rate": 5e-06, "loss": 0.1908, "step": 6190 }, { "epoch": 1.1837476099426385, "grad_norm": 1.8682113885879517, "learning_rate": 5e-06, "loss": 0.1711, "step": 6191 }, { "epoch": 1.1839388145315488, "grad_norm": 0.9372738003730774, "learning_rate": 5e-06, "loss": 0.0528, "step": 6192 }, { "epoch": 1.184130019120459, "grad_norm": 1.8785148859024048, "learning_rate": 5e-06, "loss": 0.3054, "step": 6193 }, { "epoch": 1.184321223709369, "grad_norm": 1.2328784465789795, "learning_rate": 5e-06, "loss": 0.082, "step": 6194 }, { "epoch": 1.184512428298279, "grad_norm": 1.1626081466674805, "learning_rate": 5e-06, "loss": 0.0991, "step": 6195 }, { "epoch": 1.1847036328871894, "grad_norm": 2.7644124031066895, "learning_rate": 5e-06, "loss": 0.2075, "step": 6196 }, { "epoch": 1.1848948374760995, "grad_norm": 1.14145827293396, "learning_rate": 5e-06, "loss": 0.0526, "step": 6197 }, { "epoch": 1.1850860420650096, "grad_norm": 1.1169764995574951, "learning_rate": 5e-06, "loss": 0.0239, "step": 6198 }, { "epoch": 1.1852772466539196, "grad_norm": 2.4677770137786865, "learning_rate": 5e-06, "loss": 0.0712, "step": 6199 }, { "epoch": 1.1854684512428297, "grad_norm": 2.896726369857788, "learning_rate": 5e-06, "loss": 0.1501, "step": 6200 }, { "epoch": 1.18565965583174, "grad_norm": 1.0080317258834839, "learning_rate": 5e-06, "loss": 0.0729, "step": 6201 }, { "epoch": 1.1858508604206501, "grad_norm": 1.406341791152954, "learning_rate": 5e-06, "loss": 0.1634, "step": 6202 }, { "epoch": 1.1860420650095602, "grad_norm": 2.233196258544922, "learning_rate": 5e-06, "loss": 0.168, "step": 6203 }, { "epoch": 1.1862332695984703, "grad_norm": 0.7236731052398682, "learning_rate": 5e-06, "loss": 0.0436, "step": 6204 }, { "epoch": 1.1864244741873806, "grad_norm": 1.104647159576416, "learning_rate": 5e-06, "loss": 0.0803, "step": 6205 }, { "epoch": 1.1866156787762907, "grad_norm": 1.6566107273101807, "learning_rate": 5e-06, "loss": 0.1847, "step": 6206 }, { "epoch": 1.1868068833652008, "grad_norm": 2.225419282913208, "learning_rate": 5e-06, "loss": 0.1847, "step": 6207 }, { "epoch": 1.1869980879541109, "grad_norm": 1.3101707696914673, "learning_rate": 5e-06, "loss": 0.0773, "step": 6208 }, { "epoch": 1.187189292543021, "grad_norm": 1.1030720472335815, "learning_rate": 5e-06, "loss": 0.0684, "step": 6209 }, { "epoch": 1.1873804971319313, "grad_norm": 1.0501564741134644, "learning_rate": 5e-06, "loss": 0.0591, "step": 6210 }, { "epoch": 1.1875717017208414, "grad_norm": 1.0577927827835083, "learning_rate": 5e-06, "loss": 0.0496, "step": 6211 }, { "epoch": 1.1877629063097515, "grad_norm": 2.4910309314727783, "learning_rate": 5e-06, "loss": 0.3094, "step": 6212 }, { "epoch": 1.1879541108986615, "grad_norm": 1.5930724143981934, "learning_rate": 5e-06, "loss": 0.1574, "step": 6213 }, { "epoch": 1.1881453154875716, "grad_norm": 1.2082481384277344, "learning_rate": 5e-06, "loss": 0.0612, "step": 6214 }, { "epoch": 1.188336520076482, "grad_norm": 1.5550799369812012, "learning_rate": 5e-06, "loss": 0.0683, "step": 6215 }, { "epoch": 1.188527724665392, "grad_norm": 1.548312783241272, "learning_rate": 5e-06, "loss": 0.0548, "step": 6216 }, { "epoch": 1.1887189292543021, "grad_norm": 1.306344747543335, "learning_rate": 5e-06, "loss": 0.0559, "step": 6217 }, { "epoch": 1.1889101338432122, "grad_norm": 1.2293124198913574, "learning_rate": 5e-06, "loss": 0.065, "step": 6218 }, { "epoch": 1.1891013384321223, "grad_norm": 3.0053391456604004, "learning_rate": 5e-06, "loss": 0.4573, "step": 6219 }, { "epoch": 1.1892925430210326, "grad_norm": 2.9051673412323, "learning_rate": 5e-06, "loss": 0.0862, "step": 6220 }, { "epoch": 1.1894837476099427, "grad_norm": 1.1124510765075684, "learning_rate": 5e-06, "loss": 0.0373, "step": 6221 }, { "epoch": 1.1896749521988528, "grad_norm": 6.297783374786377, "learning_rate": 5e-06, "loss": 0.1291, "step": 6222 }, { "epoch": 1.1898661567877629, "grad_norm": 3.238295316696167, "learning_rate": 5e-06, "loss": 0.1066, "step": 6223 }, { "epoch": 1.190057361376673, "grad_norm": 1.506626844406128, "learning_rate": 5e-06, "loss": 0.0639, "step": 6224 }, { "epoch": 1.1902485659655833, "grad_norm": 1.8110731840133667, "learning_rate": 5e-06, "loss": 0.1739, "step": 6225 }, { "epoch": 1.1904397705544933, "grad_norm": 1.8189138174057007, "learning_rate": 5e-06, "loss": 0.2069, "step": 6226 }, { "epoch": 1.1906309751434034, "grad_norm": 1.4828901290893555, "learning_rate": 5e-06, "loss": 0.155, "step": 6227 }, { "epoch": 1.1908221797323135, "grad_norm": 1.3421870470046997, "learning_rate": 5e-06, "loss": 0.0718, "step": 6228 }, { "epoch": 1.1910133843212236, "grad_norm": 1.5782335996627808, "learning_rate": 5e-06, "loss": 0.1227, "step": 6229 }, { "epoch": 1.191204588910134, "grad_norm": 0.5779447555541992, "learning_rate": 5e-06, "loss": 0.0196, "step": 6230 }, { "epoch": 1.191395793499044, "grad_norm": 3.198089599609375, "learning_rate": 5e-06, "loss": 0.4032, "step": 6231 }, { "epoch": 1.191586998087954, "grad_norm": 2.1662838459014893, "learning_rate": 5e-06, "loss": 0.1112, "step": 6232 }, { "epoch": 1.1917782026768642, "grad_norm": 0.6701384782791138, "learning_rate": 5e-06, "loss": 0.0796, "step": 6233 }, { "epoch": 1.1919694072657743, "grad_norm": 3.48876953125, "learning_rate": 5e-06, "loss": 0.3998, "step": 6234 }, { "epoch": 1.1921606118546846, "grad_norm": 1.7155407667160034, "learning_rate": 5e-06, "loss": 0.0601, "step": 6235 }, { "epoch": 1.1923518164435947, "grad_norm": 1.6817251443862915, "learning_rate": 5e-06, "loss": 0.0674, "step": 6236 }, { "epoch": 1.1925430210325048, "grad_norm": 1.244370460510254, "learning_rate": 5e-06, "loss": 0.0725, "step": 6237 }, { "epoch": 1.1927342256214148, "grad_norm": 0.7383753657341003, "learning_rate": 5e-06, "loss": 0.0387, "step": 6238 }, { "epoch": 1.192925430210325, "grad_norm": 3.243826150894165, "learning_rate": 5e-06, "loss": 0.2708, "step": 6239 }, { "epoch": 1.1931166347992352, "grad_norm": 1.0814484357833862, "learning_rate": 5e-06, "loss": 0.0429, "step": 6240 }, { "epoch": 1.1933078393881453, "grad_norm": 1.800237774848938, "learning_rate": 5e-06, "loss": 0.1282, "step": 6241 }, { "epoch": 1.1934990439770554, "grad_norm": 1.2890852689743042, "learning_rate": 5e-06, "loss": 0.0736, "step": 6242 }, { "epoch": 1.1936902485659655, "grad_norm": 1.504570722579956, "learning_rate": 5e-06, "loss": 0.1157, "step": 6243 }, { "epoch": 1.1938814531548756, "grad_norm": 3.0813817977905273, "learning_rate": 5e-06, "loss": 0.2378, "step": 6244 }, { "epoch": 1.194072657743786, "grad_norm": 2.5581462383270264, "learning_rate": 5e-06, "loss": 0.2248, "step": 6245 }, { "epoch": 1.194263862332696, "grad_norm": 1.9704996347427368, "learning_rate": 5e-06, "loss": 0.1308, "step": 6246 }, { "epoch": 1.194455066921606, "grad_norm": 1.2662582397460938, "learning_rate": 5e-06, "loss": 0.0757, "step": 6247 }, { "epoch": 1.1946462715105162, "grad_norm": 1.3485127687454224, "learning_rate": 5e-06, "loss": 0.0613, "step": 6248 }, { "epoch": 1.1948374760994265, "grad_norm": 1.4152460098266602, "learning_rate": 5e-06, "loss": 0.0603, "step": 6249 }, { "epoch": 1.1950286806883366, "grad_norm": 2.333681583404541, "learning_rate": 5e-06, "loss": 0.1836, "step": 6250 }, { "epoch": 1.1952198852772467, "grad_norm": 1.0232439041137695, "learning_rate": 5e-06, "loss": 0.0979, "step": 6251 }, { "epoch": 1.1954110898661567, "grad_norm": 1.8188303709030151, "learning_rate": 5e-06, "loss": 0.088, "step": 6252 }, { "epoch": 1.1956022944550668, "grad_norm": 2.2055118083953857, "learning_rate": 5e-06, "loss": 0.1996, "step": 6253 }, { "epoch": 1.1957934990439771, "grad_norm": 1.0319563150405884, "learning_rate": 5e-06, "loss": 0.0685, "step": 6254 }, { "epoch": 1.1959847036328872, "grad_norm": 1.0295190811157227, "learning_rate": 5e-06, "loss": 0.0491, "step": 6255 }, { "epoch": 1.1961759082217973, "grad_norm": 1.7119534015655518, "learning_rate": 5e-06, "loss": 0.1958, "step": 6256 }, { "epoch": 1.1963671128107074, "grad_norm": 2.6020867824554443, "learning_rate": 5e-06, "loss": 0.3227, "step": 6257 }, { "epoch": 1.1965583173996177, "grad_norm": 0.8010663390159607, "learning_rate": 5e-06, "loss": 0.0648, "step": 6258 }, { "epoch": 1.1967495219885278, "grad_norm": 1.836167335510254, "learning_rate": 5e-06, "loss": 0.1377, "step": 6259 }, { "epoch": 1.1969407265774379, "grad_norm": 2.1391921043395996, "learning_rate": 5e-06, "loss": 0.2105, "step": 6260 }, { "epoch": 1.197131931166348, "grad_norm": 1.2611051797866821, "learning_rate": 5e-06, "loss": 0.0711, "step": 6261 }, { "epoch": 1.197323135755258, "grad_norm": 1.799573540687561, "learning_rate": 5e-06, "loss": 0.1808, "step": 6262 }, { "epoch": 1.1975143403441684, "grad_norm": 1.2242969274520874, "learning_rate": 5e-06, "loss": 0.1472, "step": 6263 }, { "epoch": 1.1977055449330785, "grad_norm": 1.190840721130371, "learning_rate": 5e-06, "loss": 0.0509, "step": 6264 }, { "epoch": 1.1978967495219885, "grad_norm": 1.2675566673278809, "learning_rate": 5e-06, "loss": 0.0601, "step": 6265 }, { "epoch": 1.1980879541108986, "grad_norm": 1.62624990940094, "learning_rate": 5e-06, "loss": 0.0855, "step": 6266 }, { "epoch": 1.1982791586998087, "grad_norm": 0.9653342366218567, "learning_rate": 5e-06, "loss": 0.0298, "step": 6267 }, { "epoch": 1.198470363288719, "grad_norm": 2.015363931655884, "learning_rate": 5e-06, "loss": 0.1699, "step": 6268 }, { "epoch": 1.1986615678776291, "grad_norm": 3.233055591583252, "learning_rate": 5e-06, "loss": 0.3099, "step": 6269 }, { "epoch": 1.1988527724665392, "grad_norm": 3.2268948554992676, "learning_rate": 5e-06, "loss": 0.4068, "step": 6270 }, { "epoch": 1.1990439770554493, "grad_norm": 2.4663195610046387, "learning_rate": 5e-06, "loss": 0.1524, "step": 6271 }, { "epoch": 1.1992351816443594, "grad_norm": 1.4187419414520264, "learning_rate": 5e-06, "loss": 0.0471, "step": 6272 }, { "epoch": 1.1994263862332697, "grad_norm": 0.7417177557945251, "learning_rate": 5e-06, "loss": 0.0204, "step": 6273 }, { "epoch": 1.1996175908221798, "grad_norm": 1.6655139923095703, "learning_rate": 5e-06, "loss": 0.0895, "step": 6274 }, { "epoch": 1.1998087954110899, "grad_norm": 2.4011824131011963, "learning_rate": 5e-06, "loss": 0.2601, "step": 6275 }, { "epoch": 1.2, "grad_norm": 1.6423674821853638, "learning_rate": 5e-06, "loss": 0.1036, "step": 6276 }, { "epoch": 1.20019120458891, "grad_norm": 1.1955245733261108, "learning_rate": 5e-06, "loss": 0.1115, "step": 6277 }, { "epoch": 1.2003824091778204, "grad_norm": 2.667212724685669, "learning_rate": 5e-06, "loss": 0.2089, "step": 6278 }, { "epoch": 1.2005736137667304, "grad_norm": 1.371573567390442, "learning_rate": 5e-06, "loss": 0.0467, "step": 6279 }, { "epoch": 1.2007648183556405, "grad_norm": 0.9084519147872925, "learning_rate": 5e-06, "loss": 0.0358, "step": 6280 }, { "epoch": 1.2009560229445506, "grad_norm": 2.8557240962982178, "learning_rate": 5e-06, "loss": 0.3219, "step": 6281 }, { "epoch": 1.2011472275334607, "grad_norm": 3.1929121017456055, "learning_rate": 5e-06, "loss": 0.369, "step": 6282 }, { "epoch": 1.201338432122371, "grad_norm": 1.8422592878341675, "learning_rate": 5e-06, "loss": 0.1406, "step": 6283 }, { "epoch": 1.201529636711281, "grad_norm": 1.644729733467102, "learning_rate": 5e-06, "loss": 0.0833, "step": 6284 }, { "epoch": 1.2017208413001912, "grad_norm": 1.2755012512207031, "learning_rate": 5e-06, "loss": 0.0553, "step": 6285 }, { "epoch": 1.2019120458891013, "grad_norm": 0.9884834885597229, "learning_rate": 5e-06, "loss": 0.034, "step": 6286 }, { "epoch": 1.2021032504780114, "grad_norm": 1.7062479257583618, "learning_rate": 5e-06, "loss": 0.1823, "step": 6287 }, { "epoch": 1.2022944550669217, "grad_norm": 1.1571954488754272, "learning_rate": 5e-06, "loss": 0.0542, "step": 6288 }, { "epoch": 1.2024856596558318, "grad_norm": 2.0687966346740723, "learning_rate": 5e-06, "loss": 0.2359, "step": 6289 }, { "epoch": 1.2026768642447419, "grad_norm": 1.4236472845077515, "learning_rate": 5e-06, "loss": 0.0533, "step": 6290 }, { "epoch": 1.202868068833652, "grad_norm": 1.979692816734314, "learning_rate": 5e-06, "loss": 0.0762, "step": 6291 }, { "epoch": 1.203059273422562, "grad_norm": 0.6412269473075867, "learning_rate": 5e-06, "loss": 0.0188, "step": 6292 }, { "epoch": 1.2032504780114723, "grad_norm": 2.0749592781066895, "learning_rate": 5e-06, "loss": 0.1959, "step": 6293 }, { "epoch": 1.2034416826003824, "grad_norm": 1.9312831163406372, "learning_rate": 5e-06, "loss": 0.1836, "step": 6294 }, { "epoch": 1.2036328871892925, "grad_norm": 1.3121813535690308, "learning_rate": 5e-06, "loss": 0.0944, "step": 6295 }, { "epoch": 1.2038240917782026, "grad_norm": 1.8166366815567017, "learning_rate": 5e-06, "loss": 0.1763, "step": 6296 }, { "epoch": 1.2040152963671127, "grad_norm": 2.4472837448120117, "learning_rate": 5e-06, "loss": 0.2342, "step": 6297 }, { "epoch": 1.204206500956023, "grad_norm": 1.2451283931732178, "learning_rate": 5e-06, "loss": 0.0518, "step": 6298 }, { "epoch": 1.204397705544933, "grad_norm": 1.7306233644485474, "learning_rate": 5e-06, "loss": 0.0941, "step": 6299 }, { "epoch": 1.2045889101338432, "grad_norm": 1.8587627410888672, "learning_rate": 5e-06, "loss": 0.1325, "step": 6300 }, { "epoch": 1.2047801147227533, "grad_norm": 1.9993771314620972, "learning_rate": 5e-06, "loss": 0.2328, "step": 6301 }, { "epoch": 1.2049713193116636, "grad_norm": 1.5691461563110352, "learning_rate": 5e-06, "loss": 0.1121, "step": 6302 }, { "epoch": 1.2051625239005737, "grad_norm": 1.0496647357940674, "learning_rate": 5e-06, "loss": 0.0498, "step": 6303 }, { "epoch": 1.2053537284894837, "grad_norm": 6.206069469451904, "learning_rate": 5e-06, "loss": 0.0789, "step": 6304 }, { "epoch": 1.2055449330783938, "grad_norm": 1.0517714023590088, "learning_rate": 5e-06, "loss": 0.0533, "step": 6305 }, { "epoch": 1.205736137667304, "grad_norm": 1.230154275894165, "learning_rate": 5e-06, "loss": 0.0728, "step": 6306 }, { "epoch": 1.2059273422562142, "grad_norm": 1.2463685274124146, "learning_rate": 5e-06, "loss": 0.0973, "step": 6307 }, { "epoch": 1.2061185468451243, "grad_norm": 2.0279324054718018, "learning_rate": 5e-06, "loss": 0.1416, "step": 6308 }, { "epoch": 1.2063097514340344, "grad_norm": 1.0040415525436401, "learning_rate": 5e-06, "loss": 0.0787, "step": 6309 }, { "epoch": 1.2065009560229445, "grad_norm": 0.8246625065803528, "learning_rate": 5e-06, "loss": 0.0398, "step": 6310 }, { "epoch": 1.2066921606118548, "grad_norm": 1.2535613775253296, "learning_rate": 5e-06, "loss": 0.0478, "step": 6311 }, { "epoch": 1.206883365200765, "grad_norm": 1.6264280080795288, "learning_rate": 5e-06, "loss": 0.0806, "step": 6312 }, { "epoch": 1.207074569789675, "grad_norm": 2.006242275238037, "learning_rate": 5e-06, "loss": 0.1946, "step": 6313 }, { "epoch": 1.207265774378585, "grad_norm": 1.1853926181793213, "learning_rate": 5e-06, "loss": 0.0983, "step": 6314 }, { "epoch": 1.2074569789674952, "grad_norm": 1.7839405536651611, "learning_rate": 5e-06, "loss": 0.1751, "step": 6315 }, { "epoch": 1.2076481835564055, "grad_norm": 1.2711913585662842, "learning_rate": 5e-06, "loss": 0.0652, "step": 6316 }, { "epoch": 1.2078393881453156, "grad_norm": 3.0699965953826904, "learning_rate": 5e-06, "loss": 0.0587, "step": 6317 }, { "epoch": 1.2080305927342256, "grad_norm": 1.580519199371338, "learning_rate": 5e-06, "loss": 0.1163, "step": 6318 }, { "epoch": 1.2082217973231357, "grad_norm": 1.9091516733169556, "learning_rate": 5e-06, "loss": 0.2536, "step": 6319 }, { "epoch": 1.2084130019120458, "grad_norm": 1.5716497898101807, "learning_rate": 5e-06, "loss": 0.1696, "step": 6320 }, { "epoch": 1.2086042065009561, "grad_norm": 1.1605068445205688, "learning_rate": 5e-06, "loss": 0.0474, "step": 6321 }, { "epoch": 1.2087954110898662, "grad_norm": 1.2060127258300781, "learning_rate": 5e-06, "loss": 0.0683, "step": 6322 }, { "epoch": 1.2089866156787763, "grad_norm": 1.9277080297470093, "learning_rate": 5e-06, "loss": 0.196, "step": 6323 }, { "epoch": 1.2091778202676864, "grad_norm": 2.57090163230896, "learning_rate": 5e-06, "loss": 0.0813, "step": 6324 }, { "epoch": 1.2093690248565965, "grad_norm": 2.1226699352264404, "learning_rate": 5e-06, "loss": 0.249, "step": 6325 }, { "epoch": 1.2095602294455068, "grad_norm": 1.5362803936004639, "learning_rate": 5e-06, "loss": 0.183, "step": 6326 }, { "epoch": 1.2097514340344169, "grad_norm": 2.6527233123779297, "learning_rate": 5e-06, "loss": 0.2657, "step": 6327 }, { "epoch": 1.209942638623327, "grad_norm": 1.9042359590530396, "learning_rate": 5e-06, "loss": 0.1758, "step": 6328 }, { "epoch": 1.210133843212237, "grad_norm": 0.6970901489257812, "learning_rate": 5e-06, "loss": 0.0262, "step": 6329 }, { "epoch": 1.2103250478011471, "grad_norm": 1.6701407432556152, "learning_rate": 5e-06, "loss": 0.0508, "step": 6330 }, { "epoch": 1.2105162523900574, "grad_norm": 2.219998836517334, "learning_rate": 5e-06, "loss": 0.376, "step": 6331 }, { "epoch": 1.2107074569789675, "grad_norm": 1.266753077507019, "learning_rate": 5e-06, "loss": 0.078, "step": 6332 }, { "epoch": 1.2108986615678776, "grad_norm": 1.639849066734314, "learning_rate": 5e-06, "loss": 0.1245, "step": 6333 }, { "epoch": 1.2110898661567877, "grad_norm": 1.036638855934143, "learning_rate": 5e-06, "loss": 0.043, "step": 6334 }, { "epoch": 1.2112810707456978, "grad_norm": 1.1263424158096313, "learning_rate": 5e-06, "loss": 0.0321, "step": 6335 }, { "epoch": 1.211472275334608, "grad_norm": 1.8749959468841553, "learning_rate": 5e-06, "loss": 0.0971, "step": 6336 }, { "epoch": 1.2116634799235182, "grad_norm": 0.9465709924697876, "learning_rate": 5e-06, "loss": 0.0425, "step": 6337 }, { "epoch": 1.2118546845124283, "grad_norm": 2.4710988998413086, "learning_rate": 5e-06, "loss": 0.274, "step": 6338 }, { "epoch": 1.2120458891013384, "grad_norm": 1.7354429960250854, "learning_rate": 5e-06, "loss": 0.1027, "step": 6339 }, { "epoch": 1.2122370936902485, "grad_norm": 1.3433597087860107, "learning_rate": 5e-06, "loss": 0.0579, "step": 6340 }, { "epoch": 1.2124282982791588, "grad_norm": 1.5866605043411255, "learning_rate": 5e-06, "loss": 0.0936, "step": 6341 }, { "epoch": 1.2126195028680689, "grad_norm": 2.422170400619507, "learning_rate": 5e-06, "loss": 0.0584, "step": 6342 }, { "epoch": 1.212810707456979, "grad_norm": 2.6649551391601562, "learning_rate": 5e-06, "loss": 0.1525, "step": 6343 }, { "epoch": 1.213001912045889, "grad_norm": 1.7258577346801758, "learning_rate": 5e-06, "loss": 0.1976, "step": 6344 }, { "epoch": 1.2131931166347991, "grad_norm": 2.841470241546631, "learning_rate": 5e-06, "loss": 0.3982, "step": 6345 }, { "epoch": 1.2133843212237094, "grad_norm": 1.485338807106018, "learning_rate": 5e-06, "loss": 0.044, "step": 6346 }, { "epoch": 1.2135755258126195, "grad_norm": 1.383081316947937, "learning_rate": 5e-06, "loss": 0.2076, "step": 6347 }, { "epoch": 1.2137667304015296, "grad_norm": 0.8841148018836975, "learning_rate": 5e-06, "loss": 0.0334, "step": 6348 }, { "epoch": 1.2139579349904397, "grad_norm": 1.4105879068374634, "learning_rate": 5e-06, "loss": 0.043, "step": 6349 }, { "epoch": 1.21414913957935, "grad_norm": 2.799672842025757, "learning_rate": 5e-06, "loss": 0.2811, "step": 6350 }, { "epoch": 1.21434034416826, "grad_norm": 1.106540322303772, "learning_rate": 5e-06, "loss": 0.0508, "step": 6351 }, { "epoch": 1.2145315487571702, "grad_norm": 1.1798187494277954, "learning_rate": 5e-06, "loss": 0.071, "step": 6352 }, { "epoch": 1.2147227533460803, "grad_norm": 1.4219733476638794, "learning_rate": 5e-06, "loss": 0.0609, "step": 6353 }, { "epoch": 1.2149139579349904, "grad_norm": 0.49215996265411377, "learning_rate": 5e-06, "loss": 0.013, "step": 6354 }, { "epoch": 1.2151051625239007, "grad_norm": 1.4540727138519287, "learning_rate": 5e-06, "loss": 0.066, "step": 6355 }, { "epoch": 1.2152963671128107, "grad_norm": 2.5020601749420166, "learning_rate": 5e-06, "loss": 0.16, "step": 6356 }, { "epoch": 1.2154875717017208, "grad_norm": 1.5029916763305664, "learning_rate": 5e-06, "loss": 0.1123, "step": 6357 }, { "epoch": 1.215678776290631, "grad_norm": 1.260717749595642, "learning_rate": 5e-06, "loss": 0.0678, "step": 6358 }, { "epoch": 1.215869980879541, "grad_norm": 1.8185780048370361, "learning_rate": 5e-06, "loss": 0.1582, "step": 6359 }, { "epoch": 1.2160611854684513, "grad_norm": 0.7864851951599121, "learning_rate": 5e-06, "loss": 0.0319, "step": 6360 }, { "epoch": 1.2162523900573614, "grad_norm": 1.0918238162994385, "learning_rate": 5e-06, "loss": 0.0447, "step": 6361 }, { "epoch": 1.2164435946462715, "grad_norm": 2.916642427444458, "learning_rate": 5e-06, "loss": 0.2256, "step": 6362 }, { "epoch": 1.2166347992351816, "grad_norm": 2.603426218032837, "learning_rate": 5e-06, "loss": 0.1929, "step": 6363 }, { "epoch": 1.216826003824092, "grad_norm": 1.888663411140442, "learning_rate": 5e-06, "loss": 0.1862, "step": 6364 }, { "epoch": 1.217017208413002, "grad_norm": 5.557328701019287, "learning_rate": 5e-06, "loss": 0.1513, "step": 6365 }, { "epoch": 1.217208413001912, "grad_norm": 0.6387971043586731, "learning_rate": 5e-06, "loss": 0.0625, "step": 6366 }, { "epoch": 1.2173996175908222, "grad_norm": 0.80973881483078, "learning_rate": 5e-06, "loss": 0.0612, "step": 6367 }, { "epoch": 1.2175908221797322, "grad_norm": 1.4961767196655273, "learning_rate": 5e-06, "loss": 0.0794, "step": 6368 }, { "epoch": 1.2177820267686426, "grad_norm": 1.3402931690216064, "learning_rate": 5e-06, "loss": 0.1085, "step": 6369 }, { "epoch": 1.2179732313575526, "grad_norm": 1.615916132926941, "learning_rate": 5e-06, "loss": 0.1275, "step": 6370 }, { "epoch": 1.2181644359464627, "grad_norm": 0.9041286110877991, "learning_rate": 5e-06, "loss": 0.0461, "step": 6371 }, { "epoch": 1.2183556405353728, "grad_norm": 1.5138685703277588, "learning_rate": 5e-06, "loss": 0.0752, "step": 6372 }, { "epoch": 1.218546845124283, "grad_norm": 2.4668264389038086, "learning_rate": 5e-06, "loss": 0.1996, "step": 6373 }, { "epoch": 1.2187380497131932, "grad_norm": 1.754960060119629, "learning_rate": 5e-06, "loss": 0.0677, "step": 6374 }, { "epoch": 1.2189292543021033, "grad_norm": 1.3340129852294922, "learning_rate": 5e-06, "loss": 0.0818, "step": 6375 }, { "epoch": 1.2191204588910134, "grad_norm": 2.1007907390594482, "learning_rate": 5e-06, "loss": 0.1689, "step": 6376 }, { "epoch": 1.2193116634799235, "grad_norm": 2.982419729232788, "learning_rate": 5e-06, "loss": 0.217, "step": 6377 }, { "epoch": 1.2195028680688336, "grad_norm": 0.9394291043281555, "learning_rate": 5e-06, "loss": 0.079, "step": 6378 }, { "epoch": 1.2196940726577439, "grad_norm": 0.9420931935310364, "learning_rate": 5e-06, "loss": 0.0518, "step": 6379 }, { "epoch": 1.219885277246654, "grad_norm": 1.3374733924865723, "learning_rate": 5e-06, "loss": 0.0267, "step": 6380 }, { "epoch": 1.220076481835564, "grad_norm": 3.1424825191497803, "learning_rate": 5e-06, "loss": 0.2596, "step": 6381 }, { "epoch": 1.2202676864244741, "grad_norm": 1.0249494314193726, "learning_rate": 5e-06, "loss": 0.0696, "step": 6382 }, { "epoch": 1.2204588910133842, "grad_norm": 1.8867278099060059, "learning_rate": 5e-06, "loss": 0.0802, "step": 6383 }, { "epoch": 1.2206500956022945, "grad_norm": 1.559537649154663, "learning_rate": 5e-06, "loss": 0.0624, "step": 6384 }, { "epoch": 1.2208413001912046, "grad_norm": 0.673812747001648, "learning_rate": 5e-06, "loss": 0.0599, "step": 6385 }, { "epoch": 1.2210325047801147, "grad_norm": 0.6307937502861023, "learning_rate": 5e-06, "loss": 0.0229, "step": 6386 }, { "epoch": 1.2212237093690248, "grad_norm": 1.3093122243881226, "learning_rate": 5e-06, "loss": 0.1147, "step": 6387 }, { "epoch": 1.221414913957935, "grad_norm": 2.759263277053833, "learning_rate": 5e-06, "loss": 0.4477, "step": 6388 }, { "epoch": 1.2216061185468452, "grad_norm": 2.4608709812164307, "learning_rate": 5e-06, "loss": 0.313, "step": 6389 }, { "epoch": 1.2217973231357553, "grad_norm": 1.8315072059631348, "learning_rate": 5e-06, "loss": 0.089, "step": 6390 }, { "epoch": 1.2219885277246654, "grad_norm": 0.7214317321777344, "learning_rate": 5e-06, "loss": 0.0515, "step": 6391 }, { "epoch": 1.2221797323135755, "grad_norm": 0.8006981611251831, "learning_rate": 5e-06, "loss": 0.0421, "step": 6392 }, { "epoch": 1.2223709369024855, "grad_norm": 1.7554163932800293, "learning_rate": 5e-06, "loss": 0.147, "step": 6393 }, { "epoch": 1.2225621414913959, "grad_norm": 2.516019344329834, "learning_rate": 5e-06, "loss": 0.3259, "step": 6394 }, { "epoch": 1.222753346080306, "grad_norm": 1.6270321607589722, "learning_rate": 5e-06, "loss": 0.0723, "step": 6395 }, { "epoch": 1.222944550669216, "grad_norm": 0.9091405272483826, "learning_rate": 5e-06, "loss": 0.1058, "step": 6396 }, { "epoch": 1.2231357552581261, "grad_norm": 0.7934021353721619, "learning_rate": 5e-06, "loss": 0.0766, "step": 6397 }, { "epoch": 1.2233269598470362, "grad_norm": 0.9105468988418579, "learning_rate": 5e-06, "loss": 0.0267, "step": 6398 }, { "epoch": 1.2235181644359465, "grad_norm": 1.7011659145355225, "learning_rate": 5e-06, "loss": 0.1063, "step": 6399 }, { "epoch": 1.2237093690248566, "grad_norm": 1.6316510438919067, "learning_rate": 5e-06, "loss": 0.1647, "step": 6400 }, { "epoch": 1.2239005736137667, "grad_norm": 1.9868152141571045, "learning_rate": 5e-06, "loss": 0.1778, "step": 6401 }, { "epoch": 1.2240917782026768, "grad_norm": 1.912008285522461, "learning_rate": 5e-06, "loss": 0.1873, "step": 6402 }, { "epoch": 1.224282982791587, "grad_norm": 1.5919640064239502, "learning_rate": 5e-06, "loss": 0.1538, "step": 6403 }, { "epoch": 1.2244741873804972, "grad_norm": 1.775577425956726, "learning_rate": 5e-06, "loss": 0.0358, "step": 6404 }, { "epoch": 1.2246653919694073, "grad_norm": 0.8517948985099792, "learning_rate": 5e-06, "loss": 0.0253, "step": 6405 }, { "epoch": 1.2248565965583174, "grad_norm": 2.3561553955078125, "learning_rate": 5e-06, "loss": 0.2985, "step": 6406 }, { "epoch": 1.2250478011472274, "grad_norm": 1.6562501192092896, "learning_rate": 5e-06, "loss": 0.2061, "step": 6407 }, { "epoch": 1.2252390057361378, "grad_norm": 1.6132574081420898, "learning_rate": 5e-06, "loss": 0.1679, "step": 6408 }, { "epoch": 1.2254302103250478, "grad_norm": 1.9934953451156616, "learning_rate": 5e-06, "loss": 0.2289, "step": 6409 }, { "epoch": 1.225621414913958, "grad_norm": 1.7691314220428467, "learning_rate": 5e-06, "loss": 0.1696, "step": 6410 }, { "epoch": 1.225812619502868, "grad_norm": 1.1167519092559814, "learning_rate": 5e-06, "loss": 0.0377, "step": 6411 }, { "epoch": 1.2260038240917783, "grad_norm": 1.710638165473938, "learning_rate": 5e-06, "loss": 0.2259, "step": 6412 }, { "epoch": 1.2261950286806884, "grad_norm": 2.857348680496216, "learning_rate": 5e-06, "loss": 0.442, "step": 6413 }, { "epoch": 1.2263862332695985, "grad_norm": 2.343129873275757, "learning_rate": 5e-06, "loss": 0.2763, "step": 6414 }, { "epoch": 1.2265774378585086, "grad_norm": 0.8242167830467224, "learning_rate": 5e-06, "loss": 0.0545, "step": 6415 }, { "epoch": 1.2267686424474187, "grad_norm": 1.5538809299468994, "learning_rate": 5e-06, "loss": 0.0966, "step": 6416 }, { "epoch": 1.226959847036329, "grad_norm": 1.174586534500122, "learning_rate": 5e-06, "loss": 0.0375, "step": 6417 }, { "epoch": 1.227151051625239, "grad_norm": 2.826963186264038, "learning_rate": 5e-06, "loss": 0.3052, "step": 6418 }, { "epoch": 1.2273422562141492, "grad_norm": 2.5396435260772705, "learning_rate": 5e-06, "loss": 0.2724, "step": 6419 }, { "epoch": 1.2275334608030593, "grad_norm": 2.1552212238311768, "learning_rate": 5e-06, "loss": 0.2733, "step": 6420 }, { "epoch": 1.2277246653919693, "grad_norm": 2.812382936477661, "learning_rate": 5e-06, "loss": 0.2159, "step": 6421 }, { "epoch": 1.2279158699808796, "grad_norm": 1.1729477643966675, "learning_rate": 5e-06, "loss": 0.1066, "step": 6422 }, { "epoch": 1.2281070745697897, "grad_norm": 1.4861737489700317, "learning_rate": 5e-06, "loss": 0.1204, "step": 6423 }, { "epoch": 1.2282982791586998, "grad_norm": 0.9737563729286194, "learning_rate": 5e-06, "loss": 0.0359, "step": 6424 }, { "epoch": 1.22848948374761, "grad_norm": 2.244095802307129, "learning_rate": 5e-06, "loss": 0.3383, "step": 6425 }, { "epoch": 1.22868068833652, "grad_norm": 1.4030261039733887, "learning_rate": 5e-06, "loss": 0.0753, "step": 6426 }, { "epoch": 1.2288718929254303, "grad_norm": 1.439172625541687, "learning_rate": 5e-06, "loss": 0.0642, "step": 6427 }, { "epoch": 1.2290630975143404, "grad_norm": 0.6380961537361145, "learning_rate": 5e-06, "loss": 0.0633, "step": 6428 }, { "epoch": 1.2292543021032505, "grad_norm": 0.9303730726242065, "learning_rate": 5e-06, "loss": 0.0676, "step": 6429 }, { "epoch": 1.2294455066921606, "grad_norm": 1.2828363180160522, "learning_rate": 5e-06, "loss": 0.0468, "step": 6430 }, { "epoch": 1.2296367112810707, "grad_norm": 2.6689090728759766, "learning_rate": 5e-06, "loss": 0.3648, "step": 6431 }, { "epoch": 1.229827915869981, "grad_norm": 2.0346622467041016, "learning_rate": 5e-06, "loss": 0.2261, "step": 6432 }, { "epoch": 1.230019120458891, "grad_norm": 1.939487338066101, "learning_rate": 5e-06, "loss": 0.193, "step": 6433 }, { "epoch": 1.2302103250478011, "grad_norm": 1.3966894149780273, "learning_rate": 5e-06, "loss": 0.0372, "step": 6434 }, { "epoch": 1.2304015296367112, "grad_norm": 1.1000654697418213, "learning_rate": 5e-06, "loss": 0.0814, "step": 6435 }, { "epoch": 1.2305927342256213, "grad_norm": 4.047434329986572, "learning_rate": 5e-06, "loss": 0.1489, "step": 6436 }, { "epoch": 1.2307839388145316, "grad_norm": 1.382005214691162, "learning_rate": 5e-06, "loss": 0.0809, "step": 6437 }, { "epoch": 1.2309751434034417, "grad_norm": 1.00590181350708, "learning_rate": 5e-06, "loss": 0.0726, "step": 6438 }, { "epoch": 1.2311663479923518, "grad_norm": 1.4503659009933472, "learning_rate": 5e-06, "loss": 0.1095, "step": 6439 }, { "epoch": 1.231357552581262, "grad_norm": 1.0048933029174805, "learning_rate": 5e-06, "loss": 0.0531, "step": 6440 }, { "epoch": 1.231548757170172, "grad_norm": 1.0850677490234375, "learning_rate": 5e-06, "loss": 0.0485, "step": 6441 }, { "epoch": 1.2317399617590823, "grad_norm": 2.6655921936035156, "learning_rate": 5e-06, "loss": 0.1078, "step": 6442 }, { "epoch": 1.2319311663479924, "grad_norm": 0.8888630867004395, "learning_rate": 5e-06, "loss": 0.0555, "step": 6443 }, { "epoch": 1.2321223709369025, "grad_norm": 2.1980793476104736, "learning_rate": 5e-06, "loss": 0.3322, "step": 6444 }, { "epoch": 1.2323135755258126, "grad_norm": 1.1009891033172607, "learning_rate": 5e-06, "loss": 0.0593, "step": 6445 }, { "epoch": 1.2325047801147226, "grad_norm": 2.154589891433716, "learning_rate": 5e-06, "loss": 0.1893, "step": 6446 }, { "epoch": 1.232695984703633, "grad_norm": 2.7047064304351807, "learning_rate": 5e-06, "loss": 0.178, "step": 6447 }, { "epoch": 1.232887189292543, "grad_norm": 1.372484803199768, "learning_rate": 5e-06, "loss": 0.0953, "step": 6448 }, { "epoch": 1.2330783938814531, "grad_norm": 1.1343607902526855, "learning_rate": 5e-06, "loss": 0.0352, "step": 6449 }, { "epoch": 1.2332695984703632, "grad_norm": 2.3495070934295654, "learning_rate": 5e-06, "loss": 0.2928, "step": 6450 }, { "epoch": 1.2334608030592733, "grad_norm": 1.3010920286178589, "learning_rate": 5e-06, "loss": 0.0774, "step": 6451 }, { "epoch": 1.2336520076481836, "grad_norm": 1.11711585521698, "learning_rate": 5e-06, "loss": 0.0818, "step": 6452 }, { "epoch": 1.2338432122370937, "grad_norm": 2.2713661193847656, "learning_rate": 5e-06, "loss": 0.1201, "step": 6453 }, { "epoch": 1.2340344168260038, "grad_norm": 1.1702678203582764, "learning_rate": 5e-06, "loss": 0.0606, "step": 6454 }, { "epoch": 1.2342256214149139, "grad_norm": 1.4318171739578247, "learning_rate": 5e-06, "loss": 0.0504, "step": 6455 }, { "epoch": 1.2344168260038242, "grad_norm": 1.7486087083816528, "learning_rate": 5e-06, "loss": 0.209, "step": 6456 }, { "epoch": 1.2346080305927343, "grad_norm": 2.538674831390381, "learning_rate": 5e-06, "loss": 0.2618, "step": 6457 }, { "epoch": 1.2347992351816444, "grad_norm": 1.6931205987930298, "learning_rate": 5e-06, "loss": 0.1153, "step": 6458 }, { "epoch": 1.2349904397705544, "grad_norm": 2.5916154384613037, "learning_rate": 5e-06, "loss": 0.2899, "step": 6459 }, { "epoch": 1.2351816443594645, "grad_norm": 0.5500595569610596, "learning_rate": 5e-06, "loss": 0.0244, "step": 6460 }, { "epoch": 1.2353728489483748, "grad_norm": 1.0323785543441772, "learning_rate": 5e-06, "loss": 0.0267, "step": 6461 }, { "epoch": 1.235564053537285, "grad_norm": 2.1304259300231934, "learning_rate": 5e-06, "loss": 0.2838, "step": 6462 }, { "epoch": 1.235755258126195, "grad_norm": 1.0863720178604126, "learning_rate": 5e-06, "loss": 0.0669, "step": 6463 }, { "epoch": 1.235946462715105, "grad_norm": 1.7608684301376343, "learning_rate": 5e-06, "loss": 0.1987, "step": 6464 }, { "epoch": 1.2361376673040154, "grad_norm": 2.235177516937256, "learning_rate": 5e-06, "loss": 0.2417, "step": 6465 }, { "epoch": 1.2363288718929255, "grad_norm": 0.9923659563064575, "learning_rate": 5e-06, "loss": 0.0576, "step": 6466 }, { "epoch": 1.2365200764818356, "grad_norm": 0.47206011414527893, "learning_rate": 5e-06, "loss": 0.0123, "step": 6467 }, { "epoch": 1.2367112810707457, "grad_norm": 1.7471652030944824, "learning_rate": 5e-06, "loss": 0.1048, "step": 6468 }, { "epoch": 1.2369024856596558, "grad_norm": 2.4238178730010986, "learning_rate": 5e-06, "loss": 0.3712, "step": 6469 }, { "epoch": 1.237093690248566, "grad_norm": 1.817383050918579, "learning_rate": 5e-06, "loss": 0.173, "step": 6470 }, { "epoch": 1.2372848948374762, "grad_norm": 0.883569598197937, "learning_rate": 5e-06, "loss": 0.0748, "step": 6471 }, { "epoch": 1.2374760994263863, "grad_norm": 1.4984726905822754, "learning_rate": 5e-06, "loss": 0.1212, "step": 6472 }, { "epoch": 1.2376673040152963, "grad_norm": 1.6612300872802734, "learning_rate": 5e-06, "loss": 0.1338, "step": 6473 }, { "epoch": 1.2378585086042064, "grad_norm": 1.7457292079925537, "learning_rate": 5e-06, "loss": 0.0619, "step": 6474 }, { "epoch": 1.2380497131931167, "grad_norm": 2.305072784423828, "learning_rate": 5e-06, "loss": 0.2619, "step": 6475 }, { "epoch": 1.2382409177820268, "grad_norm": 0.9911337494850159, "learning_rate": 5e-06, "loss": 0.0859, "step": 6476 }, { "epoch": 1.238432122370937, "grad_norm": 1.6552451848983765, "learning_rate": 5e-06, "loss": 0.1726, "step": 6477 }, { "epoch": 1.238623326959847, "grad_norm": 2.003192901611328, "learning_rate": 5e-06, "loss": 0.1219, "step": 6478 }, { "epoch": 1.238814531548757, "grad_norm": 1.888157844543457, "learning_rate": 5e-06, "loss": 0.0987, "step": 6479 }, { "epoch": 1.2390057361376674, "grad_norm": 1.2396515607833862, "learning_rate": 5e-06, "loss": 0.0398, "step": 6480 }, { "epoch": 1.2391969407265775, "grad_norm": 0.947607159614563, "learning_rate": 5e-06, "loss": 0.0804, "step": 6481 }, { "epoch": 1.2393881453154876, "grad_norm": 2.7749950885772705, "learning_rate": 5e-06, "loss": 0.1755, "step": 6482 }, { "epoch": 1.2395793499043977, "grad_norm": 2.7286288738250732, "learning_rate": 5e-06, "loss": 0.3329, "step": 6483 }, { "epoch": 1.2397705544933078, "grad_norm": 2.670870065689087, "learning_rate": 5e-06, "loss": 0.0679, "step": 6484 }, { "epoch": 1.239961759082218, "grad_norm": 0.9692270159721375, "learning_rate": 5e-06, "loss": 0.0543, "step": 6485 }, { "epoch": 1.2401529636711282, "grad_norm": 1.525709867477417, "learning_rate": 5e-06, "loss": 0.0621, "step": 6486 }, { "epoch": 1.2403441682600382, "grad_norm": 2.301687479019165, "learning_rate": 5e-06, "loss": 0.1212, "step": 6487 }, { "epoch": 1.2405353728489483, "grad_norm": 1.8523778915405273, "learning_rate": 5e-06, "loss": 0.1042, "step": 6488 }, { "epoch": 1.2407265774378584, "grad_norm": 1.0919690132141113, "learning_rate": 5e-06, "loss": 0.0533, "step": 6489 }, { "epoch": 1.2409177820267687, "grad_norm": 0.7931455373764038, "learning_rate": 5e-06, "loss": 0.0574, "step": 6490 }, { "epoch": 1.2411089866156788, "grad_norm": 0.8481590747833252, "learning_rate": 5e-06, "loss": 0.0492, "step": 6491 }, { "epoch": 1.241300191204589, "grad_norm": 1.4083970785140991, "learning_rate": 5e-06, "loss": 0.0413, "step": 6492 }, { "epoch": 1.241491395793499, "grad_norm": 0.9851438999176025, "learning_rate": 5e-06, "loss": 0.0501, "step": 6493 }, { "epoch": 1.241682600382409, "grad_norm": 2.721176862716675, "learning_rate": 5e-06, "loss": 0.4178, "step": 6494 }, { "epoch": 1.2418738049713194, "grad_norm": 1.2428650856018066, "learning_rate": 5e-06, "loss": 0.1112, "step": 6495 }, { "epoch": 1.2420650095602295, "grad_norm": 3.327238082885742, "learning_rate": 5e-06, "loss": 0.3819, "step": 6496 }, { "epoch": 1.2422562141491396, "grad_norm": 1.2107603549957275, "learning_rate": 5e-06, "loss": 0.0878, "step": 6497 }, { "epoch": 1.2424474187380496, "grad_norm": 1.4254592657089233, "learning_rate": 5e-06, "loss": 0.0726, "step": 6498 }, { "epoch": 1.2426386233269597, "grad_norm": 0.8053144216537476, "learning_rate": 5e-06, "loss": 0.0288, "step": 6499 }, { "epoch": 1.24282982791587, "grad_norm": 2.3608970642089844, "learning_rate": 5e-06, "loss": 0.2719, "step": 6500 }, { "epoch": 1.24282982791587, "eval_runtime": 801.4109, "eval_samples_per_second": 1.914, "eval_steps_per_second": 0.24, "step": 6500 } ], "logging_steps": 1.0, "max_steps": 26150, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.592210007442588e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }