diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18492 @@ +{ + "best_metric": 0.7225942611694336, + "best_model_checkpoint": "/data/sora/Projects/safe-sora/outputs/cost/reward-harmlessness/checkpoint-1216", + "epoch": 4.0, + "eval_steps": 76, + "global_step": 1520, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0, + "step": 0, + "train_accuracy": 0.4375 + }, + { + "epoch": 0.002631578947368421, + "grad_norm": 82.71702575683594, + "learning_rate": 4.347826086956522e-07, + "loss": 2.3203, + "step": 1 + }, + { + "epoch": 0.002631578947368421, + "step": 1, + "train_accuracy": 0.453125 + }, + { + "epoch": 0.005263157894736842, + "grad_norm": 33.55039978027344, + "learning_rate": 8.695652173913044e-07, + "loss": 2.1104, + "step": 2 + }, + { + "epoch": 0.005263157894736842, + "step": 2, + "train_accuracy": 0.40625 + }, + { + "epoch": 0.007894736842105263, + "grad_norm": 120.34129333496094, + "learning_rate": 1.3043478260869566e-06, + "loss": 2.4414, + "step": 3 + }, + { + "epoch": 0.007894736842105263, + "step": 3, + "train_accuracy": 0.3125 + }, + { + "epoch": 0.010526315789473684, + "grad_norm": 75.6687240600586, + "learning_rate": 1.7391304347826088e-06, + "loss": 2.4277, + "step": 4 + }, + { + "epoch": 0.010526315789473684, + "step": 4, + "train_accuracy": 0.46875 + }, + { + "epoch": 0.013157894736842105, + "grad_norm": 60.28215026855469, + "learning_rate": 2.173913043478261e-06, + "loss": 2.3184, + "step": 5 + }, + { + "epoch": 0.013157894736842105, + "step": 5, + "train_accuracy": 0.4375 + }, + { + "epoch": 0.015789473684210527, + "grad_norm": 86.08032989501953, + "learning_rate": 2.6086956521739132e-06, + "loss": 2.3105, + "step": 6 + }, + { + "epoch": 0.015789473684210527, + "step": 6, + "train_accuracy": 0.53125 + }, + { + "epoch": 0.018421052631578946, + "grad_norm": 243.07888793945312, + "learning_rate": 3.043478260869566e-06, + "loss": 2.499, + "step": 7 + }, + { + "epoch": 0.018421052631578946, + "step": 7, + "train_accuracy": 0.515625 + }, + { + "epoch": 0.021052631578947368, + "grad_norm": 176.44801330566406, + "learning_rate": 3.4782608695652175e-06, + "loss": 2.2891, + "step": 8 + }, + { + "epoch": 0.021052631578947368, + "step": 8, + "train_accuracy": 0.578125 + }, + { + "epoch": 0.02368421052631579, + "grad_norm": 28.586137771606445, + "learning_rate": 3.91304347826087e-06, + "loss": 2.1494, + "step": 9 + }, + { + "epoch": 0.02368421052631579, + "step": 9, + "train_accuracy": 0.59375 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 21.67037582397461, + "learning_rate": 4.347826086956522e-06, + "loss": 1.9541, + "step": 10 + }, + { + "epoch": 0.02631578947368421, + "step": 10, + "train_accuracy": 0.578125 + }, + { + "epoch": 0.02894736842105263, + "grad_norm": 111.27401733398438, + "learning_rate": 4.782608695652174e-06, + "loss": 2.1279, + "step": 11 + }, + { + "epoch": 0.02894736842105263, + "step": 11, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.031578947368421054, + "grad_norm": 106.67390441894531, + "learning_rate": 5.2173913043478265e-06, + "loss": 2.0986, + "step": 12 + }, + { + "epoch": 0.031578947368421054, + "step": 12, + "train_accuracy": 0.59375 + }, + { + "epoch": 0.034210526315789476, + "grad_norm": 58.74238586425781, + "learning_rate": 5.652173913043479e-06, + "loss": 1.8252, + "step": 13 + }, + { + "epoch": 0.034210526315789476, + "step": 13, + "train_accuracy": 0.625 + }, + { + "epoch": 0.03684210526315789, + "grad_norm": 78.9658203125, + "learning_rate": 6.086956521739132e-06, + "loss": 1.8164, + "step": 14 + }, + { + "epoch": 0.03684210526315789, + "step": 14, + "train_accuracy": 0.625 + }, + { + "epoch": 0.039473684210526314, + "grad_norm": 8.593488693237305, + "learning_rate": 6.521739130434783e-06, + "loss": 1.7021, + "step": 15 + }, + { + "epoch": 0.039473684210526314, + "step": 15, + "train_accuracy": 0.65625 + }, + { + "epoch": 0.042105263157894736, + "grad_norm": 91.2029037475586, + "learning_rate": 6.956521739130435e-06, + "loss": 2.0029, + "step": 16 + }, + { + "epoch": 0.042105263157894736, + "step": 16, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.04473684210526316, + "grad_norm": 67.36499786376953, + "learning_rate": 7.391304347826087e-06, + "loss": 1.6465, + "step": 17 + }, + { + "epoch": 0.04473684210526316, + "step": 17, + "train_accuracy": 0.625 + }, + { + "epoch": 0.04736842105263158, + "grad_norm": 105.28680419921875, + "learning_rate": 7.82608695652174e-06, + "loss": 1.7539, + "step": 18 + }, + { + "epoch": 0.04736842105263158, + "step": 18, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.05, + "grad_norm": 9.923203468322754, + "learning_rate": 8.260869565217392e-06, + "loss": 1.7881, + "step": 19 + }, + { + "epoch": 0.05, + "step": 19, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 58.246089935302734, + "learning_rate": 8.695652173913044e-06, + "loss": 1.877, + "step": 20 + }, + { + "epoch": 0.05263157894736842, + "step": 20, + "train_accuracy": 0.546875 + }, + { + "epoch": 0.05526315789473684, + "grad_norm": 41.38032913208008, + "learning_rate": 9.130434782608697e-06, + "loss": 1.8271, + "step": 21 + }, + { + "epoch": 0.05526315789473684, + "step": 21, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.05789473684210526, + "grad_norm": 67.59843444824219, + "learning_rate": 9.565217391304349e-06, + "loss": 1.9014, + "step": 22 + }, + { + "epoch": 0.05789473684210526, + "step": 22, + "train_accuracy": 0.796875 + }, + { + "epoch": 0.060526315789473685, + "grad_norm": 19.102436065673828, + "learning_rate": 1e-05, + "loss": 1.3848, + "step": 23 + }, + { + "epoch": 0.060526315789473685, + "step": 23, + "train_accuracy": 0.546875 + }, + { + "epoch": 0.06315789473684211, + "grad_norm": 9.836153984069824, + "learning_rate": 1.0434782608695653e-05, + "loss": 1.7988, + "step": 24 + }, + { + "epoch": 0.06315789473684211, + "step": 24, + "train_accuracy": 0.59375 + }, + { + "epoch": 0.06578947368421052, + "grad_norm": 18.77242088317871, + "learning_rate": 1.0869565217391305e-05, + "loss": 1.5508, + "step": 25 + }, + { + "epoch": 0.06578947368421052, + "step": 25, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.06842105263157895, + "grad_norm": 33.33855438232422, + "learning_rate": 1.1304347826086957e-05, + "loss": 1.7383, + "step": 26 + }, + { + "epoch": 0.06842105263157895, + "step": 26, + "train_accuracy": 0.78125 + }, + { + "epoch": 0.07105263157894737, + "grad_norm": 27.66554832458496, + "learning_rate": 1.1739130434782611e-05, + "loss": 1.5332, + "step": 27 + }, + { + "epoch": 0.07105263157894737, + "step": 27, + "train_accuracy": 0.578125 + }, + { + "epoch": 0.07368421052631578, + "grad_norm": 47.21514892578125, + "learning_rate": 1.2173913043478263e-05, + "loss": 1.7275, + "step": 28 + }, + { + "epoch": 0.07368421052631578, + "step": 28, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.07631578947368421, + "grad_norm": 5.483724594116211, + "learning_rate": 1.2608695652173915e-05, + "loss": 1.6904, + "step": 29 + }, + { + "epoch": 0.07631578947368421, + "step": 29, + "train_accuracy": 0.59375 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 20.393482208251953, + "learning_rate": 1.3043478260869566e-05, + "loss": 1.6572, + "step": 30 + }, + { + "epoch": 0.07894736842105263, + "step": 30, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.08157894736842106, + "grad_norm": 47.08179473876953, + "learning_rate": 1.3478260869565218e-05, + "loss": 1.8018, + "step": 31 + }, + { + "epoch": 0.08157894736842106, + "step": 31, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.08421052631578947, + "grad_norm": 28.04905891418457, + "learning_rate": 1.391304347826087e-05, + "loss": 1.5576, + "step": 32 + }, + { + "epoch": 0.08421052631578947, + "step": 32, + "train_accuracy": 0.65625 + }, + { + "epoch": 0.0868421052631579, + "grad_norm": 8.057256698608398, + "learning_rate": 1.4347826086956522e-05, + "loss": 1.5527, + "step": 33 + }, + { + "epoch": 0.0868421052631579, + "step": 33, + "train_accuracy": 0.65625 + }, + { + "epoch": 0.08947368421052632, + "grad_norm": 4.107302665710449, + "learning_rate": 1.4782608695652174e-05, + "loss": 1.4678, + "step": 34 + }, + { + "epoch": 0.08947368421052632, + "step": 34, + "train_accuracy": 0.796875 + }, + { + "epoch": 0.09210526315789473, + "grad_norm": 24.317855834960938, + "learning_rate": 1.5217391304347828e-05, + "loss": 1.5107, + "step": 35 + }, + { + "epoch": 0.09210526315789473, + "step": 35, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.09473684210526316, + "grad_norm": 56.14430236816406, + "learning_rate": 1.565217391304348e-05, + "loss": 1.7305, + "step": 36 + }, + { + "epoch": 0.09473684210526316, + "step": 36, + "train_accuracy": 0.75 + }, + { + "epoch": 0.09736842105263158, + "grad_norm": 5.203139305114746, + "learning_rate": 1.6086956521739132e-05, + "loss": 1.4668, + "step": 37 + }, + { + "epoch": 0.09736842105263158, + "step": 37, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.1, + "grad_norm": 16.80422019958496, + "learning_rate": 1.6521739130434785e-05, + "loss": 1.4746, + "step": 38 + }, + { + "epoch": 0.1, + "step": 38, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.10263157894736842, + "grad_norm": 32.82760238647461, + "learning_rate": 1.6956521739130437e-05, + "loss": 1.5713, + "step": 39 + }, + { + "epoch": 0.10263157894736842, + "step": 39, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 34.93852996826172, + "learning_rate": 1.739130434782609e-05, + "loss": 1.7637, + "step": 40 + }, + { + "epoch": 0.10526315789473684, + "step": 40, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.10789473684210527, + "grad_norm": 6.973869800567627, + "learning_rate": 1.782608695652174e-05, + "loss": 1.4702, + "step": 41 + }, + { + "epoch": 0.10789473684210527, + "step": 41, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.11052631578947368, + "grad_norm": 9.916731834411621, + "learning_rate": 1.8260869565217393e-05, + "loss": 1.666, + "step": 42 + }, + { + "epoch": 0.11052631578947368, + "step": 42, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.11315789473684211, + "grad_norm": 28.54862403869629, + "learning_rate": 1.8695652173913045e-05, + "loss": 1.7041, + "step": 43 + }, + { + "epoch": 0.11315789473684211, + "step": 43, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.11578947368421053, + "grad_norm": 13.393380165100098, + "learning_rate": 1.9130434782608697e-05, + "loss": 1.4229, + "step": 44 + }, + { + "epoch": 0.11578947368421053, + "step": 44, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.11842105263157894, + "grad_norm": 8.206025123596191, + "learning_rate": 1.956521739130435e-05, + "loss": 1.5791, + "step": 45 + }, + { + "epoch": 0.11842105263157894, + "step": 45, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.12105263157894737, + "grad_norm": 19.74668312072754, + "learning_rate": 2e-05, + "loss": 1.5811, + "step": 46 + }, + { + "epoch": 0.12105263157894737, + "step": 46, + "train_accuracy": 0.609375 + }, + { + "epoch": 0.12368421052631579, + "grad_norm": 17.365779876708984, + "learning_rate": 1.9999977286993863e-05, + "loss": 1.6475, + "step": 47 + }, + { + "epoch": 0.12368421052631579, + "step": 47, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.12631578947368421, + "grad_norm": 15.396943092346191, + "learning_rate": 1.9999909148078624e-05, + "loss": 1.5986, + "step": 48 + }, + { + "epoch": 0.12631578947368421, + "step": 48, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.12894736842105264, + "grad_norm": 4.93903923034668, + "learning_rate": 1.9999795583563814e-05, + "loss": 1.4766, + "step": 49 + }, + { + "epoch": 0.12894736842105264, + "step": 49, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 24.193723678588867, + "learning_rate": 1.9999636593965306e-05, + "loss": 1.5166, + "step": 50 + }, + { + "epoch": 0.13157894736842105, + "step": 50, + "train_accuracy": 0.78125 + }, + { + "epoch": 0.13421052631578947, + "grad_norm": 14.069121360778809, + "learning_rate": 1.999943218000533e-05, + "loss": 1.4102, + "step": 51 + }, + { + "epoch": 0.13421052631578947, + "step": 51, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.1368421052631579, + "grad_norm": 6.808143615722656, + "learning_rate": 1.999918234261246e-05, + "loss": 1.6113, + "step": 52 + }, + { + "epoch": 0.1368421052631579, + "step": 52, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.1394736842105263, + "grad_norm": 19.879804611206055, + "learning_rate": 1.9998887082921605e-05, + "loss": 1.5615, + "step": 53 + }, + { + "epoch": 0.1394736842105263, + "step": 53, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.14210526315789473, + "grad_norm": 11.96185302734375, + "learning_rate": 1.999854640227401e-05, + "loss": 1.3799, + "step": 54 + }, + { + "epoch": 0.14210526315789473, + "step": 54, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.14473684210526316, + "grad_norm": 15.333646774291992, + "learning_rate": 1.9998160302217254e-05, + "loss": 1.2549, + "step": 55 + }, + { + "epoch": 0.14473684210526316, + "step": 55, + "train_accuracy": 0.609375 + }, + { + "epoch": 0.14736842105263157, + "grad_norm": 17.457860946655273, + "learning_rate": 1.9997728784505232e-05, + "loss": 1.6514, + "step": 56 + }, + { + "epoch": 0.14736842105263157, + "step": 56, + "train_accuracy": 0.78125 + }, + { + "epoch": 0.15, + "grad_norm": 37.52303695678711, + "learning_rate": 1.999725185109816e-05, + "loss": 1.418, + "step": 57 + }, + { + "epoch": 0.15, + "step": 57, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.15263157894736842, + "grad_norm": 15.378766059875488, + "learning_rate": 1.999672950416256e-05, + "loss": 1.5928, + "step": 58 + }, + { + "epoch": 0.15263157894736842, + "step": 58, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.15526315789473685, + "grad_norm": 22.888885498046875, + "learning_rate": 1.9996161746071238e-05, + "loss": 1.3926, + "step": 59 + }, + { + "epoch": 0.15526315789473685, + "step": 59, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 60.397743225097656, + "learning_rate": 1.9995548579403296e-05, + "loss": 2.126, + "step": 60 + }, + { + "epoch": 0.15789473684210525, + "step": 60, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.16052631578947368, + "grad_norm": 5.263916969299316, + "learning_rate": 1.9994890006944105e-05, + "loss": 1.3906, + "step": 61 + }, + { + "epoch": 0.16052631578947368, + "step": 61, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.1631578947368421, + "grad_norm": 24.444305419921875, + "learning_rate": 1.99941860316853e-05, + "loss": 1.4248, + "step": 62 + }, + { + "epoch": 0.1631578947368421, + "step": 62, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.16578947368421051, + "grad_norm": 30.14607048034668, + "learning_rate": 1.999343665682476e-05, + "loss": 1.6807, + "step": 63 + }, + { + "epoch": 0.16578947368421051, + "step": 63, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.16842105263157894, + "grad_norm": 5.177918910980225, + "learning_rate": 1.999264188576659e-05, + "loss": 1.582, + "step": 64 + }, + { + "epoch": 0.16842105263157894, + "step": 64, + "train_accuracy": 0.75 + }, + { + "epoch": 0.17105263157894737, + "grad_norm": 3.7263214588165283, + "learning_rate": 1.9991801722121124e-05, + "loss": 1.4702, + "step": 65 + }, + { + "epoch": 0.17105263157894737, + "step": 65, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.1736842105263158, + "grad_norm": 14.105743408203125, + "learning_rate": 1.9990916169704886e-05, + "loss": 1.5986, + "step": 66 + }, + { + "epoch": 0.1736842105263158, + "step": 66, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.1763157894736842, + "grad_norm": 30.84016990661621, + "learning_rate": 1.9989985232540592e-05, + "loss": 1.7539, + "step": 67 + }, + { + "epoch": 0.1763157894736842, + "step": 67, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.17894736842105263, + "grad_norm": 5.946470737457275, + "learning_rate": 1.9989008914857115e-05, + "loss": 1.5361, + "step": 68 + }, + { + "epoch": 0.17894736842105263, + "step": 68, + "train_accuracy": 0.65625 + }, + { + "epoch": 0.18157894736842106, + "grad_norm": 21.31287384033203, + "learning_rate": 1.998798722108948e-05, + "loss": 1.4482, + "step": 69 + }, + { + "epoch": 0.18157894736842106, + "step": 69, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 8.728236198425293, + "learning_rate": 1.998692015587883e-05, + "loss": 1.4541, + "step": 70 + }, + { + "epoch": 0.18421052631578946, + "step": 70, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.1868421052631579, + "grad_norm": 24.142559051513672, + "learning_rate": 1.998580772407242e-05, + "loss": 1.458, + "step": 71 + }, + { + "epoch": 0.1868421052631579, + "step": 71, + "train_accuracy": 0.8125 + }, + { + "epoch": 0.18947368421052632, + "grad_norm": 8.70908260345459, + "learning_rate": 1.9984649930723586e-05, + "loss": 1.2969, + "step": 72 + }, + { + "epoch": 0.18947368421052632, + "step": 72, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.19210526315789472, + "grad_norm": 18.371227264404297, + "learning_rate": 1.9983446781091715e-05, + "loss": 1.5898, + "step": 73 + }, + { + "epoch": 0.19210526315789472, + "step": 73, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.19473684210526315, + "grad_norm": 21.964420318603516, + "learning_rate": 1.9982198280642244e-05, + "loss": 1.5381, + "step": 74 + }, + { + "epoch": 0.19473684210526315, + "step": 74, + "train_accuracy": 0.78125 + }, + { + "epoch": 0.19736842105263158, + "grad_norm": 11.249500274658203, + "learning_rate": 1.9980904435046603e-05, + "loss": 1.5513, + "step": 75 + }, + { + "epoch": 0.19736842105263158, + "step": 75, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.2, + "grad_norm": 8.206984519958496, + "learning_rate": 1.9979565250182228e-05, + "loss": 1.5552, + "step": 76 + }, + { + "epoch": 0.2, + "eval_accuracy": 0.6919280886650085, + "eval_max_score": 6.5, + "eval_min_score": -4.59375, + "eval_runtime": 151.8642, + "eval_samples_per_second": 18.681, + "eval_steps_per_second": 0.296, + "step": 76 + }, + { + "epoch": 0.2, + "step": 76, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.2026315789473684, + "grad_norm": 29.587980270385742, + "learning_rate": 1.997818073213249e-05, + "loss": 1.5024, + "step": 77 + }, + { + "epoch": 0.2026315789473684, + "step": 77, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.20526315789473684, + "grad_norm": 10.721640586853027, + "learning_rate": 1.9976750887186708e-05, + "loss": 1.4014, + "step": 78 + }, + { + "epoch": 0.20526315789473684, + "step": 78, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.20789473684210527, + "grad_norm": 8.728314399719238, + "learning_rate": 1.9975275721840105e-05, + "loss": 1.3784, + "step": 79 + }, + { + "epoch": 0.20789473684210527, + "step": 79, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 7.245014667510986, + "learning_rate": 1.9973755242793756e-05, + "loss": 1.5894, + "step": 80 + }, + { + "epoch": 0.21052631578947367, + "step": 80, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.2131578947368421, + "grad_norm": 4.6964850425720215, + "learning_rate": 1.9972189456954595e-05, + "loss": 1.4492, + "step": 81 + }, + { + "epoch": 0.2131578947368421, + "step": 81, + "train_accuracy": 0.75 + }, + { + "epoch": 0.21578947368421053, + "grad_norm": 5.349942207336426, + "learning_rate": 1.9970578371435367e-05, + "loss": 1.249, + "step": 82 + }, + { + "epoch": 0.21578947368421053, + "step": 82, + "train_accuracy": 0.625 + }, + { + "epoch": 0.21842105263157896, + "grad_norm": 4.20225715637207, + "learning_rate": 1.996892199355459e-05, + "loss": 1.541, + "step": 83 + }, + { + "epoch": 0.21842105263157896, + "step": 83, + "train_accuracy": 0.578125 + }, + { + "epoch": 0.22105263157894736, + "grad_norm": 13.42335033416748, + "learning_rate": 1.996722033083652e-05, + "loss": 1.4404, + "step": 84 + }, + { + "epoch": 0.22105263157894736, + "step": 84, + "train_accuracy": 0.75 + }, + { + "epoch": 0.2236842105263158, + "grad_norm": 2.8606903553009033, + "learning_rate": 1.9965473391011144e-05, + "loss": 1.4155, + "step": 85 + }, + { + "epoch": 0.2236842105263158, + "step": 85, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.22631578947368422, + "grad_norm": 14.601333618164062, + "learning_rate": 1.9963681182014107e-05, + "loss": 1.3247, + "step": 86 + }, + { + "epoch": 0.22631578947368422, + "step": 86, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.22894736842105262, + "grad_norm": 11.857205390930176, + "learning_rate": 1.99618437119867e-05, + "loss": 1.4092, + "step": 87 + }, + { + "epoch": 0.22894736842105262, + "step": 87, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.23157894736842105, + "grad_norm": 3.8118960857391357, + "learning_rate": 1.9959960989275816e-05, + "loss": 1.2725, + "step": 88 + }, + { + "epoch": 0.23157894736842105, + "step": 88, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.23421052631578948, + "grad_norm": 24.30862045288086, + "learning_rate": 1.9958033022433916e-05, + "loss": 1.4478, + "step": 89 + }, + { + "epoch": 0.23421052631578948, + "step": 89, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 17.567001342773438, + "learning_rate": 1.9956059820218982e-05, + "loss": 1.415, + "step": 90 + }, + { + "epoch": 0.23684210526315788, + "step": 90, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.2394736842105263, + "grad_norm": 5.111862659454346, + "learning_rate": 1.9954041391594486e-05, + "loss": 1.6006, + "step": 91 + }, + { + "epoch": 0.2394736842105263, + "step": 91, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.24210526315789474, + "grad_norm": 19.393651962280273, + "learning_rate": 1.9951977745729343e-05, + "loss": 1.623, + "step": 92 + }, + { + "epoch": 0.24210526315789474, + "step": 92, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.24473684210526317, + "grad_norm": 28.0972843170166, + "learning_rate": 1.9949868891997877e-05, + "loss": 1.8125, + "step": 93 + }, + { + "epoch": 0.24473684210526317, + "step": 93, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.24736842105263157, + "grad_norm": 5.5879106521606445, + "learning_rate": 1.9947714839979765e-05, + "loss": 1.3486, + "step": 94 + }, + { + "epoch": 0.24736842105263157, + "step": 94, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.25, + "grad_norm": 6.477169513702393, + "learning_rate": 1.994551559946001e-05, + "loss": 1.3794, + "step": 95 + }, + { + "epoch": 0.25, + "step": 95, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.25263157894736843, + "grad_norm": 25.532249450683594, + "learning_rate": 1.9943271180428883e-05, + "loss": 1.4663, + "step": 96 + }, + { + "epoch": 0.25263157894736843, + "step": 96, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.25526315789473686, + "grad_norm": 4.028436660766602, + "learning_rate": 1.9940981593081884e-05, + "loss": 1.4131, + "step": 97 + }, + { + "epoch": 0.25526315789473686, + "step": 97, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.2578947368421053, + "grad_norm": 7.103154182434082, + "learning_rate": 1.9938646847819693e-05, + "loss": 1.208, + "step": 98 + }, + { + "epoch": 0.2578947368421053, + "step": 98, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.26052631578947366, + "grad_norm": 18.36396026611328, + "learning_rate": 1.9936266955248133e-05, + "loss": 1.46, + "step": 99 + }, + { + "epoch": 0.26052631578947366, + "step": 99, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 2.634291648864746, + "learning_rate": 1.9933841926178104e-05, + "loss": 1.4102, + "step": 100 + }, + { + "epoch": 0.2631578947368421, + "step": 100, + "train_accuracy": 0.84375 + }, + { + "epoch": 0.2657894736842105, + "grad_norm": 4.079046249389648, + "learning_rate": 1.9931371771625545e-05, + "loss": 1.1465, + "step": 101 + }, + { + "epoch": 0.2657894736842105, + "step": 101, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.26842105263157895, + "grad_norm": 4.469728469848633, + "learning_rate": 1.9928856502811383e-05, + "loss": 1.3628, + "step": 102 + }, + { + "epoch": 0.26842105263157895, + "step": 102, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.2710526315789474, + "grad_norm": 11.426676750183105, + "learning_rate": 1.992629613116148e-05, + "loss": 1.7158, + "step": 103 + }, + { + "epoch": 0.2710526315789474, + "step": 103, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.2736842105263158, + "grad_norm": 4.418765544891357, + "learning_rate": 1.992369066830659e-05, + "loss": 1.4443, + "step": 104 + }, + { + "epoch": 0.2736842105263158, + "step": 104, + "train_accuracy": 0.75 + }, + { + "epoch": 0.27631578947368424, + "grad_norm": 7.475268840789795, + "learning_rate": 1.992104012608228e-05, + "loss": 1.5762, + "step": 105 + }, + { + "epoch": 0.27631578947368424, + "step": 105, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.2789473684210526, + "grad_norm": 10.006223678588867, + "learning_rate": 1.991834451652892e-05, + "loss": 1.4473, + "step": 106 + }, + { + "epoch": 0.2789473684210526, + "step": 106, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.28157894736842104, + "grad_norm": 4.488539695739746, + "learning_rate": 1.9915603851891577e-05, + "loss": 1.3716, + "step": 107 + }, + { + "epoch": 0.28157894736842104, + "step": 107, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.28421052631578947, + "grad_norm": 31.295486450195312, + "learning_rate": 1.991281814462001e-05, + "loss": 1.6162, + "step": 108 + }, + { + "epoch": 0.28421052631578947, + "step": 108, + "train_accuracy": 0.75 + }, + { + "epoch": 0.2868421052631579, + "grad_norm": 4.2928290367126465, + "learning_rate": 1.9909987407368565e-05, + "loss": 1.2925, + "step": 109 + }, + { + "epoch": 0.2868421052631579, + "step": 109, + "train_accuracy": 0.609375 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 33.315330505371094, + "learning_rate": 1.9907111652996156e-05, + "loss": 1.6572, + "step": 110 + }, + { + "epoch": 0.2894736842105263, + "step": 110, + "train_accuracy": 0.75 + }, + { + "epoch": 0.29210526315789476, + "grad_norm": 24.655488967895508, + "learning_rate": 1.9904190894566194e-05, + "loss": 1.4414, + "step": 111 + }, + { + "epoch": 0.29210526315789476, + "step": 111, + "train_accuracy": 0.65625 + }, + { + "epoch": 0.29473684210526313, + "grad_norm": 15.197490692138672, + "learning_rate": 1.990122514534651e-05, + "loss": 1.522, + "step": 112 + }, + { + "epoch": 0.29473684210526313, + "step": 112, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.29736842105263156, + "grad_norm": 28.140966415405273, + "learning_rate": 1.989821441880933e-05, + "loss": 1.688, + "step": 113 + }, + { + "epoch": 0.29736842105263156, + "step": 113, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.3, + "grad_norm": 20.251832962036133, + "learning_rate": 1.9895158728631176e-05, + "loss": 1.4038, + "step": 114 + }, + { + "epoch": 0.3, + "step": 114, + "train_accuracy": 0.65625 + }, + { + "epoch": 0.3026315789473684, + "grad_norm": 4.522186279296875, + "learning_rate": 1.9892058088692834e-05, + "loss": 1.584, + "step": 115 + }, + { + "epoch": 0.3026315789473684, + "step": 115, + "train_accuracy": 0.75 + }, + { + "epoch": 0.30526315789473685, + "grad_norm": 31.17378044128418, + "learning_rate": 1.9888912513079276e-05, + "loss": 1.7578, + "step": 116 + }, + { + "epoch": 0.30526315789473685, + "step": 116, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.3078947368421053, + "grad_norm": 25.056249618530273, + "learning_rate": 1.9885722016079594e-05, + "loss": 1.5654, + "step": 117 + }, + { + "epoch": 0.3078947368421053, + "step": 117, + "train_accuracy": 0.796875 + }, + { + "epoch": 0.3105263157894737, + "grad_norm": 10.085057258605957, + "learning_rate": 1.9882486612186943e-05, + "loss": 1.3105, + "step": 118 + }, + { + "epoch": 0.3105263157894737, + "step": 118, + "train_accuracy": 0.8125 + }, + { + "epoch": 0.3131578947368421, + "grad_norm": 11.358416557312012, + "learning_rate": 1.9879206316098477e-05, + "loss": 1.416, + "step": 119 + }, + { + "epoch": 0.3131578947368421, + "step": 119, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 13.83195972442627, + "learning_rate": 1.9875881142715272e-05, + "loss": 1.3457, + "step": 120 + }, + { + "epoch": 0.3157894736842105, + "step": 120, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.31842105263157894, + "grad_norm": 23.987102508544922, + "learning_rate": 1.987251110714226e-05, + "loss": 1.4287, + "step": 121 + }, + { + "epoch": 0.31842105263157894, + "step": 121, + "train_accuracy": 0.5625 + }, + { + "epoch": 0.32105263157894737, + "grad_norm": 19.9754581451416, + "learning_rate": 1.986909622468818e-05, + "loss": 1.6104, + "step": 122 + }, + { + "epoch": 0.32105263157894737, + "step": 122, + "train_accuracy": 0.609375 + }, + { + "epoch": 0.3236842105263158, + "grad_norm": 10.121444702148438, + "learning_rate": 1.9865636510865466e-05, + "loss": 1.3672, + "step": 123 + }, + { + "epoch": 0.3236842105263158, + "step": 123, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.3263157894736842, + "grad_norm": 14.994632720947266, + "learning_rate": 1.986213198139023e-05, + "loss": 1.5693, + "step": 124 + }, + { + "epoch": 0.3263157894736842, + "step": 124, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.32894736842105265, + "grad_norm": 19.378555297851562, + "learning_rate": 1.9858582652182146e-05, + "loss": 1.5337, + "step": 125 + }, + { + "epoch": 0.32894736842105265, + "step": 125, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.33157894736842103, + "grad_norm": 27.150787353515625, + "learning_rate": 1.9854988539364403e-05, + "loss": 1.4336, + "step": 126 + }, + { + "epoch": 0.33157894736842103, + "step": 126, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.33421052631578946, + "grad_norm": 6.9245100021362305, + "learning_rate": 1.9851349659263624e-05, + "loss": 1.5127, + "step": 127 + }, + { + "epoch": 0.33421052631578946, + "step": 127, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.3368421052631579, + "grad_norm": 15.940958976745605, + "learning_rate": 1.9847666028409787e-05, + "loss": 1.3149, + "step": 128 + }, + { + "epoch": 0.3368421052631579, + "step": 128, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.3394736842105263, + "grad_norm": 27.410593032836914, + "learning_rate": 1.984393766353616e-05, + "loss": 1.5811, + "step": 129 + }, + { + "epoch": 0.3394736842105263, + "step": 129, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 19.065568923950195, + "learning_rate": 1.9840164581579217e-05, + "loss": 1.6299, + "step": 130 + }, + { + "epoch": 0.34210526315789475, + "step": 130, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.3447368421052632, + "grad_norm": 3.3670198917388916, + "learning_rate": 1.983634679967857e-05, + "loss": 1.4824, + "step": 131 + }, + { + "epoch": 0.3447368421052632, + "step": 131, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.3473684210526316, + "grad_norm": 20.537992477416992, + "learning_rate": 1.9832484335176866e-05, + "loss": 1.4312, + "step": 132 + }, + { + "epoch": 0.3473684210526316, + "step": 132, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.35, + "grad_norm": 18.157855987548828, + "learning_rate": 1.9828577205619757e-05, + "loss": 1.3296, + "step": 133 + }, + { + "epoch": 0.35, + "step": 133, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.3526315789473684, + "grad_norm": 10.166598320007324, + "learning_rate": 1.982462542875576e-05, + "loss": 1.3745, + "step": 134 + }, + { + "epoch": 0.3526315789473684, + "step": 134, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.35526315789473684, + "grad_norm": 7.348676681518555, + "learning_rate": 1.9820629022536234e-05, + "loss": 1.416, + "step": 135 + }, + { + "epoch": 0.35526315789473684, + "step": 135, + "train_accuracy": 0.78125 + }, + { + "epoch": 0.35789473684210527, + "grad_norm": 5.667979717254639, + "learning_rate": 1.9816588005115255e-05, + "loss": 1.1729, + "step": 136 + }, + { + "epoch": 0.35789473684210527, + "step": 136, + "train_accuracy": 0.578125 + }, + { + "epoch": 0.3605263157894737, + "grad_norm": 18.713518142700195, + "learning_rate": 1.9812502394849554e-05, + "loss": 1.665, + "step": 137 + }, + { + "epoch": 0.3605263157894737, + "step": 137, + "train_accuracy": 0.65625 + }, + { + "epoch": 0.3631578947368421, + "grad_norm": 17.586336135864258, + "learning_rate": 1.9808372210298425e-05, + "loss": 1.645, + "step": 138 + }, + { + "epoch": 0.3631578947368421, + "step": 138, + "train_accuracy": 0.78125 + }, + { + "epoch": 0.36578947368421055, + "grad_norm": 4.807186603546143, + "learning_rate": 1.980419747022365e-05, + "loss": 1.4297, + "step": 139 + }, + { + "epoch": 0.36578947368421055, + "step": 139, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 11.270689010620117, + "learning_rate": 1.9799978193589407e-05, + "loss": 1.5156, + "step": 140 + }, + { + "epoch": 0.3684210526315789, + "step": 140, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.37105263157894736, + "grad_norm": 6.3195343017578125, + "learning_rate": 1.9795714399562198e-05, + "loss": 1.4634, + "step": 141 + }, + { + "epoch": 0.37105263157894736, + "step": 141, + "train_accuracy": 0.78125 + }, + { + "epoch": 0.3736842105263158, + "grad_norm": 8.104639053344727, + "learning_rate": 1.979140610751073e-05, + "loss": 1.2305, + "step": 142 + }, + { + "epoch": 0.3736842105263158, + "step": 142, + "train_accuracy": 0.625 + }, + { + "epoch": 0.3763157894736842, + "grad_norm": 6.707549571990967, + "learning_rate": 1.9787053337005855e-05, + "loss": 1.5098, + "step": 143 + }, + { + "epoch": 0.3763157894736842, + "step": 143, + "train_accuracy": 0.796875 + }, + { + "epoch": 0.37894736842105264, + "grad_norm": 4.008877277374268, + "learning_rate": 1.9782656107820476e-05, + "loss": 1.2354, + "step": 144 + }, + { + "epoch": 0.37894736842105264, + "step": 144, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.3815789473684211, + "grad_norm": 16.74215316772461, + "learning_rate": 1.9778214439929453e-05, + "loss": 1.314, + "step": 145 + }, + { + "epoch": 0.3815789473684211, + "step": 145, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.38421052631578945, + "grad_norm": 6.672449111938477, + "learning_rate": 1.9773728353509512e-05, + "loss": 1.3169, + "step": 146 + }, + { + "epoch": 0.38421052631578945, + "step": 146, + "train_accuracy": 0.8125 + }, + { + "epoch": 0.3868421052631579, + "grad_norm": 3.9568095207214355, + "learning_rate": 1.9769197868939153e-05, + "loss": 1.1396, + "step": 147 + }, + { + "epoch": 0.3868421052631579, + "step": 147, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.3894736842105263, + "grad_norm": 3.1683335304260254, + "learning_rate": 1.9764623006798554e-05, + "loss": 1.3745, + "step": 148 + }, + { + "epoch": 0.3894736842105263, + "step": 148, + "train_accuracy": 0.78125 + }, + { + "epoch": 0.39210526315789473, + "grad_norm": 5.442292213439941, + "learning_rate": 1.9760003787869504e-05, + "loss": 1.4702, + "step": 149 + }, + { + "epoch": 0.39210526315789473, + "step": 149, + "train_accuracy": 0.65625 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 3.1763217449188232, + "learning_rate": 1.9755340233135265e-05, + "loss": 1.5659, + "step": 150 + }, + { + "epoch": 0.39473684210526316, + "step": 150, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.3973684210526316, + "grad_norm": 9.710025787353516, + "learning_rate": 1.9750632363780503e-05, + "loss": 1.5723, + "step": 151 + }, + { + "epoch": 0.3973684210526316, + "step": 151, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.4, + "grad_norm": 11.149826049804688, + "learning_rate": 1.9745880201191198e-05, + "loss": 1.6113, + "step": 152 + }, + { + "epoch": 0.4, + "eval_accuracy": 0.6922805905342102, + "eval_max_score": 6.78125, + "eval_min_score": -4.21875, + "eval_runtime": 151.353, + "eval_samples_per_second": 18.744, + "eval_steps_per_second": 0.297, + "step": 152 + }, + { + "epoch": 0.4, + "step": 152, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.4026315789473684, + "grad_norm": 17.558015823364258, + "learning_rate": 1.9741083766954527e-05, + "loss": 1.4136, + "step": 153 + }, + { + "epoch": 0.4026315789473684, + "step": 153, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.4052631578947368, + "grad_norm": 6.4166178703308105, + "learning_rate": 1.9736243082858772e-05, + "loss": 1.4238, + "step": 154 + }, + { + "epoch": 0.4052631578947368, + "step": 154, + "train_accuracy": 0.78125 + }, + { + "epoch": 0.40789473684210525, + "grad_norm": 16.045656204223633, + "learning_rate": 1.973135817089324e-05, + "loss": 1.3569, + "step": 155 + }, + { + "epoch": 0.40789473684210525, + "step": 155, + "train_accuracy": 0.609375 + }, + { + "epoch": 0.4105263157894737, + "grad_norm": 11.161890983581543, + "learning_rate": 1.972642905324813e-05, + "loss": 1.7246, + "step": 156 + }, + { + "epoch": 0.4105263157894737, + "step": 156, + "train_accuracy": 0.609375 + }, + { + "epoch": 0.4131578947368421, + "grad_norm": 10.395303726196289, + "learning_rate": 1.9721455752314468e-05, + "loss": 1.2827, + "step": 157 + }, + { + "epoch": 0.4131578947368421, + "step": 157, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.41578947368421054, + "grad_norm": 6.473245143890381, + "learning_rate": 1.9716438290683964e-05, + "loss": 1.667, + "step": 158 + }, + { + "epoch": 0.41578947368421054, + "step": 158, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.41842105263157897, + "grad_norm": 26.937177658081055, + "learning_rate": 1.9711376691148958e-05, + "loss": 1.5586, + "step": 159 + }, + { + "epoch": 0.41842105263157897, + "step": 159, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 8.536986351013184, + "learning_rate": 1.970627097670227e-05, + "loss": 1.4272, + "step": 160 + }, + { + "epoch": 0.42105263157894735, + "step": 160, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.4236842105263158, + "grad_norm": 6.249920845031738, + "learning_rate": 1.9701121170537125e-05, + "loss": 1.5605, + "step": 161 + }, + { + "epoch": 0.4236842105263158, + "step": 161, + "train_accuracy": 0.75 + }, + { + "epoch": 0.4263157894736842, + "grad_norm": 20.86353302001953, + "learning_rate": 1.9695927296047044e-05, + "loss": 1.3545, + "step": 162 + }, + { + "epoch": 0.4263157894736842, + "step": 162, + "train_accuracy": 0.625 + }, + { + "epoch": 0.42894736842105263, + "grad_norm": 18.73291778564453, + "learning_rate": 1.969068937682572e-05, + "loss": 1.5581, + "step": 163 + }, + { + "epoch": 0.42894736842105263, + "step": 163, + "train_accuracy": 0.75 + }, + { + "epoch": 0.43157894736842106, + "grad_norm": 3.9674601554870605, + "learning_rate": 1.968540743666694e-05, + "loss": 1.3638, + "step": 164 + }, + { + "epoch": 0.43157894736842106, + "step": 164, + "train_accuracy": 0.796875 + }, + { + "epoch": 0.4342105263157895, + "grad_norm": 10.880136489868164, + "learning_rate": 1.9680081499564446e-05, + "loss": 1.248, + "step": 165 + }, + { + "epoch": 0.4342105263157895, + "step": 165, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.4368421052631579, + "grad_norm": 23.13611602783203, + "learning_rate": 1.967471158971185e-05, + "loss": 1.4531, + "step": 166 + }, + { + "epoch": 0.4368421052631579, + "step": 166, + "train_accuracy": 0.609375 + }, + { + "epoch": 0.4394736842105263, + "grad_norm": 8.650835990905762, + "learning_rate": 1.966929773150251e-05, + "loss": 1.3369, + "step": 167 + }, + { + "epoch": 0.4394736842105263, + "step": 167, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.4421052631578947, + "grad_norm": 4.803102493286133, + "learning_rate": 1.966383994952942e-05, + "loss": 1.2661, + "step": 168 + }, + { + "epoch": 0.4421052631578947, + "step": 168, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.44473684210526315, + "grad_norm": 16.57624053955078, + "learning_rate": 1.9658338268585113e-05, + "loss": 1.4502, + "step": 169 + }, + { + "epoch": 0.44473684210526315, + "step": 169, + "train_accuracy": 0.65625 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 11.971793174743652, + "learning_rate": 1.965279271366153e-05, + "loss": 1.3433, + "step": 170 + }, + { + "epoch": 0.4473684210526316, + "step": 170, + "train_accuracy": 0.65625 + }, + { + "epoch": 0.45, + "grad_norm": 5.445674419403076, + "learning_rate": 1.9647203309949913e-05, + "loss": 1.5737, + "step": 171 + }, + { + "epoch": 0.45, + "step": 171, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.45263157894736844, + "grad_norm": 10.206953048706055, + "learning_rate": 1.96415700828407e-05, + "loss": 1.3804, + "step": 172 + }, + { + "epoch": 0.45263157894736844, + "step": 172, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.45526315789473687, + "grad_norm": 11.522626876831055, + "learning_rate": 1.963589305792339e-05, + "loss": 1.2734, + "step": 173 + }, + { + "epoch": 0.45526315789473687, + "step": 173, + "train_accuracy": 0.65625 + }, + { + "epoch": 0.45789473684210524, + "grad_norm": 4.055401802062988, + "learning_rate": 1.9630172260986447e-05, + "loss": 1.4268, + "step": 174 + }, + { + "epoch": 0.45789473684210524, + "step": 174, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.4605263157894737, + "grad_norm": 8.796987533569336, + "learning_rate": 1.9624407718017165e-05, + "loss": 1.3555, + "step": 175 + }, + { + "epoch": 0.4605263157894737, + "step": 175, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.4631578947368421, + "grad_norm": 3.7710328102111816, + "learning_rate": 1.961859945520157e-05, + "loss": 1.2656, + "step": 176 + }, + { + "epoch": 0.4631578947368421, + "step": 176, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.46578947368421053, + "grad_norm": 5.011978626251221, + "learning_rate": 1.961274749892428e-05, + "loss": 1.2974, + "step": 177 + }, + { + "epoch": 0.46578947368421053, + "step": 177, + "train_accuracy": 0.65625 + }, + { + "epoch": 0.46842105263157896, + "grad_norm": 6.948405742645264, + "learning_rate": 1.9606851875768404e-05, + "loss": 1.5273, + "step": 178 + }, + { + "epoch": 0.46842105263157896, + "step": 178, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.4710526315789474, + "grad_norm": 14.704864501953125, + "learning_rate": 1.96009126125154e-05, + "loss": 1.7246, + "step": 179 + }, + { + "epoch": 0.4710526315789474, + "step": 179, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 4.491285800933838, + "learning_rate": 1.9594929736144978e-05, + "loss": 1.4385, + "step": 180 + }, + { + "epoch": 0.47368421052631576, + "step": 180, + "train_accuracy": 0.65625 + }, + { + "epoch": 0.4763157894736842, + "grad_norm": 6.6391401290893555, + "learning_rate": 1.9588903273834954e-05, + "loss": 1.4521, + "step": 181 + }, + { + "epoch": 0.4763157894736842, + "step": 181, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.4789473684210526, + "grad_norm": 8.603846549987793, + "learning_rate": 1.9582833252961143e-05, + "loss": 1.334, + "step": 182 + }, + { + "epoch": 0.4789473684210526, + "step": 182, + "train_accuracy": 0.625 + }, + { + "epoch": 0.48157894736842105, + "grad_norm": 5.131749153137207, + "learning_rate": 1.9576719701097238e-05, + "loss": 1.2568, + "step": 183 + }, + { + "epoch": 0.48157894736842105, + "step": 183, + "train_accuracy": 0.578125 + }, + { + "epoch": 0.4842105263157895, + "grad_norm": 4.089670658111572, + "learning_rate": 1.957056264601466e-05, + "loss": 1.394, + "step": 184 + }, + { + "epoch": 0.4842105263157895, + "step": 184, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.4868421052631579, + "grad_norm": 2.8140037059783936, + "learning_rate": 1.956436211568246e-05, + "loss": 1.4531, + "step": 185 + }, + { + "epoch": 0.4868421052631579, + "step": 185, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.48947368421052634, + "grad_norm": 19.234317779541016, + "learning_rate": 1.9558118138267166e-05, + "loss": 1.3413, + "step": 186 + }, + { + "epoch": 0.48947368421052634, + "step": 186, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.4921052631578947, + "grad_norm": 3.8546056747436523, + "learning_rate": 1.9551830742132684e-05, + "loss": 1.4771, + "step": 187 + }, + { + "epoch": 0.4921052631578947, + "step": 187, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.49473684210526314, + "grad_norm": 6.853395462036133, + "learning_rate": 1.9545499955840145e-05, + "loss": 1.3462, + "step": 188 + }, + { + "epoch": 0.49473684210526314, + "step": 188, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.49736842105263157, + "grad_norm": 2.805311918258667, + "learning_rate": 1.953912580814779e-05, + "loss": 1.3135, + "step": 189 + }, + { + "epoch": 0.49736842105263157, + "step": 189, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.5, + "grad_norm": 7.851466178894043, + "learning_rate": 1.953270832801083e-05, + "loss": 1.2954, + "step": 190 + }, + { + "epoch": 0.5, + "step": 190, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.5026315789473684, + "grad_norm": 19.671802520751953, + "learning_rate": 1.9526247544581315e-05, + "loss": 1.3569, + "step": 191 + }, + { + "epoch": 0.5026315789473684, + "step": 191, + "train_accuracy": 0.59375 + }, + { + "epoch": 0.5052631578947369, + "grad_norm": 4.735298156738281, + "learning_rate": 1.9519743487208008e-05, + "loss": 1.4355, + "step": 192 + }, + { + "epoch": 0.5052631578947369, + "step": 192, + "train_accuracy": 0.8125 + }, + { + "epoch": 0.5078947368421053, + "grad_norm": 6.873162269592285, + "learning_rate": 1.9513196185436248e-05, + "loss": 1.439, + "step": 193 + }, + { + "epoch": 0.5078947368421053, + "step": 193, + "train_accuracy": 0.75 + }, + { + "epoch": 0.5105263157894737, + "grad_norm": 13.055280685424805, + "learning_rate": 1.9506605669007815e-05, + "loss": 1.584, + "step": 194 + }, + { + "epoch": 0.5105263157894737, + "step": 194, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.5131578947368421, + "grad_norm": 6.923583030700684, + "learning_rate": 1.94999719678608e-05, + "loss": 1.4561, + "step": 195 + }, + { + "epoch": 0.5131578947368421, + "step": 195, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.5157894736842106, + "grad_norm": 4.102768421173096, + "learning_rate": 1.9493295112129464e-05, + "loss": 1.6006, + "step": 196 + }, + { + "epoch": 0.5157894736842106, + "step": 196, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.5184210526315789, + "grad_norm": 9.221837997436523, + "learning_rate": 1.9486575132144095e-05, + "loss": 1.4302, + "step": 197 + }, + { + "epoch": 0.5184210526315789, + "step": 197, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.5210526315789473, + "grad_norm": 3.7125375270843506, + "learning_rate": 1.9479812058430886e-05, + "loss": 1.3184, + "step": 198 + }, + { + "epoch": 0.5210526315789473, + "step": 198, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.5236842105263158, + "grad_norm": 6.4270243644714355, + "learning_rate": 1.9473005921711778e-05, + "loss": 1.3823, + "step": 199 + }, + { + "epoch": 0.5236842105263158, + "step": 199, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 4.891445159912109, + "learning_rate": 1.9466156752904344e-05, + "loss": 1.4551, + "step": 200 + }, + { + "epoch": 0.5263157894736842, + "step": 200, + "train_accuracy": 0.859375 + }, + { + "epoch": 0.5289473684210526, + "grad_norm": 10.10925006866455, + "learning_rate": 1.945926458312162e-05, + "loss": 1.2012, + "step": 201 + }, + { + "epoch": 0.5289473684210526, + "step": 201, + "train_accuracy": 0.78125 + }, + { + "epoch": 0.531578947368421, + "grad_norm": 17.743282318115234, + "learning_rate": 1.945232944367199e-05, + "loss": 1.3965, + "step": 202 + }, + { + "epoch": 0.531578947368421, + "step": 202, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.5342105263157895, + "grad_norm": 5.660726547241211, + "learning_rate": 1.9445351366059025e-05, + "loss": 1.3765, + "step": 203 + }, + { + "epoch": 0.5342105263157895, + "step": 203, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.5368421052631579, + "grad_norm": 17.128097534179688, + "learning_rate": 1.9438330381981348e-05, + "loss": 1.291, + "step": 204 + }, + { + "epoch": 0.5368421052631579, + "step": 204, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.5394736842105263, + "grad_norm": 5.735154151916504, + "learning_rate": 1.9431266523332488e-05, + "loss": 1.7031, + "step": 205 + }, + { + "epoch": 0.5394736842105263, + "step": 205, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.5421052631578948, + "grad_norm": 8.40402603149414, + "learning_rate": 1.9424159822200744e-05, + "loss": 1.3916, + "step": 206 + }, + { + "epoch": 0.5421052631578948, + "step": 206, + "train_accuracy": 0.828125 + }, + { + "epoch": 0.5447368421052632, + "grad_norm": 8.66818618774414, + "learning_rate": 1.941701031086902e-05, + "loss": 1.3877, + "step": 207 + }, + { + "epoch": 0.5447368421052632, + "step": 207, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.5473684210526316, + "grad_norm": 16.262784957885742, + "learning_rate": 1.9409818021814698e-05, + "loss": 1.4619, + "step": 208 + }, + { + "epoch": 0.5473684210526316, + "step": 208, + "train_accuracy": 0.796875 + }, + { + "epoch": 0.55, + "grad_norm": 2.998375177383423, + "learning_rate": 1.9402582987709477e-05, + "loss": 1.3452, + "step": 209 + }, + { + "epoch": 0.55, + "step": 209, + "train_accuracy": 0.8125 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 2.688655138015747, + "learning_rate": 1.9395305241419234e-05, + "loss": 1.3125, + "step": 210 + }, + { + "epoch": 0.5526315789473685, + "step": 210, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.5552631578947368, + "grad_norm": 9.62772274017334, + "learning_rate": 1.9387984816003868e-05, + "loss": 1.3271, + "step": 211 + }, + { + "epoch": 0.5552631578947368, + "step": 211, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.5578947368421052, + "grad_norm": 4.155248165130615, + "learning_rate": 1.9380621744717144e-05, + "loss": 1.3545, + "step": 212 + }, + { + "epoch": 0.5578947368421052, + "step": 212, + "train_accuracy": 0.65625 + }, + { + "epoch": 0.5605263157894737, + "grad_norm": 12.761597633361816, + "learning_rate": 1.9373216061006576e-05, + "loss": 1.4141, + "step": 213 + }, + { + "epoch": 0.5605263157894737, + "step": 213, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.5631578947368421, + "grad_norm": 7.922176837921143, + "learning_rate": 1.9365767798513216e-05, + "loss": 1.2588, + "step": 214 + }, + { + "epoch": 0.5631578947368421, + "step": 214, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.5657894736842105, + "grad_norm": 6.710723876953125, + "learning_rate": 1.9358276991071556e-05, + "loss": 1.3638, + "step": 215 + }, + { + "epoch": 0.5657894736842105, + "step": 215, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.5684210526315789, + "grad_norm": 3.7787835597991943, + "learning_rate": 1.935074367270935e-05, + "loss": 1.2544, + "step": 216 + }, + { + "epoch": 0.5684210526315789, + "step": 216, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.5710526315789474, + "grad_norm": 7.387803554534912, + "learning_rate": 1.9343167877647457e-05, + "loss": 1.3369, + "step": 217 + }, + { + "epoch": 0.5710526315789474, + "step": 217, + "train_accuracy": 0.875 + }, + { + "epoch": 0.5736842105263158, + "grad_norm": 3.0733940601348877, + "learning_rate": 1.9335549640299688e-05, + "loss": 1.2944, + "step": 218 + }, + { + "epoch": 0.5736842105263158, + "step": 218, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.5763157894736842, + "grad_norm": 16.344900131225586, + "learning_rate": 1.9327888995272667e-05, + "loss": 1.1758, + "step": 219 + }, + { + "epoch": 0.5763157894736842, + "step": 219, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 6.135222434997559, + "learning_rate": 1.9320185977365643e-05, + "loss": 1.3555, + "step": 220 + }, + { + "epoch": 0.5789473684210527, + "step": 220, + "train_accuracy": 0.75 + }, + { + "epoch": 0.5815789473684211, + "grad_norm": 5.324588298797607, + "learning_rate": 1.9312440621570355e-05, + "loss": 1.4307, + "step": 221 + }, + { + "epoch": 0.5815789473684211, + "step": 221, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.5842105263157895, + "grad_norm": 4.523622512817383, + "learning_rate": 1.9304652963070868e-05, + "loss": 1.2983, + "step": 222 + }, + { + "epoch": 0.5842105263157895, + "step": 222, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.5868421052631579, + "grad_norm": 4.101083278656006, + "learning_rate": 1.9296823037243406e-05, + "loss": 1.3501, + "step": 223 + }, + { + "epoch": 0.5868421052631579, + "step": 223, + "train_accuracy": 0.75 + }, + { + "epoch": 0.5894736842105263, + "grad_norm": 8.83919620513916, + "learning_rate": 1.9288950879656205e-05, + "loss": 1.2852, + "step": 224 + }, + { + "epoch": 0.5894736842105263, + "step": 224, + "train_accuracy": 0.65625 + }, + { + "epoch": 0.5921052631578947, + "grad_norm": 9.160242080688477, + "learning_rate": 1.9281036526069333e-05, + "loss": 1.3491, + "step": 225 + }, + { + "epoch": 0.5921052631578947, + "step": 225, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.5947368421052631, + "grad_norm": 10.763931274414062, + "learning_rate": 1.927308001243454e-05, + "loss": 1.3047, + "step": 226 + }, + { + "epoch": 0.5947368421052631, + "step": 226, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.5973684210526315, + "grad_norm": 9.965821266174316, + "learning_rate": 1.92650813748951e-05, + "loss": 1.4458, + "step": 227 + }, + { + "epoch": 0.5973684210526315, + "step": 227, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.6, + "grad_norm": 3.940614938735962, + "learning_rate": 1.9257040649785633e-05, + "loss": 1.4888, + "step": 228 + }, + { + "epoch": 0.6, + "eval_accuracy": 0.7053225040435791, + "eval_max_score": 5.4375, + "eval_min_score": -5.0, + "eval_runtime": 151.3974, + "eval_samples_per_second": 18.739, + "eval_steps_per_second": 0.297, + "step": 228 + }, + { + "epoch": 0.6, + "step": 228, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.6026315789473684, + "grad_norm": 12.34426498413086, + "learning_rate": 1.9248957873631947e-05, + "loss": 1.4258, + "step": 229 + }, + { + "epoch": 0.6026315789473684, + "step": 229, + "train_accuracy": 0.65625 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 13.6069917678833, + "learning_rate": 1.9240833083150864e-05, + "loss": 1.5156, + "step": 230 + }, + { + "epoch": 0.6052631578947368, + "step": 230, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.6078947368421053, + "grad_norm": 4.457098484039307, + "learning_rate": 1.9232666315250078e-05, + "loss": 1.3008, + "step": 231 + }, + { + "epoch": 0.6078947368421053, + "step": 231, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.6105263157894737, + "grad_norm": 6.309927463531494, + "learning_rate": 1.922445760702795e-05, + "loss": 1.3384, + "step": 232 + }, + { + "epoch": 0.6105263157894737, + "step": 232, + "train_accuracy": 0.796875 + }, + { + "epoch": 0.6131578947368421, + "grad_norm": 7.830813407897949, + "learning_rate": 1.9216206995773373e-05, + "loss": 1.2866, + "step": 233 + }, + { + "epoch": 0.6131578947368421, + "step": 233, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.6157894736842106, + "grad_norm": 10.779923439025879, + "learning_rate": 1.9207914518965585e-05, + "loss": 1.4932, + "step": 234 + }, + { + "epoch": 0.6157894736842106, + "step": 234, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.618421052631579, + "grad_norm": 2.9620766639709473, + "learning_rate": 1.9199580214274e-05, + "loss": 1.3242, + "step": 235 + }, + { + "epoch": 0.618421052631579, + "step": 235, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.6210526315789474, + "grad_norm": 8.38760757446289, + "learning_rate": 1.9191204119558034e-05, + "loss": 1.3672, + "step": 236 + }, + { + "epoch": 0.6210526315789474, + "step": 236, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.6236842105263158, + "grad_norm": 5.167934417724609, + "learning_rate": 1.9182786272866955e-05, + "loss": 1.3555, + "step": 237 + }, + { + "epoch": 0.6236842105263158, + "step": 237, + "train_accuracy": 0.75 + }, + { + "epoch": 0.6263157894736842, + "grad_norm": 12.282572746276855, + "learning_rate": 1.9174326712439674e-05, + "loss": 1.6484, + "step": 238 + }, + { + "epoch": 0.6263157894736842, + "step": 238, + "train_accuracy": 0.78125 + }, + { + "epoch": 0.6289473684210526, + "grad_norm": 14.087172508239746, + "learning_rate": 1.916582547670461e-05, + "loss": 1.4004, + "step": 239 + }, + { + "epoch": 0.6289473684210526, + "step": 239, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 3.8323986530303955, + "learning_rate": 1.9157282604279482e-05, + "loss": 1.3779, + "step": 240 + }, + { + "epoch": 0.631578947368421, + "step": 240, + "train_accuracy": 0.8125 + }, + { + "epoch": 0.6342105263157894, + "grad_norm": 9.426977157592773, + "learning_rate": 1.9148698133971156e-05, + "loss": 1.4463, + "step": 241 + }, + { + "epoch": 0.6342105263157894, + "step": 241, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.6368421052631579, + "grad_norm": 6.784728527069092, + "learning_rate": 1.914007210477545e-05, + "loss": 1.3521, + "step": 242 + }, + { + "epoch": 0.6368421052631579, + "step": 242, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.6394736842105263, + "grad_norm": 7.3046650886535645, + "learning_rate": 1.913140455587698e-05, + "loss": 1.4111, + "step": 243 + }, + { + "epoch": 0.6394736842105263, + "step": 243, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.6421052631578947, + "grad_norm": 18.1398983001709, + "learning_rate": 1.9122695526648968e-05, + "loss": 1.5938, + "step": 244 + }, + { + "epoch": 0.6421052631578947, + "step": 244, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.6447368421052632, + "grad_norm": 4.717808246612549, + "learning_rate": 1.911394505665306e-05, + "loss": 1.3379, + "step": 245 + }, + { + "epoch": 0.6447368421052632, + "step": 245, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.6473684210526316, + "grad_norm": 5.125016212463379, + "learning_rate": 1.9105153185639142e-05, + "loss": 1.2539, + "step": 246 + }, + { + "epoch": 0.6473684210526316, + "step": 246, + "train_accuracy": 0.828125 + }, + { + "epoch": 0.65, + "grad_norm": 5.333531856536865, + "learning_rate": 1.9096319953545186e-05, + "loss": 1.374, + "step": 247 + }, + { + "epoch": 0.65, + "step": 247, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.6526315789473685, + "grad_norm": 11.1005220413208, + "learning_rate": 1.908744540049704e-05, + "loss": 1.4688, + "step": 248 + }, + { + "epoch": 0.6526315789473685, + "step": 248, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.6552631578947369, + "grad_norm": 18.011417388916016, + "learning_rate": 1.9078529566808265e-05, + "loss": 1.5732, + "step": 249 + }, + { + "epoch": 0.6552631578947369, + "step": 249, + "train_accuracy": 0.78125 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 9.390484809875488, + "learning_rate": 1.9069572492979933e-05, + "loss": 1.1738, + "step": 250 + }, + { + "epoch": 0.6578947368421053, + "step": 250, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.6605263157894737, + "grad_norm": 21.22768211364746, + "learning_rate": 1.906057421970046e-05, + "loss": 1.5244, + "step": 251 + }, + { + "epoch": 0.6605263157894737, + "step": 251, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.6631578947368421, + "grad_norm": 3.9320061206817627, + "learning_rate": 1.9051534787845414e-05, + "loss": 1.5396, + "step": 252 + }, + { + "epoch": 0.6631578947368421, + "step": 252, + "train_accuracy": 0.65625 + }, + { + "epoch": 0.6657894736842105, + "grad_norm": 16.053037643432617, + "learning_rate": 1.9042454238477326e-05, + "loss": 1.4902, + "step": 253 + }, + { + "epoch": 0.6657894736842105, + "step": 253, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.6684210526315789, + "grad_norm": 3.7904529571533203, + "learning_rate": 1.9033332612845516e-05, + "loss": 1.3354, + "step": 254 + }, + { + "epoch": 0.6684210526315789, + "step": 254, + "train_accuracy": 0.625 + }, + { + "epoch": 0.6710526315789473, + "grad_norm": 3.9885382652282715, + "learning_rate": 1.9024169952385887e-05, + "loss": 1.5967, + "step": 255 + }, + { + "epoch": 0.6710526315789473, + "step": 255, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.6736842105263158, + "grad_norm": 5.4799885749816895, + "learning_rate": 1.9014966298720752e-05, + "loss": 1.5703, + "step": 256 + }, + { + "epoch": 0.6736842105263158, + "step": 256, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.6763157894736842, + "grad_norm": 3.2502963542938232, + "learning_rate": 1.9005721693658642e-05, + "loss": 1.2104, + "step": 257 + }, + { + "epoch": 0.6763157894736842, + "step": 257, + "train_accuracy": 0.578125 + }, + { + "epoch": 0.6789473684210526, + "grad_norm": 10.936656951904297, + "learning_rate": 1.899643617919411e-05, + "loss": 1.519, + "step": 258 + }, + { + "epoch": 0.6789473684210526, + "step": 258, + "train_accuracy": 0.796875 + }, + { + "epoch": 0.6815789473684211, + "grad_norm": 3.5721821784973145, + "learning_rate": 1.898710979750755e-05, + "loss": 1.3594, + "step": 259 + }, + { + "epoch": 0.6815789473684211, + "step": 259, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 2.8655872344970703, + "learning_rate": 1.8977742590964985e-05, + "loss": 1.3838, + "step": 260 + }, + { + "epoch": 0.6842105263157895, + "step": 260, + "train_accuracy": 0.796875 + }, + { + "epoch": 0.6868421052631579, + "grad_norm": 4.112249374389648, + "learning_rate": 1.8968334602117906e-05, + "loss": 1.168, + "step": 261 + }, + { + "epoch": 0.6868421052631579, + "step": 261, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.6894736842105263, + "grad_norm": 23.64423370361328, + "learning_rate": 1.8958885873703055e-05, + "loss": 1.5669, + "step": 262 + }, + { + "epoch": 0.6894736842105263, + "step": 262, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.6921052631578948, + "grad_norm": 9.901530265808105, + "learning_rate": 1.8949396448642233e-05, + "loss": 1.1182, + "step": 263 + }, + { + "epoch": 0.6921052631578948, + "step": 263, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.6947368421052632, + "grad_norm": 8.222607612609863, + "learning_rate": 1.8939866370042116e-05, + "loss": 1.4614, + "step": 264 + }, + { + "epoch": 0.6947368421052632, + "step": 264, + "train_accuracy": 0.796875 + }, + { + "epoch": 0.6973684210526315, + "grad_norm": 3.2499775886535645, + "learning_rate": 1.8930295681194054e-05, + "loss": 1.2705, + "step": 265 + }, + { + "epoch": 0.6973684210526315, + "step": 265, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.7, + "grad_norm": 21.207712173461914, + "learning_rate": 1.8920684425573865e-05, + "loss": 1.4531, + "step": 266 + }, + { + "epoch": 0.7, + "step": 266, + "train_accuracy": 0.75 + }, + { + "epoch": 0.7026315789473684, + "grad_norm": 3.7196009159088135, + "learning_rate": 1.8911032646841657e-05, + "loss": 1.3164, + "step": 267 + }, + { + "epoch": 0.7026315789473684, + "step": 267, + "train_accuracy": 0.796875 + }, + { + "epoch": 0.7052631578947368, + "grad_norm": 3.2149417400360107, + "learning_rate": 1.8901340388841602e-05, + "loss": 1.251, + "step": 268 + }, + { + "epoch": 0.7052631578947368, + "step": 268, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.7078947368421052, + "grad_norm": 5.346362113952637, + "learning_rate": 1.889160769560177e-05, + "loss": 1.5195, + "step": 269 + }, + { + "epoch": 0.7078947368421052, + "step": 269, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 19.827333450317383, + "learning_rate": 1.8881834611333906e-05, + "loss": 1.3813, + "step": 270 + }, + { + "epoch": 0.7105263157894737, + "step": 270, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.7131578947368421, + "grad_norm": 3.6186683177948, + "learning_rate": 1.887202118043323e-05, + "loss": 1.3633, + "step": 271 + }, + { + "epoch": 0.7131578947368421, + "step": 271, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.7157894736842105, + "grad_norm": 7.749443531036377, + "learning_rate": 1.886216744747825e-05, + "loss": 1.3379, + "step": 272 + }, + { + "epoch": 0.7157894736842105, + "step": 272, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.718421052631579, + "grad_norm": 18.11931610107422, + "learning_rate": 1.885227345723054e-05, + "loss": 1.7515, + "step": 273 + }, + { + "epoch": 0.718421052631579, + "step": 273, + "train_accuracy": 0.75 + }, + { + "epoch": 0.7210526315789474, + "grad_norm": 4.238664627075195, + "learning_rate": 1.8842339254634558e-05, + "loss": 1.3262, + "step": 274 + }, + { + "epoch": 0.7210526315789474, + "step": 274, + "train_accuracy": 0.8125 + }, + { + "epoch": 0.7236842105263158, + "grad_norm": 5.915544033050537, + "learning_rate": 1.8832364884817424e-05, + "loss": 1.3804, + "step": 275 + }, + { + "epoch": 0.7236842105263158, + "step": 275, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.7263157894736842, + "grad_norm": 4.206833839416504, + "learning_rate": 1.8822350393088717e-05, + "loss": 1.1641, + "step": 276 + }, + { + "epoch": 0.7263157894736842, + "step": 276, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.7289473684210527, + "grad_norm": 15.429242134094238, + "learning_rate": 1.8812295824940284e-05, + "loss": 1.3164, + "step": 277 + }, + { + "epoch": 0.7289473684210527, + "step": 277, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.7315789473684211, + "grad_norm": 4.126450061798096, + "learning_rate": 1.8802201226046023e-05, + "loss": 1.208, + "step": 278 + }, + { + "epoch": 0.7315789473684211, + "step": 278, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.7342105263157894, + "grad_norm": 5.984684944152832, + "learning_rate": 1.879206664226166e-05, + "loss": 1.333, + "step": 279 + }, + { + "epoch": 0.7342105263157894, + "step": 279, + "train_accuracy": 0.578125 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 3.0508310794830322, + "learning_rate": 1.8781892119624578e-05, + "loss": 1.499, + "step": 280 + }, + { + "epoch": 0.7368421052631579, + "step": 280, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.7394736842105263, + "grad_norm": 5.437434673309326, + "learning_rate": 1.877167770435357e-05, + "loss": 1.3906, + "step": 281 + }, + { + "epoch": 0.7394736842105263, + "step": 281, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.7421052631578947, + "grad_norm": 14.601706504821777, + "learning_rate": 1.8761423442848655e-05, + "loss": 1.4321, + "step": 282 + }, + { + "epoch": 0.7421052631578947, + "step": 282, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.7447368421052631, + "grad_norm": 5.646981716156006, + "learning_rate": 1.875112938169085e-05, + "loss": 1.4238, + "step": 283 + }, + { + "epoch": 0.7447368421052631, + "step": 283, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.7473684210526316, + "grad_norm": 3.7609148025512695, + "learning_rate": 1.874079556764197e-05, + "loss": 1.1875, + "step": 284 + }, + { + "epoch": 0.7473684210526316, + "step": 284, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.75, + "grad_norm": 3.995249032974243, + "learning_rate": 1.8730422047644417e-05, + "loss": 1.5713, + "step": 285 + }, + { + "epoch": 0.75, + "step": 285, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.7526315789473684, + "grad_norm": 13.330533027648926, + "learning_rate": 1.8720008868820954e-05, + "loss": 1.4902, + "step": 286 + }, + { + "epoch": 0.7526315789473684, + "step": 286, + "train_accuracy": 0.78125 + }, + { + "epoch": 0.7552631578947369, + "grad_norm": 9.846002578735352, + "learning_rate": 1.8709556078474497e-05, + "loss": 1.2695, + "step": 287 + }, + { + "epoch": 0.7552631578947369, + "step": 287, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.7578947368421053, + "grad_norm": 2.9425392150878906, + "learning_rate": 1.8699063724087905e-05, + "loss": 1.1797, + "step": 288 + }, + { + "epoch": 0.7578947368421053, + "step": 288, + "train_accuracy": 0.625 + }, + { + "epoch": 0.7605263157894737, + "grad_norm": 9.827260971069336, + "learning_rate": 1.868853185332376e-05, + "loss": 1.5898, + "step": 289 + }, + { + "epoch": 0.7605263157894737, + "step": 289, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 6.012570858001709, + "learning_rate": 1.867796051402415e-05, + "loss": 1.3945, + "step": 290 + }, + { + "epoch": 0.7631578947368421, + "step": 290, + "train_accuracy": 0.75 + }, + { + "epoch": 0.7657894736842106, + "grad_norm": 12.465781211853027, + "learning_rate": 1.8667349754210456e-05, + "loss": 1.1934, + "step": 291 + }, + { + "epoch": 0.7657894736842106, + "step": 291, + "train_accuracy": 0.75 + }, + { + "epoch": 0.7684210526315789, + "grad_norm": 11.289711952209473, + "learning_rate": 1.865669962208313e-05, + "loss": 1.2749, + "step": 292 + }, + { + "epoch": 0.7684210526315789, + "step": 292, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.7710526315789473, + "grad_norm": 9.718608856201172, + "learning_rate": 1.864601016602147e-05, + "loss": 1.3511, + "step": 293 + }, + { + "epoch": 0.7710526315789473, + "step": 293, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.7736842105263158, + "grad_norm": 14.851384162902832, + "learning_rate": 1.863528143458342e-05, + "loss": 1.3115, + "step": 294 + }, + { + "epoch": 0.7736842105263158, + "step": 294, + "train_accuracy": 0.578125 + }, + { + "epoch": 0.7763157894736842, + "grad_norm": 27.674755096435547, + "learning_rate": 1.8624513476505316e-05, + "loss": 1.9219, + "step": 295 + }, + { + "epoch": 0.7763157894736842, + "step": 295, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.7789473684210526, + "grad_norm": 3.874488115310669, + "learning_rate": 1.861370634070171e-05, + "loss": 1.5938, + "step": 296 + }, + { + "epoch": 0.7789473684210526, + "step": 296, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.781578947368421, + "grad_norm": 12.267851829528809, + "learning_rate": 1.8602860076265107e-05, + "loss": 1.2485, + "step": 297 + }, + { + "epoch": 0.781578947368421, + "step": 297, + "train_accuracy": 0.75 + }, + { + "epoch": 0.7842105263157895, + "grad_norm": 10.122313499450684, + "learning_rate": 1.859197473246576e-05, + "loss": 1.4756, + "step": 298 + }, + { + "epoch": 0.7842105263157895, + "step": 298, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.7868421052631579, + "grad_norm": 11.50515079498291, + "learning_rate": 1.8581050358751444e-05, + "loss": 1.2676, + "step": 299 + }, + { + "epoch": 0.7868421052631579, + "step": 299, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 12.21772289276123, + "learning_rate": 1.857008700474723e-05, + "loss": 1.1899, + "step": 300 + }, + { + "epoch": 0.7894736842105263, + "step": 300, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.7921052631578948, + "grad_norm": 8.307805061340332, + "learning_rate": 1.8559084720255276e-05, + "loss": 1.418, + "step": 301 + }, + { + "epoch": 0.7921052631578948, + "step": 301, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.7947368421052632, + "grad_norm": 4.6956610679626465, + "learning_rate": 1.8548043555254556e-05, + "loss": 1.4775, + "step": 302 + }, + { + "epoch": 0.7947368421052632, + "step": 302, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.7973684210526316, + "grad_norm": 11.48013687133789, + "learning_rate": 1.853696355990069e-05, + "loss": 1.665, + "step": 303 + }, + { + "epoch": 0.7973684210526316, + "step": 303, + "train_accuracy": 0.828125 + }, + { + "epoch": 0.8, + "grad_norm": 15.654217720031738, + "learning_rate": 1.852584478452568e-05, + "loss": 1.2144, + "step": 304 + }, + { + "epoch": 0.8, + "eval_accuracy": 0.7014451622962952, + "eval_max_score": 4.8125, + "eval_min_score": -5.15625, + "eval_runtime": 151.0436, + "eval_samples_per_second": 18.783, + "eval_steps_per_second": 0.298, + "step": 304 + }, + { + "epoch": 0.8, + "step": 304, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.8026315789473685, + "grad_norm": 6.716573238372803, + "learning_rate": 1.8514687279637677e-05, + "loss": 1.1758, + "step": 305 + }, + { + "epoch": 0.8026315789473685, + "step": 305, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.8052631578947368, + "grad_norm": 12.827051162719727, + "learning_rate": 1.8503491095920788e-05, + "loss": 1.5117, + "step": 306 + }, + { + "epoch": 0.8052631578947368, + "step": 306, + "train_accuracy": 0.78125 + }, + { + "epoch": 0.8078947368421052, + "grad_norm": 15.274651527404785, + "learning_rate": 1.849225628423481e-05, + "loss": 1.377, + "step": 307 + }, + { + "epoch": 0.8078947368421052, + "step": 307, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.8105263157894737, + "grad_norm": 12.7550048828125, + "learning_rate": 1.8480982895615005e-05, + "loss": 1.2598, + "step": 308 + }, + { + "epoch": 0.8105263157894737, + "step": 308, + "train_accuracy": 0.59375 + }, + { + "epoch": 0.8131578947368421, + "grad_norm": 12.460929870605469, + "learning_rate": 1.846967098127189e-05, + "loss": 1.3872, + "step": 309 + }, + { + "epoch": 0.8131578947368421, + "step": 309, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 21.332063674926758, + "learning_rate": 1.8458320592590976e-05, + "loss": 1.4429, + "step": 310 + }, + { + "epoch": 0.8157894736842105, + "step": 310, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.8184210526315789, + "grad_norm": 27.87535285949707, + "learning_rate": 1.8446931781132553e-05, + "loss": 1.7881, + "step": 311 + }, + { + "epoch": 0.8184210526315789, + "step": 311, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.8210526315789474, + "grad_norm": 3.5377941131591797, + "learning_rate": 1.843550459863145e-05, + "loss": 1.417, + "step": 312 + }, + { + "epoch": 0.8210526315789474, + "step": 312, + "train_accuracy": 0.5625 + }, + { + "epoch": 0.8236842105263158, + "grad_norm": 21.97355842590332, + "learning_rate": 1.8424039096996804e-05, + "loss": 1.6807, + "step": 313 + }, + { + "epoch": 0.8236842105263158, + "step": 313, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.8263157894736842, + "grad_norm": 27.53736686706543, + "learning_rate": 1.8412535328311813e-05, + "loss": 1.5273, + "step": 314 + }, + { + "epoch": 0.8263157894736842, + "step": 314, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.8289473684210527, + "grad_norm": 15.879623413085938, + "learning_rate": 1.8400993344833513e-05, + "loss": 1.3857, + "step": 315 + }, + { + "epoch": 0.8289473684210527, + "step": 315, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.8315789473684211, + "grad_norm": 4.012557029724121, + "learning_rate": 1.8389413198992528e-05, + "loss": 1.4648, + "step": 316 + }, + { + "epoch": 0.8315789473684211, + "step": 316, + "train_accuracy": 0.78125 + }, + { + "epoch": 0.8342105263157895, + "grad_norm": 11.08835220336914, + "learning_rate": 1.8377794943392848e-05, + "loss": 1.3896, + "step": 317 + }, + { + "epoch": 0.8342105263157895, + "step": 317, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.8368421052631579, + "grad_norm": 24.538671493530273, + "learning_rate": 1.8366138630811573e-05, + "loss": 1.4434, + "step": 318 + }, + { + "epoch": 0.8368421052631579, + "step": 318, + "train_accuracy": 0.625 + }, + { + "epoch": 0.8394736842105263, + "grad_norm": 29.274654388427734, + "learning_rate": 1.835444431419868e-05, + "loss": 1.7217, + "step": 319 + }, + { + "epoch": 0.8394736842105263, + "step": 319, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 12.992687225341797, + "learning_rate": 1.834271204667679e-05, + "loss": 1.459, + "step": 320 + }, + { + "epoch": 0.8421052631578947, + "step": 320, + "train_accuracy": 0.78125 + }, + { + "epoch": 0.8447368421052631, + "grad_norm": 14.843409538269043, + "learning_rate": 1.8330941881540917e-05, + "loss": 1.4219, + "step": 321 + }, + { + "epoch": 0.8447368421052631, + "step": 321, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.8473684210526315, + "grad_norm": 35.8585319519043, + "learning_rate": 1.8319133872258224e-05, + "loss": 1.7939, + "step": 322 + }, + { + "epoch": 0.8473684210526315, + "step": 322, + "train_accuracy": 0.609375 + }, + { + "epoch": 0.85, + "grad_norm": 30.27836036682129, + "learning_rate": 1.830728807246779e-05, + "loss": 1.4526, + "step": 323 + }, + { + "epoch": 0.85, + "step": 323, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.8526315789473684, + "grad_norm": 13.607982635498047, + "learning_rate": 1.8295404535980357e-05, + "loss": 1.4277, + "step": 324 + }, + { + "epoch": 0.8526315789473684, + "step": 324, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.8552631578947368, + "grad_norm": 2.9818201065063477, + "learning_rate": 1.8283483316778097e-05, + "loss": 1.2144, + "step": 325 + }, + { + "epoch": 0.8552631578947368, + "step": 325, + "train_accuracy": 0.625 + }, + { + "epoch": 0.8578947368421053, + "grad_norm": 26.039756774902344, + "learning_rate": 1.827152446901435e-05, + "loss": 1.6626, + "step": 326 + }, + { + "epoch": 0.8578947368421053, + "step": 326, + "train_accuracy": 0.78125 + }, + { + "epoch": 0.8605263157894737, + "grad_norm": 34.77094268798828, + "learning_rate": 1.82595280470134e-05, + "loss": 1.8359, + "step": 327 + }, + { + "epoch": 0.8605263157894737, + "step": 327, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.8631578947368421, + "grad_norm": 22.246620178222656, + "learning_rate": 1.8247494105270198e-05, + "loss": 1.5605, + "step": 328 + }, + { + "epoch": 0.8631578947368421, + "step": 328, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.8657894736842106, + "grad_norm": 3.848651885986328, + "learning_rate": 1.8235422698450153e-05, + "loss": 1.3169, + "step": 329 + }, + { + "epoch": 0.8657894736842106, + "step": 329, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 15.153877258300781, + "learning_rate": 1.8223313881388845e-05, + "loss": 1.5996, + "step": 330 + }, + { + "epoch": 0.868421052631579, + "step": 330, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.8710526315789474, + "grad_norm": 9.871405601501465, + "learning_rate": 1.8211167709091805e-05, + "loss": 1.2954, + "step": 331 + }, + { + "epoch": 0.8710526315789474, + "step": 331, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.8736842105263158, + "grad_norm": 30.780324935913086, + "learning_rate": 1.8198984236734246e-05, + "loss": 1.7251, + "step": 332 + }, + { + "epoch": 0.8736842105263158, + "step": 332, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.8763157894736842, + "grad_norm": 21.378108978271484, + "learning_rate": 1.818676351966083e-05, + "loss": 1.5273, + "step": 333 + }, + { + "epoch": 0.8763157894736842, + "step": 333, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.8789473684210526, + "grad_norm": 5.893415451049805, + "learning_rate": 1.81745056133854e-05, + "loss": 1.3877, + "step": 334 + }, + { + "epoch": 0.8789473684210526, + "step": 334, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.881578947368421, + "grad_norm": 12.588119506835938, + "learning_rate": 1.8162210573590733e-05, + "loss": 1.4844, + "step": 335 + }, + { + "epoch": 0.881578947368421, + "step": 335, + "train_accuracy": 0.78125 + }, + { + "epoch": 0.8842105263157894, + "grad_norm": 19.006074905395508, + "learning_rate": 1.8149878456128296e-05, + "loss": 1.4629, + "step": 336 + }, + { + "epoch": 0.8842105263157894, + "step": 336, + "train_accuracy": 0.75 + }, + { + "epoch": 0.8868421052631579, + "grad_norm": 15.575494766235352, + "learning_rate": 1.8137509317017976e-05, + "loss": 1.5322, + "step": 337 + }, + { + "epoch": 0.8868421052631579, + "step": 337, + "train_accuracy": 0.53125 + }, + { + "epoch": 0.8894736842105263, + "grad_norm": 8.873583793640137, + "learning_rate": 1.8125103212447842e-05, + "loss": 1.6191, + "step": 338 + }, + { + "epoch": 0.8894736842105263, + "step": 338, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.8921052631578947, + "grad_norm": 3.71246337890625, + "learning_rate": 1.8112660198773883e-05, + "loss": 1.4995, + "step": 339 + }, + { + "epoch": 0.8921052631578947, + "step": 339, + "train_accuracy": 0.75 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 10.060272216796875, + "learning_rate": 1.8100180332519746e-05, + "loss": 1.2642, + "step": 340 + }, + { + "epoch": 0.8947368421052632, + "step": 340, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.8973684210526316, + "grad_norm": 15.318142890930176, + "learning_rate": 1.8087663670376483e-05, + "loss": 1.5645, + "step": 341 + }, + { + "epoch": 0.8973684210526316, + "step": 341, + "train_accuracy": 0.640625 + }, + { + "epoch": 0.9, + "grad_norm": 18.948949813842773, + "learning_rate": 1.80751102692023e-05, + "loss": 1.6025, + "step": 342 + }, + { + "epoch": 0.9, + "step": 342, + "train_accuracy": 0.796875 + }, + { + "epoch": 0.9026315789473685, + "grad_norm": 5.428231716156006, + "learning_rate": 1.80625201860223e-05, + "loss": 1.4634, + "step": 343 + }, + { + "epoch": 0.9026315789473685, + "step": 343, + "train_accuracy": 0.78125 + }, + { + "epoch": 0.9052631578947369, + "grad_norm": 4.9712934494018555, + "learning_rate": 1.8049893478028203e-05, + "loss": 1.5166, + "step": 344 + }, + { + "epoch": 0.9052631578947369, + "step": 344, + "train_accuracy": 0.828125 + }, + { + "epoch": 0.9078947368421053, + "grad_norm": 15.7850341796875, + "learning_rate": 1.803723020257811e-05, + "loss": 1.4326, + "step": 345 + }, + { + "epoch": 0.9078947368421053, + "step": 345, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.9105263157894737, + "grad_norm": 14.186681747436523, + "learning_rate": 1.8024530417196228e-05, + "loss": 1.5503, + "step": 346 + }, + { + "epoch": 0.9105263157894737, + "step": 346, + "train_accuracy": 0.65625 + }, + { + "epoch": 0.9131578947368421, + "grad_norm": 2.5712296962738037, + "learning_rate": 1.8011794179572628e-05, + "loss": 1.2954, + "step": 347 + }, + { + "epoch": 0.9131578947368421, + "step": 347, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.9157894736842105, + "grad_norm": 3.0300934314727783, + "learning_rate": 1.7999021547562943e-05, + "loss": 1.4614, + "step": 348 + }, + { + "epoch": 0.9157894736842105, + "step": 348, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.9184210526315789, + "grad_norm": 11.72201156616211, + "learning_rate": 1.7986212579188163e-05, + "loss": 1.312, + "step": 349 + }, + { + "epoch": 0.9184210526315789, + "step": 349, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 2.3370659351348877, + "learning_rate": 1.7973367332634314e-05, + "loss": 1.3076, + "step": 350 + }, + { + "epoch": 0.9210526315789473, + "step": 350, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.9236842105263158, + "grad_norm": 7.042386531829834, + "learning_rate": 1.796048586625223e-05, + "loss": 1.2827, + "step": 351 + }, + { + "epoch": 0.9236842105263158, + "step": 351, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.9263157894736842, + "grad_norm": 3.6882100105285645, + "learning_rate": 1.7947568238557282e-05, + "loss": 1.6367, + "step": 352 + }, + { + "epoch": 0.9263157894736842, + "step": 352, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.9289473684210526, + "grad_norm": 2.304743766784668, + "learning_rate": 1.793461450822909e-05, + "loss": 1.3926, + "step": 353 + }, + { + "epoch": 0.9289473684210526, + "step": 353, + "train_accuracy": 0.75 + }, + { + "epoch": 0.9315789473684211, + "grad_norm": 8.389480590820312, + "learning_rate": 1.7921624734111292e-05, + "loss": 1.3057, + "step": 354 + }, + { + "epoch": 0.9315789473684211, + "step": 354, + "train_accuracy": 0.8125 + }, + { + "epoch": 0.9342105263157895, + "grad_norm": 7.985996246337891, + "learning_rate": 1.7908598975211256e-05, + "loss": 1.3267, + "step": 355 + }, + { + "epoch": 0.9342105263157895, + "step": 355, + "train_accuracy": 0.59375 + }, + { + "epoch": 0.9368421052631579, + "grad_norm": 9.184244155883789, + "learning_rate": 1.7895537290699806e-05, + "loss": 1.4209, + "step": 356 + }, + { + "epoch": 0.9368421052631579, + "step": 356, + "train_accuracy": 0.796875 + }, + { + "epoch": 0.9394736842105263, + "grad_norm": 13.210733413696289, + "learning_rate": 1.7882439739910964e-05, + "loss": 1.3384, + "step": 357 + }, + { + "epoch": 0.9394736842105263, + "step": 357, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.9421052631578948, + "grad_norm": 7.19890832901001, + "learning_rate": 1.7869306382341682e-05, + "loss": 1.4727, + "step": 358 + }, + { + "epoch": 0.9421052631578948, + "step": 358, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.9447368421052632, + "grad_norm": 4.545177936553955, + "learning_rate": 1.7856137277651567e-05, + "loss": 1.395, + "step": 359 + }, + { + "epoch": 0.9447368421052632, + "step": 359, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 14.59473991394043, + "learning_rate": 1.784293248566261e-05, + "loss": 1.3652, + "step": 360 + }, + { + "epoch": 0.9473684210526315, + "step": 360, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.95, + "grad_norm": 3.9617745876312256, + "learning_rate": 1.7829692066358914e-05, + "loss": 1.2046, + "step": 361 + }, + { + "epoch": 0.95, + "step": 361, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.9526315789473684, + "grad_norm": 12.78911018371582, + "learning_rate": 1.7816416079886427e-05, + "loss": 1.4561, + "step": 362 + }, + { + "epoch": 0.9526315789473684, + "step": 362, + "train_accuracy": 0.75 + }, + { + "epoch": 0.9552631578947368, + "grad_norm": 2.799234628677368, + "learning_rate": 1.780310458655266e-05, + "loss": 1.2793, + "step": 363 + }, + { + "epoch": 0.9552631578947368, + "step": 363, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.9578947368421052, + "grad_norm": 4.567415714263916, + "learning_rate": 1.7789757646826416e-05, + "loss": 1.5762, + "step": 364 + }, + { + "epoch": 0.9578947368421052, + "step": 364, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.9605263157894737, + "grad_norm": 2.880171775817871, + "learning_rate": 1.7776375321337523e-05, + "loss": 1.4688, + "step": 365 + }, + { + "epoch": 0.9605263157894737, + "step": 365, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.9631578947368421, + "grad_norm": 4.474959373474121, + "learning_rate": 1.7762957670876547e-05, + "loss": 1.25, + "step": 366 + }, + { + "epoch": 0.9631578947368421, + "step": 366, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.9657894736842105, + "grad_norm": 4.31421422958374, + "learning_rate": 1.7749504756394528e-05, + "loss": 1.5176, + "step": 367 + }, + { + "epoch": 0.9657894736842105, + "step": 367, + "train_accuracy": 0.625 + }, + { + "epoch": 0.968421052631579, + "grad_norm": 3.9677927494049072, + "learning_rate": 1.7736016639002683e-05, + "loss": 1.3076, + "step": 368 + }, + { + "epoch": 0.968421052631579, + "step": 368, + "train_accuracy": 0.671875 + }, + { + "epoch": 0.9710526315789474, + "grad_norm": 2.26666522026062, + "learning_rate": 1.7722493379972163e-05, + "loss": 1.4307, + "step": 369 + }, + { + "epoch": 0.9710526315789474, + "step": 369, + "train_accuracy": 0.703125 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 6.098262310028076, + "learning_rate": 1.770893504073373e-05, + "loss": 1.4111, + "step": 370 + }, + { + "epoch": 0.9736842105263158, + "step": 370, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.9763157894736842, + "grad_norm": 8.281463623046875, + "learning_rate": 1.769534168287752e-05, + "loss": 1.4824, + "step": 371 + }, + { + "epoch": 0.9763157894736842, + "step": 371, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.9789473684210527, + "grad_norm": 6.954835891723633, + "learning_rate": 1.7681713368152733e-05, + "loss": 1.4072, + "step": 372 + }, + { + "epoch": 0.9789473684210527, + "step": 372, + "train_accuracy": 0.6875 + }, + { + "epoch": 0.9815789473684211, + "grad_norm": 5.460451126098633, + "learning_rate": 1.7668050158467367e-05, + "loss": 1.5469, + "step": 373 + }, + { + "epoch": 0.9815789473684211, + "step": 373, + "train_accuracy": 0.765625 + }, + { + "epoch": 0.9842105263157894, + "grad_norm": 2.4143896102905273, + "learning_rate": 1.765435211588794e-05, + "loss": 1.2954, + "step": 374 + }, + { + "epoch": 0.9842105263157894, + "step": 374, + "train_accuracy": 0.71875 + }, + { + "epoch": 0.9868421052631579, + "grad_norm": 3.317214012145996, + "learning_rate": 1.7640619302639194e-05, + "loss": 1.2959, + "step": 375 + }, + { + "epoch": 0.9868421052631579, + "step": 375, + "train_accuracy": 0.65625 + }, + { + "epoch": 0.9894736842105263, + "grad_norm": 10.918193817138672, + "learning_rate": 1.762685178110382e-05, + "loss": 1.5464, + "step": 376 + }, + { + "epoch": 0.9894736842105263, + "step": 376, + "train_accuracy": 0.734375 + }, + { + "epoch": 0.9921052631578947, + "grad_norm": 5.174256801605225, + "learning_rate": 1.7613049613822188e-05, + "loss": 1.1704, + "step": 377 + }, + { + "epoch": 0.9921052631578947, + "step": 377, + "train_accuracy": 0.65625 + }, + { + "epoch": 0.9947368421052631, + "grad_norm": 4.161032199859619, + "learning_rate": 1.759921286349203e-05, + "loss": 1.2808, + "step": 378 + }, + { + "epoch": 0.9947368421052631, + "step": 378, + "train_accuracy": 0.625 + }, + { + "epoch": 0.9973684210526316, + "grad_norm": 8.024152755737305, + "learning_rate": 1.7585341592968188e-05, + "loss": 1.1694, + "step": 379 + }, + { + "epoch": 0.9973684210526316, + "step": 379, + "train_accuracy": 0.65625 + }, + { + "epoch": 1.0, + "grad_norm": 6.739095687866211, + "learning_rate": 1.7571435865262314e-05, + "loss": 1.3779, + "step": 380 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.7010927200317383, + "eval_max_score": 5.375, + "eval_min_score": -5.71875, + "eval_runtime": 151.0289, + "eval_samples_per_second": 18.784, + "eval_steps_per_second": 0.298, + "step": 380 + }, + { + "epoch": 1.0, + "step": 380, + "train_accuracy": 0.828125 + }, + { + "epoch": 1.0026315789473683, + "grad_norm": 4.438934326171875, + "learning_rate": 1.7557495743542586e-05, + "loss": 1.0659, + "step": 381 + }, + { + "epoch": 1.0026315789473683, + "step": 381, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.0052631578947369, + "grad_norm": 8.325281143188477, + "learning_rate": 1.7543521291133413e-05, + "loss": 1.2104, + "step": 382 + }, + { + "epoch": 1.0052631578947369, + "step": 382, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.0078947368421052, + "grad_norm": 4.250606536865234, + "learning_rate": 1.752951257151516e-05, + "loss": 1.2192, + "step": 383 + }, + { + "epoch": 1.0078947368421052, + "step": 383, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.0105263157894737, + "grad_norm": 8.005126953125, + "learning_rate": 1.751546964832386e-05, + "loss": 1.2773, + "step": 384 + }, + { + "epoch": 1.0105263157894737, + "step": 384, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.013157894736842, + "grad_norm": 3.093712329864502, + "learning_rate": 1.7501392585350903e-05, + "loss": 1.1494, + "step": 385 + }, + { + "epoch": 1.013157894736842, + "step": 385, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.0157894736842106, + "grad_norm": 4.209375381469727, + "learning_rate": 1.7487281446542782e-05, + "loss": 1.1475, + "step": 386 + }, + { + "epoch": 1.0157894736842106, + "step": 386, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.018421052631579, + "grad_norm": 11.179862022399902, + "learning_rate": 1.747313629600077e-05, + "loss": 1.4062, + "step": 387 + }, + { + "epoch": 1.018421052631579, + "step": 387, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.0210526315789474, + "grad_norm": 2.9365897178649902, + "learning_rate": 1.745895719798065e-05, + "loss": 1.2676, + "step": 388 + }, + { + "epoch": 1.0210526315789474, + "step": 388, + "train_accuracy": 0.859375 + }, + { + "epoch": 1.0236842105263158, + "grad_norm": 6.062369346618652, + "learning_rate": 1.74447442168924e-05, + "loss": 1.1021, + "step": 389 + }, + { + "epoch": 1.0236842105263158, + "step": 389, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.0263157894736843, + "grad_norm": 4.148881435394287, + "learning_rate": 1.743049741729993e-05, + "loss": 1.2554, + "step": 390 + }, + { + "epoch": 1.0263157894736843, + "step": 390, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.0289473684210526, + "grad_norm": 4.890829563140869, + "learning_rate": 1.741621686392077e-05, + "loss": 1.1567, + "step": 391 + }, + { + "epoch": 1.0289473684210526, + "step": 391, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.0315789473684212, + "grad_norm": 5.793519973754883, + "learning_rate": 1.740190262162578e-05, + "loss": 1.2739, + "step": 392 + }, + { + "epoch": 1.0315789473684212, + "step": 392, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.0342105263157895, + "grad_norm": 9.222723960876465, + "learning_rate": 1.7387554755438857e-05, + "loss": 1.3457, + "step": 393 + }, + { + "epoch": 1.0342105263157895, + "step": 393, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.0368421052631578, + "grad_norm": 6.217665195465088, + "learning_rate": 1.7373173330536628e-05, + "loss": 1.4707, + "step": 394 + }, + { + "epoch": 1.0368421052631578, + "step": 394, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.0394736842105263, + "grad_norm": 3.183716058731079, + "learning_rate": 1.7358758412248176e-05, + "loss": 1.1362, + "step": 395 + }, + { + "epoch": 1.0394736842105263, + "step": 395, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.0421052631578946, + "grad_norm": 8.913907051086426, + "learning_rate": 1.734431006605473e-05, + "loss": 1.3467, + "step": 396 + }, + { + "epoch": 1.0421052631578946, + "step": 396, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.0447368421052632, + "grad_norm": 14.859869003295898, + "learning_rate": 1.7329828357589356e-05, + "loss": 1.3984, + "step": 397 + }, + { + "epoch": 1.0447368421052632, + "step": 397, + "train_accuracy": 0.75 + }, + { + "epoch": 1.0473684210526315, + "grad_norm": 6.55880880355835, + "learning_rate": 1.731531335263669e-05, + "loss": 1.27, + "step": 398 + }, + { + "epoch": 1.0473684210526315, + "step": 398, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.05, + "grad_norm": 3.913179874420166, + "learning_rate": 1.7300765117132608e-05, + "loss": 1.2803, + "step": 399 + }, + { + "epoch": 1.05, + "step": 399, + "train_accuracy": 0.828125 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 15.469132423400879, + "learning_rate": 1.7286183717163942e-05, + "loss": 1.2852, + "step": 400 + }, + { + "epoch": 1.0526315789473684, + "step": 400, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.055263157894737, + "grad_norm": 17.792240142822266, + "learning_rate": 1.7271569218968175e-05, + "loss": 1.4697, + "step": 401 + }, + { + "epoch": 1.055263157894737, + "step": 401, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.0578947368421052, + "grad_norm": 8.638894081115723, + "learning_rate": 1.7256921688933145e-05, + "loss": 1.5859, + "step": 402 + }, + { + "epoch": 1.0578947368421052, + "step": 402, + "train_accuracy": 0.75 + }, + { + "epoch": 1.0605263157894738, + "grad_norm": 4.139883995056152, + "learning_rate": 1.7242241193596747e-05, + "loss": 1.2485, + "step": 403 + }, + { + "epoch": 1.0605263157894738, + "step": 403, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.063157894736842, + "grad_norm": 5.416935443878174, + "learning_rate": 1.722752779964661e-05, + "loss": 1.2969, + "step": 404 + }, + { + "epoch": 1.063157894736842, + "step": 404, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.0657894736842106, + "grad_norm": 18.150854110717773, + "learning_rate": 1.7212781573919818e-05, + "loss": 1.2886, + "step": 405 + }, + { + "epoch": 1.0657894736842106, + "step": 405, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.068421052631579, + "grad_norm": 17.88170623779297, + "learning_rate": 1.7198002583402588e-05, + "loss": 1.4951, + "step": 406 + }, + { + "epoch": 1.068421052631579, + "step": 406, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.0710526315789473, + "grad_norm": 7.877701759338379, + "learning_rate": 1.718319089522999e-05, + "loss": 1.2217, + "step": 407 + }, + { + "epoch": 1.0710526315789473, + "step": 407, + "train_accuracy": 0.828125 + }, + { + "epoch": 1.0736842105263158, + "grad_norm": 3.5272531509399414, + "learning_rate": 1.7168346576685616e-05, + "loss": 1.1587, + "step": 408 + }, + { + "epoch": 1.0736842105263158, + "step": 408, + "train_accuracy": 0.75 + }, + { + "epoch": 1.0763157894736841, + "grad_norm": 4.579117298126221, + "learning_rate": 1.7153469695201278e-05, + "loss": 1.2354, + "step": 409 + }, + { + "epoch": 1.0763157894736841, + "step": 409, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.0789473684210527, + "grad_norm": 14.453102111816406, + "learning_rate": 1.713856031835672e-05, + "loss": 1.374, + "step": 410 + }, + { + "epoch": 1.0789473684210527, + "step": 410, + "train_accuracy": 0.875 + }, + { + "epoch": 1.081578947368421, + "grad_norm": 7.408053874969482, + "learning_rate": 1.7123618513879296e-05, + "loss": 1.0933, + "step": 411 + }, + { + "epoch": 1.081578947368421, + "step": 411, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.0842105263157895, + "grad_norm": 5.875626564025879, + "learning_rate": 1.710864434964367e-05, + "loss": 1.127, + "step": 412 + }, + { + "epoch": 1.0842105263157895, + "step": 412, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.0868421052631578, + "grad_norm": 3.377965211868286, + "learning_rate": 1.709363789367149e-05, + "loss": 1.2173, + "step": 413 + }, + { + "epoch": 1.0868421052631578, + "step": 413, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.0894736842105264, + "grad_norm": 3.2645585536956787, + "learning_rate": 1.7078599214131105e-05, + "loss": 1.3379, + "step": 414 + }, + { + "epoch": 1.0894736842105264, + "step": 414, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.0921052631578947, + "grad_norm": 5.091524124145508, + "learning_rate": 1.7063528379337238e-05, + "loss": 1.272, + "step": 415 + }, + { + "epoch": 1.0921052631578947, + "step": 415, + "train_accuracy": 0.75 + }, + { + "epoch": 1.0947368421052632, + "grad_norm": 5.960602283477783, + "learning_rate": 1.7048425457750685e-05, + "loss": 1.272, + "step": 416 + }, + { + "epoch": 1.0947368421052632, + "step": 416, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.0973684210526315, + "grad_norm": 4.821347713470459, + "learning_rate": 1.7033290517977996e-05, + "loss": 1.1245, + "step": 417 + }, + { + "epoch": 1.0973684210526315, + "step": 417, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.1, + "grad_norm": 3.2067134380340576, + "learning_rate": 1.7018123628771166e-05, + "loss": 1.2695, + "step": 418 + }, + { + "epoch": 1.1, + "step": 418, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.1026315789473684, + "grad_norm": 3.3393802642822266, + "learning_rate": 1.7002924859027322e-05, + "loss": 1.0884, + "step": 419 + }, + { + "epoch": 1.1026315789473684, + "step": 419, + "train_accuracy": 0.84375 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 4.24766731262207, + "learning_rate": 1.698769427778842e-05, + "loss": 0.9609, + "step": 420 + }, + { + "epoch": 1.1052631578947367, + "step": 420, + "train_accuracy": 0.75 + }, + { + "epoch": 1.1078947368421053, + "grad_norm": 13.302772521972656, + "learning_rate": 1.6972431954240906e-05, + "loss": 1.1318, + "step": 421 + }, + { + "epoch": 1.1078947368421053, + "step": 421, + "train_accuracy": 0.75 + }, + { + "epoch": 1.1105263157894736, + "grad_norm": 3.8378937244415283, + "learning_rate": 1.6957137957715442e-05, + "loss": 1.2275, + "step": 422 + }, + { + "epoch": 1.1105263157894736, + "step": 422, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.1131578947368421, + "grad_norm": 5.301788806915283, + "learning_rate": 1.6941812357686547e-05, + "loss": 1.3276, + "step": 423 + }, + { + "epoch": 1.1131578947368421, + "step": 423, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.1157894736842104, + "grad_norm": 3.5003018379211426, + "learning_rate": 1.6926455223772317e-05, + "loss": 1.2417, + "step": 424 + }, + { + "epoch": 1.1157894736842104, + "step": 424, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.118421052631579, + "grad_norm": 7.428293704986572, + "learning_rate": 1.6911066625734082e-05, + "loss": 1.1333, + "step": 425 + }, + { + "epoch": 1.118421052631579, + "step": 425, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.1210526315789473, + "grad_norm": 5.929080009460449, + "learning_rate": 1.689564663347611e-05, + "loss": 1.251, + "step": 426 + }, + { + "epoch": 1.1210526315789473, + "step": 426, + "train_accuracy": 0.625 + }, + { + "epoch": 1.1236842105263158, + "grad_norm": 4.198963165283203, + "learning_rate": 1.6880195317045274e-05, + "loss": 1.4849, + "step": 427 + }, + { + "epoch": 1.1236842105263158, + "step": 427, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.1263157894736842, + "grad_norm": 5.26378059387207, + "learning_rate": 1.6864712746630745e-05, + "loss": 1.4326, + "step": 428 + }, + { + "epoch": 1.1263157894736842, + "step": 428, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.1289473684210527, + "grad_norm": 9.662724494934082, + "learning_rate": 1.6849198992563666e-05, + "loss": 1.3887, + "step": 429 + }, + { + "epoch": 1.1289473684210527, + "step": 429, + "train_accuracy": 0.640625 + }, + { + "epoch": 1.131578947368421, + "grad_norm": 3.6311473846435547, + "learning_rate": 1.6833654125316832e-05, + "loss": 1.2671, + "step": 430 + }, + { + "epoch": 1.131578947368421, + "step": 430, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.1342105263157896, + "grad_norm": 4.167621612548828, + "learning_rate": 1.681807821550438e-05, + "loss": 1.4048, + "step": 431 + }, + { + "epoch": 1.1342105263157896, + "step": 431, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.1368421052631579, + "grad_norm": 2.69230318069458, + "learning_rate": 1.6802471333881456e-05, + "loss": 1.2554, + "step": 432 + }, + { + "epoch": 1.1368421052631579, + "step": 432, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.1394736842105262, + "grad_norm": 3.8613479137420654, + "learning_rate": 1.6786833551343896e-05, + "loss": 1.1846, + "step": 433 + }, + { + "epoch": 1.1394736842105262, + "step": 433, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.1421052631578947, + "grad_norm": 3.424915075302124, + "learning_rate": 1.677116493892792e-05, + "loss": 1.3999, + "step": 434 + }, + { + "epoch": 1.1421052631578947, + "step": 434, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.1447368421052633, + "grad_norm": 6.499844551086426, + "learning_rate": 1.6755465567809776e-05, + "loss": 1.2183, + "step": 435 + }, + { + "epoch": 1.1447368421052633, + "step": 435, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.1473684210526316, + "grad_norm": 8.367511749267578, + "learning_rate": 1.6739735509305452e-05, + "loss": 1.1621, + "step": 436 + }, + { + "epoch": 1.1473684210526316, + "step": 436, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.15, + "grad_norm": 3.175813913345337, + "learning_rate": 1.6723974834870327e-05, + "loss": 1.2539, + "step": 437 + }, + { + "epoch": 1.15, + "step": 437, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.1526315789473685, + "grad_norm": 2.990525245666504, + "learning_rate": 1.6708183616098864e-05, + "loss": 1.1562, + "step": 438 + }, + { + "epoch": 1.1526315789473685, + "step": 438, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.1552631578947368, + "grad_norm": 2.673862934112549, + "learning_rate": 1.669236192472427e-05, + "loss": 1.1377, + "step": 439 + }, + { + "epoch": 1.1552631578947368, + "step": 439, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 3.241729497909546, + "learning_rate": 1.667650983261818e-05, + "loss": 1.2139, + "step": 440 + }, + { + "epoch": 1.1578947368421053, + "step": 440, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.1605263157894736, + "grad_norm": 3.701186418533325, + "learning_rate": 1.6660627411790327e-05, + "loss": 1.3027, + "step": 441 + }, + { + "epoch": 1.1605263157894736, + "step": 441, + "train_accuracy": 0.75 + }, + { + "epoch": 1.1631578947368422, + "grad_norm": 3.050072431564331, + "learning_rate": 1.664471473438822e-05, + "loss": 1.1738, + "step": 442 + }, + { + "epoch": 1.1631578947368422, + "step": 442, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.1657894736842105, + "grad_norm": 8.023958206176758, + "learning_rate": 1.66287718726968e-05, + "loss": 1.2134, + "step": 443 + }, + { + "epoch": 1.1657894736842105, + "step": 443, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.168421052631579, + "grad_norm": 7.616509914398193, + "learning_rate": 1.6612798899138134e-05, + "loss": 1.145, + "step": 444 + }, + { + "epoch": 1.168421052631579, + "step": 444, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.1710526315789473, + "grad_norm": 8.092353820800781, + "learning_rate": 1.6596795886271067e-05, + "loss": 1.0767, + "step": 445 + }, + { + "epoch": 1.1710526315789473, + "step": 445, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.1736842105263159, + "grad_norm": 9.653562545776367, + "learning_rate": 1.6580762906790913e-05, + "loss": 1.1504, + "step": 446 + }, + { + "epoch": 1.1736842105263159, + "step": 446, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.1763157894736842, + "grad_norm": 4.4737067222595215, + "learning_rate": 1.65647000335291e-05, + "loss": 1.2695, + "step": 447 + }, + { + "epoch": 1.1763157894736842, + "step": 447, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.1789473684210527, + "grad_norm": 5.050171375274658, + "learning_rate": 1.6548607339452853e-05, + "loss": 1.3516, + "step": 448 + }, + { + "epoch": 1.1789473684210527, + "step": 448, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.181578947368421, + "grad_norm": 3.861908197402954, + "learning_rate": 1.6532484897664868e-05, + "loss": 1.0137, + "step": 449 + }, + { + "epoch": 1.181578947368421, + "step": 449, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 3.3620777130126953, + "learning_rate": 1.6516332781402965e-05, + "loss": 1.1274, + "step": 450 + }, + { + "epoch": 1.1842105263157894, + "step": 450, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.186842105263158, + "grad_norm": 3.629772186279297, + "learning_rate": 1.6500151064039768e-05, + "loss": 1.1787, + "step": 451 + }, + { + "epoch": 1.186842105263158, + "step": 451, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.1894736842105262, + "grad_norm": 6.909435749053955, + "learning_rate": 1.6483939819082368e-05, + "loss": 1.1567, + "step": 452 + }, + { + "epoch": 1.1894736842105262, + "step": 452, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.1921052631578948, + "grad_norm": 8.454055786132812, + "learning_rate": 1.646769912017199e-05, + "loss": 1.3823, + "step": 453 + }, + { + "epoch": 1.1921052631578948, + "step": 453, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.194736842105263, + "grad_norm": 4.320656776428223, + "learning_rate": 1.645142904108364e-05, + "loss": 1.1353, + "step": 454 + }, + { + "epoch": 1.194736842105263, + "step": 454, + "train_accuracy": 0.65625 + }, + { + "epoch": 1.1973684210526316, + "grad_norm": 12.18797779083252, + "learning_rate": 1.6435129655725813e-05, + "loss": 1.4482, + "step": 455 + }, + { + "epoch": 1.1973684210526316, + "step": 455, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.2, + "grad_norm": 3.5705020427703857, + "learning_rate": 1.6418801038140114e-05, + "loss": 1.1245, + "step": 456 + }, + { + "epoch": 1.2, + "eval_accuracy": 0.7049700617790222, + "eval_max_score": 5.6875, + "eval_min_score": -7.0, + "eval_runtime": 151.3491, + "eval_samples_per_second": 18.745, + "eval_steps_per_second": 0.297, + "step": 456 + }, + { + "epoch": 1.2, + "step": 456, + "train_accuracy": 0.828125 + }, + { + "epoch": 1.2026315789473685, + "grad_norm": 9.271757125854492, + "learning_rate": 1.6402443262500936e-05, + "loss": 1.3506, + "step": 457 + }, + { + "epoch": 1.2026315789473685, + "step": 457, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.2052631578947368, + "grad_norm": 6.957113265991211, + "learning_rate": 1.6386056403115135e-05, + "loss": 1.3022, + "step": 458 + }, + { + "epoch": 1.2052631578947368, + "step": 458, + "train_accuracy": 0.75 + }, + { + "epoch": 1.2078947368421054, + "grad_norm": 3.5725152492523193, + "learning_rate": 1.6369640534421675e-05, + "loss": 1.1992, + "step": 459 + }, + { + "epoch": 1.2078947368421054, + "step": 459, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 11.582955360412598, + "learning_rate": 1.6353195730991308e-05, + "loss": 1.2861, + "step": 460 + }, + { + "epoch": 1.2105263157894737, + "step": 460, + "train_accuracy": 0.75 + }, + { + "epoch": 1.2131578947368422, + "grad_norm": 6.4823527336120605, + "learning_rate": 1.633672206752621e-05, + "loss": 1.4482, + "step": 461 + }, + { + "epoch": 1.2131578947368422, + "step": 461, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.2157894736842105, + "grad_norm": 15.534878730773926, + "learning_rate": 1.6320219618859668e-05, + "loss": 1.2925, + "step": 462 + }, + { + "epoch": 1.2157894736842105, + "step": 462, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.2184210526315788, + "grad_norm": 4.061946868896484, + "learning_rate": 1.6303688459955728e-05, + "loss": 1.2837, + "step": 463 + }, + { + "epoch": 1.2184210526315788, + "step": 463, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.2210526315789474, + "grad_norm": 2.7848575115203857, + "learning_rate": 1.628712866590885e-05, + "loss": 1.188, + "step": 464 + }, + { + "epoch": 1.2210526315789474, + "step": 464, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.2236842105263157, + "grad_norm": 9.240684509277344, + "learning_rate": 1.627054031194358e-05, + "loss": 1.2993, + "step": 465 + }, + { + "epoch": 1.2236842105263157, + "step": 465, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.2263157894736842, + "grad_norm": 3.1299266815185547, + "learning_rate": 1.6253923473414185e-05, + "loss": 1.3467, + "step": 466 + }, + { + "epoch": 1.2263157894736842, + "step": 466, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.2289473684210526, + "grad_norm": 4.89497184753418, + "learning_rate": 1.623727822580434e-05, + "loss": 1.1733, + "step": 467 + }, + { + "epoch": 1.2289473684210526, + "step": 467, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.231578947368421, + "grad_norm": 3.6426620483398438, + "learning_rate": 1.6220604644726778e-05, + "loss": 1.0483, + "step": 468 + }, + { + "epoch": 1.231578947368421, + "step": 468, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.2342105263157894, + "grad_norm": 4.336991786956787, + "learning_rate": 1.620390280592291e-05, + "loss": 1.3164, + "step": 469 + }, + { + "epoch": 1.2342105263157894, + "step": 469, + "train_accuracy": 0.75 + }, + { + "epoch": 1.236842105263158, + "grad_norm": 9.696093559265137, + "learning_rate": 1.6187172785262544e-05, + "loss": 1.2285, + "step": 470 + }, + { + "epoch": 1.236842105263158, + "step": 470, + "train_accuracy": 0.84375 + }, + { + "epoch": 1.2394736842105263, + "grad_norm": 10.798023223876953, + "learning_rate": 1.6170414658743488e-05, + "loss": 1.2935, + "step": 471 + }, + { + "epoch": 1.2394736842105263, + "step": 471, + "train_accuracy": 0.828125 + }, + { + "epoch": 1.2421052631578948, + "grad_norm": 6.651049613952637, + "learning_rate": 1.6153628502491228e-05, + "loss": 1.0566, + "step": 472 + }, + { + "epoch": 1.2421052631578948, + "step": 472, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.2447368421052631, + "grad_norm": 15.750227928161621, + "learning_rate": 1.613681439275858e-05, + "loss": 1.4531, + "step": 473 + }, + { + "epoch": 1.2447368421052631, + "step": 473, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.2473684210526317, + "grad_norm": 12.205035209655762, + "learning_rate": 1.6119972405925332e-05, + "loss": 1.3672, + "step": 474 + }, + { + "epoch": 1.2473684210526317, + "step": 474, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.25, + "grad_norm": 4.43392276763916, + "learning_rate": 1.6103102618497922e-05, + "loss": 1.3491, + "step": 475 + }, + { + "epoch": 1.25, + "step": 475, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.2526315789473683, + "grad_norm": 4.819791793823242, + "learning_rate": 1.6086205107109067e-05, + "loss": 1.165, + "step": 476 + }, + { + "epoch": 1.2526315789473683, + "step": 476, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.2552631578947369, + "grad_norm": 4.512659549713135, + "learning_rate": 1.6069279948517416e-05, + "loss": 1.146, + "step": 477 + }, + { + "epoch": 1.2552631578947369, + "step": 477, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.2578947368421054, + "grad_norm": 11.076248168945312, + "learning_rate": 1.6052327219607223e-05, + "loss": 1.207, + "step": 478 + }, + { + "epoch": 1.2578947368421054, + "step": 478, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.2605263157894737, + "grad_norm": 13.904632568359375, + "learning_rate": 1.603534699738797e-05, + "loss": 1.3438, + "step": 479 + }, + { + "epoch": 1.2605263157894737, + "step": 479, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 4.511821746826172, + "learning_rate": 1.601833935899404e-05, + "loss": 1.1997, + "step": 480 + }, + { + "epoch": 1.263157894736842, + "step": 480, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.2657894736842106, + "grad_norm": 5.979182243347168, + "learning_rate": 1.600130438168435e-05, + "loss": 1.2031, + "step": 481 + }, + { + "epoch": 1.2657894736842106, + "step": 481, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.268421052631579, + "grad_norm": 10.593663215637207, + "learning_rate": 1.5984242142842003e-05, + "loss": 1.293, + "step": 482 + }, + { + "epoch": 1.268421052631579, + "step": 482, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.2710526315789474, + "grad_norm": 16.006187438964844, + "learning_rate": 1.5967152719973954e-05, + "loss": 1.4736, + "step": 483 + }, + { + "epoch": 1.2710526315789474, + "step": 483, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.2736842105263158, + "grad_norm": 7.199328899383545, + "learning_rate": 1.5950036190710637e-05, + "loss": 1.3931, + "step": 484 + }, + { + "epoch": 1.2736842105263158, + "step": 484, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.2763157894736843, + "grad_norm": 12.612074851989746, + "learning_rate": 1.593289263280561e-05, + "loss": 1.2217, + "step": 485 + }, + { + "epoch": 1.2763157894736843, + "step": 485, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.2789473684210526, + "grad_norm": 13.020115852355957, + "learning_rate": 1.5915722124135227e-05, + "loss": 1.4404, + "step": 486 + }, + { + "epoch": 1.2789473684210526, + "step": 486, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.2815789473684212, + "grad_norm": 15.411005020141602, + "learning_rate": 1.5898524742698257e-05, + "loss": 1.627, + "step": 487 + }, + { + "epoch": 1.2815789473684212, + "step": 487, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.2842105263157895, + "grad_norm": 2.5929243564605713, + "learning_rate": 1.5881300566615555e-05, + "loss": 1.2729, + "step": 488 + }, + { + "epoch": 1.2842105263157895, + "step": 488, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.2868421052631578, + "grad_norm": 19.66521644592285, + "learning_rate": 1.5864049674129677e-05, + "loss": 1.6582, + "step": 489 + }, + { + "epoch": 1.2868421052631578, + "step": 489, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.2894736842105263, + "grad_norm": 7.616007328033447, + "learning_rate": 1.584677214360455e-05, + "loss": 1.2085, + "step": 490 + }, + { + "epoch": 1.2894736842105263, + "step": 490, + "train_accuracy": 0.65625 + }, + { + "epoch": 1.2921052631578949, + "grad_norm": 3.8917739391326904, + "learning_rate": 1.5829468053525104e-05, + "loss": 1.3672, + "step": 491 + }, + { + "epoch": 1.2921052631578949, + "step": 491, + "train_accuracy": 0.75 + }, + { + "epoch": 1.2947368421052632, + "grad_norm": 9.001242637634277, + "learning_rate": 1.5812137482496925e-05, + "loss": 1.4292, + "step": 492 + }, + { + "epoch": 1.2947368421052632, + "step": 492, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.2973684210526315, + "grad_norm": 3.5048999786376953, + "learning_rate": 1.5794780509245876e-05, + "loss": 1.2676, + "step": 493 + }, + { + "epoch": 1.2973684210526315, + "step": 493, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.3, + "grad_norm": 17.744800567626953, + "learning_rate": 1.5777397212617776e-05, + "loss": 1.4194, + "step": 494 + }, + { + "epoch": 1.3, + "step": 494, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.3026315789473684, + "grad_norm": 15.553146362304688, + "learning_rate": 1.5759987671578007e-05, + "loss": 1.2896, + "step": 495 + }, + { + "epoch": 1.3026315789473684, + "step": 495, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.305263157894737, + "grad_norm": 8.133227348327637, + "learning_rate": 1.5742551965211167e-05, + "loss": 1.1895, + "step": 496 + }, + { + "epoch": 1.305263157894737, + "step": 496, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.3078947368421052, + "grad_norm": 3.2711598873138428, + "learning_rate": 1.572509017272072e-05, + "loss": 1.4629, + "step": 497 + }, + { + "epoch": 1.3078947368421052, + "step": 497, + "train_accuracy": 0.75 + }, + { + "epoch": 1.3105263157894738, + "grad_norm": 8.634398460388184, + "learning_rate": 1.5707602373428628e-05, + "loss": 1.3159, + "step": 498 + }, + { + "epoch": 1.3105263157894738, + "step": 498, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.313157894736842, + "grad_norm": 7.313762664794922, + "learning_rate": 1.5690088646774983e-05, + "loss": 1.3647, + "step": 499 + }, + { + "epoch": 1.313157894736842, + "step": 499, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 6.731086730957031, + "learning_rate": 1.5672549072317667e-05, + "loss": 1.1221, + "step": 500 + }, + { + "epoch": 1.3157894736842106, + "step": 500, + "train_accuracy": 0.609375 + }, + { + "epoch": 1.318421052631579, + "grad_norm": 6.832492351531982, + "learning_rate": 1.5654983729731978e-05, + "loss": 1.2764, + "step": 501 + }, + { + "epoch": 1.318421052631579, + "step": 501, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.3210526315789473, + "grad_norm": 3.859423875808716, + "learning_rate": 1.5637392698810247e-05, + "loss": 1.2837, + "step": 502 + }, + { + "epoch": 1.3210526315789473, + "step": 502, + "train_accuracy": 0.828125 + }, + { + "epoch": 1.3236842105263158, + "grad_norm": 7.8143534660339355, + "learning_rate": 1.5619776059461523e-05, + "loss": 1.1621, + "step": 503 + }, + { + "epoch": 1.3236842105263158, + "step": 503, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.3263157894736843, + "grad_norm": 4.160390853881836, + "learning_rate": 1.5602133891711175e-05, + "loss": 1.4302, + "step": 504 + }, + { + "epoch": 1.3263157894736843, + "step": 504, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.3289473684210527, + "grad_norm": 2.8057210445404053, + "learning_rate": 1.558446627570053e-05, + "loss": 1.1064, + "step": 505 + }, + { + "epoch": 1.3289473684210527, + "step": 505, + "train_accuracy": 0.59375 + }, + { + "epoch": 1.331578947368421, + "grad_norm": 5.990983009338379, + "learning_rate": 1.5566773291686527e-05, + "loss": 1.4414, + "step": 506 + }, + { + "epoch": 1.331578947368421, + "step": 506, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.3342105263157895, + "grad_norm": 8.961088180541992, + "learning_rate": 1.554905502004133e-05, + "loss": 1.3325, + "step": 507 + }, + { + "epoch": 1.3342105263157895, + "step": 507, + "train_accuracy": 0.75 + }, + { + "epoch": 1.3368421052631578, + "grad_norm": 5.393622875213623, + "learning_rate": 1.5531311541251995e-05, + "loss": 1.248, + "step": 508 + }, + { + "epoch": 1.3368421052631578, + "step": 508, + "train_accuracy": 0.640625 + }, + { + "epoch": 1.3394736842105264, + "grad_norm": 6.529601097106934, + "learning_rate": 1.5513542935920058e-05, + "loss": 1.3818, + "step": 509 + }, + { + "epoch": 1.3394736842105264, + "step": 509, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.3421052631578947, + "grad_norm": 13.529115676879883, + "learning_rate": 1.5495749284761213e-05, + "loss": 1.5435, + "step": 510 + }, + { + "epoch": 1.3421052631578947, + "step": 510, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.3447368421052632, + "grad_norm": 5.483336925506592, + "learning_rate": 1.5477930668604917e-05, + "loss": 1.2622, + "step": 511 + }, + { + "epoch": 1.3447368421052632, + "step": 511, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.3473684210526315, + "grad_norm": 3.2029190063476562, + "learning_rate": 1.5460087168394042e-05, + "loss": 1.1821, + "step": 512 + }, + { + "epoch": 1.3473684210526315, + "step": 512, + "train_accuracy": 0.890625 + }, + { + "epoch": 1.35, + "grad_norm": 15.539850234985352, + "learning_rate": 1.5442218865184493e-05, + "loss": 1.0308, + "step": 513 + }, + { + "epoch": 1.35, + "step": 513, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.3526315789473684, + "grad_norm": 5.350340366363525, + "learning_rate": 1.5424325840144847e-05, + "loss": 1.2207, + "step": 514 + }, + { + "epoch": 1.3526315789473684, + "step": 514, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.3552631578947367, + "grad_norm": 8.735867500305176, + "learning_rate": 1.5406408174555978e-05, + "loss": 1.29, + "step": 515 + }, + { + "epoch": 1.3552631578947367, + "step": 515, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.3578947368421053, + "grad_norm": 4.313580513000488, + "learning_rate": 1.53884659498107e-05, + "loss": 1.0488, + "step": 516 + }, + { + "epoch": 1.3578947368421053, + "step": 516, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.3605263157894738, + "grad_norm": 5.2027387619018555, + "learning_rate": 1.537049924741338e-05, + "loss": 1.1738, + "step": 517 + }, + { + "epoch": 1.3605263157894738, + "step": 517, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.3631578947368421, + "grad_norm": 7.884912967681885, + "learning_rate": 1.5352508148979585e-05, + "loss": 1.5806, + "step": 518 + }, + { + "epoch": 1.3631578947368421, + "step": 518, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.3657894736842104, + "grad_norm": 3.1361117362976074, + "learning_rate": 1.5334492736235703e-05, + "loss": 1.144, + "step": 519 + }, + { + "epoch": 1.3657894736842104, + "step": 519, + "train_accuracy": 0.75 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 8.120806694030762, + "learning_rate": 1.5316453091018572e-05, + "loss": 1.0537, + "step": 520 + }, + { + "epoch": 1.368421052631579, + "step": 520, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.3710526315789473, + "grad_norm": 3.6984429359436035, + "learning_rate": 1.5298389295275098e-05, + "loss": 1.0972, + "step": 521 + }, + { + "epoch": 1.3710526315789473, + "step": 521, + "train_accuracy": 0.75 + }, + { + "epoch": 1.3736842105263158, + "grad_norm": 4.505008220672607, + "learning_rate": 1.5280301431061907e-05, + "loss": 1.4058, + "step": 522 + }, + { + "epoch": 1.3736842105263158, + "step": 522, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.3763157894736842, + "grad_norm": 3.8629140853881836, + "learning_rate": 1.5262189580544955e-05, + "loss": 1.2212, + "step": 523 + }, + { + "epoch": 1.3763157894736842, + "step": 523, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.3789473684210527, + "grad_norm": 6.1243109703063965, + "learning_rate": 1.5244053825999152e-05, + "loss": 1.3076, + "step": 524 + }, + { + "epoch": 1.3789473684210527, + "step": 524, + "train_accuracy": 0.828125 + }, + { + "epoch": 1.381578947368421, + "grad_norm": 13.835949897766113, + "learning_rate": 1.5225894249808005e-05, + "loss": 1.2031, + "step": 525 + }, + { + "epoch": 1.381578947368421, + "step": 525, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.3842105263157896, + "grad_norm": 5.034552097320557, + "learning_rate": 1.5207710934463218e-05, + "loss": 1.3672, + "step": 526 + }, + { + "epoch": 1.3842105263157896, + "step": 526, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.3868421052631579, + "grad_norm": 10.014500617980957, + "learning_rate": 1.5189503962564347e-05, + "loss": 1.1841, + "step": 527 + }, + { + "epoch": 1.3868421052631579, + "step": 527, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.3894736842105262, + "grad_norm": 14.438556671142578, + "learning_rate": 1.5171273416818406e-05, + "loss": 1.3545, + "step": 528 + }, + { + "epoch": 1.3894736842105262, + "step": 528, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.3921052631578947, + "grad_norm": 4.237582206726074, + "learning_rate": 1.5153019380039493e-05, + "loss": 1.3281, + "step": 529 + }, + { + "epoch": 1.3921052631578947, + "step": 529, + "train_accuracy": 0.640625 + }, + { + "epoch": 1.3947368421052633, + "grad_norm": 13.250812530517578, + "learning_rate": 1.513474193514842e-05, + "loss": 1.4199, + "step": 530 + }, + { + "epoch": 1.3947368421052633, + "step": 530, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.3973684210526316, + "grad_norm": 9.62955379486084, + "learning_rate": 1.5116441165172328e-05, + "loss": 1.4917, + "step": 531 + }, + { + "epoch": 1.3973684210526316, + "step": 531, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.4, + "grad_norm": 3.8189797401428223, + "learning_rate": 1.5098117153244317e-05, + "loss": 1.293, + "step": 532 + }, + { + "epoch": 1.4, + "eval_accuracy": 0.7084949016571045, + "eval_max_score": 5.59375, + "eval_min_score": -5.59375, + "eval_runtime": 151.3104, + "eval_samples_per_second": 18.75, + "eval_steps_per_second": 0.297, + "step": 532 + }, + { + "epoch": 1.4, + "step": 532, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.4026315789473685, + "grad_norm": 7.4259843826293945, + "learning_rate": 1.5079769982603067e-05, + "loss": 1.3726, + "step": 533 + }, + { + "epoch": 1.4026315789473685, + "step": 533, + "train_accuracy": 0.75 + }, + { + "epoch": 1.4052631578947368, + "grad_norm": 3.3982386589050293, + "learning_rate": 1.5061399736592457e-05, + "loss": 1.2163, + "step": 534 + }, + { + "epoch": 1.4052631578947368, + "step": 534, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.4078947368421053, + "grad_norm": 17.446184158325195, + "learning_rate": 1.504300649866119e-05, + "loss": 1.1636, + "step": 535 + }, + { + "epoch": 1.4078947368421053, + "step": 535, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.4105263157894736, + "grad_norm": 9.651148796081543, + "learning_rate": 1.5024590352362412e-05, + "loss": 1.5947, + "step": 536 + }, + { + "epoch": 1.4105263157894736, + "step": 536, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.4131578947368422, + "grad_norm": 4.520640850067139, + "learning_rate": 1.5006151381353328e-05, + "loss": 1.353, + "step": 537 + }, + { + "epoch": 1.4131578947368422, + "step": 537, + "train_accuracy": 0.75 + }, + { + "epoch": 1.4157894736842105, + "grad_norm": 4.592413902282715, + "learning_rate": 1.4987689669394836e-05, + "loss": 1.0796, + "step": 538 + }, + { + "epoch": 1.4157894736842105, + "step": 538, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.418421052631579, + "grad_norm": 8.244244575500488, + "learning_rate": 1.4969205300351128e-05, + "loss": 1.2051, + "step": 539 + }, + { + "epoch": 1.418421052631579, + "step": 539, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 22.695865631103516, + "learning_rate": 1.4950698358189322e-05, + "loss": 1.6543, + "step": 540 + }, + { + "epoch": 1.4210526315789473, + "step": 540, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.4236842105263157, + "grad_norm": 14.040301322937012, + "learning_rate": 1.4932168926979074e-05, + "loss": 1.354, + "step": 541 + }, + { + "epoch": 1.4236842105263157, + "step": 541, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.4263157894736842, + "grad_norm": 12.0126314163208, + "learning_rate": 1.4913617090892206e-05, + "loss": 1.3936, + "step": 542 + }, + { + "epoch": 1.4263157894736842, + "step": 542, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.4289473684210527, + "grad_norm": 9.563346862792969, + "learning_rate": 1.4895042934202306e-05, + "loss": 1.1978, + "step": 543 + }, + { + "epoch": 1.4289473684210527, + "step": 543, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.431578947368421, + "grad_norm": 15.859170913696289, + "learning_rate": 1.4876446541284365e-05, + "loss": 1.3774, + "step": 544 + }, + { + "epoch": 1.431578947368421, + "step": 544, + "train_accuracy": 0.859375 + }, + { + "epoch": 1.4342105263157894, + "grad_norm": 17.58700942993164, + "learning_rate": 1.485782799661438e-05, + "loss": 1.2646, + "step": 545 + }, + { + "epoch": 1.4342105263157894, + "step": 545, + "train_accuracy": 0.65625 + }, + { + "epoch": 1.436842105263158, + "grad_norm": 8.851125717163086, + "learning_rate": 1.4839187384768971e-05, + "loss": 1.2417, + "step": 546 + }, + { + "epoch": 1.436842105263158, + "step": 546, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.4394736842105262, + "grad_norm": 3.8548920154571533, + "learning_rate": 1.4820524790425007e-05, + "loss": 1.2793, + "step": 547 + }, + { + "epoch": 1.4394736842105262, + "step": 547, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.4421052631578948, + "grad_norm": 9.304672241210938, + "learning_rate": 1.4801840298359216e-05, + "loss": 1.314, + "step": 548 + }, + { + "epoch": 1.4421052631578948, + "step": 548, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.444736842105263, + "grad_norm": 10.30390453338623, + "learning_rate": 1.4783133993447789e-05, + "loss": 1.3345, + "step": 549 + }, + { + "epoch": 1.444736842105263, + "step": 549, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 10.108946800231934, + "learning_rate": 1.4764405960666011e-05, + "loss": 1.374, + "step": 550 + }, + { + "epoch": 1.4473684210526316, + "step": 550, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.45, + "grad_norm": 4.669325351715088, + "learning_rate": 1.4745656285087866e-05, + "loss": 1.2256, + "step": 551 + }, + { + "epoch": 1.45, + "step": 551, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.4526315789473685, + "grad_norm": 6.70346736907959, + "learning_rate": 1.4726885051885654e-05, + "loss": 1.3486, + "step": 552 + }, + { + "epoch": 1.4526315789473685, + "step": 552, + "train_accuracy": 0.84375 + }, + { + "epoch": 1.4552631578947368, + "grad_norm": 3.9752542972564697, + "learning_rate": 1.4708092346329604e-05, + "loss": 1.147, + "step": 553 + }, + { + "epoch": 1.4552631578947368, + "step": 553, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.4578947368421051, + "grad_norm": 8.792821884155273, + "learning_rate": 1.468927825378748e-05, + "loss": 1.3594, + "step": 554 + }, + { + "epoch": 1.4578947368421051, + "step": 554, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.4605263157894737, + "grad_norm": 5.539193153381348, + "learning_rate": 1.4670442859724204e-05, + "loss": 1.0762, + "step": 555 + }, + { + "epoch": 1.4605263157894737, + "step": 555, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.4631578947368422, + "grad_norm": 9.367209434509277, + "learning_rate": 1.4651586249701458e-05, + "loss": 1.2529, + "step": 556 + }, + { + "epoch": 1.4631578947368422, + "step": 556, + "train_accuracy": 0.75 + }, + { + "epoch": 1.4657894736842105, + "grad_norm": 13.890340805053711, + "learning_rate": 1.4632708509377305e-05, + "loss": 1.3589, + "step": 557 + }, + { + "epoch": 1.4657894736842105, + "step": 557, + "train_accuracy": 0.75 + }, + { + "epoch": 1.4684210526315788, + "grad_norm": 3.0693435668945312, + "learning_rate": 1.461380972450579e-05, + "loss": 1.1621, + "step": 558 + }, + { + "epoch": 1.4684210526315788, + "step": 558, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.4710526315789474, + "grad_norm": 9.083732604980469, + "learning_rate": 1.4594889980936554e-05, + "loss": 1.3843, + "step": 559 + }, + { + "epoch": 1.4710526315789474, + "step": 559, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 8.28622055053711, + "learning_rate": 1.4575949364614453e-05, + "loss": 1.2578, + "step": 560 + }, + { + "epoch": 1.4736842105263157, + "step": 560, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.4763157894736842, + "grad_norm": 10.74136734008789, + "learning_rate": 1.4556987961579148e-05, + "loss": 1.5986, + "step": 561 + }, + { + "epoch": 1.4763157894736842, + "step": 561, + "train_accuracy": 0.640625 + }, + { + "epoch": 1.4789473684210526, + "grad_norm": 5.316714763641357, + "learning_rate": 1.4538005857964735e-05, + "loss": 1.5508, + "step": 562 + }, + { + "epoch": 1.4789473684210526, + "step": 562, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.481578947368421, + "grad_norm": 3.3459677696228027, + "learning_rate": 1.451900313999934e-05, + "loss": 1.332, + "step": 563 + }, + { + "epoch": 1.481578947368421, + "step": 563, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.4842105263157894, + "grad_norm": 8.240880012512207, + "learning_rate": 1.4499979894004733e-05, + "loss": 1.001, + "step": 564 + }, + { + "epoch": 1.4842105263157894, + "step": 564, + "train_accuracy": 0.65625 + }, + { + "epoch": 1.486842105263158, + "grad_norm": 8.606131553649902, + "learning_rate": 1.4480936206395936e-05, + "loss": 1.436, + "step": 565 + }, + { + "epoch": 1.486842105263158, + "step": 565, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.4894736842105263, + "grad_norm": 3.6511523723602295, + "learning_rate": 1.4461872163680826e-05, + "loss": 1.1348, + "step": 566 + }, + { + "epoch": 1.4894736842105263, + "step": 566, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.4921052631578946, + "grad_norm": 5.405193328857422, + "learning_rate": 1.4442787852459748e-05, + "loss": 1.1343, + "step": 567 + }, + { + "epoch": 1.4921052631578946, + "step": 567, + "train_accuracy": 0.65625 + }, + { + "epoch": 1.4947368421052631, + "grad_norm": 5.214520454406738, + "learning_rate": 1.4423683359425118e-05, + "loss": 1.4219, + "step": 568 + }, + { + "epoch": 1.4947368421052631, + "step": 568, + "train_accuracy": 0.640625 + }, + { + "epoch": 1.4973684210526317, + "grad_norm": 2.837728500366211, + "learning_rate": 1.4404558771361027e-05, + "loss": 1.3423, + "step": 569 + }, + { + "epoch": 1.4973684210526317, + "step": 569, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.5, + "grad_norm": 3.266947031021118, + "learning_rate": 1.4385414175142855e-05, + "loss": 1.1948, + "step": 570 + }, + { + "epoch": 1.5, + "step": 570, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.5026315789473683, + "grad_norm": 12.204270362854004, + "learning_rate": 1.4366249657736866e-05, + "loss": 1.3867, + "step": 571 + }, + { + "epoch": 1.5026315789473683, + "step": 571, + "train_accuracy": 0.828125 + }, + { + "epoch": 1.5052631578947369, + "grad_norm": 6.56190299987793, + "learning_rate": 1.4347065306199823e-05, + "loss": 1.2837, + "step": 572 + }, + { + "epoch": 1.5052631578947369, + "step": 572, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.5078947368421054, + "grad_norm": 5.147476673126221, + "learning_rate": 1.4327861207678581e-05, + "loss": 1.2783, + "step": 573 + }, + { + "epoch": 1.5078947368421054, + "step": 573, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.5105263157894737, + "grad_norm": 3.911860704421997, + "learning_rate": 1.4308637449409705e-05, + "loss": 1.3105, + "step": 574 + }, + { + "epoch": 1.5105263157894737, + "step": 574, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.513157894736842, + "grad_norm": 4.936273097991943, + "learning_rate": 1.4289394118719061e-05, + "loss": 1.4072, + "step": 575 + }, + { + "epoch": 1.513157894736842, + "step": 575, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.5157894736842106, + "grad_norm": 4.839917182922363, + "learning_rate": 1.4270131303021431e-05, + "loss": 1.4307, + "step": 576 + }, + { + "epoch": 1.5157894736842106, + "step": 576, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.518421052631579, + "grad_norm": 9.490140914916992, + "learning_rate": 1.4250849089820095e-05, + "loss": 1.3818, + "step": 577 + }, + { + "epoch": 1.518421052631579, + "step": 577, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.5210526315789474, + "grad_norm": 3.4175305366516113, + "learning_rate": 1.423154756670647e-05, + "loss": 1.0977, + "step": 578 + }, + { + "epoch": 1.5210526315789474, + "step": 578, + "train_accuracy": 0.75 + }, + { + "epoch": 1.5236842105263158, + "grad_norm": 7.429515838623047, + "learning_rate": 1.4212226821359672e-05, + "loss": 1.1333, + "step": 579 + }, + { + "epoch": 1.5236842105263158, + "step": 579, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 4.309225559234619, + "learning_rate": 1.4192886941546141e-05, + "loss": 0.9233, + "step": 580 + }, + { + "epoch": 1.526315789473684, + "step": 580, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.5289473684210526, + "grad_norm": 15.609049797058105, + "learning_rate": 1.4173528015119247e-05, + "loss": 1.4473, + "step": 581 + }, + { + "epoch": 1.5289473684210526, + "step": 581, + "train_accuracy": 0.640625 + }, + { + "epoch": 1.5315789473684212, + "grad_norm": 8.803685188293457, + "learning_rate": 1.4154150130018867e-05, + "loss": 1.293, + "step": 582 + }, + { + "epoch": 1.5315789473684212, + "step": 582, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.5342105263157895, + "grad_norm": 3.669485092163086, + "learning_rate": 1.4134753374271003e-05, + "loss": 1.0322, + "step": 583 + }, + { + "epoch": 1.5342105263157895, + "step": 583, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.5368421052631578, + "grad_norm": 6.106133460998535, + "learning_rate": 1.4115337835987388e-05, + "loss": 1.4111, + "step": 584 + }, + { + "epoch": 1.5368421052631578, + "step": 584, + "train_accuracy": 0.828125 + }, + { + "epoch": 1.5394736842105263, + "grad_norm": 5.285925388336182, + "learning_rate": 1.4095903603365067e-05, + "loss": 1.21, + "step": 585 + }, + { + "epoch": 1.5394736842105263, + "step": 585, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.5421052631578949, + "grad_norm": 7.868580341339111, + "learning_rate": 1.4076450764686005e-05, + "loss": 1.2734, + "step": 586 + }, + { + "epoch": 1.5421052631578949, + "step": 586, + "train_accuracy": 0.828125 + }, + { + "epoch": 1.5447368421052632, + "grad_norm": 9.833069801330566, + "learning_rate": 1.40569794083167e-05, + "loss": 1.1172, + "step": 587 + }, + { + "epoch": 1.5447368421052632, + "step": 587, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.5473684210526315, + "grad_norm": 12.843183517456055, + "learning_rate": 1.4037489622707749e-05, + "loss": 1.4414, + "step": 588 + }, + { + "epoch": 1.5473684210526315, + "step": 588, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.55, + "grad_norm": 4.2173662185668945, + "learning_rate": 1.4017981496393484e-05, + "loss": 1.186, + "step": 589 + }, + { + "epoch": 1.55, + "step": 589, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.5526315789473686, + "grad_norm": 9.550527572631836, + "learning_rate": 1.3998455117991542e-05, + "loss": 1.1289, + "step": 590 + }, + { + "epoch": 1.5526315789473686, + "step": 590, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.555263157894737, + "grad_norm": 17.769779205322266, + "learning_rate": 1.3978910576202472e-05, + "loss": 1.9341, + "step": 591 + }, + { + "epoch": 1.555263157894737, + "step": 591, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.5578947368421052, + "grad_norm": 11.969633102416992, + "learning_rate": 1.395934795980933e-05, + "loss": 1.4902, + "step": 592 + }, + { + "epoch": 1.5578947368421052, + "step": 592, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.5605263157894735, + "grad_norm": 2.4993820190429688, + "learning_rate": 1.3939767357677287e-05, + "loss": 1.1899, + "step": 593 + }, + { + "epoch": 1.5605263157894735, + "step": 593, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.563157894736842, + "grad_norm": 6.35914421081543, + "learning_rate": 1.3920168858753208e-05, + "loss": 1.2163, + "step": 594 + }, + { + "epoch": 1.563157894736842, + "step": 594, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.5657894736842106, + "grad_norm": 9.9996337890625, + "learning_rate": 1.3900552552065259e-05, + "loss": 1.3228, + "step": 595 + }, + { + "epoch": 1.5657894736842106, + "step": 595, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.568421052631579, + "grad_norm": 8.151016235351562, + "learning_rate": 1.3880918526722497e-05, + "loss": 1.5107, + "step": 596 + }, + { + "epoch": 1.568421052631579, + "step": 596, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.5710526315789473, + "grad_norm": 5.698625564575195, + "learning_rate": 1.3861266871914473e-05, + "loss": 1.4619, + "step": 597 + }, + { + "epoch": 1.5710526315789473, + "step": 597, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.5736842105263158, + "grad_norm": 10.527148246765137, + "learning_rate": 1.3841597676910816e-05, + "loss": 1.3984, + "step": 598 + }, + { + "epoch": 1.5736842105263158, + "step": 598, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.5763157894736843, + "grad_norm": 4.670735836029053, + "learning_rate": 1.3821911031060834e-05, + "loss": 1.3779, + "step": 599 + }, + { + "epoch": 1.5763157894736843, + "step": 599, + "train_accuracy": 0.640625 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 5.227207183837891, + "learning_rate": 1.3802207023793112e-05, + "loss": 1.3438, + "step": 600 + }, + { + "epoch": 1.5789473684210527, + "step": 600, + "train_accuracy": 0.75 + }, + { + "epoch": 1.581578947368421, + "grad_norm": 7.337916851043701, + "learning_rate": 1.3782485744615098e-05, + "loss": 1.3682, + "step": 601 + }, + { + "epoch": 1.581578947368421, + "step": 601, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.5842105263157895, + "grad_norm": 9.321327209472656, + "learning_rate": 1.3762747283112692e-05, + "loss": 1.3521, + "step": 602 + }, + { + "epoch": 1.5842105263157895, + "step": 602, + "train_accuracy": 0.625 + }, + { + "epoch": 1.586842105263158, + "grad_norm": 2.6610355377197266, + "learning_rate": 1.3742991728949862e-05, + "loss": 1.252, + "step": 603 + }, + { + "epoch": 1.586842105263158, + "step": 603, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.5894736842105264, + "grad_norm": 4.487646102905273, + "learning_rate": 1.3723219171868207e-05, + "loss": 1.1855, + "step": 604 + }, + { + "epoch": 1.5894736842105264, + "step": 604, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.5921052631578947, + "grad_norm": 2.9246814250946045, + "learning_rate": 1.370342970168657e-05, + "loss": 1.3052, + "step": 605 + }, + { + "epoch": 1.5921052631578947, + "step": 605, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.594736842105263, + "grad_norm": 11.696772575378418, + "learning_rate": 1.3683623408300626e-05, + "loss": 1.2334, + "step": 606 + }, + { + "epoch": 1.594736842105263, + "step": 606, + "train_accuracy": 0.828125 + }, + { + "epoch": 1.5973684210526315, + "grad_norm": 10.42992115020752, + "learning_rate": 1.3663800381682465e-05, + "loss": 1.1255, + "step": 607 + }, + { + "epoch": 1.5973684210526315, + "step": 607, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.6, + "grad_norm": 8.260746955871582, + "learning_rate": 1.3643960711880191e-05, + "loss": 1.1284, + "step": 608 + }, + { + "epoch": 1.6, + "eval_accuracy": 0.7060275077819824, + "eval_max_score": 5.46875, + "eval_min_score": -6.65625, + "eval_runtime": 151.2596, + "eval_samples_per_second": 18.756, + "eval_steps_per_second": 0.298, + "step": 608 + }, + { + "epoch": 1.6, + "step": 608, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.6026315789473684, + "grad_norm": 3.6682422161102295, + "learning_rate": 1.3624104489017513e-05, + "loss": 1.2915, + "step": 609 + }, + { + "epoch": 1.6026315789473684, + "step": 609, + "train_accuracy": 0.859375 + }, + { + "epoch": 1.6052631578947367, + "grad_norm": 7.5316596031188965, + "learning_rate": 1.3604231803293336e-05, + "loss": 1.1611, + "step": 610 + }, + { + "epoch": 1.6052631578947367, + "step": 610, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.6078947368421053, + "grad_norm": 10.11808967590332, + "learning_rate": 1.3584342744981343e-05, + "loss": 1.2471, + "step": 611 + }, + { + "epoch": 1.6078947368421053, + "step": 611, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.6105263157894738, + "grad_norm": 5.6028242111206055, + "learning_rate": 1.3564437404429595e-05, + "loss": 1.5654, + "step": 612 + }, + { + "epoch": 1.6105263157894738, + "step": 612, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.6131578947368421, + "grad_norm": 4.102721691131592, + "learning_rate": 1.3544515872060118e-05, + "loss": 1.1572, + "step": 613 + }, + { + "epoch": 1.6131578947368421, + "step": 613, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.6157894736842104, + "grad_norm": 3.3373892307281494, + "learning_rate": 1.3524578238368489e-05, + "loss": 1.3301, + "step": 614 + }, + { + "epoch": 1.6157894736842104, + "step": 614, + "train_accuracy": 0.75 + }, + { + "epoch": 1.618421052631579, + "grad_norm": 19.588586807250977, + "learning_rate": 1.350462459392343e-05, + "loss": 1.3608, + "step": 615 + }, + { + "epoch": 1.618421052631579, + "step": 615, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.6210526315789475, + "grad_norm": 4.01005220413208, + "learning_rate": 1.3484655029366387e-05, + "loss": 1.4072, + "step": 616 + }, + { + "epoch": 1.6210526315789475, + "step": 616, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.6236842105263158, + "grad_norm": 3.243781566619873, + "learning_rate": 1.3464669635411127e-05, + "loss": 1.3027, + "step": 617 + }, + { + "epoch": 1.6236842105263158, + "step": 617, + "train_accuracy": 0.75 + }, + { + "epoch": 1.6263157894736842, + "grad_norm": 6.2855753898620605, + "learning_rate": 1.344466850284333e-05, + "loss": 1.209, + "step": 618 + }, + { + "epoch": 1.6263157894736842, + "step": 618, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.6289473684210525, + "grad_norm": 8.140256881713867, + "learning_rate": 1.3424651722520164e-05, + "loss": 1.543, + "step": 619 + }, + { + "epoch": 1.6289473684210525, + "step": 619, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 4.431771278381348, + "learning_rate": 1.3404619385369876e-05, + "loss": 1.3579, + "step": 620 + }, + { + "epoch": 1.631578947368421, + "step": 620, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.6342105263157896, + "grad_norm": 11.76955509185791, + "learning_rate": 1.3384571582391392e-05, + "loss": 1.1875, + "step": 621 + }, + { + "epoch": 1.6342105263157896, + "step": 621, + "train_accuracy": 0.625 + }, + { + "epoch": 1.6368421052631579, + "grad_norm": 6.960719585418701, + "learning_rate": 1.3364508404653879e-05, + "loss": 1.5352, + "step": 622 + }, + { + "epoch": 1.6368421052631579, + "step": 622, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.6394736842105262, + "grad_norm": 4.3750762939453125, + "learning_rate": 1.3344429943296358e-05, + "loss": 1.3037, + "step": 623 + }, + { + "epoch": 1.6394736842105262, + "step": 623, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.6421052631578947, + "grad_norm": 11.1054105758667, + "learning_rate": 1.3324336289527272e-05, + "loss": 1.4268, + "step": 624 + }, + { + "epoch": 1.6421052631578947, + "step": 624, + "train_accuracy": 0.609375 + }, + { + "epoch": 1.6447368421052633, + "grad_norm": 14.030521392822266, + "learning_rate": 1.3304227534624072e-05, + "loss": 1.4775, + "step": 625 + }, + { + "epoch": 1.6447368421052633, + "step": 625, + "train_accuracy": 0.75 + }, + { + "epoch": 1.6473684210526316, + "grad_norm": 12.589299201965332, + "learning_rate": 1.328410376993282e-05, + "loss": 1.2598, + "step": 626 + }, + { + "epoch": 1.6473684210526316, + "step": 626, + "train_accuracy": 0.625 + }, + { + "epoch": 1.65, + "grad_norm": 4.208714962005615, + "learning_rate": 1.3263965086867752e-05, + "loss": 1.4551, + "step": 627 + }, + { + "epoch": 1.65, + "step": 627, + "train_accuracy": 0.875 + }, + { + "epoch": 1.6526315789473685, + "grad_norm": 8.422722816467285, + "learning_rate": 1.3243811576910873e-05, + "loss": 1.1475, + "step": 628 + }, + { + "epoch": 1.6526315789473685, + "step": 628, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.655263157894737, + "grad_norm": 11.778885841369629, + "learning_rate": 1.3223643331611538e-05, + "loss": 1.2666, + "step": 629 + }, + { + "epoch": 1.655263157894737, + "step": 629, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.6578947368421053, + "grad_norm": 3.499476432800293, + "learning_rate": 1.3203460442586052e-05, + "loss": 1.1436, + "step": 630 + }, + { + "epoch": 1.6578947368421053, + "step": 630, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.6605263157894736, + "grad_norm": 9.276093482971191, + "learning_rate": 1.3183263001517224e-05, + "loss": 1.2021, + "step": 631 + }, + { + "epoch": 1.6605263157894736, + "step": 631, + "train_accuracy": 0.875 + }, + { + "epoch": 1.663157894736842, + "grad_norm": 5.010464668273926, + "learning_rate": 1.3163051100153979e-05, + "loss": 0.9712, + "step": 632 + }, + { + "epoch": 1.663157894736842, + "step": 632, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.6657894736842105, + "grad_norm": 4.382296085357666, + "learning_rate": 1.314282483031092e-05, + "loss": 1.3379, + "step": 633 + }, + { + "epoch": 1.6657894736842105, + "step": 633, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.668421052631579, + "grad_norm": 8.575241088867188, + "learning_rate": 1.3122584283867932e-05, + "loss": 1.3179, + "step": 634 + }, + { + "epoch": 1.668421052631579, + "step": 634, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.6710526315789473, + "grad_norm": 9.079877853393555, + "learning_rate": 1.3102329552769742e-05, + "loss": 1.1982, + "step": 635 + }, + { + "epoch": 1.6710526315789473, + "step": 635, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.6736842105263157, + "grad_norm": 4.818666458129883, + "learning_rate": 1.3082060729025515e-05, + "loss": 1.3408, + "step": 636 + }, + { + "epoch": 1.6736842105263157, + "step": 636, + "train_accuracy": 0.75 + }, + { + "epoch": 1.6763157894736842, + "grad_norm": 4.9298601150512695, + "learning_rate": 1.3061777904708437e-05, + "loss": 1.3608, + "step": 637 + }, + { + "epoch": 1.6763157894736842, + "step": 637, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.6789473684210527, + "grad_norm": 3.88163685798645, + "learning_rate": 1.3041481171955293e-05, + "loss": 1.2104, + "step": 638 + }, + { + "epoch": 1.6789473684210527, + "step": 638, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.681578947368421, + "grad_norm": 3.570984125137329, + "learning_rate": 1.3021170622966039e-05, + "loss": 1.3003, + "step": 639 + }, + { + "epoch": 1.681578947368421, + "step": 639, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 3.3834519386291504, + "learning_rate": 1.300084635000341e-05, + "loss": 1.293, + "step": 640 + }, + { + "epoch": 1.6842105263157894, + "step": 640, + "train_accuracy": 0.75 + }, + { + "epoch": 1.686842105263158, + "grad_norm": 6.387214660644531, + "learning_rate": 1.298050844539246e-05, + "loss": 1.3428, + "step": 641 + }, + { + "epoch": 1.686842105263158, + "step": 641, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.6894736842105265, + "grad_norm": 5.291227340698242, + "learning_rate": 1.2960157001520193e-05, + "loss": 1.0625, + "step": 642 + }, + { + "epoch": 1.6894736842105265, + "step": 642, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.6921052631578948, + "grad_norm": 8.750785827636719, + "learning_rate": 1.2939792110835094e-05, + "loss": 1.2295, + "step": 643 + }, + { + "epoch": 1.6921052631578948, + "step": 643, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.694736842105263, + "grad_norm": 5.106113910675049, + "learning_rate": 1.2919413865846744e-05, + "loss": 1.4746, + "step": 644 + }, + { + "epoch": 1.694736842105263, + "step": 644, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.6973684210526314, + "grad_norm": 3.0670228004455566, + "learning_rate": 1.2899022359125381e-05, + "loss": 1.3691, + "step": 645 + }, + { + "epoch": 1.6973684210526314, + "step": 645, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.7, + "grad_norm": 11.231282234191895, + "learning_rate": 1.2878617683301493e-05, + "loss": 1.3296, + "step": 646 + }, + { + "epoch": 1.7, + "step": 646, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.7026315789473685, + "grad_norm": 4.981436252593994, + "learning_rate": 1.2858199931065382e-05, + "loss": 1.2363, + "step": 647 + }, + { + "epoch": 1.7026315789473685, + "step": 647, + "train_accuracy": 0.65625 + }, + { + "epoch": 1.7052631578947368, + "grad_norm": 5.563715934753418, + "learning_rate": 1.2837769195166757e-05, + "loss": 1.2451, + "step": 648 + }, + { + "epoch": 1.7052631578947368, + "step": 648, + "train_accuracy": 0.65625 + }, + { + "epoch": 1.7078947368421051, + "grad_norm": 4.0916972160339355, + "learning_rate": 1.2817325568414299e-05, + "loss": 1.4819, + "step": 649 + }, + { + "epoch": 1.7078947368421051, + "step": 649, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 2.385584831237793, + "learning_rate": 1.2796869143675254e-05, + "loss": 1.3257, + "step": 650 + }, + { + "epoch": 1.7105263157894737, + "step": 650, + "train_accuracy": 0.828125 + }, + { + "epoch": 1.7131578947368422, + "grad_norm": 6.606967926025391, + "learning_rate": 1.2776400013875006e-05, + "loss": 1.145, + "step": 651 + }, + { + "epoch": 1.7131578947368422, + "step": 651, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.7157894736842105, + "grad_norm": 6.199295520782471, + "learning_rate": 1.2755918271996645e-05, + "loss": 1.3477, + "step": 652 + }, + { + "epoch": 1.7157894736842105, + "step": 652, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.7184210526315788, + "grad_norm": 3.354233980178833, + "learning_rate": 1.2735424011080562e-05, + "loss": 1.2456, + "step": 653 + }, + { + "epoch": 1.7184210526315788, + "step": 653, + "train_accuracy": 0.75 + }, + { + "epoch": 1.7210526315789474, + "grad_norm": 2.5530078411102295, + "learning_rate": 1.2714917324224003e-05, + "loss": 1.3369, + "step": 654 + }, + { + "epoch": 1.7210526315789474, + "step": 654, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.723684210526316, + "grad_norm": 2.3354504108428955, + "learning_rate": 1.2694398304580677e-05, + "loss": 1.2271, + "step": 655 + }, + { + "epoch": 1.723684210526316, + "step": 655, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.7263157894736842, + "grad_norm": 4.240665435791016, + "learning_rate": 1.2673867045360304e-05, + "loss": 1.3633, + "step": 656 + }, + { + "epoch": 1.7263157894736842, + "step": 656, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.7289473684210526, + "grad_norm": 2.5598838329315186, + "learning_rate": 1.2653323639828208e-05, + "loss": 1.3408, + "step": 657 + }, + { + "epoch": 1.7289473684210526, + "step": 657, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.731578947368421, + "grad_norm": 4.531867027282715, + "learning_rate": 1.2632768181304888e-05, + "loss": 1.3018, + "step": 658 + }, + { + "epoch": 1.731578947368421, + "step": 658, + "train_accuracy": 0.828125 + }, + { + "epoch": 1.7342105263157894, + "grad_norm": 2.8843955993652344, + "learning_rate": 1.2612200763165597e-05, + "loss": 1.3086, + "step": 659 + }, + { + "epoch": 1.7342105263157894, + "step": 659, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 2.640134572982788, + "learning_rate": 1.2591621478839911e-05, + "loss": 1.231, + "step": 660 + }, + { + "epoch": 1.736842105263158, + "step": 660, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.7394736842105263, + "grad_norm": 2.6228537559509277, + "learning_rate": 1.2571030421811314e-05, + "loss": 1.3301, + "step": 661 + }, + { + "epoch": 1.7394736842105263, + "step": 661, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.7421052631578946, + "grad_norm": 4.74251127243042, + "learning_rate": 1.2550427685616767e-05, + "loss": 1.3125, + "step": 662 + }, + { + "epoch": 1.7421052631578946, + "step": 662, + "train_accuracy": 0.75 + }, + { + "epoch": 1.7447368421052631, + "grad_norm": 8.167641639709473, + "learning_rate": 1.2529813363846284e-05, + "loss": 1.3848, + "step": 663 + }, + { + "epoch": 1.7447368421052631, + "step": 663, + "train_accuracy": 0.625 + }, + { + "epoch": 1.7473684210526317, + "grad_norm": 3.8640027046203613, + "learning_rate": 1.2509187550142507e-05, + "loss": 1.2764, + "step": 664 + }, + { + "epoch": 1.7473684210526317, + "step": 664, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.75, + "grad_norm": 3.9919190406799316, + "learning_rate": 1.2488550338200285e-05, + "loss": 1.3896, + "step": 665 + }, + { + "epoch": 1.75, + "step": 665, + "train_accuracy": 0.75 + }, + { + "epoch": 1.7526315789473683, + "grad_norm": 6.61153507232666, + "learning_rate": 1.2467901821766241e-05, + "loss": 1.0601, + "step": 666 + }, + { + "epoch": 1.7526315789473683, + "step": 666, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.7552631578947369, + "grad_norm": 3.031205892562866, + "learning_rate": 1.2447242094638349e-05, + "loss": 1.3569, + "step": 667 + }, + { + "epoch": 1.7552631578947369, + "step": 667, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.7578947368421054, + "grad_norm": 5.902942657470703, + "learning_rate": 1.2426571250665517e-05, + "loss": 1.1152, + "step": 668 + }, + { + "epoch": 1.7578947368421054, + "step": 668, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.7605263157894737, + "grad_norm": 2.655332326889038, + "learning_rate": 1.2405889383747144e-05, + "loss": 1.3159, + "step": 669 + }, + { + "epoch": 1.7605263157894737, + "step": 669, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.763157894736842, + "grad_norm": 5.942189693450928, + "learning_rate": 1.2385196587832702e-05, + "loss": 1.2368, + "step": 670 + }, + { + "epoch": 1.763157894736842, + "step": 670, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.7657894736842106, + "grad_norm": 5.477725028991699, + "learning_rate": 1.236449295692131e-05, + "loss": 1.2134, + "step": 671 + }, + { + "epoch": 1.7657894736842106, + "step": 671, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.768421052631579, + "grad_norm": 3.354971408843994, + "learning_rate": 1.234377858506131e-05, + "loss": 1.3511, + "step": 672 + }, + { + "epoch": 1.768421052631579, + "step": 672, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.7710526315789474, + "grad_norm": 3.9351298809051514, + "learning_rate": 1.2323053566349834e-05, + "loss": 1.2578, + "step": 673 + }, + { + "epoch": 1.7710526315789474, + "step": 673, + "train_accuracy": 0.75 + }, + { + "epoch": 1.7736842105263158, + "grad_norm": 4.374091148376465, + "learning_rate": 1.2302317994932373e-05, + "loss": 1.3262, + "step": 674 + }, + { + "epoch": 1.7736842105263158, + "step": 674, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.776315789473684, + "grad_norm": 7.173335552215576, + "learning_rate": 1.2281571965002363e-05, + "loss": 1.0754, + "step": 675 + }, + { + "epoch": 1.776315789473684, + "step": 675, + "train_accuracy": 0.75 + }, + { + "epoch": 1.7789473684210526, + "grad_norm": 6.442569732666016, + "learning_rate": 1.2260815570800743e-05, + "loss": 1.3569, + "step": 676 + }, + { + "epoch": 1.7789473684210526, + "step": 676, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.7815789473684212, + "grad_norm": 3.3768608570098877, + "learning_rate": 1.2240048906615536e-05, + "loss": 1.373, + "step": 677 + }, + { + "epoch": 1.7815789473684212, + "step": 677, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.7842105263157895, + "grad_norm": 8.247380256652832, + "learning_rate": 1.2219272066781416e-05, + "loss": 1.3501, + "step": 678 + }, + { + "epoch": 1.7842105263157895, + "step": 678, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.7868421052631578, + "grad_norm": 3.1177291870117188, + "learning_rate": 1.219848514567928e-05, + "loss": 1.167, + "step": 679 + }, + { + "epoch": 1.7868421052631578, + "step": 679, + "train_accuracy": 0.84375 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 3.9514224529266357, + "learning_rate": 1.2177688237735823e-05, + "loss": 1.4126, + "step": 680 + }, + { + "epoch": 1.7894736842105263, + "step": 680, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.7921052631578949, + "grad_norm": 5.616189002990723, + "learning_rate": 1.2156881437423103e-05, + "loss": 1.3857, + "step": 681 + }, + { + "epoch": 1.7921052631578949, + "step": 681, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.7947368421052632, + "grad_norm": 12.491791725158691, + "learning_rate": 1.2136064839258119e-05, + "loss": 1.3823, + "step": 682 + }, + { + "epoch": 1.7947368421052632, + "step": 682, + "train_accuracy": 0.875 + }, + { + "epoch": 1.7973684210526315, + "grad_norm": 5.0712504386901855, + "learning_rate": 1.2115238537802371e-05, + "loss": 1.3481, + "step": 683 + }, + { + "epoch": 1.7973684210526315, + "step": 683, + "train_accuracy": 0.875 + }, + { + "epoch": 1.8, + "grad_norm": 2.5290355682373047, + "learning_rate": 1.2094402627661447e-05, + "loss": 1.0576, + "step": 684 + }, + { + "epoch": 1.8, + "eval_accuracy": 0.7134296894073486, + "eval_max_score": 5.09375, + "eval_min_score": -5.46875, + "eval_runtime": 151.5395, + "eval_samples_per_second": 18.721, + "eval_steps_per_second": 0.297, + "step": 684 + }, + { + "epoch": 1.8, + "step": 684, + "train_accuracy": 0.875 + }, + { + "epoch": 1.8026315789473686, + "grad_norm": 4.08900260925293, + "learning_rate": 1.2073557203484571e-05, + "loss": 1.0898, + "step": 685 + }, + { + "epoch": 1.8026315789473686, + "step": 685, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.805263157894737, + "grad_norm": 3.661402940750122, + "learning_rate": 1.2052702359964201e-05, + "loss": 1.27, + "step": 686 + }, + { + "epoch": 1.805263157894737, + "step": 686, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.8078947368421052, + "grad_norm": 3.4118309020996094, + "learning_rate": 1.2031838191835569e-05, + "loss": 1.416, + "step": 687 + }, + { + "epoch": 1.8078947368421052, + "step": 687, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.8105263157894735, + "grad_norm": 7.626739978790283, + "learning_rate": 1.2010964793876274e-05, + "loss": 1.2495, + "step": 688 + }, + { + "epoch": 1.8105263157894735, + "step": 688, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.813157894736842, + "grad_norm": 3.9986813068389893, + "learning_rate": 1.1990082260905836e-05, + "loss": 1.4053, + "step": 689 + }, + { + "epoch": 1.813157894736842, + "step": 689, + "train_accuracy": 0.828125 + }, + { + "epoch": 1.8157894736842106, + "grad_norm": 5.343590259552002, + "learning_rate": 1.1969190687785278e-05, + "loss": 1.1636, + "step": 690 + }, + { + "epoch": 1.8157894736842106, + "step": 690, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.818421052631579, + "grad_norm": 3.1849772930145264, + "learning_rate": 1.1948290169416682e-05, + "loss": 1.3369, + "step": 691 + }, + { + "epoch": 1.818421052631579, + "step": 691, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.8210526315789473, + "grad_norm": 8.547561645507812, + "learning_rate": 1.1927380800742772e-05, + "loss": 1.2246, + "step": 692 + }, + { + "epoch": 1.8210526315789473, + "step": 692, + "train_accuracy": 0.84375 + }, + { + "epoch": 1.8236842105263158, + "grad_norm": 4.1838202476501465, + "learning_rate": 1.1906462676746471e-05, + "loss": 1.1265, + "step": 693 + }, + { + "epoch": 1.8236842105263158, + "step": 693, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.8263157894736843, + "grad_norm": 3.1758480072021484, + "learning_rate": 1.1885535892450473e-05, + "loss": 1.1968, + "step": 694 + }, + { + "epoch": 1.8263157894736843, + "step": 694, + "train_accuracy": 0.75 + }, + { + "epoch": 1.8289473684210527, + "grad_norm": 3.9789164066314697, + "learning_rate": 1.1864600542916813e-05, + "loss": 1.4497, + "step": 695 + }, + { + "epoch": 1.8289473684210527, + "step": 695, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.831578947368421, + "grad_norm": 3.6292474269866943, + "learning_rate": 1.1843656723246442e-05, + "loss": 1.4697, + "step": 696 + }, + { + "epoch": 1.831578947368421, + "step": 696, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.8342105263157895, + "grad_norm": 4.709868907928467, + "learning_rate": 1.1822704528578771e-05, + "loss": 1.2129, + "step": 697 + }, + { + "epoch": 1.8342105263157895, + "step": 697, + "train_accuracy": 0.75 + }, + { + "epoch": 1.836842105263158, + "grad_norm": 2.8560163974761963, + "learning_rate": 1.1801744054091275e-05, + "loss": 1.2881, + "step": 698 + }, + { + "epoch": 1.836842105263158, + "step": 698, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.8394736842105264, + "grad_norm": 3.929037094116211, + "learning_rate": 1.1780775394999026e-05, + "loss": 1.2056, + "step": 699 + }, + { + "epoch": 1.8394736842105264, + "step": 699, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 4.683406352996826, + "learning_rate": 1.1759798646554284e-05, + "loss": 1.5181, + "step": 700 + }, + { + "epoch": 1.8421052631578947, + "step": 700, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.844736842105263, + "grad_norm": 8.26242733001709, + "learning_rate": 1.1738813904046044e-05, + "loss": 1.3506, + "step": 701 + }, + { + "epoch": 1.844736842105263, + "step": 701, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.8473684210526315, + "grad_norm": 4.075243949890137, + "learning_rate": 1.1717821262799633e-05, + "loss": 1.2554, + "step": 702 + }, + { + "epoch": 1.8473684210526315, + "step": 702, + "train_accuracy": 0.640625 + }, + { + "epoch": 1.85, + "grad_norm": 3.1833837032318115, + "learning_rate": 1.1696820818176242e-05, + "loss": 1.2778, + "step": 703 + }, + { + "epoch": 1.85, + "step": 703, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.8526315789473684, + "grad_norm": 7.329124450683594, + "learning_rate": 1.1675812665572522e-05, + "loss": 1.1284, + "step": 704 + }, + { + "epoch": 1.8526315789473684, + "step": 704, + "train_accuracy": 0.765625 + }, + { + "epoch": 1.8552631578947367, + "grad_norm": 13.962849617004395, + "learning_rate": 1.165479690042013e-05, + "loss": 1.3296, + "step": 705 + }, + { + "epoch": 1.8552631578947367, + "step": 705, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.8578947368421053, + "grad_norm": 5.744749069213867, + "learning_rate": 1.1633773618185302e-05, + "loss": 1.2402, + "step": 706 + }, + { + "epoch": 1.8578947368421053, + "step": 706, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.8605263157894738, + "grad_norm": 3.8666727542877197, + "learning_rate": 1.1612742914368436e-05, + "loss": 1.2944, + "step": 707 + }, + { + "epoch": 1.8605263157894738, + "step": 707, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.8631578947368421, + "grad_norm": 4.552417278289795, + "learning_rate": 1.1591704884503625e-05, + "loss": 1.4844, + "step": 708 + }, + { + "epoch": 1.8631578947368421, + "step": 708, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.8657894736842104, + "grad_norm": 6.385856628417969, + "learning_rate": 1.1570659624158252e-05, + "loss": 1.1055, + "step": 709 + }, + { + "epoch": 1.8657894736842104, + "step": 709, + "train_accuracy": 0.828125 + }, + { + "epoch": 1.868421052631579, + "grad_norm": 9.559865951538086, + "learning_rate": 1.154960722893254e-05, + "loss": 1.3003, + "step": 710 + }, + { + "epoch": 1.868421052631579, + "step": 710, + "train_accuracy": 0.75 + }, + { + "epoch": 1.8710526315789475, + "grad_norm": 7.753382205963135, + "learning_rate": 1.1528547794459128e-05, + "loss": 1.314, + "step": 711 + }, + { + "epoch": 1.8710526315789475, + "step": 711, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.8736842105263158, + "grad_norm": 6.830855369567871, + "learning_rate": 1.1507481416402631e-05, + "loss": 1.2871, + "step": 712 + }, + { + "epoch": 1.8736842105263158, + "step": 712, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.8763157894736842, + "grad_norm": 9.252273559570312, + "learning_rate": 1.14864081904592e-05, + "loss": 1.4844, + "step": 713 + }, + { + "epoch": 1.8763157894736842, + "step": 713, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.8789473684210525, + "grad_norm": 3.4712741374969482, + "learning_rate": 1.1465328212356096e-05, + "loss": 1.2153, + "step": 714 + }, + { + "epoch": 1.8789473684210525, + "step": 714, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.881578947368421, + "grad_norm": 10.48102855682373, + "learning_rate": 1.1444241577851259e-05, + "loss": 1.3228, + "step": 715 + }, + { + "epoch": 1.881578947368421, + "step": 715, + "train_accuracy": 0.75 + }, + { + "epoch": 1.8842105263157896, + "grad_norm": 13.115138053894043, + "learning_rate": 1.1423148382732854e-05, + "loss": 1.3706, + "step": 716 + }, + { + "epoch": 1.8842105263157896, + "step": 716, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.8868421052631579, + "grad_norm": 12.306279182434082, + "learning_rate": 1.1402048722818862e-05, + "loss": 1.3506, + "step": 717 + }, + { + "epoch": 1.8868421052631579, + "step": 717, + "train_accuracy": 0.75 + }, + { + "epoch": 1.8894736842105262, + "grad_norm": 5.738865375518799, + "learning_rate": 1.1380942693956616e-05, + "loss": 1.3467, + "step": 718 + }, + { + "epoch": 1.8894736842105262, + "step": 718, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.8921052631578947, + "grad_norm": 11.618247032165527, + "learning_rate": 1.1359830392022397e-05, + "loss": 1.3203, + "step": 719 + }, + { + "epoch": 1.8921052631578947, + "step": 719, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 10.782801628112793, + "learning_rate": 1.1338711912920966e-05, + "loss": 1.2524, + "step": 720 + }, + { + "epoch": 1.8947368421052633, + "step": 720, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.8973684210526316, + "grad_norm": 10.74484920501709, + "learning_rate": 1.1317587352585158e-05, + "loss": 1.1309, + "step": 721 + }, + { + "epoch": 1.8973684210526316, + "step": 721, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.9, + "grad_norm": 10.578213691711426, + "learning_rate": 1.1296456806975425e-05, + "loss": 1.2578, + "step": 722 + }, + { + "epoch": 1.9, + "step": 722, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.9026315789473685, + "grad_norm": 6.203153610229492, + "learning_rate": 1.1275320372079409e-05, + "loss": 1.0459, + "step": 723 + }, + { + "epoch": 1.9026315789473685, + "step": 723, + "train_accuracy": 0.75 + }, + { + "epoch": 1.905263157894737, + "grad_norm": 3.5795280933380127, + "learning_rate": 1.1254178143911505e-05, + "loss": 1.2227, + "step": 724 + }, + { + "epoch": 1.905263157894737, + "step": 724, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.9078947368421053, + "grad_norm": 3.7006609439849854, + "learning_rate": 1.1233030218512424e-05, + "loss": 1.1079, + "step": 725 + }, + { + "epoch": 1.9078947368421053, + "step": 725, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.9105263157894736, + "grad_norm": 7.2685418128967285, + "learning_rate": 1.121187669194876e-05, + "loss": 1.1953, + "step": 726 + }, + { + "epoch": 1.9105263157894736, + "step": 726, + "train_accuracy": 0.734375 + }, + { + "epoch": 1.913157894736842, + "grad_norm": 10.036980628967285, + "learning_rate": 1.1190717660312546e-05, + "loss": 1.4414, + "step": 727 + }, + { + "epoch": 1.913157894736842, + "step": 727, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.9157894736842105, + "grad_norm": 18.515457153320312, + "learning_rate": 1.1169553219720828e-05, + "loss": 1.5098, + "step": 728 + }, + { + "epoch": 1.9157894736842105, + "step": 728, + "train_accuracy": 0.640625 + }, + { + "epoch": 1.918421052631579, + "grad_norm": 10.847208023071289, + "learning_rate": 1.1148383466315215e-05, + "loss": 1.2827, + "step": 729 + }, + { + "epoch": 1.918421052631579, + "step": 729, + "train_accuracy": 0.828125 + }, + { + "epoch": 1.9210526315789473, + "grad_norm": 2.8338921070098877, + "learning_rate": 1.112720849626146e-05, + "loss": 1.2183, + "step": 730 + }, + { + "epoch": 1.9210526315789473, + "step": 730, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.9236842105263157, + "grad_norm": 2.897954225540161, + "learning_rate": 1.1106028405749005e-05, + "loss": 1.228, + "step": 731 + }, + { + "epoch": 1.9236842105263157, + "step": 731, + "train_accuracy": 0.65625 + }, + { + "epoch": 1.9263157894736842, + "grad_norm": 4.838603973388672, + "learning_rate": 1.108484329099056e-05, + "loss": 1.2529, + "step": 732 + }, + { + "epoch": 1.9263157894736842, + "step": 732, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.9289473684210527, + "grad_norm": 15.432750701904297, + "learning_rate": 1.1063653248221647e-05, + "loss": 1.1182, + "step": 733 + }, + { + "epoch": 1.9289473684210527, + "step": 733, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.931578947368421, + "grad_norm": 24.764320373535156, + "learning_rate": 1.1042458373700182e-05, + "loss": 1.8174, + "step": 734 + }, + { + "epoch": 1.931578947368421, + "step": 734, + "train_accuracy": 0.75 + }, + { + "epoch": 1.9342105263157894, + "grad_norm": 7.018698692321777, + "learning_rate": 1.102125876370603e-05, + "loss": 1.25, + "step": 735 + }, + { + "epoch": 1.9342105263157894, + "step": 735, + "train_accuracy": 0.875 + }, + { + "epoch": 1.936842105263158, + "grad_norm": 6.331981658935547, + "learning_rate": 1.1000054514540563e-05, + "loss": 1.3584, + "step": 736 + }, + { + "epoch": 1.936842105263158, + "step": 736, + "train_accuracy": 0.703125 + }, + { + "epoch": 1.9394736842105265, + "grad_norm": 4.260552406311035, + "learning_rate": 1.0978845722526233e-05, + "loss": 0.9893, + "step": 737 + }, + { + "epoch": 1.9394736842105265, + "step": 737, + "train_accuracy": 0.828125 + }, + { + "epoch": 1.9421052631578948, + "grad_norm": 7.735119342803955, + "learning_rate": 1.095763248400612e-05, + "loss": 1.2583, + "step": 738 + }, + { + "epoch": 1.9421052631578948, + "step": 738, + "train_accuracy": 0.75 + }, + { + "epoch": 1.944736842105263, + "grad_norm": 8.540193557739258, + "learning_rate": 1.093641489534351e-05, + "loss": 1.2437, + "step": 739 + }, + { + "epoch": 1.944736842105263, + "step": 739, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 22.239505767822266, + "learning_rate": 1.0915193052921444e-05, + "loss": 1.6079, + "step": 740 + }, + { + "epoch": 1.9473684210526314, + "step": 740, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.95, + "grad_norm": 2.21054744720459, + "learning_rate": 1.0893967053142296e-05, + "loss": 1.0818, + "step": 741 + }, + { + "epoch": 1.95, + "step": 741, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.9526315789473685, + "grad_norm": 7.5065131187438965, + "learning_rate": 1.0872736992427313e-05, + "loss": 1.3613, + "step": 742 + }, + { + "epoch": 1.9526315789473685, + "step": 742, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.9552631578947368, + "grad_norm": 3.3990285396575928, + "learning_rate": 1.0851502967216199e-05, + "loss": 1.2324, + "step": 743 + }, + { + "epoch": 1.9552631578947368, + "step": 743, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.9578947368421051, + "grad_norm": 5.191620826721191, + "learning_rate": 1.0830265073966659e-05, + "loss": 1.4365, + "step": 744 + }, + { + "epoch": 1.9578947368421051, + "step": 744, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.9605263157894737, + "grad_norm": 12.306520462036133, + "learning_rate": 1.0809023409153975e-05, + "loss": 1.3271, + "step": 745 + }, + { + "epoch": 1.9605263157894737, + "step": 745, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.9631578947368422, + "grad_norm": 18.141544342041016, + "learning_rate": 1.078777806927056e-05, + "loss": 1.5117, + "step": 746 + }, + { + "epoch": 1.9631578947368422, + "step": 746, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.9657894736842105, + "grad_norm": 17.377368927001953, + "learning_rate": 1.076652915082552e-05, + "loss": 1.5439, + "step": 747 + }, + { + "epoch": 1.9657894736842105, + "step": 747, + "train_accuracy": 0.84375 + }, + { + "epoch": 1.9684210526315788, + "grad_norm": 7.152597427368164, + "learning_rate": 1.0745276750344217e-05, + "loss": 1.1499, + "step": 748 + }, + { + "epoch": 1.9684210526315788, + "step": 748, + "train_accuracy": 0.640625 + }, + { + "epoch": 1.9710526315789474, + "grad_norm": 9.160234451293945, + "learning_rate": 1.0724020964367836e-05, + "loss": 1.4727, + "step": 749 + }, + { + "epoch": 1.9710526315789474, + "step": 749, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 12.51110553741455, + "learning_rate": 1.070276188945293e-05, + "loss": 1.5566, + "step": 750 + }, + { + "epoch": 1.973684210526316, + "step": 750, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.9763157894736842, + "grad_norm": 14.964673042297363, + "learning_rate": 1.0681499622171006e-05, + "loss": 1.2637, + "step": 751 + }, + { + "epoch": 1.9763157894736842, + "step": 751, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.9789473684210526, + "grad_norm": 7.775798797607422, + "learning_rate": 1.0660234259108058e-05, + "loss": 1.0046, + "step": 752 + }, + { + "epoch": 1.9789473684210526, + "step": 752, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.981578947368421, + "grad_norm": 6.908376216888428, + "learning_rate": 1.0638965896864155e-05, + "loss": 1.1677, + "step": 753 + }, + { + "epoch": 1.981578947368421, + "step": 753, + "train_accuracy": 0.8125 + }, + { + "epoch": 1.9842105263157894, + "grad_norm": 4.5304975509643555, + "learning_rate": 1.0617694632052985e-05, + "loss": 1.3921, + "step": 754 + }, + { + "epoch": 1.9842105263157894, + "step": 754, + "train_accuracy": 0.78125 + }, + { + "epoch": 1.986842105263158, + "grad_norm": 3.6965484619140625, + "learning_rate": 1.0596420561301421e-05, + "loss": 1.2329, + "step": 755 + }, + { + "epoch": 1.986842105263158, + "step": 755, + "train_accuracy": 0.6875 + }, + { + "epoch": 1.9894736842105263, + "grad_norm": 10.095308303833008, + "learning_rate": 1.0575143781249085e-05, + "loss": 1.3174, + "step": 756 + }, + { + "epoch": 1.9894736842105263, + "step": 756, + "train_accuracy": 0.71875 + }, + { + "epoch": 1.9921052631578946, + "grad_norm": 18.230331420898438, + "learning_rate": 1.0553864388547898e-05, + "loss": 1.417, + "step": 757 + }, + { + "epoch": 1.9921052631578946, + "step": 757, + "train_accuracy": 0.796875 + }, + { + "epoch": 1.9947368421052631, + "grad_norm": 15.718420028686523, + "learning_rate": 1.0532582479861661e-05, + "loss": 1.2627, + "step": 758 + }, + { + "epoch": 1.9947368421052631, + "step": 758, + "train_accuracy": 0.671875 + }, + { + "epoch": 1.9973684210526317, + "grad_norm": 5.491973400115967, + "learning_rate": 1.05112981518656e-05, + "loss": 1.3433, + "step": 759 + }, + { + "epoch": 1.9973684210526317, + "step": 759, + "train_accuracy": 0.875 + }, + { + "epoch": 2.0, + "grad_norm": 3.911191701889038, + "learning_rate": 1.0490011501245922e-05, + "loss": 0.9978, + "step": 760 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.7095523476600647, + "eval_max_score": 5.4375, + "eval_min_score": -7.53125, + "eval_runtime": 151.3191, + "eval_samples_per_second": 18.748, + "eval_steps_per_second": 0.297, + "step": 760 + }, + { + "epoch": 2.0, + "step": 760, + "train_accuracy": 0.703125 + }, + { + "epoch": 2.0026315789473683, + "grad_norm": 11.913331985473633, + "learning_rate": 1.0468722624699401e-05, + "loss": 1.1938, + "step": 761 + }, + { + "epoch": 2.0026315789473683, + "step": 761, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.0052631578947366, + "grad_norm": 3.7143356800079346, + "learning_rate": 1.0447431618932908e-05, + "loss": 0.9375, + "step": 762 + }, + { + "epoch": 2.0052631578947366, + "step": 762, + "train_accuracy": 0.6875 + }, + { + "epoch": 2.0078947368421054, + "grad_norm": 11.733597755432129, + "learning_rate": 1.0426138580662994e-05, + "loss": 1.0718, + "step": 763 + }, + { + "epoch": 2.0078947368421054, + "step": 763, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.0105263157894737, + "grad_norm": 5.847121715545654, + "learning_rate": 1.040484360661544e-05, + "loss": 0.9795, + "step": 764 + }, + { + "epoch": 2.0105263157894737, + "step": 764, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.013157894736842, + "grad_norm": 10.243609428405762, + "learning_rate": 1.0383546793524821e-05, + "loss": 1.1621, + "step": 765 + }, + { + "epoch": 2.013157894736842, + "step": 765, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.0157894736842104, + "grad_norm": 4.13088846206665, + "learning_rate": 1.0362248238134069e-05, + "loss": 1.2065, + "step": 766 + }, + { + "epoch": 2.0157894736842104, + "step": 766, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.018421052631579, + "grad_norm": 13.116296768188477, + "learning_rate": 1.0340948037194022e-05, + "loss": 1.1763, + "step": 767 + }, + { + "epoch": 2.018421052631579, + "step": 767, + "train_accuracy": 0.875 + }, + { + "epoch": 2.0210526315789474, + "grad_norm": 9.769474983215332, + "learning_rate": 1.0319646287463007e-05, + "loss": 0.9854, + "step": 768 + }, + { + "epoch": 2.0210526315789474, + "step": 768, + "train_accuracy": 0.734375 + }, + { + "epoch": 2.0236842105263158, + "grad_norm": 5.3271870613098145, + "learning_rate": 1.0298343085706373e-05, + "loss": 1.1587, + "step": 769 + }, + { + "epoch": 2.0236842105263158, + "step": 769, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.026315789473684, + "grad_norm": 5.8491339683532715, + "learning_rate": 1.0277038528696069e-05, + "loss": 1.1216, + "step": 770 + }, + { + "epoch": 2.026315789473684, + "step": 770, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.028947368421053, + "grad_norm": 4.986546516418457, + "learning_rate": 1.0255732713210207e-05, + "loss": 0.8696, + "step": 771 + }, + { + "epoch": 2.028947368421053, + "step": 771, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.031578947368421, + "grad_norm": 6.160093784332275, + "learning_rate": 1.0234425736032607e-05, + "loss": 0.8853, + "step": 772 + }, + { + "epoch": 2.031578947368421, + "step": 772, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.0342105263157895, + "grad_norm": 5.357452869415283, + "learning_rate": 1.021311769395237e-05, + "loss": 1.061, + "step": 773 + }, + { + "epoch": 2.0342105263157895, + "step": 773, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.036842105263158, + "grad_norm": 4.175728797912598, + "learning_rate": 1.0191808683763435e-05, + "loss": 1.1689, + "step": 774 + }, + { + "epoch": 2.036842105263158, + "step": 774, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.039473684210526, + "grad_norm": 12.665717124938965, + "learning_rate": 1.0170498802264137e-05, + "loss": 1.2319, + "step": 775 + }, + { + "epoch": 2.039473684210526, + "step": 775, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.042105263157895, + "grad_norm": 3.677302360534668, + "learning_rate": 1.0149188146256772e-05, + "loss": 1.1147, + "step": 776 + }, + { + "epoch": 2.042105263157895, + "step": 776, + "train_accuracy": 0.734375 + }, + { + "epoch": 2.044736842105263, + "grad_norm": 9.071887016296387, + "learning_rate": 1.012787681254715e-05, + "loss": 1.4141, + "step": 777 + }, + { + "epoch": 2.044736842105263, + "step": 777, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.0473684210526315, + "grad_norm": 4.74293327331543, + "learning_rate": 1.0106564897944161e-05, + "loss": 1.2603, + "step": 778 + }, + { + "epoch": 2.0473684210526315, + "step": 778, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.05, + "grad_norm": 6.72886323928833, + "learning_rate": 1.0085252499259339e-05, + "loss": 1.2246, + "step": 779 + }, + { + "epoch": 2.05, + "step": 779, + "train_accuracy": 0.875 + }, + { + "epoch": 2.0526315789473686, + "grad_norm": 7.605119705200195, + "learning_rate": 1.0063939713306408e-05, + "loss": 1.0317, + "step": 780 + }, + { + "epoch": 2.0526315789473686, + "step": 780, + "train_accuracy": 0.890625 + }, + { + "epoch": 2.055263157894737, + "grad_norm": 3.3540549278259277, + "learning_rate": 1.0042626636900857e-05, + "loss": 0.8687, + "step": 781 + }, + { + "epoch": 2.055263157894737, + "step": 781, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.057894736842105, + "grad_norm": 3.413945436477661, + "learning_rate": 1.0021313366859492e-05, + "loss": 0.9585, + "step": 782 + }, + { + "epoch": 2.057894736842105, + "step": 782, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.0605263157894735, + "grad_norm": 8.70964527130127, + "learning_rate": 1e-05, + "loss": 0.9995, + "step": 783 + }, + { + "epoch": 2.0605263157894735, + "step": 783, + "train_accuracy": 0.75 + }, + { + "epoch": 2.0631578947368423, + "grad_norm": 3.214557409286499, + "learning_rate": 9.97868663314051e-06, + "loss": 1.1606, + "step": 784 + }, + { + "epoch": 2.0631578947368423, + "step": 784, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.0657894736842106, + "grad_norm": 3.9605302810668945, + "learning_rate": 9.957373363099145e-06, + "loss": 1.1567, + "step": 785 + }, + { + "epoch": 2.0657894736842106, + "step": 785, + "train_accuracy": 0.875 + }, + { + "epoch": 2.068421052631579, + "grad_norm": 4.05962610244751, + "learning_rate": 9.936060286693592e-06, + "loss": 1.2017, + "step": 786 + }, + { + "epoch": 2.068421052631579, + "step": 786, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.0710526315789473, + "grad_norm": 5.312036991119385, + "learning_rate": 9.914747500740664e-06, + "loss": 1.1226, + "step": 787 + }, + { + "epoch": 2.0710526315789473, + "step": 787, + "train_accuracy": 0.890625 + }, + { + "epoch": 2.0736842105263156, + "grad_norm": 3.9408695697784424, + "learning_rate": 9.893435102055837e-06, + "loss": 0.9087, + "step": 788 + }, + { + "epoch": 2.0736842105263156, + "step": 788, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.0763157894736843, + "grad_norm": 7.020488739013672, + "learning_rate": 9.872123187452853e-06, + "loss": 0.832, + "step": 789 + }, + { + "epoch": 2.0763157894736843, + "step": 789, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.0789473684210527, + "grad_norm": 11.928730964660645, + "learning_rate": 9.850811853743228e-06, + "loss": 0.9893, + "step": 790 + }, + { + "epoch": 2.0789473684210527, + "step": 790, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.081578947368421, + "grad_norm": 3.8629775047302246, + "learning_rate": 9.829501197735866e-06, + "loss": 1.0938, + "step": 791 + }, + { + "epoch": 2.081578947368421, + "step": 791, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.0842105263157893, + "grad_norm": 5.992745876312256, + "learning_rate": 9.808191316236567e-06, + "loss": 1.0854, + "step": 792 + }, + { + "epoch": 2.0842105263157893, + "step": 792, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.086842105263158, + "grad_norm": 13.866453170776367, + "learning_rate": 9.786882306047634e-06, + "loss": 1.1436, + "step": 793 + }, + { + "epoch": 2.086842105263158, + "step": 793, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.0894736842105264, + "grad_norm": 6.951359748840332, + "learning_rate": 9.765574263967397e-06, + "loss": 0.9795, + "step": 794 + }, + { + "epoch": 2.0894736842105264, + "step": 794, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.0921052631578947, + "grad_norm": 3.920224666595459, + "learning_rate": 9.7442672867898e-06, + "loss": 0.9082, + "step": 795 + }, + { + "epoch": 2.0921052631578947, + "step": 795, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.094736842105263, + "grad_norm": 9.016509056091309, + "learning_rate": 9.722961471303933e-06, + "loss": 1.0391, + "step": 796 + }, + { + "epoch": 2.094736842105263, + "step": 796, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.0973684210526318, + "grad_norm": 4.021119594573975, + "learning_rate": 9.701656914293633e-06, + "loss": 1.0806, + "step": 797 + }, + { + "epoch": 2.0973684210526318, + "step": 797, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.1, + "grad_norm": 10.586660385131836, + "learning_rate": 9.680353712536996e-06, + "loss": 1.2656, + "step": 798 + }, + { + "epoch": 2.1, + "step": 798, + "train_accuracy": 0.71875 + }, + { + "epoch": 2.1026315789473684, + "grad_norm": 8.6112699508667, + "learning_rate": 9.659051962805981e-06, + "loss": 1.4502, + "step": 799 + }, + { + "epoch": 2.1026315789473684, + "step": 799, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 11.76105785369873, + "learning_rate": 9.637751761865935e-06, + "loss": 1.4658, + "step": 800 + }, + { + "epoch": 2.1052631578947367, + "step": 800, + "train_accuracy": 0.90625 + }, + { + "epoch": 2.1078947368421055, + "grad_norm": 3.611421823501587, + "learning_rate": 9.616453206475179e-06, + "loss": 0.8423, + "step": 801 + }, + { + "epoch": 2.1078947368421055, + "step": 801, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.110526315789474, + "grad_norm": 7.084579944610596, + "learning_rate": 9.595156393384563e-06, + "loss": 1.0503, + "step": 802 + }, + { + "epoch": 2.110526315789474, + "step": 802, + "train_accuracy": 0.6875 + }, + { + "epoch": 2.113157894736842, + "grad_norm": 7.71038293838501, + "learning_rate": 9.573861419337006e-06, + "loss": 1.0415, + "step": 803 + }, + { + "epoch": 2.113157894736842, + "step": 803, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.1157894736842104, + "grad_norm": 3.812354564666748, + "learning_rate": 9.552568381067094e-06, + "loss": 1.0215, + "step": 804 + }, + { + "epoch": 2.1157894736842104, + "step": 804, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.1184210526315788, + "grad_norm": 5.703183650970459, + "learning_rate": 9.531277375300599e-06, + "loss": 0.9658, + "step": 805 + }, + { + "epoch": 2.1184210526315788, + "step": 805, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.1210526315789475, + "grad_norm": 4.764461517333984, + "learning_rate": 9.50998849875408e-06, + "loss": 1.0361, + "step": 806 + }, + { + "epoch": 2.1210526315789475, + "step": 806, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.123684210526316, + "grad_norm": 3.6121408939361572, + "learning_rate": 9.488701848134402e-06, + "loss": 1.1016, + "step": 807 + }, + { + "epoch": 2.123684210526316, + "step": 807, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.126315789473684, + "grad_norm": 5.072896480560303, + "learning_rate": 9.467417520138342e-06, + "loss": 1.1719, + "step": 808 + }, + { + "epoch": 2.126315789473684, + "step": 808, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.1289473684210525, + "grad_norm": 7.073615550994873, + "learning_rate": 9.446135611452104e-06, + "loss": 1.0371, + "step": 809 + }, + { + "epoch": 2.1289473684210525, + "step": 809, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.1315789473684212, + "grad_norm": 4.111389636993408, + "learning_rate": 9.42485621875092e-06, + "loss": 1.1528, + "step": 810 + }, + { + "epoch": 2.1315789473684212, + "step": 810, + "train_accuracy": 0.6875 + }, + { + "epoch": 2.1342105263157896, + "grad_norm": 5.092117786407471, + "learning_rate": 9.40357943869858e-06, + "loss": 1.2002, + "step": 811 + }, + { + "epoch": 2.1342105263157896, + "step": 811, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.136842105263158, + "grad_norm": 6.998875617980957, + "learning_rate": 9.382305367947018e-06, + "loss": 1.1992, + "step": 812 + }, + { + "epoch": 2.136842105263158, + "step": 812, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.139473684210526, + "grad_norm": 4.185628890991211, + "learning_rate": 9.361034103135847e-06, + "loss": 0.8735, + "step": 813 + }, + { + "epoch": 2.139473684210526, + "step": 813, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.1421052631578945, + "grad_norm": 5.1156325340271, + "learning_rate": 9.339765740891946e-06, + "loss": 1.0894, + "step": 814 + }, + { + "epoch": 2.1421052631578945, + "step": 814, + "train_accuracy": 0.875 + }, + { + "epoch": 2.1447368421052633, + "grad_norm": 7.7576189041137695, + "learning_rate": 9.318500377828998e-06, + "loss": 0.9214, + "step": 815 + }, + { + "epoch": 2.1447368421052633, + "step": 815, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.1473684210526316, + "grad_norm": 4.9661126136779785, + "learning_rate": 9.297238110547075e-06, + "loss": 1.2227, + "step": 816 + }, + { + "epoch": 2.1473684210526316, + "step": 816, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.15, + "grad_norm": 3.897723913192749, + "learning_rate": 9.275979035632167e-06, + "loss": 0.9333, + "step": 817 + }, + { + "epoch": 2.15, + "step": 817, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.1526315789473682, + "grad_norm": 13.26174545288086, + "learning_rate": 9.254723249655784e-06, + "loss": 1.1143, + "step": 818 + }, + { + "epoch": 2.1526315789473682, + "step": 818, + "train_accuracy": 0.75 + }, + { + "epoch": 2.155263157894737, + "grad_norm": 7.426464557647705, + "learning_rate": 9.233470849174484e-06, + "loss": 1.2188, + "step": 819 + }, + { + "epoch": 2.155263157894737, + "step": 819, + "train_accuracy": 0.921875 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 5.145357608795166, + "learning_rate": 9.212221930729442e-06, + "loss": 1.075, + "step": 820 + }, + { + "epoch": 2.1578947368421053, + "step": 820, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.1605263157894736, + "grad_norm": 13.33061695098877, + "learning_rate": 9.190976590846028e-06, + "loss": 1.3589, + "step": 821 + }, + { + "epoch": 2.1605263157894736, + "step": 821, + "train_accuracy": 0.75 + }, + { + "epoch": 2.163157894736842, + "grad_norm": 15.381673812866211, + "learning_rate": 9.169734926033343e-06, + "loss": 1.6045, + "step": 822 + }, + { + "epoch": 2.163157894736842, + "step": 822, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.1657894736842107, + "grad_norm": 5.126620769500732, + "learning_rate": 9.148497032783804e-06, + "loss": 1.2603, + "step": 823 + }, + { + "epoch": 2.1657894736842107, + "step": 823, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.168421052631579, + "grad_norm": 6.827272891998291, + "learning_rate": 9.127263007572688e-06, + "loss": 1.0527, + "step": 824 + }, + { + "epoch": 2.168421052631579, + "step": 824, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.1710526315789473, + "grad_norm": 11.240569114685059, + "learning_rate": 9.106032946857708e-06, + "loss": 1.0391, + "step": 825 + }, + { + "epoch": 2.1710526315789473, + "step": 825, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.1736842105263157, + "grad_norm": 5.50578498840332, + "learning_rate": 9.084806947078558e-06, + "loss": 0.9722, + "step": 826 + }, + { + "epoch": 2.1736842105263157, + "step": 826, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.1763157894736844, + "grad_norm": 6.608829021453857, + "learning_rate": 9.063585104656494e-06, + "loss": 1.0566, + "step": 827 + }, + { + "epoch": 2.1763157894736844, + "step": 827, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.1789473684210527, + "grad_norm": 7.2398176193237305, + "learning_rate": 9.042367515993884e-06, + "loss": 1.0808, + "step": 828 + }, + { + "epoch": 2.1789473684210527, + "step": 828, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.181578947368421, + "grad_norm": 5.127984046936035, + "learning_rate": 9.021154277473772e-06, + "loss": 1.063, + "step": 829 + }, + { + "epoch": 2.181578947368421, + "step": 829, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.1842105263157894, + "grad_norm": 5.692725658416748, + "learning_rate": 8.999945485459439e-06, + "loss": 1.0215, + "step": 830 + }, + { + "epoch": 2.1842105263157894, + "step": 830, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.1868421052631577, + "grad_norm": 9.340150833129883, + "learning_rate": 8.978741236293972e-06, + "loss": 1.0005, + "step": 831 + }, + { + "epoch": 2.1868421052631577, + "step": 831, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.1894736842105265, + "grad_norm": 16.168376922607422, + "learning_rate": 8.957541626299821e-06, + "loss": 1.3848, + "step": 832 + }, + { + "epoch": 2.1894736842105265, + "step": 832, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.192105263157895, + "grad_norm": 12.331050872802734, + "learning_rate": 8.936346751778358e-06, + "loss": 1.1753, + "step": 833 + }, + { + "epoch": 2.192105263157895, + "step": 833, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.194736842105263, + "grad_norm": 5.23305082321167, + "learning_rate": 8.915156709009445e-06, + "loss": 1.0239, + "step": 834 + }, + { + "epoch": 2.194736842105263, + "step": 834, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.1973684210526314, + "grad_norm": 4.159104347229004, + "learning_rate": 8.893971594250998e-06, + "loss": 1.105, + "step": 835 + }, + { + "epoch": 2.1973684210526314, + "step": 835, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.2, + "grad_norm": 9.729438781738281, + "learning_rate": 8.872791503738543e-06, + "loss": 1.5513, + "step": 836 + }, + { + "epoch": 2.2, + "eval_accuracy": 0.7173070311546326, + "eval_max_score": 8.125, + "eval_min_score": -8.625, + "eval_runtime": 151.7446, + "eval_samples_per_second": 18.696, + "eval_steps_per_second": 0.297, + "step": 836 + }, + { + "epoch": 2.2, + "step": 836, + "train_accuracy": 0.90625 + }, + { + "epoch": 2.2026315789473685, + "grad_norm": 6.211835861206055, + "learning_rate": 8.851616533684788e-06, + "loss": 0.9358, + "step": 837 + }, + { + "epoch": 2.2026315789473685, + "step": 837, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.205263157894737, + "grad_norm": 4.620204448699951, + "learning_rate": 8.830446780279175e-06, + "loss": 1.1157, + "step": 838 + }, + { + "epoch": 2.205263157894737, + "step": 838, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.207894736842105, + "grad_norm": 11.254515647888184, + "learning_rate": 8.809282339687457e-06, + "loss": 1.0879, + "step": 839 + }, + { + "epoch": 2.207894736842105, + "step": 839, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 4.592057704925537, + "learning_rate": 8.788123308051244e-06, + "loss": 1.0962, + "step": 840 + }, + { + "epoch": 2.2105263157894735, + "step": 840, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.213157894736842, + "grad_norm": 7.248822212219238, + "learning_rate": 8.766969781487579e-06, + "loss": 1.0967, + "step": 841 + }, + { + "epoch": 2.213157894736842, + "step": 841, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.2157894736842105, + "grad_norm": 4.9665985107421875, + "learning_rate": 8.7458218560885e-06, + "loss": 1.0249, + "step": 842 + }, + { + "epoch": 2.2157894736842105, + "step": 842, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.218421052631579, + "grad_norm": 7.975915431976318, + "learning_rate": 8.724679627920595e-06, + "loss": 1.2192, + "step": 843 + }, + { + "epoch": 2.218421052631579, + "step": 843, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.221052631578947, + "grad_norm": 3.6533076763153076, + "learning_rate": 8.703543193024578e-06, + "loss": 1.0068, + "step": 844 + }, + { + "epoch": 2.221052631578947, + "step": 844, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.223684210526316, + "grad_norm": 4.886638164520264, + "learning_rate": 8.682412647414845e-06, + "loss": 0.9976, + "step": 845 + }, + { + "epoch": 2.223684210526316, + "step": 845, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.2263157894736842, + "grad_norm": 8.30583381652832, + "learning_rate": 8.661288087079038e-06, + "loss": 1.0015, + "step": 846 + }, + { + "epoch": 2.2263157894736842, + "step": 846, + "train_accuracy": 0.890625 + }, + { + "epoch": 2.2289473684210526, + "grad_norm": 16.96817970275879, + "learning_rate": 8.640169607977606e-06, + "loss": 1.2681, + "step": 847 + }, + { + "epoch": 2.2289473684210526, + "step": 847, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.231578947368421, + "grad_norm": 8.309298515319824, + "learning_rate": 8.619057306043388e-06, + "loss": 1.1211, + "step": 848 + }, + { + "epoch": 2.231578947368421, + "step": 848, + "train_accuracy": 0.75 + }, + { + "epoch": 2.2342105263157896, + "grad_norm": 4.597687244415283, + "learning_rate": 8.597951277181143e-06, + "loss": 0.9634, + "step": 849 + }, + { + "epoch": 2.2342105263157896, + "step": 849, + "train_accuracy": 0.890625 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 6.457274436950684, + "learning_rate": 8.576851617267151e-06, + "loss": 1.0137, + "step": 850 + }, + { + "epoch": 2.236842105263158, + "step": 850, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.2394736842105263, + "grad_norm": 9.658507347106934, + "learning_rate": 8.555758422148746e-06, + "loss": 1.0164, + "step": 851 + }, + { + "epoch": 2.2394736842105263, + "step": 851, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.2421052631578946, + "grad_norm": 12.8440580368042, + "learning_rate": 8.534671787643909e-06, + "loss": 1.1245, + "step": 852 + }, + { + "epoch": 2.2421052631578946, + "step": 852, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.2447368421052634, + "grad_norm": 9.525819778442383, + "learning_rate": 8.513591809540804e-06, + "loss": 1.1011, + "step": 853 + }, + { + "epoch": 2.2447368421052634, + "step": 853, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.2473684210526317, + "grad_norm": 10.477482795715332, + "learning_rate": 8.492518583597374e-06, + "loss": 1.0659, + "step": 854 + }, + { + "epoch": 2.2473684210526317, + "step": 854, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.25, + "grad_norm": 5.5181708335876465, + "learning_rate": 8.471452205540873e-06, + "loss": 1.1631, + "step": 855 + }, + { + "epoch": 2.25, + "step": 855, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.2526315789473683, + "grad_norm": 4.574328422546387, + "learning_rate": 8.450392771067463e-06, + "loss": 0.8623, + "step": 856 + }, + { + "epoch": 2.2526315789473683, + "step": 856, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.2552631578947366, + "grad_norm": 16.64838981628418, + "learning_rate": 8.429340375841753e-06, + "loss": 1.3433, + "step": 857 + }, + { + "epoch": 2.2552631578947366, + "step": 857, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.2578947368421054, + "grad_norm": 7.746898174285889, + "learning_rate": 8.408295115496376e-06, + "loss": 1.147, + "step": 858 + }, + { + "epoch": 2.2578947368421054, + "step": 858, + "train_accuracy": 0.734375 + }, + { + "epoch": 2.2605263157894737, + "grad_norm": 4.429878234863281, + "learning_rate": 8.387257085631567e-06, + "loss": 1.0591, + "step": 859 + }, + { + "epoch": 2.2605263157894737, + "step": 859, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.263157894736842, + "grad_norm": 8.77043342590332, + "learning_rate": 8.366226381814698e-06, + "loss": 1.0776, + "step": 860 + }, + { + "epoch": 2.263157894736842, + "step": 860, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.2657894736842104, + "grad_norm": 11.112512588500977, + "learning_rate": 8.345203099579874e-06, + "loss": 1.1587, + "step": 861 + }, + { + "epoch": 2.2657894736842104, + "step": 861, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.268421052631579, + "grad_norm": 3.823777914047241, + "learning_rate": 8.32418733442748e-06, + "loss": 0.9067, + "step": 862 + }, + { + "epoch": 2.268421052631579, + "step": 862, + "train_accuracy": 0.75 + }, + { + "epoch": 2.2710526315789474, + "grad_norm": 5.121189117431641, + "learning_rate": 8.30317918182376e-06, + "loss": 1.3232, + "step": 863 + }, + { + "epoch": 2.2710526315789474, + "step": 863, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.2736842105263158, + "grad_norm": 13.663050651550293, + "learning_rate": 8.282178737200369e-06, + "loss": 1.1323, + "step": 864 + }, + { + "epoch": 2.2736842105263158, + "step": 864, + "train_accuracy": 0.71875 + }, + { + "epoch": 2.276315789473684, + "grad_norm": 11.7109956741333, + "learning_rate": 8.261186095953959e-06, + "loss": 1.2798, + "step": 865 + }, + { + "epoch": 2.276315789473684, + "step": 865, + "train_accuracy": 0.734375 + }, + { + "epoch": 2.2789473684210524, + "grad_norm": 3.7948272228240967, + "learning_rate": 8.240201353445721e-06, + "loss": 1.0508, + "step": 866 + }, + { + "epoch": 2.2789473684210524, + "step": 866, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.281578947368421, + "grad_norm": 4.340938568115234, + "learning_rate": 8.219224605000979e-06, + "loss": 1.1313, + "step": 867 + }, + { + "epoch": 2.281578947368421, + "step": 867, + "train_accuracy": 0.75 + }, + { + "epoch": 2.2842105263157895, + "grad_norm": 9.902191162109375, + "learning_rate": 8.198255945908727e-06, + "loss": 1.1665, + "step": 868 + }, + { + "epoch": 2.2842105263157895, + "step": 868, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.286842105263158, + "grad_norm": 3.9820899963378906, + "learning_rate": 8.177295471421232e-06, + "loss": 1.2061, + "step": 869 + }, + { + "epoch": 2.286842105263158, + "step": 869, + "train_accuracy": 0.71875 + }, + { + "epoch": 2.2894736842105265, + "grad_norm": 11.108955383300781, + "learning_rate": 8.156343276753563e-06, + "loss": 1.2837, + "step": 870 + }, + { + "epoch": 2.2894736842105265, + "step": 870, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.292105263157895, + "grad_norm": 10.159497261047363, + "learning_rate": 8.13539945708319e-06, + "loss": 1.0798, + "step": 871 + }, + { + "epoch": 2.292105263157895, + "step": 871, + "train_accuracy": 0.90625 + }, + { + "epoch": 2.294736842105263, + "grad_norm": 5.310483932495117, + "learning_rate": 8.114464107549532e-06, + "loss": 0.9238, + "step": 872 + }, + { + "epoch": 2.294736842105263, + "step": 872, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.2973684210526315, + "grad_norm": 3.798041582107544, + "learning_rate": 8.09353732325353e-06, + "loss": 1.0635, + "step": 873 + }, + { + "epoch": 2.2973684210526315, + "step": 873, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.3, + "grad_norm": 7.455261707305908, + "learning_rate": 8.072619199257232e-06, + "loss": 1.1836, + "step": 874 + }, + { + "epoch": 2.3, + "step": 874, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.3026315789473686, + "grad_norm": 5.471553325653076, + "learning_rate": 8.05170983058332e-06, + "loss": 1.064, + "step": 875 + }, + { + "epoch": 2.3026315789473686, + "step": 875, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.305263157894737, + "grad_norm": 4.1305694580078125, + "learning_rate": 8.030809312214726e-06, + "loss": 1.1958, + "step": 876 + }, + { + "epoch": 2.305263157894737, + "step": 876, + "train_accuracy": 0.75 + }, + { + "epoch": 2.307894736842105, + "grad_norm": 5.602186679840088, + "learning_rate": 8.009917739094164e-06, + "loss": 1.1851, + "step": 877 + }, + { + "epoch": 2.307894736842105, + "step": 877, + "train_accuracy": 0.6875 + }, + { + "epoch": 2.3105263157894735, + "grad_norm": 6.608561038970947, + "learning_rate": 7.98903520612373e-06, + "loss": 1.2207, + "step": 878 + }, + { + "epoch": 2.3105263157894735, + "step": 878, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.3131578947368423, + "grad_norm": 7.152756214141846, + "learning_rate": 7.968161808164431e-06, + "loss": 1.1006, + "step": 879 + }, + { + "epoch": 2.3131578947368423, + "step": 879, + "train_accuracy": 0.75 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 4.266940593719482, + "learning_rate": 7.9472976400358e-06, + "loss": 1.2261, + "step": 880 + }, + { + "epoch": 2.3157894736842106, + "step": 880, + "train_accuracy": 0.734375 + }, + { + "epoch": 2.318421052631579, + "grad_norm": 11.65020751953125, + "learning_rate": 7.926442796515429e-06, + "loss": 1.501, + "step": 881 + }, + { + "epoch": 2.318421052631579, + "step": 881, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.3210526315789473, + "grad_norm": 4.304981708526611, + "learning_rate": 7.905597372338558e-06, + "loss": 1.0825, + "step": 882 + }, + { + "epoch": 2.3210526315789473, + "step": 882, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.3236842105263156, + "grad_norm": 4.352926731109619, + "learning_rate": 7.88476146219763e-06, + "loss": 1.0073, + "step": 883 + }, + { + "epoch": 2.3236842105263156, + "step": 883, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.3263157894736843, + "grad_norm": 5.145933628082275, + "learning_rate": 7.863935160741886e-06, + "loss": 1.2544, + "step": 884 + }, + { + "epoch": 2.3263157894736843, + "step": 884, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.3289473684210527, + "grad_norm": 4.757798194885254, + "learning_rate": 7.843118562576899e-06, + "loss": 0.916, + "step": 885 + }, + { + "epoch": 2.3289473684210527, + "step": 885, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.331578947368421, + "grad_norm": 5.980506896972656, + "learning_rate": 7.822311762264182e-06, + "loss": 1.3086, + "step": 886 + }, + { + "epoch": 2.331578947368421, + "step": 886, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.3342105263157893, + "grad_norm": 4.120883941650391, + "learning_rate": 7.801514854320724e-06, + "loss": 1.1172, + "step": 887 + }, + { + "epoch": 2.3342105263157893, + "step": 887, + "train_accuracy": 0.734375 + }, + { + "epoch": 2.336842105263158, + "grad_norm": 3.8416996002197266, + "learning_rate": 7.780727933218589e-06, + "loss": 1.0269, + "step": 888 + }, + { + "epoch": 2.336842105263158, + "step": 888, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.3394736842105264, + "grad_norm": 4.1616597175598145, + "learning_rate": 7.759951093384467e-06, + "loss": 1.3921, + "step": 889 + }, + { + "epoch": 2.3394736842105264, + "step": 889, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.3421052631578947, + "grad_norm": 7.139702796936035, + "learning_rate": 7.739184429199262e-06, + "loss": 1.0664, + "step": 890 + }, + { + "epoch": 2.3421052631578947, + "step": 890, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.344736842105263, + "grad_norm": 3.5956788063049316, + "learning_rate": 7.71842803499764e-06, + "loss": 1.0742, + "step": 891 + }, + { + "epoch": 2.344736842105263, + "step": 891, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.3473684210526318, + "grad_norm": 5.877930164337158, + "learning_rate": 7.697682005067627e-06, + "loss": 1.0059, + "step": 892 + }, + { + "epoch": 2.3473684210526318, + "step": 892, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.35, + "grad_norm": 8.738607406616211, + "learning_rate": 7.67694643365017e-06, + "loss": 0.9917, + "step": 893 + }, + { + "epoch": 2.35, + "step": 893, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.3526315789473684, + "grad_norm": 5.848970413208008, + "learning_rate": 7.65622141493869e-06, + "loss": 1.3345, + "step": 894 + }, + { + "epoch": 2.3526315789473684, + "step": 894, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.3552631578947367, + "grad_norm": 8.351785659790039, + "learning_rate": 7.635507043078692e-06, + "loss": 1.1919, + "step": 895 + }, + { + "epoch": 2.3552631578947367, + "step": 895, + "train_accuracy": 0.75 + }, + { + "epoch": 2.3578947368421055, + "grad_norm": 7.921352386474609, + "learning_rate": 7.614803412167299e-06, + "loss": 1.0029, + "step": 896 + }, + { + "epoch": 2.3578947368421055, + "step": 896, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.360526315789474, + "grad_norm": 4.196382522583008, + "learning_rate": 7.594110616252859e-06, + "loss": 0.9062, + "step": 897 + }, + { + "epoch": 2.360526315789474, + "step": 897, + "train_accuracy": 0.75 + }, + { + "epoch": 2.363157894736842, + "grad_norm": 5.209059238433838, + "learning_rate": 7.573428749334482e-06, + "loss": 1.1943, + "step": 898 + }, + { + "epoch": 2.363157894736842, + "step": 898, + "train_accuracy": 0.875 + }, + { + "epoch": 2.3657894736842104, + "grad_norm": 3.660780191421509, + "learning_rate": 7.552757905361652e-06, + "loss": 0.8054, + "step": 899 + }, + { + "epoch": 2.3657894736842104, + "step": 899, + "train_accuracy": 0.75 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 6.490506172180176, + "learning_rate": 7.532098178233761e-06, + "loss": 1.1636, + "step": 900 + }, + { + "epoch": 2.3684210526315788, + "step": 900, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.3710526315789475, + "grad_norm": 4.385256767272949, + "learning_rate": 7.5114496617997205e-06, + "loss": 1.1226, + "step": 901 + }, + { + "epoch": 2.3710526315789475, + "step": 901, + "train_accuracy": 0.890625 + }, + { + "epoch": 2.373684210526316, + "grad_norm": 5.040467262268066, + "learning_rate": 7.4908124498574964e-06, + "loss": 0.855, + "step": 902 + }, + { + "epoch": 2.373684210526316, + "step": 902, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.376315789473684, + "grad_norm": 8.077067375183105, + "learning_rate": 7.470186636153722e-06, + "loss": 1.1592, + "step": 903 + }, + { + "epoch": 2.376315789473684, + "step": 903, + "train_accuracy": 0.734375 + }, + { + "epoch": 2.3789473684210525, + "grad_norm": 4.04118537902832, + "learning_rate": 7.449572314383237e-06, + "loss": 1.1375, + "step": 904 + }, + { + "epoch": 2.3789473684210525, + "step": 904, + "train_accuracy": 0.671875 + }, + { + "epoch": 2.3815789473684212, + "grad_norm": 6.724638938903809, + "learning_rate": 7.428969578188692e-06, + "loss": 1.1167, + "step": 905 + }, + { + "epoch": 2.3815789473684212, + "step": 905, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.3842105263157896, + "grad_norm": 8.281147003173828, + "learning_rate": 7.408378521160091e-06, + "loss": 1.064, + "step": 906 + }, + { + "epoch": 2.3842105263157896, + "step": 906, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.386842105263158, + "grad_norm": 5.431487083435059, + "learning_rate": 7.387799236834408e-06, + "loss": 0.96, + "step": 907 + }, + { + "epoch": 2.386842105263158, + "step": 907, + "train_accuracy": 0.75 + }, + { + "epoch": 2.389473684210526, + "grad_norm": 5.093214511871338, + "learning_rate": 7.367231818695113e-06, + "loss": 1.0693, + "step": 908 + }, + { + "epoch": 2.389473684210526, + "step": 908, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.3921052631578945, + "grad_norm": 7.53461217880249, + "learning_rate": 7.346676360171792e-06, + "loss": 1.4072, + "step": 909 + }, + { + "epoch": 2.3921052631578945, + "step": 909, + "train_accuracy": 0.703125 + }, + { + "epoch": 2.3947368421052633, + "grad_norm": 7.927123069763184, + "learning_rate": 7.326132954639699e-06, + "loss": 0.9409, + "step": 910 + }, + { + "epoch": 2.3947368421052633, + "step": 910, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.3973684210526316, + "grad_norm": 4.111649513244629, + "learning_rate": 7.3056016954193235e-06, + "loss": 1.1978, + "step": 911 + }, + { + "epoch": 2.3973684210526316, + "step": 911, + "train_accuracy": 0.875 + }, + { + "epoch": 2.4, + "grad_norm": 8.101927757263184, + "learning_rate": 7.285082675775998e-06, + "loss": 0.9675, + "step": 912 + }, + { + "epoch": 2.4, + "eval_accuracy": 0.709199845790863, + "eval_max_score": 7.4375, + "eval_min_score": -9.75, + "eval_runtime": 151.321, + "eval_samples_per_second": 18.748, + "eval_steps_per_second": 0.297, + "step": 912 + }, + { + "epoch": 2.4, + "step": 912, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.4026315789473682, + "grad_norm": 6.553269386291504, + "learning_rate": 7.26457598891944e-06, + "loss": 1.0269, + "step": 913 + }, + { + "epoch": 2.4026315789473682, + "step": 913, + "train_accuracy": 0.703125 + }, + { + "epoch": 2.405263157894737, + "grad_norm": 11.197577476501465, + "learning_rate": 7.2440817280033555e-06, + "loss": 1.3179, + "step": 914 + }, + { + "epoch": 2.405263157894737, + "step": 914, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.4078947368421053, + "grad_norm": 12.725153923034668, + "learning_rate": 7.223599986124994e-06, + "loss": 1.3213, + "step": 915 + }, + { + "epoch": 2.4078947368421053, + "step": 915, + "train_accuracy": 0.875 + }, + { + "epoch": 2.4105263157894736, + "grad_norm": 3.8353846073150635, + "learning_rate": 7.20313085632475e-06, + "loss": 0.9346, + "step": 916 + }, + { + "epoch": 2.4105263157894736, + "step": 916, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.413157894736842, + "grad_norm": 4.217707633972168, + "learning_rate": 7.182674431585703e-06, + "loss": 1.0293, + "step": 917 + }, + { + "epoch": 2.413157894736842, + "step": 917, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.4157894736842107, + "grad_norm": 3.886138916015625, + "learning_rate": 7.162230804833249e-06, + "loss": 1.249, + "step": 918 + }, + { + "epoch": 2.4157894736842107, + "step": 918, + "train_accuracy": 0.734375 + }, + { + "epoch": 2.418421052631579, + "grad_norm": 4.176194190979004, + "learning_rate": 7.14180006893462e-06, + "loss": 1.1792, + "step": 919 + }, + { + "epoch": 2.418421052631579, + "step": 919, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 4.922031402587891, + "learning_rate": 7.121382316698511e-06, + "loss": 1.1738, + "step": 920 + }, + { + "epoch": 2.4210526315789473, + "step": 920, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.4236842105263157, + "grad_norm": 10.639328002929688, + "learning_rate": 7.1009776408746205e-06, + "loss": 1.0972, + "step": 921 + }, + { + "epoch": 2.4236842105263157, + "step": 921, + "train_accuracy": 0.875 + }, + { + "epoch": 2.4263157894736844, + "grad_norm": 4.55307149887085, + "learning_rate": 7.08058613415326e-06, + "loss": 1.1309, + "step": 922 + }, + { + "epoch": 2.4263157894736844, + "step": 922, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.4289473684210527, + "grad_norm": 3.423119306564331, + "learning_rate": 7.060207889164909e-06, + "loss": 1.1821, + "step": 923 + }, + { + "epoch": 2.4289473684210527, + "step": 923, + "train_accuracy": 0.75 + }, + { + "epoch": 2.431578947368421, + "grad_norm": 3.477151870727539, + "learning_rate": 7.03984299847981e-06, + "loss": 1.1514, + "step": 924 + }, + { + "epoch": 2.431578947368421, + "step": 924, + "train_accuracy": 0.875 + }, + { + "epoch": 2.4342105263157894, + "grad_norm": 3.4678921699523926, + "learning_rate": 7.01949155460754e-06, + "loss": 0.8901, + "step": 925 + }, + { + "epoch": 2.4342105263157894, + "step": 925, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.4368421052631577, + "grad_norm": 3.4634525775909424, + "learning_rate": 6.999153649996595e-06, + "loss": 1.0435, + "step": 926 + }, + { + "epoch": 2.4368421052631577, + "step": 926, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.4394736842105265, + "grad_norm": 8.086597442626953, + "learning_rate": 6.978829377033962e-06, + "loss": 1.0676, + "step": 927 + }, + { + "epoch": 2.4394736842105265, + "step": 927, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.442105263157895, + "grad_norm": 6.657654762268066, + "learning_rate": 6.9585188280447094e-06, + "loss": 1.2402, + "step": 928 + }, + { + "epoch": 2.442105263157895, + "step": 928, + "train_accuracy": 0.734375 + }, + { + "epoch": 2.444736842105263, + "grad_norm": 7.919809818267822, + "learning_rate": 6.938222095291565e-06, + "loss": 1.1038, + "step": 929 + }, + { + "epoch": 2.444736842105263, + "step": 929, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.4473684210526314, + "grad_norm": 5.626051425933838, + "learning_rate": 6.917939270974485e-06, + "loss": 1.0713, + "step": 930 + }, + { + "epoch": 2.4473684210526314, + "step": 930, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.45, + "grad_norm": 3.9721519947052, + "learning_rate": 6.897670447230263e-06, + "loss": 1.041, + "step": 931 + }, + { + "epoch": 2.45, + "step": 931, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.4526315789473685, + "grad_norm": 5.174097061157227, + "learning_rate": 6.87741571613207e-06, + "loss": 1.2671, + "step": 932 + }, + { + "epoch": 2.4526315789473685, + "step": 932, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.455263157894737, + "grad_norm": 8.099953651428223, + "learning_rate": 6.8571751696890835e-06, + "loss": 1.0227, + "step": 933 + }, + { + "epoch": 2.455263157894737, + "step": 933, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.457894736842105, + "grad_norm": 11.488319396972656, + "learning_rate": 6.836948899846024e-06, + "loss": 1.146, + "step": 934 + }, + { + "epoch": 2.457894736842105, + "step": 934, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.4605263157894735, + "grad_norm": 8.036605834960938, + "learning_rate": 6.816736998482778e-06, + "loss": 1.0474, + "step": 935 + }, + { + "epoch": 2.4605263157894735, + "step": 935, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.463157894736842, + "grad_norm": 5.297451019287109, + "learning_rate": 6.796539557413951e-06, + "loss": 1.1514, + "step": 936 + }, + { + "epoch": 2.463157894736842, + "step": 936, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.4657894736842105, + "grad_norm": 5.096290111541748, + "learning_rate": 6.776356668388464e-06, + "loss": 1.0708, + "step": 937 + }, + { + "epoch": 2.4657894736842105, + "step": 937, + "train_accuracy": 0.734375 + }, + { + "epoch": 2.468421052631579, + "grad_norm": 4.746647834777832, + "learning_rate": 6.756188423089131e-06, + "loss": 1.1162, + "step": 938 + }, + { + "epoch": 2.468421052631579, + "step": 938, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.4710526315789476, + "grad_norm": 4.6044158935546875, + "learning_rate": 6.736034913132253e-06, + "loss": 0.9243, + "step": 939 + }, + { + "epoch": 2.4710526315789476, + "step": 939, + "train_accuracy": 0.875 + }, + { + "epoch": 2.473684210526316, + "grad_norm": 5.2404866218566895, + "learning_rate": 6.715896230067183e-06, + "loss": 0.9922, + "step": 940 + }, + { + "epoch": 2.473684210526316, + "step": 940, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.4763157894736842, + "grad_norm": 4.998322010040283, + "learning_rate": 6.695772465375929e-06, + "loss": 1.0859, + "step": 941 + }, + { + "epoch": 2.4763157894736842, + "step": 941, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.4789473684210526, + "grad_norm": 6.812887668609619, + "learning_rate": 6.675663710472733e-06, + "loss": 0.8792, + "step": 942 + }, + { + "epoch": 2.4789473684210526, + "step": 942, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.481578947368421, + "grad_norm": 10.387646675109863, + "learning_rate": 6.655570056703646e-06, + "loss": 1.3091, + "step": 943 + }, + { + "epoch": 2.481578947368421, + "step": 943, + "train_accuracy": 0.75 + }, + { + "epoch": 2.4842105263157896, + "grad_norm": 5.717840194702148, + "learning_rate": 6.635491595346122e-06, + "loss": 1.2109, + "step": 944 + }, + { + "epoch": 2.4842105263157896, + "step": 944, + "train_accuracy": 0.75 + }, + { + "epoch": 2.486842105263158, + "grad_norm": 8.672900199890137, + "learning_rate": 6.615428417608611e-06, + "loss": 1.2695, + "step": 945 + }, + { + "epoch": 2.486842105263158, + "step": 945, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.4894736842105263, + "grad_norm": 4.839511871337891, + "learning_rate": 6.5953806146301245e-06, + "loss": 1.0244, + "step": 946 + }, + { + "epoch": 2.4894736842105263, + "step": 946, + "train_accuracy": 0.75 + }, + { + "epoch": 2.4921052631578946, + "grad_norm": 4.422204971313477, + "learning_rate": 6.575348277479838e-06, + "loss": 1.0557, + "step": 947 + }, + { + "epoch": 2.4921052631578946, + "step": 947, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.4947368421052634, + "grad_norm": 11.49475383758545, + "learning_rate": 6.555331497156671e-06, + "loss": 1.1226, + "step": 948 + }, + { + "epoch": 2.4947368421052634, + "step": 948, + "train_accuracy": 0.875 + }, + { + "epoch": 2.4973684210526317, + "grad_norm": 6.345265865325928, + "learning_rate": 6.535330364588875e-06, + "loss": 1.0649, + "step": 949 + }, + { + "epoch": 2.4973684210526317, + "step": 949, + "train_accuracy": 0.734375 + }, + { + "epoch": 2.5, + "grad_norm": 7.858110427856445, + "learning_rate": 6.515344970633617e-06, + "loss": 1.3809, + "step": 950 + }, + { + "epoch": 2.5, + "step": 950, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.5026315789473683, + "grad_norm": 9.39013957977295, + "learning_rate": 6.495375406076574e-06, + "loss": 0.9399, + "step": 951 + }, + { + "epoch": 2.5026315789473683, + "step": 951, + "train_accuracy": 0.71875 + }, + { + "epoch": 2.5052631578947366, + "grad_norm": 4.707727432250977, + "learning_rate": 6.4754217616315125e-06, + "loss": 1.1304, + "step": 952 + }, + { + "epoch": 2.5052631578947366, + "step": 952, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.5078947368421054, + "grad_norm": 3.284626007080078, + "learning_rate": 6.455484127939885e-06, + "loss": 0.9111, + "step": 953 + }, + { + "epoch": 2.5078947368421054, + "step": 953, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.5105263157894737, + "grad_norm": 11.840646743774414, + "learning_rate": 6.4355625955704096e-06, + "loss": 1.1851, + "step": 954 + }, + { + "epoch": 2.5105263157894737, + "step": 954, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.513157894736842, + "grad_norm": 6.086373329162598, + "learning_rate": 6.415657255018662e-06, + "loss": 1.3726, + "step": 955 + }, + { + "epoch": 2.513157894736842, + "step": 955, + "train_accuracy": 0.703125 + }, + { + "epoch": 2.515789473684211, + "grad_norm": 10.698202133178711, + "learning_rate": 6.3957681967066695e-06, + "loss": 1.5376, + "step": 956 + }, + { + "epoch": 2.515789473684211, + "step": 956, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.518421052631579, + "grad_norm": 3.7446768283843994, + "learning_rate": 6.375895510982491e-06, + "loss": 0.9751, + "step": 957 + }, + { + "epoch": 2.518421052631579, + "step": 957, + "train_accuracy": 0.921875 + }, + { + "epoch": 2.5210526315789474, + "grad_norm": 4.004354953765869, + "learning_rate": 6.356039288119815e-06, + "loss": 0.9243, + "step": 958 + }, + { + "epoch": 2.5210526315789474, + "step": 958, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.5236842105263158, + "grad_norm": 4.783187389373779, + "learning_rate": 6.336199618317538e-06, + "loss": 1.0728, + "step": 959 + }, + { + "epoch": 2.5236842105263158, + "step": 959, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 4.596762180328369, + "learning_rate": 6.316376591699378e-06, + "loss": 0.9634, + "step": 960 + }, + { + "epoch": 2.526315789473684, + "step": 960, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.5289473684210524, + "grad_norm": 5.435537815093994, + "learning_rate": 6.2965702983134314e-06, + "loss": 1.0088, + "step": 961 + }, + { + "epoch": 2.5289473684210524, + "step": 961, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.531578947368421, + "grad_norm": 6.328050136566162, + "learning_rate": 6.276780828131798e-06, + "loss": 1.0698, + "step": 962 + }, + { + "epoch": 2.531578947368421, + "step": 962, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.5342105263157895, + "grad_norm": 5.2449798583984375, + "learning_rate": 6.257008271050141e-06, + "loss": 0.9053, + "step": 963 + }, + { + "epoch": 2.5342105263157895, + "step": 963, + "train_accuracy": 0.703125 + }, + { + "epoch": 2.536842105263158, + "grad_norm": 4.180750370025635, + "learning_rate": 6.237252716887307e-06, + "loss": 1.2451, + "step": 964 + }, + { + "epoch": 2.536842105263158, + "step": 964, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.5394736842105265, + "grad_norm": 7.252860069274902, + "learning_rate": 6.217514255384907e-06, + "loss": 1.1162, + "step": 965 + }, + { + "epoch": 2.5394736842105265, + "step": 965, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.542105263157895, + "grad_norm": 7.941829681396484, + "learning_rate": 6.197792976206887e-06, + "loss": 1.0415, + "step": 966 + }, + { + "epoch": 2.542105263157895, + "step": 966, + "train_accuracy": 0.71875 + }, + { + "epoch": 2.544736842105263, + "grad_norm": 5.3874359130859375, + "learning_rate": 6.178088968939166e-06, + "loss": 1.0444, + "step": 967 + }, + { + "epoch": 2.544736842105263, + "step": 967, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.5473684210526315, + "grad_norm": 3.811061382293701, + "learning_rate": 6.158402323089184e-06, + "loss": 1.0552, + "step": 968 + }, + { + "epoch": 2.5473684210526315, + "step": 968, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.55, + "grad_norm": 4.76802921295166, + "learning_rate": 6.138733128085529e-06, + "loss": 0.958, + "step": 969 + }, + { + "epoch": 2.55, + "step": 969, + "train_accuracy": 0.90625 + }, + { + "epoch": 2.5526315789473686, + "grad_norm": 5.948366165161133, + "learning_rate": 6.119081473277502e-06, + "loss": 0.9478, + "step": 970 + }, + { + "epoch": 2.5526315789473686, + "step": 970, + "train_accuracy": 0.875 + }, + { + "epoch": 2.555263157894737, + "grad_norm": 8.939352989196777, + "learning_rate": 6.0994474479347435e-06, + "loss": 1.0188, + "step": 971 + }, + { + "epoch": 2.555263157894737, + "step": 971, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.557894736842105, + "grad_norm": 4.726570129394531, + "learning_rate": 6.079831141246792e-06, + "loss": 1.0972, + "step": 972 + }, + { + "epoch": 2.557894736842105, + "step": 972, + "train_accuracy": 0.703125 + }, + { + "epoch": 2.5605263157894735, + "grad_norm": 6.62232780456543, + "learning_rate": 6.060232642322717e-06, + "loss": 1.1201, + "step": 973 + }, + { + "epoch": 2.5605263157894735, + "step": 973, + "train_accuracy": 0.90625 + }, + { + "epoch": 2.5631578947368423, + "grad_norm": 6.835370063781738, + "learning_rate": 6.040652040190672e-06, + "loss": 1.063, + "step": 974 + }, + { + "epoch": 2.5631578947368423, + "step": 974, + "train_accuracy": 0.75 + }, + { + "epoch": 2.5657894736842106, + "grad_norm": 9.736258506774902, + "learning_rate": 6.021089423797535e-06, + "loss": 1.2314, + "step": 975 + }, + { + "epoch": 2.5657894736842106, + "step": 975, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.568421052631579, + "grad_norm": 4.367083549499512, + "learning_rate": 6.001544882008461e-06, + "loss": 1.0264, + "step": 976 + }, + { + "epoch": 2.568421052631579, + "step": 976, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.5710526315789473, + "grad_norm": 6.833847522735596, + "learning_rate": 5.982018503606519e-06, + "loss": 1.5303, + "step": 977 + }, + { + "epoch": 2.5710526315789473, + "step": 977, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.5736842105263156, + "grad_norm": 4.34575080871582, + "learning_rate": 5.962510377292252e-06, + "loss": 1.0857, + "step": 978 + }, + { + "epoch": 2.5736842105263156, + "step": 978, + "train_accuracy": 0.734375 + }, + { + "epoch": 2.5763157894736843, + "grad_norm": 4.48021125793457, + "learning_rate": 5.943020591683306e-06, + "loss": 1.2573, + "step": 979 + }, + { + "epoch": 2.5763157894736843, + "step": 979, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.5789473684210527, + "grad_norm": 7.216438293457031, + "learning_rate": 5.923549235313997e-06, + "loss": 1.3081, + "step": 980 + }, + { + "epoch": 2.5789473684210527, + "step": 980, + "train_accuracy": 0.875 + }, + { + "epoch": 2.581578947368421, + "grad_norm": 5.685892105102539, + "learning_rate": 5.904096396634935e-06, + "loss": 0.9727, + "step": 981 + }, + { + "epoch": 2.581578947368421, + "step": 981, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.5842105263157897, + "grad_norm": 3.4921514987945557, + "learning_rate": 5.884662164012616e-06, + "loss": 1.0112, + "step": 982 + }, + { + "epoch": 2.5842105263157897, + "step": 982, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.586842105263158, + "grad_norm": 3.5595004558563232, + "learning_rate": 5.8652466257289974e-06, + "loss": 0.9966, + "step": 983 + }, + { + "epoch": 2.586842105263158, + "step": 983, + "train_accuracy": 0.734375 + }, + { + "epoch": 2.5894736842105264, + "grad_norm": 10.218847274780273, + "learning_rate": 5.845849869981137e-06, + "loss": 1.2466, + "step": 984 + }, + { + "epoch": 2.5894736842105264, + "step": 984, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.5921052631578947, + "grad_norm": 6.644645690917969, + "learning_rate": 5.826471984880754e-06, + "loss": 0.9751, + "step": 985 + }, + { + "epoch": 2.5921052631578947, + "step": 985, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.594736842105263, + "grad_norm": 7.35904598236084, + "learning_rate": 5.807113058453862e-06, + "loss": 1.2002, + "step": 986 + }, + { + "epoch": 2.594736842105263, + "step": 986, + "train_accuracy": 0.703125 + }, + { + "epoch": 2.5973684210526313, + "grad_norm": 5.091441631317139, + "learning_rate": 5.7877731786403304e-06, + "loss": 1.1816, + "step": 987 + }, + { + "epoch": 2.5973684210526313, + "step": 987, + "train_accuracy": 0.734375 + }, + { + "epoch": 2.6, + "grad_norm": 10.243165969848633, + "learning_rate": 5.768452433293532e-06, + "loss": 1.1782, + "step": 988 + }, + { + "epoch": 2.6, + "eval_accuracy": 0.7113147974014282, + "eval_max_score": 5.84375, + "eval_min_score": -10.25, + "eval_runtime": 151.3594, + "eval_samples_per_second": 18.743, + "eval_steps_per_second": 0.297, + "step": 988 + }, + { + "epoch": 2.6, + "step": 988, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.6026315789473684, + "grad_norm": 3.4999477863311768, + "learning_rate": 5.7491509101799055e-06, + "loss": 1.0269, + "step": 989 + }, + { + "epoch": 2.6026315789473684, + "step": 989, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.6052631578947367, + "grad_norm": 3.9146370887756348, + "learning_rate": 5.729868696978574e-06, + "loss": 1.1133, + "step": 990 + }, + { + "epoch": 2.6052631578947367, + "step": 990, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.6078947368421055, + "grad_norm": 7.038024425506592, + "learning_rate": 5.710605881280939e-06, + "loss": 1.1978, + "step": 991 + }, + { + "epoch": 2.6078947368421055, + "step": 991, + "train_accuracy": 0.71875 + }, + { + "epoch": 2.610526315789474, + "grad_norm": 4.451397895812988, + "learning_rate": 5.6913625505902966e-06, + "loss": 1.0557, + "step": 992 + }, + { + "epoch": 2.610526315789474, + "step": 992, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.613157894736842, + "grad_norm": 4.300213813781738, + "learning_rate": 5.6721387923214215e-06, + "loss": 1.23, + "step": 993 + }, + { + "epoch": 2.613157894736842, + "step": 993, + "train_accuracy": 0.71875 + }, + { + "epoch": 2.6157894736842104, + "grad_norm": 3.7951157093048096, + "learning_rate": 5.65293469380018e-06, + "loss": 1.144, + "step": 994 + }, + { + "epoch": 2.6157894736842104, + "step": 994, + "train_accuracy": 0.71875 + }, + { + "epoch": 2.6184210526315788, + "grad_norm": 4.785224437713623, + "learning_rate": 5.633750342263136e-06, + "loss": 1.3882, + "step": 995 + }, + { + "epoch": 2.6184210526315788, + "step": 995, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.6210526315789475, + "grad_norm": 15.803621292114258, + "learning_rate": 5.614585824857148e-06, + "loss": 1.2593, + "step": 996 + }, + { + "epoch": 2.6210526315789475, + "step": 996, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.623684210526316, + "grad_norm": 4.082406997680664, + "learning_rate": 5.595441228638976e-06, + "loss": 0.9116, + "step": 997 + }, + { + "epoch": 2.623684210526316, + "step": 997, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.626315789473684, + "grad_norm": 9.289669036865234, + "learning_rate": 5.576316640574886e-06, + "loss": 1.2944, + "step": 998 + }, + { + "epoch": 2.626315789473684, + "step": 998, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.6289473684210525, + "grad_norm": 6.445914268493652, + "learning_rate": 5.557212147540254e-06, + "loss": 1.1318, + "step": 999 + }, + { + "epoch": 2.6289473684210525, + "step": 999, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 11.213533401489258, + "learning_rate": 5.538127836319176e-06, + "loss": 1.1318, + "step": 1000 + }, + { + "epoch": 2.6315789473684212, + "step": 1000, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.6342105263157896, + "grad_norm": 15.603263854980469, + "learning_rate": 5.519063793604067e-06, + "loss": 1.2778, + "step": 1001 + }, + { + "epoch": 2.6342105263157896, + "step": 1001, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.636842105263158, + "grad_norm": 10.475622177124023, + "learning_rate": 5.50002010599527e-06, + "loss": 1.126, + "step": 1002 + }, + { + "epoch": 2.636842105263158, + "step": 1002, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.639473684210526, + "grad_norm": 5.916659832000732, + "learning_rate": 5.480996860000664e-06, + "loss": 1.2539, + "step": 1003 + }, + { + "epoch": 2.639473684210526, + "step": 1003, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.6421052631578945, + "grad_norm": 6.055222511291504, + "learning_rate": 5.461994142035269e-06, + "loss": 1.0186, + "step": 1004 + }, + { + "epoch": 2.6421052631578945, + "step": 1004, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.6447368421052633, + "grad_norm": 17.587617874145508, + "learning_rate": 5.443012038420856e-06, + "loss": 1.2661, + "step": 1005 + }, + { + "epoch": 2.6447368421052633, + "step": 1005, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.6473684210526316, + "grad_norm": 4.647197246551514, + "learning_rate": 5.424050635385552e-06, + "loss": 1.0176, + "step": 1006 + }, + { + "epoch": 2.6473684210526316, + "step": 1006, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.65, + "grad_norm": 3.68725323677063, + "learning_rate": 5.405110019063449e-06, + "loss": 0.9331, + "step": 1007 + }, + { + "epoch": 2.65, + "step": 1007, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.6526315789473687, + "grad_norm": 5.881679534912109, + "learning_rate": 5.3861902754942104e-06, + "loss": 0.8833, + "step": 1008 + }, + { + "epoch": 2.6526315789473687, + "step": 1008, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.655263157894737, + "grad_norm": 10.934441566467285, + "learning_rate": 5.367291490622699e-06, + "loss": 1.2524, + "step": 1009 + }, + { + "epoch": 2.655263157894737, + "step": 1009, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.6578947368421053, + "grad_norm": 4.606790542602539, + "learning_rate": 5.348413750298542e-06, + "loss": 1.1841, + "step": 1010 + }, + { + "epoch": 2.6578947368421053, + "step": 1010, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.6605263157894736, + "grad_norm": 9.060807228088379, + "learning_rate": 5.329557140275802e-06, + "loss": 1.4224, + "step": 1011 + }, + { + "epoch": 2.6605263157894736, + "step": 1011, + "train_accuracy": 0.75 + }, + { + "epoch": 2.663157894736842, + "grad_norm": 3.719575881958008, + "learning_rate": 5.310721746212522e-06, + "loss": 1.0898, + "step": 1012 + }, + { + "epoch": 2.663157894736842, + "step": 1012, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.6657894736842103, + "grad_norm": 4.574690341949463, + "learning_rate": 5.291907653670402e-06, + "loss": 1.2852, + "step": 1013 + }, + { + "epoch": 2.6657894736842103, + "step": 1013, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.668421052631579, + "grad_norm": 5.748828411102295, + "learning_rate": 5.273114948114346e-06, + "loss": 1.0542, + "step": 1014 + }, + { + "epoch": 2.668421052631579, + "step": 1014, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.6710526315789473, + "grad_norm": 3.7456870079040527, + "learning_rate": 5.254343714912139e-06, + "loss": 1.1528, + "step": 1015 + }, + { + "epoch": 2.6710526315789473, + "step": 1015, + "train_accuracy": 0.671875 + }, + { + "epoch": 2.6736842105263157, + "grad_norm": 4.605350494384766, + "learning_rate": 5.2355940393339914e-06, + "loss": 1.3657, + "step": 1016 + }, + { + "epoch": 2.6736842105263157, + "step": 1016, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.6763157894736844, + "grad_norm": 9.041370391845703, + "learning_rate": 5.216866006552213e-06, + "loss": 1.1028, + "step": 1017 + }, + { + "epoch": 2.6763157894736844, + "step": 1017, + "train_accuracy": 0.71875 + }, + { + "epoch": 2.6789473684210527, + "grad_norm": 4.867583274841309, + "learning_rate": 5.198159701640784e-06, + "loss": 1.1343, + "step": 1018 + }, + { + "epoch": 2.6789473684210527, + "step": 1018, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.681578947368421, + "grad_norm": 8.326457977294922, + "learning_rate": 5.179475209574991e-06, + "loss": 1.0972, + "step": 1019 + }, + { + "epoch": 2.681578947368421, + "step": 1019, + "train_accuracy": 0.75 + }, + { + "epoch": 2.6842105263157894, + "grad_norm": 3.207512140274048, + "learning_rate": 5.1608126152310286e-06, + "loss": 1.0972, + "step": 1020 + }, + { + "epoch": 2.6842105263157894, + "step": 1020, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.6868421052631577, + "grad_norm": 9.606292724609375, + "learning_rate": 5.142172003385622e-06, + "loss": 1.1665, + "step": 1021 + }, + { + "epoch": 2.6868421052631577, + "step": 1021, + "train_accuracy": 0.71875 + }, + { + "epoch": 2.6894736842105265, + "grad_norm": 8.760821342468262, + "learning_rate": 5.123553458715635e-06, + "loss": 1.2441, + "step": 1022 + }, + { + "epoch": 2.6894736842105265, + "step": 1022, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.692105263157895, + "grad_norm": 4.255366802215576, + "learning_rate": 5.104957065797696e-06, + "loss": 1.0542, + "step": 1023 + }, + { + "epoch": 2.692105263157895, + "step": 1023, + "train_accuracy": 0.75 + }, + { + "epoch": 2.694736842105263, + "grad_norm": 3.35002064704895, + "learning_rate": 5.086382909107797e-06, + "loss": 0.8862, + "step": 1024 + }, + { + "epoch": 2.694736842105263, + "step": 1024, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.6973684210526314, + "grad_norm": 7.932205677032471, + "learning_rate": 5.067831073020928e-06, + "loss": 1.1333, + "step": 1025 + }, + { + "epoch": 2.6973684210526314, + "step": 1025, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.7, + "grad_norm": 8.935729026794434, + "learning_rate": 5.049301641810682e-06, + "loss": 1.0669, + "step": 1026 + }, + { + "epoch": 2.7, + "step": 1026, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.7026315789473685, + "grad_norm": 4.217672348022461, + "learning_rate": 5.030794699648875e-06, + "loss": 1.1006, + "step": 1027 + }, + { + "epoch": 2.7026315789473685, + "step": 1027, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.705263157894737, + "grad_norm": 17.088626861572266, + "learning_rate": 5.012310330605167e-06, + "loss": 1.3882, + "step": 1028 + }, + { + "epoch": 2.705263157894737, + "step": 1028, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.707894736842105, + "grad_norm": 6.726995944976807, + "learning_rate": 4.9938486186466736e-06, + "loss": 1.0659, + "step": 1029 + }, + { + "epoch": 2.707894736842105, + "step": 1029, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.7105263157894735, + "grad_norm": 3.387362241744995, + "learning_rate": 4.975409647637591e-06, + "loss": 0.9692, + "step": 1030 + }, + { + "epoch": 2.7105263157894735, + "step": 1030, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.713157894736842, + "grad_norm": 5.338740825653076, + "learning_rate": 4.9569935013388125e-06, + "loss": 1.2466, + "step": 1031 + }, + { + "epoch": 2.713157894736842, + "step": 1031, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.7157894736842105, + "grad_norm": 10.06348705291748, + "learning_rate": 4.938600263407546e-06, + "loss": 1.0942, + "step": 1032 + }, + { + "epoch": 2.7157894736842105, + "step": 1032, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.718421052631579, + "grad_norm": 12.942997932434082, + "learning_rate": 4.9202300173969364e-06, + "loss": 1.2822, + "step": 1033 + }, + { + "epoch": 2.718421052631579, + "step": 1033, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.7210526315789476, + "grad_norm": 7.932915687561035, + "learning_rate": 4.901882846755687e-06, + "loss": 1.0498, + "step": 1034 + }, + { + "epoch": 2.7210526315789476, + "step": 1034, + "train_accuracy": 0.6875 + }, + { + "epoch": 2.723684210526316, + "grad_norm": 7.6717352867126465, + "learning_rate": 4.883558834827675e-06, + "loss": 1.2739, + "step": 1035 + }, + { + "epoch": 2.723684210526316, + "step": 1035, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.7263157894736842, + "grad_norm": 6.054356575012207, + "learning_rate": 4.865258064851579e-06, + "loss": 1.208, + "step": 1036 + }, + { + "epoch": 2.7263157894736842, + "step": 1036, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.7289473684210526, + "grad_norm": 4.3364362716674805, + "learning_rate": 4.846980619960509e-06, + "loss": 1.1011, + "step": 1037 + }, + { + "epoch": 2.7289473684210526, + "step": 1037, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.731578947368421, + "grad_norm": 3.8560869693756104, + "learning_rate": 4.8287265831815924e-06, + "loss": 1.0205, + "step": 1038 + }, + { + "epoch": 2.731578947368421, + "step": 1038, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.734210526315789, + "grad_norm": 5.655264377593994, + "learning_rate": 4.810496037435654e-06, + "loss": 1.1133, + "step": 1039 + }, + { + "epoch": 2.734210526315789, + "step": 1039, + "train_accuracy": 0.890625 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 4.983482360839844, + "learning_rate": 4.792289065536783e-06, + "loss": 0.8828, + "step": 1040 + }, + { + "epoch": 2.736842105263158, + "step": 1040, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.7394736842105263, + "grad_norm": 3.325807809829712, + "learning_rate": 4.774105750192001e-06, + "loss": 1.0684, + "step": 1041 + }, + { + "epoch": 2.7394736842105263, + "step": 1041, + "train_accuracy": 0.75 + }, + { + "epoch": 2.7421052631578946, + "grad_norm": 4.7382893562316895, + "learning_rate": 4.7559461740008475e-06, + "loss": 1.2109, + "step": 1042 + }, + { + "epoch": 2.7421052631578946, + "step": 1042, + "train_accuracy": 0.75 + }, + { + "epoch": 2.7447368421052634, + "grad_norm": 5.645082950592041, + "learning_rate": 4.7378104194550485e-06, + "loss": 1.2959, + "step": 1043 + }, + { + "epoch": 2.7447368421052634, + "step": 1043, + "train_accuracy": 0.71875 + }, + { + "epoch": 2.7473684210526317, + "grad_norm": 4.51600456237793, + "learning_rate": 4.719698568938092e-06, + "loss": 1.25, + "step": 1044 + }, + { + "epoch": 2.7473684210526317, + "step": 1044, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.75, + "grad_norm": 5.984616756439209, + "learning_rate": 4.701610704724906e-06, + "loss": 1.1694, + "step": 1045 + }, + { + "epoch": 2.75, + "step": 1045, + "train_accuracy": 0.9375 + }, + { + "epoch": 2.7526315789473683, + "grad_norm": 5.485406398773193, + "learning_rate": 4.6835469089814304e-06, + "loss": 0.8899, + "step": 1046 + }, + { + "epoch": 2.7526315789473683, + "step": 1046, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.7552631578947366, + "grad_norm": 4.4941534996032715, + "learning_rate": 4.665507263764299e-06, + "loss": 1.2178, + "step": 1047 + }, + { + "epoch": 2.7552631578947366, + "step": 1047, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.7578947368421054, + "grad_norm": 6.920727252960205, + "learning_rate": 4.6474918510204145e-06, + "loss": 1.0352, + "step": 1048 + }, + { + "epoch": 2.7578947368421054, + "step": 1048, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.7605263157894737, + "grad_norm": 5.5470356941223145, + "learning_rate": 4.629500752586625e-06, + "loss": 1.0195, + "step": 1049 + }, + { + "epoch": 2.7605263157894737, + "step": 1049, + "train_accuracy": 0.75 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 5.659350872039795, + "learning_rate": 4.611534050189304e-06, + "loss": 1.0259, + "step": 1050 + }, + { + "epoch": 2.763157894736842, + "step": 1050, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.765789473684211, + "grad_norm": 8.783293724060059, + "learning_rate": 4.593591825444028e-06, + "loss": 1.1479, + "step": 1051 + }, + { + "epoch": 2.765789473684211, + "step": 1051, + "train_accuracy": 0.75 + }, + { + "epoch": 2.768421052631579, + "grad_norm": 3.798257827758789, + "learning_rate": 4.575674159855156e-06, + "loss": 1.0312, + "step": 1052 + }, + { + "epoch": 2.768421052631579, + "step": 1052, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.7710526315789474, + "grad_norm": 5.9507155418396, + "learning_rate": 4.557781134815509e-06, + "loss": 0.9458, + "step": 1053 + }, + { + "epoch": 2.7710526315789474, + "step": 1053, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.7736842105263158, + "grad_norm": 8.45976448059082, + "learning_rate": 4.539912831605959e-06, + "loss": 1.3091, + "step": 1054 + }, + { + "epoch": 2.7736842105263158, + "step": 1054, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.776315789473684, + "grad_norm": 9.173246383666992, + "learning_rate": 4.522069331395085e-06, + "loss": 1.127, + "step": 1055 + }, + { + "epoch": 2.776315789473684, + "step": 1055, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.7789473684210524, + "grad_norm": 5.204906940460205, + "learning_rate": 4.504250715238791e-06, + "loss": 1.0767, + "step": 1056 + }, + { + "epoch": 2.7789473684210524, + "step": 1056, + "train_accuracy": 0.75 + }, + { + "epoch": 2.781578947368421, + "grad_norm": 5.059564113616943, + "learning_rate": 4.486457064079943e-06, + "loss": 1.1562, + "step": 1057 + }, + { + "epoch": 2.781578947368421, + "step": 1057, + "train_accuracy": 0.734375 + }, + { + "epoch": 2.7842105263157895, + "grad_norm": 3.4430267810821533, + "learning_rate": 4.468688458748006e-06, + "loss": 1.0962, + "step": 1058 + }, + { + "epoch": 2.7842105263157895, + "step": 1058, + "train_accuracy": 0.75 + }, + { + "epoch": 2.786842105263158, + "grad_norm": 10.936015129089355, + "learning_rate": 4.450944979958668e-06, + "loss": 1.3589, + "step": 1059 + }, + { + "epoch": 2.786842105263158, + "step": 1059, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.7894736842105265, + "grad_norm": 3.82942533493042, + "learning_rate": 4.433226708313475e-06, + "loss": 1.1372, + "step": 1060 + }, + { + "epoch": 2.7894736842105265, + "step": 1060, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.792105263157895, + "grad_norm": 2.9924509525299072, + "learning_rate": 4.415533724299471e-06, + "loss": 0.9639, + "step": 1061 + }, + { + "epoch": 2.792105263157895, + "step": 1061, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.794736842105263, + "grad_norm": 8.909473419189453, + "learning_rate": 4.397866108288828e-06, + "loss": 1.1021, + "step": 1062 + }, + { + "epoch": 2.794736842105263, + "step": 1062, + "train_accuracy": 0.890625 + }, + { + "epoch": 2.7973684210526315, + "grad_norm": 9.16897201538086, + "learning_rate": 4.380223940538478e-06, + "loss": 1.1138, + "step": 1063 + }, + { + "epoch": 2.7973684210526315, + "step": 1063, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.8, + "grad_norm": 4.704495906829834, + "learning_rate": 4.362607301189756e-06, + "loss": 1.0015, + "step": 1064 + }, + { + "epoch": 2.8, + "eval_accuracy": 0.7099048495292664, + "eval_max_score": 7.96875, + "eval_min_score": -8.875, + "eval_runtime": 151.1898, + "eval_samples_per_second": 18.764, + "eval_steps_per_second": 0.298, + "step": 1064 + }, + { + "epoch": 2.8, + "step": 1064, + "train_accuracy": 0.703125 + }, + { + "epoch": 2.8026315789473686, + "grad_norm": 5.374510765075684, + "learning_rate": 4.345016270268029e-06, + "loss": 1.1904, + "step": 1065 + }, + { + "epoch": 2.8026315789473686, + "step": 1065, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.805263157894737, + "grad_norm": 6.463529586791992, + "learning_rate": 4.327450927682334e-06, + "loss": 1.1279, + "step": 1066 + }, + { + "epoch": 2.805263157894737, + "step": 1066, + "train_accuracy": 0.875 + }, + { + "epoch": 2.807894736842105, + "grad_norm": 4.455600738525391, + "learning_rate": 4.309911353225019e-06, + "loss": 0.9233, + "step": 1067 + }, + { + "epoch": 2.807894736842105, + "step": 1067, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.8105263157894735, + "grad_norm": 7.1986589431762695, + "learning_rate": 4.2923976265713765e-06, + "loss": 0.8882, + "step": 1068 + }, + { + "epoch": 2.8105263157894735, + "step": 1068, + "train_accuracy": 0.75 + }, + { + "epoch": 2.8131578947368423, + "grad_norm": 4.7391462326049805, + "learning_rate": 4.274909827279283e-06, + "loss": 1.2251, + "step": 1069 + }, + { + "epoch": 2.8131578947368423, + "step": 1069, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.8157894736842106, + "grad_norm": 4.475101470947266, + "learning_rate": 4.257448034788837e-06, + "loss": 1.0859, + "step": 1070 + }, + { + "epoch": 2.8157894736842106, + "step": 1070, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.818421052631579, + "grad_norm": 5.544163703918457, + "learning_rate": 4.240012328421998e-06, + "loss": 0.9385, + "step": 1071 + }, + { + "epoch": 2.818421052631579, + "step": 1071, + "train_accuracy": 0.875 + }, + { + "epoch": 2.8210526315789473, + "grad_norm": 3.5065155029296875, + "learning_rate": 4.222602787382223e-06, + "loss": 0.8945, + "step": 1072 + }, + { + "epoch": 2.8210526315789473, + "step": 1072, + "train_accuracy": 0.75 + }, + { + "epoch": 2.8236842105263156, + "grad_norm": 5.043632507324219, + "learning_rate": 4.2052194907541255e-06, + "loss": 1.311, + "step": 1073 + }, + { + "epoch": 2.8236842105263156, + "step": 1073, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.8263157894736843, + "grad_norm": 3.5553743839263916, + "learning_rate": 4.187862517503077e-06, + "loss": 0.9697, + "step": 1074 + }, + { + "epoch": 2.8263157894736843, + "step": 1074, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.8289473684210527, + "grad_norm": 3.987929344177246, + "learning_rate": 4.1705319464749e-06, + "loss": 1.0498, + "step": 1075 + }, + { + "epoch": 2.8289473684210527, + "step": 1075, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.831578947368421, + "grad_norm": 5.27581262588501, + "learning_rate": 4.153227856395452e-06, + "loss": 1.2754, + "step": 1076 + }, + { + "epoch": 2.831578947368421, + "step": 1076, + "train_accuracy": 0.75 + }, + { + "epoch": 2.8342105263157897, + "grad_norm": 4.249098300933838, + "learning_rate": 4.135950325870328e-06, + "loss": 0.9639, + "step": 1077 + }, + { + "epoch": 2.8342105263157897, + "step": 1077, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.836842105263158, + "grad_norm": 3.359182357788086, + "learning_rate": 4.118699433384446e-06, + "loss": 0.7979, + "step": 1078 + }, + { + "epoch": 2.836842105263158, + "step": 1078, + "train_accuracy": 0.875 + }, + { + "epoch": 2.8394736842105264, + "grad_norm": 5.7397589683532715, + "learning_rate": 4.101475257301746e-06, + "loss": 1.0815, + "step": 1079 + }, + { + "epoch": 2.8394736842105264, + "step": 1079, + "train_accuracy": 0.734375 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 3.884835720062256, + "learning_rate": 4.084277875864776e-06, + "loss": 1.0981, + "step": 1080 + }, + { + "epoch": 2.8421052631578947, + "step": 1080, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.844736842105263, + "grad_norm": 5.433359622955322, + "learning_rate": 4.067107367194397e-06, + "loss": 1.2388, + "step": 1081 + }, + { + "epoch": 2.844736842105263, + "step": 1081, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.8473684210526313, + "grad_norm": 6.485168933868408, + "learning_rate": 4.049963809289368e-06, + "loss": 1.3252, + "step": 1082 + }, + { + "epoch": 2.8473684210526313, + "step": 1082, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.85, + "grad_norm": 3.8819828033447266, + "learning_rate": 4.032847280026051e-06, + "loss": 1.1392, + "step": 1083 + }, + { + "epoch": 2.85, + "step": 1083, + "train_accuracy": 0.71875 + }, + { + "epoch": 2.8526315789473684, + "grad_norm": 4.981328964233398, + "learning_rate": 4.015757857157999e-06, + "loss": 1.2705, + "step": 1084 + }, + { + "epoch": 2.8526315789473684, + "step": 1084, + "train_accuracy": 0.703125 + }, + { + "epoch": 2.8552631578947367, + "grad_norm": 7.025356769561768, + "learning_rate": 3.998695618315655e-06, + "loss": 1.02, + "step": 1085 + }, + { + "epoch": 2.8552631578947367, + "step": 1085, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.8578947368421055, + "grad_norm": 6.157021522521973, + "learning_rate": 3.9816606410059625e-06, + "loss": 1.2275, + "step": 1086 + }, + { + "epoch": 2.8578947368421055, + "step": 1086, + "train_accuracy": 0.75 + }, + { + "epoch": 2.860526315789474, + "grad_norm": 4.282430648803711, + "learning_rate": 3.964653002612031e-06, + "loss": 0.9609, + "step": 1087 + }, + { + "epoch": 2.860526315789474, + "step": 1087, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.863157894736842, + "grad_norm": 3.4297397136688232, + "learning_rate": 3.94767278039278e-06, + "loss": 1.103, + "step": 1088 + }, + { + "epoch": 2.863157894736842, + "step": 1088, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.8657894736842104, + "grad_norm": 3.6751058101654053, + "learning_rate": 3.930720051482585e-06, + "loss": 0.9619, + "step": 1089 + }, + { + "epoch": 2.8657894736842104, + "step": 1089, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.8684210526315788, + "grad_norm": 4.563840389251709, + "learning_rate": 3.9137948928909374e-06, + "loss": 1.3232, + "step": 1090 + }, + { + "epoch": 2.8684210526315788, + "step": 1090, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.8710526315789475, + "grad_norm": 4.432309150695801, + "learning_rate": 3.896897381502081e-06, + "loss": 1.0767, + "step": 1091 + }, + { + "epoch": 2.8710526315789475, + "step": 1091, + "train_accuracy": 0.65625 + }, + { + "epoch": 2.873684210526316, + "grad_norm": 7.510792255401611, + "learning_rate": 3.880027594074671e-06, + "loss": 1.2344, + "step": 1092 + }, + { + "epoch": 2.873684210526316, + "step": 1092, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.876315789473684, + "grad_norm": 3.3003294467926025, + "learning_rate": 3.863185607241425e-06, + "loss": 0.9473, + "step": 1093 + }, + { + "epoch": 2.876315789473684, + "step": 1093, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.8789473684210525, + "grad_norm": 4.405003547668457, + "learning_rate": 3.846371497508775e-06, + "loss": 1.1001, + "step": 1094 + }, + { + "epoch": 2.8789473684210525, + "step": 1094, + "train_accuracy": 0.75 + }, + { + "epoch": 2.8815789473684212, + "grad_norm": 4.802317142486572, + "learning_rate": 3.829585341256515e-06, + "loss": 1.0273, + "step": 1095 + }, + { + "epoch": 2.8815789473684212, + "step": 1095, + "train_accuracy": 0.875 + }, + { + "epoch": 2.8842105263157896, + "grad_norm": 3.6775407791137695, + "learning_rate": 3.812827214737459e-06, + "loss": 0.9351, + "step": 1096 + }, + { + "epoch": 2.8842105263157896, + "step": 1096, + "train_accuracy": 0.75 + }, + { + "epoch": 2.886842105263158, + "grad_norm": 3.8660459518432617, + "learning_rate": 3.796097194077093e-06, + "loss": 1.1362, + "step": 1097 + }, + { + "epoch": 2.886842105263158, + "step": 1097, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.889473684210526, + "grad_norm": 9.875020027160645, + "learning_rate": 3.77939535527323e-06, + "loss": 1.2524, + "step": 1098 + }, + { + "epoch": 2.889473684210526, + "step": 1098, + "train_accuracy": 0.71875 + }, + { + "epoch": 2.8921052631578945, + "grad_norm": 3.817460298538208, + "learning_rate": 3.7627217741956625e-06, + "loss": 1.0151, + "step": 1099 + }, + { + "epoch": 2.8921052631578945, + "step": 1099, + "train_accuracy": 0.6875 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 7.155078411102295, + "learning_rate": 3.7460765265858213e-06, + "loss": 1.1729, + "step": 1100 + }, + { + "epoch": 2.8947368421052633, + "step": 1100, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.8973684210526316, + "grad_norm": 5.495885848999023, + "learning_rate": 3.729459688056427e-06, + "loss": 0.9888, + "step": 1101 + }, + { + "epoch": 2.8973684210526316, + "step": 1101, + "train_accuracy": 0.71875 + }, + { + "epoch": 2.9, + "grad_norm": 5.324273586273193, + "learning_rate": 3.712871334091154e-06, + "loss": 1.239, + "step": 1102 + }, + { + "epoch": 2.9, + "step": 1102, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.9026315789473687, + "grad_norm": 4.5909905433654785, + "learning_rate": 3.696311540044276e-06, + "loss": 1.0537, + "step": 1103 + }, + { + "epoch": 2.9026315789473687, + "step": 1103, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.905263157894737, + "grad_norm": 3.9828968048095703, + "learning_rate": 3.6797803811403354e-06, + "loss": 1.0264, + "step": 1104 + }, + { + "epoch": 2.905263157894737, + "step": 1104, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.9078947368421053, + "grad_norm": 4.042853355407715, + "learning_rate": 3.663277932473791e-06, + "loss": 0.9209, + "step": 1105 + }, + { + "epoch": 2.9078947368421053, + "step": 1105, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.9105263157894736, + "grad_norm": 6.130189418792725, + "learning_rate": 3.646804269008697e-06, + "loss": 1.2104, + "step": 1106 + }, + { + "epoch": 2.9105263157894736, + "step": 1106, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.913157894736842, + "grad_norm": 6.112072944641113, + "learning_rate": 3.630359465578324e-06, + "loss": 1.2646, + "step": 1107 + }, + { + "epoch": 2.913157894736842, + "step": 1107, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.9157894736842103, + "grad_norm": 8.988405227661133, + "learning_rate": 3.613943596884865e-06, + "loss": 1.1348, + "step": 1108 + }, + { + "epoch": 2.9157894736842103, + "step": 1108, + "train_accuracy": 0.65625 + }, + { + "epoch": 2.918421052631579, + "grad_norm": 7.373897552490234, + "learning_rate": 3.597556737499064e-06, + "loss": 1.3979, + "step": 1109 + }, + { + "epoch": 2.918421052631579, + "step": 1109, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.9210526315789473, + "grad_norm": 4.439242839813232, + "learning_rate": 3.5811989618598863e-06, + "loss": 0.8975, + "step": 1110 + }, + { + "epoch": 2.9210526315789473, + "step": 1110, + "train_accuracy": 0.703125 + }, + { + "epoch": 2.9236842105263157, + "grad_norm": 8.199309349060059, + "learning_rate": 3.564870344274185e-06, + "loss": 1.2334, + "step": 1111 + }, + { + "epoch": 2.9236842105263157, + "step": 1111, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.9263157894736844, + "grad_norm": 4.312633037567139, + "learning_rate": 3.5485709589163576e-06, + "loss": 1.2168, + "step": 1112 + }, + { + "epoch": 2.9263157894736844, + "step": 1112, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.9289473684210527, + "grad_norm": 3.521815776824951, + "learning_rate": 3.532300879828013e-06, + "loss": 1.0327, + "step": 1113 + }, + { + "epoch": 2.9289473684210527, + "step": 1113, + "train_accuracy": 0.875 + }, + { + "epoch": 2.931578947368421, + "grad_norm": 5.077023983001709, + "learning_rate": 3.516060180917632e-06, + "loss": 1.041, + "step": 1114 + }, + { + "epoch": 2.931578947368421, + "step": 1114, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.9342105263157894, + "grad_norm": 4.366433143615723, + "learning_rate": 3.499848935960234e-06, + "loss": 1.0889, + "step": 1115 + }, + { + "epoch": 2.9342105263157894, + "step": 1115, + "train_accuracy": 0.734375 + }, + { + "epoch": 2.9368421052631577, + "grad_norm": 5.330564498901367, + "learning_rate": 3.483667218597039e-06, + "loss": 1.4038, + "step": 1116 + }, + { + "epoch": 2.9368421052631577, + "step": 1116, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.9394736842105265, + "grad_norm": 3.7983169555664062, + "learning_rate": 3.467515102335136e-06, + "loss": 0.8452, + "step": 1117 + }, + { + "epoch": 2.9394736842105265, + "step": 1117, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.942105263157895, + "grad_norm": 4.798806667327881, + "learning_rate": 3.4513926605471504e-06, + "loss": 0.9983, + "step": 1118 + }, + { + "epoch": 2.942105263157895, + "step": 1118, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.944736842105263, + "grad_norm": 3.3308746814727783, + "learning_rate": 3.435299966470903e-06, + "loss": 0.957, + "step": 1119 + }, + { + "epoch": 2.944736842105263, + "step": 1119, + "train_accuracy": 0.78125 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 5.4025044441223145, + "learning_rate": 3.4192370932090892e-06, + "loss": 1.1567, + "step": 1120 + }, + { + "epoch": 2.9473684210526314, + "step": 1120, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.95, + "grad_norm": 12.746258735656738, + "learning_rate": 3.4032041137289327e-06, + "loss": 1.1982, + "step": 1121 + }, + { + "epoch": 2.95, + "step": 1121, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.9526315789473685, + "grad_norm": 4.749011039733887, + "learning_rate": 3.387201100861869e-06, + "loss": 1.1455, + "step": 1122 + }, + { + "epoch": 2.9526315789473685, + "step": 1122, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.955263157894737, + "grad_norm": 4.1148200035095215, + "learning_rate": 3.371228127303203e-06, + "loss": 1.1035, + "step": 1123 + }, + { + "epoch": 2.955263157894737, + "step": 1123, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.957894736842105, + "grad_norm": 3.990781545639038, + "learning_rate": 3.355285265611784e-06, + "loss": 0.9458, + "step": 1124 + }, + { + "epoch": 2.957894736842105, + "step": 1124, + "train_accuracy": 0.796875 + }, + { + "epoch": 2.9605263157894735, + "grad_norm": 4.621012210845947, + "learning_rate": 3.339372588209672e-06, + "loss": 0.979, + "step": 1125 + }, + { + "epoch": 2.9605263157894735, + "step": 1125, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.963157894736842, + "grad_norm": 4.63665771484375, + "learning_rate": 3.323490167381823e-06, + "loss": 1.2349, + "step": 1126 + }, + { + "epoch": 2.963157894736842, + "step": 1126, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.9657894736842105, + "grad_norm": 3.3280749320983887, + "learning_rate": 3.307638075275731e-06, + "loss": 0.9976, + "step": 1127 + }, + { + "epoch": 2.9657894736842105, + "step": 1127, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.968421052631579, + "grad_norm": 3.5311524868011475, + "learning_rate": 3.2918163839011408e-06, + "loss": 1.0435, + "step": 1128 + }, + { + "epoch": 2.968421052631579, + "step": 1128, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.9710526315789476, + "grad_norm": 6.561064720153809, + "learning_rate": 3.2760251651296737e-06, + "loss": 1.1294, + "step": 1129 + }, + { + "epoch": 2.9710526315789476, + "step": 1129, + "train_accuracy": 0.734375 + }, + { + "epoch": 2.973684210526316, + "grad_norm": 3.985593318939209, + "learning_rate": 3.2602644906945536e-06, + "loss": 1.168, + "step": 1130 + }, + { + "epoch": 2.973684210526316, + "step": 1130, + "train_accuracy": 0.75 + }, + { + "epoch": 2.9763157894736842, + "grad_norm": 3.7214221954345703, + "learning_rate": 3.244534432190225e-06, + "loss": 1.1406, + "step": 1131 + }, + { + "epoch": 2.9763157894736842, + "step": 1131, + "train_accuracy": 0.84375 + }, + { + "epoch": 2.9789473684210526, + "grad_norm": 5.923731327056885, + "learning_rate": 3.228835061072084e-06, + "loss": 1.1328, + "step": 1132 + }, + { + "epoch": 2.9789473684210526, + "step": 1132, + "train_accuracy": 0.703125 + }, + { + "epoch": 2.981578947368421, + "grad_norm": 4.362364768981934, + "learning_rate": 3.2131664486561022e-06, + "loss": 1.0903, + "step": 1133 + }, + { + "epoch": 2.981578947368421, + "step": 1133, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.984210526315789, + "grad_norm": 3.981119155883789, + "learning_rate": 3.197528666118549e-06, + "loss": 1.1641, + "step": 1134 + }, + { + "epoch": 2.984210526315789, + "step": 1134, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.986842105263158, + "grad_norm": 7.54674768447876, + "learning_rate": 3.1819217844956216e-06, + "loss": 1.1636, + "step": 1135 + }, + { + "epoch": 2.986842105263158, + "step": 1135, + "train_accuracy": 0.8125 + }, + { + "epoch": 2.9894736842105263, + "grad_norm": 7.682521820068359, + "learning_rate": 3.1663458746831734e-06, + "loss": 1.1372, + "step": 1136 + }, + { + "epoch": 2.9894736842105263, + "step": 1136, + "train_accuracy": 0.828125 + }, + { + "epoch": 2.9921052631578946, + "grad_norm": 6.8332672119140625, + "learning_rate": 3.1508010074363384e-06, + "loss": 0.9878, + "step": 1137 + }, + { + "epoch": 2.9921052631578946, + "step": 1137, + "train_accuracy": 0.765625 + }, + { + "epoch": 2.9947368421052634, + "grad_norm": 10.780973434448242, + "learning_rate": 3.1352872533692603e-06, + "loss": 1.1616, + "step": 1138 + }, + { + "epoch": 2.9947368421052634, + "step": 1138, + "train_accuracy": 0.859375 + }, + { + "epoch": 2.9973684210526317, + "grad_norm": 6.914345741271973, + "learning_rate": 3.119804682954728e-06, + "loss": 1.1758, + "step": 1139 + }, + { + "epoch": 2.9973684210526317, + "step": 1139, + "train_accuracy": 0.796875 + }, + { + "epoch": 3.0, + "grad_norm": 28.889413833618164, + "learning_rate": 3.1043533665238944e-06, + "loss": 0.9897, + "step": 1140 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.7127246856689453, + "eval_max_score": 6.8125, + "eval_min_score": -8.25, + "eval_runtime": 151.4052, + "eval_samples_per_second": 18.738, + "eval_steps_per_second": 0.297, + "step": 1140 + }, + { + "epoch": 3.0, + "step": 1140, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.0026315789473683, + "grad_norm": 91.07470703125, + "learning_rate": 3.0889333742659187e-06, + "loss": 0.9707, + "step": 1141 + }, + { + "epoch": 3.0026315789473683, + "step": 1141, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.0052631578947366, + "grad_norm": 6.031994819641113, + "learning_rate": 3.0735447762276872e-06, + "loss": 1.0366, + "step": 1142 + }, + { + "epoch": 3.0052631578947366, + "step": 1142, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.0078947368421054, + "grad_norm": 4.031905651092529, + "learning_rate": 3.0581876423134527e-06, + "loss": 0.7954, + "step": 1143 + }, + { + "epoch": 3.0078947368421054, + "step": 1143, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.0105263157894737, + "grad_norm": 7.002809047698975, + "learning_rate": 3.042862042284559e-06, + "loss": 0.7837, + "step": 1144 + }, + { + "epoch": 3.0105263157894737, + "step": 1144, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.013157894736842, + "grad_norm": 4.162775039672852, + "learning_rate": 3.027568045759094e-06, + "loss": 1.0454, + "step": 1145 + }, + { + "epoch": 3.013157894736842, + "step": 1145, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.0157894736842104, + "grad_norm": 6.396974086761475, + "learning_rate": 3.0123057222115835e-06, + "loss": 1.0654, + "step": 1146 + }, + { + "epoch": 3.0157894736842104, + "step": 1146, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.018421052631579, + "grad_norm": 3.9046335220336914, + "learning_rate": 2.9970751409726785e-06, + "loss": 0.886, + "step": 1147 + }, + { + "epoch": 3.018421052631579, + "step": 1147, + "train_accuracy": 0.8125 + }, + { + "epoch": 3.0210526315789474, + "grad_norm": 7.014838218688965, + "learning_rate": 2.981876371228836e-06, + "loss": 1.019, + "step": 1148 + }, + { + "epoch": 3.0210526315789474, + "step": 1148, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.0236842105263158, + "grad_norm": 6.357172012329102, + "learning_rate": 2.9667094820220044e-06, + "loss": 0.873, + "step": 1149 + }, + { + "epoch": 3.0236842105263158, + "step": 1149, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.026315789473684, + "grad_norm": 3.373042583465576, + "learning_rate": 2.951574542249315e-06, + "loss": 0.958, + "step": 1150 + }, + { + "epoch": 3.026315789473684, + "step": 1150, + "train_accuracy": 0.875 + }, + { + "epoch": 3.028947368421053, + "grad_norm": 4.8992743492126465, + "learning_rate": 2.936471620662763e-06, + "loss": 0.8518, + "step": 1151 + }, + { + "epoch": 3.028947368421053, + "step": 1151, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.031578947368421, + "grad_norm": 3.5850627422332764, + "learning_rate": 2.9214007858688986e-06, + "loss": 0.9675, + "step": 1152 + }, + { + "epoch": 3.031578947368421, + "step": 1152, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.0342105263157895, + "grad_norm": 9.761622428894043, + "learning_rate": 2.906362106328515e-06, + "loss": 0.9331, + "step": 1153 + }, + { + "epoch": 3.0342105263157895, + "step": 1153, + "train_accuracy": 0.8125 + }, + { + "epoch": 3.036842105263158, + "grad_norm": 4.369013786315918, + "learning_rate": 2.8913556503563356e-06, + "loss": 1.0645, + "step": 1154 + }, + { + "epoch": 3.036842105263158, + "step": 1154, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.039473684210526, + "grad_norm": 3.727263927459717, + "learning_rate": 2.876381486120706e-06, + "loss": 0.8364, + "step": 1155 + }, + { + "epoch": 3.039473684210526, + "step": 1155, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.042105263157895, + "grad_norm": 3.8516135215759277, + "learning_rate": 2.861439681643283e-06, + "loss": 0.853, + "step": 1156 + }, + { + "epoch": 3.042105263157895, + "step": 1156, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.044736842105263, + "grad_norm": 4.059918403625488, + "learning_rate": 2.846530304798727e-06, + "loss": 1.0015, + "step": 1157 + }, + { + "epoch": 3.044736842105263, + "step": 1157, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.0473684210526315, + "grad_norm": 4.983440399169922, + "learning_rate": 2.831653423314389e-06, + "loss": 0.7947, + "step": 1158 + }, + { + "epoch": 3.0473684210526315, + "step": 1158, + "train_accuracy": 0.875 + }, + { + "epoch": 3.05, + "grad_norm": 7.605556011199951, + "learning_rate": 2.816809104770012e-06, + "loss": 0.9277, + "step": 1159 + }, + { + "epoch": 3.05, + "step": 1159, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.0526315789473686, + "grad_norm": 3.867741584777832, + "learning_rate": 2.8019974165974127e-06, + "loss": 0.9189, + "step": 1160 + }, + { + "epoch": 3.0526315789473686, + "step": 1160, + "train_accuracy": 0.9375 + }, + { + "epoch": 3.055263157894737, + "grad_norm": 5.580239772796631, + "learning_rate": 2.787218426080184e-06, + "loss": 0.7227, + "step": 1161 + }, + { + "epoch": 3.055263157894737, + "step": 1161, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.057894736842105, + "grad_norm": 4.5618672370910645, + "learning_rate": 2.7724722003533945e-06, + "loss": 0.9521, + "step": 1162 + }, + { + "epoch": 3.057894736842105, + "step": 1162, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.0605263157894735, + "grad_norm": 7.543132305145264, + "learning_rate": 2.7577588064032533e-06, + "loss": 0.7461, + "step": 1163 + }, + { + "epoch": 3.0605263157894735, + "step": 1163, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.0631578947368423, + "grad_norm": 4.477904319763184, + "learning_rate": 2.7430783110668557e-06, + "loss": 0.9336, + "step": 1164 + }, + { + "epoch": 3.0631578947368423, + "step": 1164, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.0657894736842106, + "grad_norm": 4.662795543670654, + "learning_rate": 2.7284307810318257e-06, + "loss": 0.8987, + "step": 1165 + }, + { + "epoch": 3.0657894736842106, + "step": 1165, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.068421052631579, + "grad_norm": 5.89164400100708, + "learning_rate": 2.7138162828360628e-06, + "loss": 0.7316, + "step": 1166 + }, + { + "epoch": 3.068421052631579, + "step": 1166, + "train_accuracy": 0.875 + }, + { + "epoch": 3.0710526315789473, + "grad_norm": 10.350142478942871, + "learning_rate": 2.699234882867393e-06, + "loss": 1.0649, + "step": 1167 + }, + { + "epoch": 3.0710526315789473, + "step": 1167, + "train_accuracy": 0.8125 + }, + { + "epoch": 3.0736842105263156, + "grad_norm": 4.684883117675781, + "learning_rate": 2.6846866473633126e-06, + "loss": 0.8745, + "step": 1168 + }, + { + "epoch": 3.0736842105263156, + "step": 1168, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.0763157894736843, + "grad_norm": 6.489368915557861, + "learning_rate": 2.6701716424106425e-06, + "loss": 0.8135, + "step": 1169 + }, + { + "epoch": 3.0763157894736843, + "step": 1169, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.0789473684210527, + "grad_norm": 5.5763702392578125, + "learning_rate": 2.6556899339452757e-06, + "loss": 0.8262, + "step": 1170 + }, + { + "epoch": 3.0789473684210527, + "step": 1170, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.081578947368421, + "grad_norm": 4.210776329040527, + "learning_rate": 2.641241587751824e-06, + "loss": 0.8428, + "step": 1171 + }, + { + "epoch": 3.081578947368421, + "step": 1171, + "train_accuracy": 0.765625 + }, + { + "epoch": 3.0842105263157893, + "grad_norm": 5.931998252868652, + "learning_rate": 2.626826669463377e-06, + "loss": 0.9688, + "step": 1172 + }, + { + "epoch": 3.0842105263157893, + "step": 1172, + "train_accuracy": 0.9375 + }, + { + "epoch": 3.086842105263158, + "grad_norm": 4.297722816467285, + "learning_rate": 2.6124452445611458e-06, + "loss": 0.7803, + "step": 1173 + }, + { + "epoch": 3.086842105263158, + "step": 1173, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.0894736842105264, + "grad_norm": 4.3388495445251465, + "learning_rate": 2.5980973783742236e-06, + "loss": 0.7305, + "step": 1174 + }, + { + "epoch": 3.0894736842105264, + "step": 1174, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.0921052631578947, + "grad_norm": 6.159016132354736, + "learning_rate": 2.583783136079231e-06, + "loss": 0.8511, + "step": 1175 + }, + { + "epoch": 3.0921052631578947, + "step": 1175, + "train_accuracy": 0.78125 + }, + { + "epoch": 3.094736842105263, + "grad_norm": 7.548987865447998, + "learning_rate": 2.5695025827000752e-06, + "loss": 0.9966, + "step": 1176 + }, + { + "epoch": 3.094736842105263, + "step": 1176, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.0973684210526318, + "grad_norm": 8.062982559204102, + "learning_rate": 2.555255783107603e-06, + "loss": 0.8833, + "step": 1177 + }, + { + "epoch": 3.0973684210526318, + "step": 1177, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.1, + "grad_norm": 6.597980976104736, + "learning_rate": 2.5410428020193568e-06, + "loss": 0.7385, + "step": 1178 + }, + { + "epoch": 3.1, + "step": 1178, + "train_accuracy": 0.875 + }, + { + "epoch": 3.1026315789473684, + "grad_norm": 11.51430892944336, + "learning_rate": 2.5268637039992296e-06, + "loss": 1.0493, + "step": 1179 + }, + { + "epoch": 3.1026315789473684, + "step": 1179, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.1052631578947367, + "grad_norm": 9.351119041442871, + "learning_rate": 2.5127185534572173e-06, + "loss": 0.938, + "step": 1180 + }, + { + "epoch": 3.1052631578947367, + "step": 1180, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.1078947368421055, + "grad_norm": 7.316296577453613, + "learning_rate": 2.4986074146490967e-06, + "loss": 0.8232, + "step": 1181 + }, + { + "epoch": 3.1078947368421055, + "step": 1181, + "train_accuracy": 0.78125 + }, + { + "epoch": 3.110526315789474, + "grad_norm": 6.263866424560547, + "learning_rate": 2.4845303516761442e-06, + "loss": 0.9543, + "step": 1182 + }, + { + "epoch": 3.110526315789474, + "step": 1182, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.113157894736842, + "grad_norm": 6.839787006378174, + "learning_rate": 2.4704874284848425e-06, + "loss": 0.9771, + "step": 1183 + }, + { + "epoch": 3.113157894736842, + "step": 1183, + "train_accuracy": 0.8125 + }, + { + "epoch": 3.1157894736842104, + "grad_norm": 6.001226902008057, + "learning_rate": 2.456478708866591e-06, + "loss": 0.8281, + "step": 1184 + }, + { + "epoch": 3.1157894736842104, + "step": 1184, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.1184210526315788, + "grad_norm": 5.544653415679932, + "learning_rate": 2.4425042564574186e-06, + "loss": 0.7673, + "step": 1185 + }, + { + "epoch": 3.1184210526315788, + "step": 1185, + "train_accuracy": 0.8125 + }, + { + "epoch": 3.1210526315789475, + "grad_norm": 11.175527572631836, + "learning_rate": 2.4285641347376887e-06, + "loss": 0.8721, + "step": 1186 + }, + { + "epoch": 3.1210526315789475, + "step": 1186, + "train_accuracy": 0.765625 + }, + { + "epoch": 3.123684210526316, + "grad_norm": 6.080649375915527, + "learning_rate": 2.4146584070318145e-06, + "loss": 0.9204, + "step": 1187 + }, + { + "epoch": 3.123684210526316, + "step": 1187, + "train_accuracy": 0.875 + }, + { + "epoch": 3.126315789473684, + "grad_norm": 8.601665496826172, + "learning_rate": 2.400787136507975e-06, + "loss": 0.8398, + "step": 1188 + }, + { + "epoch": 3.126315789473684, + "step": 1188, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.1289473684210525, + "grad_norm": 9.983627319335938, + "learning_rate": 2.3869503861778176e-06, + "loss": 0.8701, + "step": 1189 + }, + { + "epoch": 3.1289473684210525, + "step": 1189, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.1315789473684212, + "grad_norm": 3.987053394317627, + "learning_rate": 2.373148218896182e-06, + "loss": 0.6858, + "step": 1190 + }, + { + "epoch": 3.1315789473684212, + "step": 1190, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.1342105263157896, + "grad_norm": 6.030102729797363, + "learning_rate": 2.35938069736081e-06, + "loss": 0.8005, + "step": 1191 + }, + { + "epoch": 3.1342105263157896, + "step": 1191, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.136842105263158, + "grad_norm": 4.159539222717285, + "learning_rate": 2.3456478841120634e-06, + "loss": 0.8113, + "step": 1192 + }, + { + "epoch": 3.136842105263158, + "step": 1192, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.139473684210526, + "grad_norm": 5.042561054229736, + "learning_rate": 2.331949841532636e-06, + "loss": 0.8757, + "step": 1193 + }, + { + "epoch": 3.139473684210526, + "step": 1193, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.1421052631578945, + "grad_norm": 12.693655967712402, + "learning_rate": 2.318286631847272e-06, + "loss": 0.9448, + "step": 1194 + }, + { + "epoch": 3.1421052631578945, + "step": 1194, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.1447368421052633, + "grad_norm": 4.802595615386963, + "learning_rate": 2.3046583171224835e-06, + "loss": 0.8074, + "step": 1195 + }, + { + "epoch": 3.1447368421052633, + "step": 1195, + "train_accuracy": 0.796875 + }, + { + "epoch": 3.1473684210526316, + "grad_norm": 5.915050983428955, + "learning_rate": 2.2910649592662724e-06, + "loss": 1.0713, + "step": 1196 + }, + { + "epoch": 3.1473684210526316, + "step": 1196, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.15, + "grad_norm": 4.1914472579956055, + "learning_rate": 2.2775066200278383e-06, + "loss": 0.7119, + "step": 1197 + }, + { + "epoch": 3.15, + "step": 1197, + "train_accuracy": 0.875 + }, + { + "epoch": 3.1526315789473682, + "grad_norm": 7.356982231140137, + "learning_rate": 2.2639833609973182e-06, + "loss": 0.8606, + "step": 1198 + }, + { + "epoch": 3.1526315789473682, + "step": 1198, + "train_accuracy": 0.96875 + }, + { + "epoch": 3.155263157894737, + "grad_norm": 4.53258752822876, + "learning_rate": 2.250495243605475e-06, + "loss": 0.6907, + "step": 1199 + }, + { + "epoch": 3.155263157894737, + "step": 1199, + "train_accuracy": 0.875 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 6.945924758911133, + "learning_rate": 2.2370423291234543e-06, + "loss": 0.7888, + "step": 1200 + }, + { + "epoch": 3.1578947368421053, + "step": 1200, + "train_accuracy": 0.875 + }, + { + "epoch": 3.1605263157894736, + "grad_norm": 8.83996868133545, + "learning_rate": 2.2236246786624794e-06, + "loss": 0.9097, + "step": 1201 + }, + { + "epoch": 3.1605263157894736, + "step": 1201, + "train_accuracy": 0.8125 + }, + { + "epoch": 3.163157894736842, + "grad_norm": 6.74652099609375, + "learning_rate": 2.210242353173586e-06, + "loss": 1.0459, + "step": 1202 + }, + { + "epoch": 3.163157894736842, + "step": 1202, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.1657894736842107, + "grad_norm": 5.417960166931152, + "learning_rate": 2.196895413447343e-06, + "loss": 0.835, + "step": 1203 + }, + { + "epoch": 3.1657894736842107, + "step": 1203, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.168421052631579, + "grad_norm": 5.827296733856201, + "learning_rate": 2.1835839201135743e-06, + "loss": 0.8953, + "step": 1204 + }, + { + "epoch": 3.168421052631579, + "step": 1204, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.1710526315789473, + "grad_norm": 13.554121971130371, + "learning_rate": 2.170307933641087e-06, + "loss": 0.9985, + "step": 1205 + }, + { + "epoch": 3.1710526315789473, + "step": 1205, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.1736842105263157, + "grad_norm": 6.409447193145752, + "learning_rate": 2.157067514337392e-06, + "loss": 1.0347, + "step": 1206 + }, + { + "epoch": 3.1736842105263157, + "step": 1206, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.1763157894736844, + "grad_norm": 7.8958234786987305, + "learning_rate": 2.143862722348434e-06, + "loss": 0.9468, + "step": 1207 + }, + { + "epoch": 3.1763157894736844, + "step": 1207, + "train_accuracy": 0.796875 + }, + { + "epoch": 3.1789473684210527, + "grad_norm": 5.160824298858643, + "learning_rate": 2.1306936176583206e-06, + "loss": 0.8315, + "step": 1208 + }, + { + "epoch": 3.1789473684210527, + "step": 1208, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.181578947368421, + "grad_norm": 6.878368377685547, + "learning_rate": 2.117560260089039e-06, + "loss": 1.0015, + "step": 1209 + }, + { + "epoch": 3.181578947368421, + "step": 1209, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.1842105263157894, + "grad_norm": 6.8526930809021, + "learning_rate": 2.1044627093001966e-06, + "loss": 0.9253, + "step": 1210 + }, + { + "epoch": 3.1842105263157894, + "step": 1210, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.1868421052631577, + "grad_norm": 6.191732883453369, + "learning_rate": 2.091401024788745e-06, + "loss": 0.8765, + "step": 1211 + }, + { + "epoch": 3.1868421052631577, + "step": 1211, + "train_accuracy": 0.875 + }, + { + "epoch": 3.1894736842105265, + "grad_norm": 7.804241180419922, + "learning_rate": 2.078375265888707e-06, + "loss": 0.9224, + "step": 1212 + }, + { + "epoch": 3.1894736842105265, + "step": 1212, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.192105263157895, + "grad_norm": 5.588228225708008, + "learning_rate": 2.0653854917709115e-06, + "loss": 1.0356, + "step": 1213 + }, + { + "epoch": 3.192105263157895, + "step": 1213, + "train_accuracy": 0.875 + }, + { + "epoch": 3.194736842105263, + "grad_norm": 9.253522872924805, + "learning_rate": 2.0524317614427225e-06, + "loss": 1.0024, + "step": 1214 + }, + { + "epoch": 3.194736842105263, + "step": 1214, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.1973684210526314, + "grad_norm": 4.661523818969727, + "learning_rate": 2.039514133747771e-06, + "loss": 0.6868, + "step": 1215 + }, + { + "epoch": 3.1973684210526314, + "step": 1215, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.2, + "grad_norm": 6.566859245300293, + "learning_rate": 2.0266326673656877e-06, + "loss": 1.1104, + "step": 1216 + }, + { + "epoch": 3.2, + "eval_accuracy": 0.7225942611694336, + "eval_max_score": 10.875, + "eval_min_score": -13.625, + "eval_runtime": 151.3303, + "eval_samples_per_second": 18.747, + "eval_steps_per_second": 0.297, + "step": 1216 + }, + { + "epoch": 3.2, + "step": 1216, + "train_accuracy": 0.8125 + }, + { + "epoch": 3.2026315789473685, + "grad_norm": 4.405065536499023, + "learning_rate": 2.013787420811839e-06, + "loss": 0.7971, + "step": 1217 + }, + { + "epoch": 3.2026315789473685, + "step": 1217, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.205263157894737, + "grad_norm": 4.326594829559326, + "learning_rate": 2.0009784524370577e-06, + "loss": 0.7188, + "step": 1218 + }, + { + "epoch": 3.205263157894737, + "step": 1218, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.207894736842105, + "grad_norm": 7.73793363571167, + "learning_rate": 1.988205820427378e-06, + "loss": 0.9402, + "step": 1219 + }, + { + "epoch": 3.207894736842105, + "step": 1219, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.2105263157894735, + "grad_norm": 5.0551910400390625, + "learning_rate": 1.9754695828037707e-06, + "loss": 0.792, + "step": 1220 + }, + { + "epoch": 3.2105263157894735, + "step": 1220, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.213157894736842, + "grad_norm": 5.002174377441406, + "learning_rate": 1.962769797421895e-06, + "loss": 0.7297, + "step": 1221 + }, + { + "epoch": 3.213157894736842, + "step": 1221, + "train_accuracy": 0.9375 + }, + { + "epoch": 3.2157894736842105, + "grad_norm": 6.284675121307373, + "learning_rate": 1.9501065219717984e-06, + "loss": 0.7141, + "step": 1222 + }, + { + "epoch": 3.2157894736842105, + "step": 1222, + "train_accuracy": 0.875 + }, + { + "epoch": 3.218421052631579, + "grad_norm": 4.388381004333496, + "learning_rate": 1.937479813977703e-06, + "loss": 0.8003, + "step": 1223 + }, + { + "epoch": 3.218421052631579, + "step": 1223, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.221052631578947, + "grad_norm": 4.708600997924805, + "learning_rate": 1.9248897307976977e-06, + "loss": 0.7725, + "step": 1224 + }, + { + "epoch": 3.221052631578947, + "step": 1224, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.223684210526316, + "grad_norm": 3.932790994644165, + "learning_rate": 1.9123363296235207e-06, + "loss": 0.7332, + "step": 1225 + }, + { + "epoch": 3.223684210526316, + "step": 1225, + "train_accuracy": 0.9375 + }, + { + "epoch": 3.2263157894736842, + "grad_norm": 4.5415754318237305, + "learning_rate": 1.8998196674802561e-06, + "loss": 0.7847, + "step": 1226 + }, + { + "epoch": 3.2263157894736842, + "step": 1226, + "train_accuracy": 0.765625 + }, + { + "epoch": 3.2289473684210526, + "grad_norm": 5.433136463165283, + "learning_rate": 1.88733980122612e-06, + "loss": 0.9407, + "step": 1227 + }, + { + "epoch": 3.2289473684210526, + "step": 1227, + "train_accuracy": 0.78125 + }, + { + "epoch": 3.231578947368421, + "grad_norm": 6.181934356689453, + "learning_rate": 1.8748967875521574e-06, + "loss": 1.1606, + "step": 1228 + }, + { + "epoch": 3.231578947368421, + "step": 1228, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.2342105263157896, + "grad_norm": 9.84251880645752, + "learning_rate": 1.8624906829820278e-06, + "loss": 1.0325, + "step": 1229 + }, + { + "epoch": 3.2342105263157896, + "step": 1229, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.236842105263158, + "grad_norm": 5.358395576477051, + "learning_rate": 1.8501215438717057e-06, + "loss": 0.7742, + "step": 1230 + }, + { + "epoch": 3.236842105263158, + "step": 1230, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.2394736842105263, + "grad_norm": 5.701262950897217, + "learning_rate": 1.83778942640927e-06, + "loss": 0.9922, + "step": 1231 + }, + { + "epoch": 3.2394736842105263, + "step": 1231, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.2421052631578946, + "grad_norm": 6.035477161407471, + "learning_rate": 1.8254943866146013e-06, + "loss": 0.9014, + "step": 1232 + }, + { + "epoch": 3.2421052631578946, + "step": 1232, + "train_accuracy": 0.875 + }, + { + "epoch": 3.2447368421052634, + "grad_norm": 4.604367256164551, + "learning_rate": 1.81323648033917e-06, + "loss": 0.7578, + "step": 1233 + }, + { + "epoch": 3.2447368421052634, + "step": 1233, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.2473684210526317, + "grad_norm": 6.747530460357666, + "learning_rate": 1.8010157632657544e-06, + "loss": 0.9478, + "step": 1234 + }, + { + "epoch": 3.2473684210526317, + "step": 1234, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.25, + "grad_norm": 5.973506450653076, + "learning_rate": 1.7888322909081978e-06, + "loss": 0.8875, + "step": 1235 + }, + { + "epoch": 3.25, + "step": 1235, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.2526315789473683, + "grad_norm": 5.428391933441162, + "learning_rate": 1.7766861186111573e-06, + "loss": 0.8809, + "step": 1236 + }, + { + "epoch": 3.2526315789473683, + "step": 1236, + "train_accuracy": 0.875 + }, + { + "epoch": 3.2552631578947366, + "grad_norm": 5.573856353759766, + "learning_rate": 1.764577301549849e-06, + "loss": 0.8752, + "step": 1237 + }, + { + "epoch": 3.2552631578947366, + "step": 1237, + "train_accuracy": 0.875 + }, + { + "epoch": 3.2578947368421054, + "grad_norm": 4.994183540344238, + "learning_rate": 1.7525058947298025e-06, + "loss": 0.7871, + "step": 1238 + }, + { + "epoch": 3.2578947368421054, + "step": 1238, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.2605263157894737, + "grad_norm": 5.398833751678467, + "learning_rate": 1.7404719529866032e-06, + "loss": 0.8843, + "step": 1239 + }, + { + "epoch": 3.2605263157894737, + "step": 1239, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.263157894736842, + "grad_norm": 7.436490058898926, + "learning_rate": 1.7284755309856505e-06, + "loss": 0.9875, + "step": 1240 + }, + { + "epoch": 3.263157894736842, + "step": 1240, + "train_accuracy": 0.8125 + }, + { + "epoch": 3.2657894736842104, + "grad_norm": 8.08215618133545, + "learning_rate": 1.716516683221906e-06, + "loss": 1.0361, + "step": 1241 + }, + { + "epoch": 3.2657894736842104, + "step": 1241, + "train_accuracy": 0.796875 + }, + { + "epoch": 3.268421052631579, + "grad_norm": 6.869689464569092, + "learning_rate": 1.7045954640196448e-06, + "loss": 0.9634, + "step": 1242 + }, + { + "epoch": 3.268421052631579, + "step": 1242, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.2710526315789474, + "grad_norm": 9.234119415283203, + "learning_rate": 1.6927119275322135e-06, + "loss": 0.8782, + "step": 1243 + }, + { + "epoch": 3.2710526315789474, + "step": 1243, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.2736842105263158, + "grad_norm": 6.922120571136475, + "learning_rate": 1.6808661277417793e-06, + "loss": 0.8464, + "step": 1244 + }, + { + "epoch": 3.2736842105263158, + "step": 1244, + "train_accuracy": 0.8125 + }, + { + "epoch": 3.276315789473684, + "grad_norm": 9.614771842956543, + "learning_rate": 1.6690581184590859e-06, + "loss": 0.9414, + "step": 1245 + }, + { + "epoch": 3.276315789473684, + "step": 1245, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.2789473684210524, + "grad_norm": 6.084325313568115, + "learning_rate": 1.657287953323211e-06, + "loss": 0.9082, + "step": 1246 + }, + { + "epoch": 3.2789473684210524, + "step": 1246, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.281578947368421, + "grad_norm": 8.449655532836914, + "learning_rate": 1.6455556858013222e-06, + "loss": 0.8462, + "step": 1247 + }, + { + "epoch": 3.281578947368421, + "step": 1247, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.2842105263157895, + "grad_norm": 5.102181434631348, + "learning_rate": 1.6338613691884308e-06, + "loss": 0.9702, + "step": 1248 + }, + { + "epoch": 3.2842105263157895, + "step": 1248, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.286842105263158, + "grad_norm": 5.518160820007324, + "learning_rate": 1.6222050566071545e-06, + "loss": 0.8591, + "step": 1249 + }, + { + "epoch": 3.286842105263158, + "step": 1249, + "train_accuracy": 0.9375 + }, + { + "epoch": 3.2894736842105265, + "grad_norm": 5.898723125457764, + "learning_rate": 1.6105868010074744e-06, + "loss": 0.8779, + "step": 1250 + }, + { + "epoch": 3.2894736842105265, + "step": 1250, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.292105263157895, + "grad_norm": 8.095972061157227, + "learning_rate": 1.5990066551664906e-06, + "loss": 0.9385, + "step": 1251 + }, + { + "epoch": 3.292105263157895, + "step": 1251, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.294736842105263, + "grad_norm": 6.1568074226379395, + "learning_rate": 1.587464671688187e-06, + "loss": 0.9697, + "step": 1252 + }, + { + "epoch": 3.294736842105263, + "step": 1252, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.2973684210526315, + "grad_norm": 6.438913345336914, + "learning_rate": 1.5759609030031986e-06, + "loss": 0.8687, + "step": 1253 + }, + { + "epoch": 3.2973684210526315, + "step": 1253, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.3, + "grad_norm": 6.652143478393555, + "learning_rate": 1.5644954013685486e-06, + "loss": 0.7432, + "step": 1254 + }, + { + "epoch": 3.3, + "step": 1254, + "train_accuracy": 0.96875 + }, + { + "epoch": 3.3026315789473686, + "grad_norm": 6.559475421905518, + "learning_rate": 1.5530682188674506e-06, + "loss": 0.8599, + "step": 1255 + }, + { + "epoch": 3.3026315789473686, + "step": 1255, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.305263157894737, + "grad_norm": 5.773578643798828, + "learning_rate": 1.5416794074090258e-06, + "loss": 0.772, + "step": 1256 + }, + { + "epoch": 3.305263157894737, + "step": 1256, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.307894736842105, + "grad_norm": 5.565701484680176, + "learning_rate": 1.530329018728114e-06, + "loss": 0.8916, + "step": 1257 + }, + { + "epoch": 3.307894736842105, + "step": 1257, + "train_accuracy": 0.9375 + }, + { + "epoch": 3.3105263157894735, + "grad_norm": 5.266923904418945, + "learning_rate": 1.5190171043849955e-06, + "loss": 0.9146, + "step": 1258 + }, + { + "epoch": 3.3105263157894735, + "step": 1258, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.3131578947368423, + "grad_norm": 6.593441486358643, + "learning_rate": 1.5077437157651942e-06, + "loss": 0.8623, + "step": 1259 + }, + { + "epoch": 3.3131578947368423, + "step": 1259, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.3157894736842106, + "grad_norm": 5.083078861236572, + "learning_rate": 1.4965089040792114e-06, + "loss": 0.8208, + "step": 1260 + }, + { + "epoch": 3.3157894736842106, + "step": 1260, + "train_accuracy": 0.953125 + }, + { + "epoch": 3.318421052631579, + "grad_norm": 10.047627449035645, + "learning_rate": 1.4853127203623253e-06, + "loss": 0.9258, + "step": 1261 + }, + { + "epoch": 3.318421052631579, + "step": 1261, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.3210526315789473, + "grad_norm": 7.929150581359863, + "learning_rate": 1.4741552154743243e-06, + "loss": 1.1812, + "step": 1262 + }, + { + "epoch": 3.3210526315789473, + "step": 1262, + "train_accuracy": 0.875 + }, + { + "epoch": 3.3236842105263156, + "grad_norm": 5.663976669311523, + "learning_rate": 1.4630364400993123e-06, + "loss": 0.8647, + "step": 1263 + }, + { + "epoch": 3.3236842105263156, + "step": 1263, + "train_accuracy": 0.9375 + }, + { + "epoch": 3.3263157894736843, + "grad_norm": 6.1754961013793945, + "learning_rate": 1.451956444745445e-06, + "loss": 0.7129, + "step": 1264 + }, + { + "epoch": 3.3263157894736843, + "step": 1264, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.3289473684210527, + "grad_norm": 5.069157600402832, + "learning_rate": 1.440915279744729e-06, + "loss": 0.854, + "step": 1265 + }, + { + "epoch": 3.3289473684210527, + "step": 1265, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.331578947368421, + "grad_norm": 5.302091121673584, + "learning_rate": 1.4299129952527678e-06, + "loss": 0.7041, + "step": 1266 + }, + { + "epoch": 3.331578947368421, + "step": 1266, + "train_accuracy": 0.953125 + }, + { + "epoch": 3.3342105263157893, + "grad_norm": 7.704717636108398, + "learning_rate": 1.4189496412485593e-06, + "loss": 0.7681, + "step": 1267 + }, + { + "epoch": 3.3342105263157893, + "step": 1267, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.336842105263158, + "grad_norm": 4.745948314666748, + "learning_rate": 1.408025267534242e-06, + "loss": 0.7314, + "step": 1268 + }, + { + "epoch": 3.336842105263158, + "step": 1268, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.3394736842105264, + "grad_norm": 5.072338581085205, + "learning_rate": 1.3971399237348936e-06, + "loss": 0.9341, + "step": 1269 + }, + { + "epoch": 3.3394736842105264, + "step": 1269, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.3421052631578947, + "grad_norm": 9.687994956970215, + "learning_rate": 1.38629365929829e-06, + "loss": 0.9022, + "step": 1270 + }, + { + "epoch": 3.3421052631578947, + "step": 1270, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.344736842105263, + "grad_norm": 7.591251373291016, + "learning_rate": 1.3754865234946835e-06, + "loss": 1.0562, + "step": 1271 + }, + { + "epoch": 3.344736842105263, + "step": 1271, + "train_accuracy": 0.953125 + }, + { + "epoch": 3.3473684210526318, + "grad_norm": 5.41000509262085, + "learning_rate": 1.364718565416584e-06, + "loss": 0.9058, + "step": 1272 + }, + { + "epoch": 3.3473684210526318, + "step": 1272, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.35, + "grad_norm": 12.85086441040039, + "learning_rate": 1.3539898339785307e-06, + "loss": 0.9321, + "step": 1273 + }, + { + "epoch": 3.35, + "step": 1273, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.3526315789473684, + "grad_norm": 9.143210411071777, + "learning_rate": 1.343300377916873e-06, + "loss": 1.1162, + "step": 1274 + }, + { + "epoch": 3.3526315789473684, + "step": 1274, + "train_accuracy": 0.875 + }, + { + "epoch": 3.3552631578947367, + "grad_norm": 6.773848056793213, + "learning_rate": 1.3326502457895452e-06, + "loss": 0.7704, + "step": 1275 + }, + { + "epoch": 3.3552631578947367, + "step": 1275, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.3578947368421055, + "grad_norm": 4.66011905670166, + "learning_rate": 1.3220394859758512e-06, + "loss": 0.7253, + "step": 1276 + }, + { + "epoch": 3.3578947368421055, + "step": 1276, + "train_accuracy": 0.78125 + }, + { + "epoch": 3.360526315789474, + "grad_norm": 4.582793235778809, + "learning_rate": 1.3114681466762424e-06, + "loss": 0.7734, + "step": 1277 + }, + { + "epoch": 3.360526315789474, + "step": 1277, + "train_accuracy": 0.875 + }, + { + "epoch": 3.363157894736842, + "grad_norm": 11.202856063842773, + "learning_rate": 1.300936275912098e-06, + "loss": 0.906, + "step": 1278 + }, + { + "epoch": 3.363157894736842, + "step": 1278, + "train_accuracy": 0.765625 + }, + { + "epoch": 3.3657894736842104, + "grad_norm": 6.078808784484863, + "learning_rate": 1.2904439215255049e-06, + "loss": 0.9971, + "step": 1279 + }, + { + "epoch": 3.3657894736842104, + "step": 1279, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 7.51846981048584, + "learning_rate": 1.279991131179048e-06, + "loss": 0.8589, + "step": 1280 + }, + { + "epoch": 3.3684210526315788, + "step": 1280, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.3710526315789475, + "grad_norm": 7.458414554595947, + "learning_rate": 1.269577952355583e-06, + "loss": 0.7031, + "step": 1281 + }, + { + "epoch": 3.3710526315789475, + "step": 1281, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.373684210526316, + "grad_norm": 8.030908584594727, + "learning_rate": 1.25920443235803e-06, + "loss": 0.9263, + "step": 1282 + }, + { + "epoch": 3.373684210526316, + "step": 1282, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.376315789473684, + "grad_norm": 8.650345802307129, + "learning_rate": 1.2488706183091526e-06, + "loss": 0.6409, + "step": 1283 + }, + { + "epoch": 3.376315789473684, + "step": 1283, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.3789473684210525, + "grad_norm": 5.0357985496521, + "learning_rate": 1.2385765571513498e-06, + "loss": 0.9023, + "step": 1284 + }, + { + "epoch": 3.3789473684210525, + "step": 1284, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.3815789473684212, + "grad_norm": 4.378666877746582, + "learning_rate": 1.2283222956464336e-06, + "loss": 0.7954, + "step": 1285 + }, + { + "epoch": 3.3815789473684212, + "step": 1285, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.3842105263157896, + "grad_norm": 4.937459468841553, + "learning_rate": 1.2181078803754254e-06, + "loss": 0.8413, + "step": 1286 + }, + { + "epoch": 3.3842105263157896, + "step": 1286, + "train_accuracy": 0.875 + }, + { + "epoch": 3.386842105263158, + "grad_norm": 7.140615463256836, + "learning_rate": 1.2079333577383422e-06, + "loss": 1.0811, + "step": 1287 + }, + { + "epoch": 3.386842105263158, + "step": 1287, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.389473684210526, + "grad_norm": 7.88900899887085, + "learning_rate": 1.1977987739539798e-06, + "loss": 1.021, + "step": 1288 + }, + { + "epoch": 3.389473684210526, + "step": 1288, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.3921052631578945, + "grad_norm": 8.951826095581055, + "learning_rate": 1.1877041750597174e-06, + "loss": 0.8384, + "step": 1289 + }, + { + "epoch": 3.3921052631578945, + "step": 1289, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.3947368421052633, + "grad_norm": 4.733591556549072, + "learning_rate": 1.177649606911283e-06, + "loss": 0.875, + "step": 1290 + }, + { + "epoch": 3.3947368421052633, + "step": 1290, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.3973684210526316, + "grad_norm": 6.041803359985352, + "learning_rate": 1.1676351151825803e-06, + "loss": 0.7407, + "step": 1291 + }, + { + "epoch": 3.3973684210526316, + "step": 1291, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.4, + "grad_norm": 4.956209182739258, + "learning_rate": 1.157660745365442e-06, + "loss": 0.7407, + "step": 1292 + }, + { + "epoch": 3.4, + "eval_accuracy": 0.7088474035263062, + "eval_max_score": 11.3125, + "eval_min_score": -13.875, + "eval_runtime": 151.4716, + "eval_samples_per_second": 18.73, + "eval_steps_per_second": 0.297, + "step": 1292 + }, + { + "epoch": 3.4, + "step": 1292, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.4026315789473682, + "grad_norm": 5.408447742462158, + "learning_rate": 1.1477265427694616e-06, + "loss": 0.8413, + "step": 1293 + }, + { + "epoch": 3.4026315789473682, + "step": 1293, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.405263157894737, + "grad_norm": 6.186283588409424, + "learning_rate": 1.1378325525217516e-06, + "loss": 0.8872, + "step": 1294 + }, + { + "epoch": 3.405263157894737, + "step": 1294, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.4078947368421053, + "grad_norm": 6.49485445022583, + "learning_rate": 1.127978819566773e-06, + "loss": 0.7852, + "step": 1295 + }, + { + "epoch": 3.4078947368421053, + "step": 1295, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.4105263157894736, + "grad_norm": 5.673926830291748, + "learning_rate": 1.1181653886660959e-06, + "loss": 0.7651, + "step": 1296 + }, + { + "epoch": 3.4105263157894736, + "step": 1296, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.413157894736842, + "grad_norm": 5.944647789001465, + "learning_rate": 1.1083923043982303e-06, + "loss": 1.0059, + "step": 1297 + }, + { + "epoch": 3.413157894736842, + "step": 1297, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.4157894736842107, + "grad_norm": 7.421634674072266, + "learning_rate": 1.098659611158399e-06, + "loss": 0.9417, + "step": 1298 + }, + { + "epoch": 3.4157894736842107, + "step": 1298, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.418421052631579, + "grad_norm": 7.423999309539795, + "learning_rate": 1.0889673531583466e-06, + "loss": 0.9941, + "step": 1299 + }, + { + "epoch": 3.418421052631579, + "step": 1299, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.4210526315789473, + "grad_norm": 8.161972999572754, + "learning_rate": 1.0793155744261352e-06, + "loss": 1.062, + "step": 1300 + }, + { + "epoch": 3.4210526315789473, + "step": 1300, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.4236842105263157, + "grad_norm": 6.31049108505249, + "learning_rate": 1.0697043188059475e-06, + "loss": 0.7502, + "step": 1301 + }, + { + "epoch": 3.4236842105263157, + "step": 1301, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.4263157894736844, + "grad_norm": 4.8111772537231445, + "learning_rate": 1.0601336299578834e-06, + "loss": 0.7729, + "step": 1302 + }, + { + "epoch": 3.4263157894736844, + "step": 1302, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.4289473684210527, + "grad_norm": 7.701916694641113, + "learning_rate": 1.0506035513577683e-06, + "loss": 0.8699, + "step": 1303 + }, + { + "epoch": 3.4289473684210527, + "step": 1303, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.431578947368421, + "grad_norm": 4.927713871002197, + "learning_rate": 1.0411141262969482e-06, + "loss": 0.8628, + "step": 1304 + }, + { + "epoch": 3.431578947368421, + "step": 1304, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.4342105263157894, + "grad_norm": 5.331511497497559, + "learning_rate": 1.0316653978820956e-06, + "loss": 0.8447, + "step": 1305 + }, + { + "epoch": 3.4342105263157894, + "step": 1305, + "train_accuracy": 0.875 + }, + { + "epoch": 3.4368421052631577, + "grad_norm": 5.308257102966309, + "learning_rate": 1.0222574090350169e-06, + "loss": 0.7975, + "step": 1306 + }, + { + "epoch": 3.4368421052631577, + "step": 1306, + "train_accuracy": 0.875 + }, + { + "epoch": 3.4394736842105265, + "grad_norm": 5.9786272048950195, + "learning_rate": 1.0128902024924547e-06, + "loss": 0.8286, + "step": 1307 + }, + { + "epoch": 3.4394736842105265, + "step": 1307, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.442105263157895, + "grad_norm": 4.495826244354248, + "learning_rate": 1.0035638208058907e-06, + "loss": 0.7522, + "step": 1308 + }, + { + "epoch": 3.442105263157895, + "step": 1308, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.444736842105263, + "grad_norm": 6.9757208824157715, + "learning_rate": 9.942783063413596e-07, + "loss": 0.9727, + "step": 1309 + }, + { + "epoch": 3.444736842105263, + "step": 1309, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.4473684210526314, + "grad_norm": 5.75640344619751, + "learning_rate": 9.850337012792499e-07, + "loss": 0.8916, + "step": 1310 + }, + { + "epoch": 3.4473684210526314, + "step": 1310, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.45, + "grad_norm": 6.0769524574279785, + "learning_rate": 9.75830047614117e-07, + "loss": 0.802, + "step": 1311 + }, + { + "epoch": 3.45, + "step": 1311, + "train_accuracy": 0.96875 + }, + { + "epoch": 3.4526315789473685, + "grad_norm": 5.191233158111572, + "learning_rate": 9.666673871544884e-07, + "loss": 0.8542, + "step": 1312 + }, + { + "epoch": 3.4526315789473685, + "step": 1312, + "train_accuracy": 0.875 + }, + { + "epoch": 3.455263157894737, + "grad_norm": 10.107154846191406, + "learning_rate": 9.57545761522677e-07, + "loss": 0.6987, + "step": 1313 + }, + { + "epoch": 3.455263157894737, + "step": 1313, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.457894736842105, + "grad_norm": 5.977207660675049, + "learning_rate": 9.484652121545901e-07, + "loss": 0.8047, + "step": 1314 + }, + { + "epoch": 3.457894736842105, + "step": 1314, + "train_accuracy": 0.875 + }, + { + "epoch": 3.4605263157894735, + "grad_norm": 5.891763687133789, + "learning_rate": 9.394257802995432e-07, + "loss": 0.7876, + "step": 1315 + }, + { + "epoch": 3.4605263157894735, + "step": 1315, + "train_accuracy": 0.9375 + }, + { + "epoch": 3.463157894736842, + "grad_norm": 7.937765121459961, + "learning_rate": 9.304275070200697e-07, + "loss": 0.8418, + "step": 1316 + }, + { + "epoch": 3.463157894736842, + "step": 1316, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.4657894736842105, + "grad_norm": 7.354846000671387, + "learning_rate": 9.214704331917356e-07, + "loss": 1.0989, + "step": 1317 + }, + { + "epoch": 3.4657894736842105, + "step": 1317, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.468421052631579, + "grad_norm": 5.987973213195801, + "learning_rate": 9.125545995029616e-07, + "loss": 0.7869, + "step": 1318 + }, + { + "epoch": 3.468421052631579, + "step": 1318, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.4710526315789476, + "grad_norm": 5.579070568084717, + "learning_rate": 9.036800464548157e-07, + "loss": 0.77, + "step": 1319 + }, + { + "epoch": 3.4710526315789476, + "step": 1319, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.473684210526316, + "grad_norm": 7.5642523765563965, + "learning_rate": 8.948468143608624e-07, + "loss": 0.9525, + "step": 1320 + }, + { + "epoch": 3.473684210526316, + "step": 1320, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.4763157894736842, + "grad_norm": 6.69650936126709, + "learning_rate": 8.860549433469445e-07, + "loss": 0.9917, + "step": 1321 + }, + { + "epoch": 3.4763157894736842, + "step": 1321, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.4789473684210526, + "grad_norm": 6.561551570892334, + "learning_rate": 8.773044733510338e-07, + "loss": 0.9126, + "step": 1322 + }, + { + "epoch": 3.4789473684210526, + "step": 1322, + "train_accuracy": 0.875 + }, + { + "epoch": 3.481578947368421, + "grad_norm": 4.962501525878906, + "learning_rate": 8.685954441230182e-07, + "loss": 0.8538, + "step": 1323 + }, + { + "epoch": 3.481578947368421, + "step": 1323, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.4842105263157896, + "grad_norm": 5.327600479125977, + "learning_rate": 8.599278952245504e-07, + "loss": 0.9084, + "step": 1324 + }, + { + "epoch": 3.4842105263157896, + "step": 1324, + "train_accuracy": 0.796875 + }, + { + "epoch": 3.486842105263158, + "grad_norm": 7.786504745483398, + "learning_rate": 8.513018660288475e-07, + "loss": 1.0999, + "step": 1325 + }, + { + "epoch": 3.486842105263158, + "step": 1325, + "train_accuracy": 0.9375 + }, + { + "epoch": 3.4894736842105263, + "grad_norm": 7.61739444732666, + "learning_rate": 8.427173957205193e-07, + "loss": 0.7703, + "step": 1326 + }, + { + "epoch": 3.4894736842105263, + "step": 1326, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.4921052631578946, + "grad_norm": 5.046051502227783, + "learning_rate": 8.341745232953913e-07, + "loss": 0.8911, + "step": 1327 + }, + { + "epoch": 3.4921052631578946, + "step": 1327, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.4947368421052634, + "grad_norm": 6.484220027923584, + "learning_rate": 8.256732875603269e-07, + "loss": 0.9453, + "step": 1328 + }, + { + "epoch": 3.4947368421052634, + "step": 1328, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.4973684210526317, + "grad_norm": 5.907304286956787, + "learning_rate": 8.172137271330494e-07, + "loss": 0.7571, + "step": 1329 + }, + { + "epoch": 3.4973684210526317, + "step": 1329, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.5, + "grad_norm": 5.444192409515381, + "learning_rate": 8.087958804419682e-07, + "loss": 0.7827, + "step": 1330 + }, + { + "epoch": 3.5, + "step": 1330, + "train_accuracy": 0.9375 + }, + { + "epoch": 3.5026315789473683, + "grad_norm": 9.199472427368164, + "learning_rate": 8.004197857260043e-07, + "loss": 1.0444, + "step": 1331 + }, + { + "epoch": 3.5026315789473683, + "step": 1331, + "train_accuracy": 0.78125 + }, + { + "epoch": 3.5052631578947366, + "grad_norm": 5.791006565093994, + "learning_rate": 7.920854810344159e-07, + "loss": 0.915, + "step": 1332 + }, + { + "epoch": 3.5052631578947366, + "step": 1332, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.5078947368421054, + "grad_norm": 4.752233982086182, + "learning_rate": 7.837930042266262e-07, + "loss": 0.7422, + "step": 1333 + }, + { + "epoch": 3.5078947368421054, + "step": 1333, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.5105263157894737, + "grad_norm": 6.618746757507324, + "learning_rate": 7.755423929720496e-07, + "loss": 0.7988, + "step": 1334 + }, + { + "epoch": 3.5105263157894737, + "step": 1334, + "train_accuracy": 0.796875 + }, + { + "epoch": 3.513157894736842, + "grad_norm": 5.868110179901123, + "learning_rate": 7.673336847499235e-07, + "loss": 0.8191, + "step": 1335 + }, + { + "epoch": 3.513157894736842, + "step": 1335, + "train_accuracy": 0.875 + }, + { + "epoch": 3.515789473684211, + "grad_norm": 7.067508220672607, + "learning_rate": 7.591669168491355e-07, + "loss": 0.9233, + "step": 1336 + }, + { + "epoch": 3.515789473684211, + "step": 1336, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.518421052631579, + "grad_norm": 5.433889389038086, + "learning_rate": 7.510421263680567e-07, + "loss": 0.8657, + "step": 1337 + }, + { + "epoch": 3.518421052631579, + "step": 1337, + "train_accuracy": 0.875 + }, + { + "epoch": 3.5210526315789474, + "grad_norm": 6.573571681976318, + "learning_rate": 7.429593502143684e-07, + "loss": 0.9448, + "step": 1338 + }, + { + "epoch": 3.5210526315789474, + "step": 1338, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.5236842105263158, + "grad_norm": 10.058076858520508, + "learning_rate": 7.349186251049012e-07, + "loss": 0.9507, + "step": 1339 + }, + { + "epoch": 3.5236842105263158, + "step": 1339, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.526315789473684, + "grad_norm": 7.550402641296387, + "learning_rate": 7.26919987565462e-07, + "loss": 0.8738, + "step": 1340 + }, + { + "epoch": 3.526315789473684, + "step": 1340, + "train_accuracy": 0.71875 + }, + { + "epoch": 3.5289473684210524, + "grad_norm": 7.782090663909912, + "learning_rate": 7.189634739306705e-07, + "loss": 1.0415, + "step": 1341 + }, + { + "epoch": 3.5289473684210524, + "step": 1341, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.531578947368421, + "grad_norm": 4.73071813583374, + "learning_rate": 7.110491203437985e-07, + "loss": 0.7476, + "step": 1342 + }, + { + "epoch": 3.531578947368421, + "step": 1342, + "train_accuracy": 0.875 + }, + { + "epoch": 3.5342105263157895, + "grad_norm": 5.576818466186523, + "learning_rate": 7.031769627565944e-07, + "loss": 0.8374, + "step": 1343 + }, + { + "epoch": 3.5342105263157895, + "step": 1343, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.536842105263158, + "grad_norm": 6.697592735290527, + "learning_rate": 6.953470369291349e-07, + "loss": 1.1177, + "step": 1344 + }, + { + "epoch": 3.536842105263158, + "step": 1344, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.5394736842105265, + "grad_norm": 6.47061014175415, + "learning_rate": 6.875593784296453e-07, + "loss": 0.7524, + "step": 1345 + }, + { + "epoch": 3.5394736842105265, + "step": 1345, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.542105263157895, + "grad_norm": 4.739287376403809, + "learning_rate": 6.798140226343597e-07, + "loss": 0.7229, + "step": 1346 + }, + { + "epoch": 3.542105263157895, + "step": 1346, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.544736842105263, + "grad_norm": 5.039734363555908, + "learning_rate": 6.721110047273338e-07, + "loss": 0.9312, + "step": 1347 + }, + { + "epoch": 3.544736842105263, + "step": 1347, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.5473684210526315, + "grad_norm": 5.539333820343018, + "learning_rate": 6.644503597003127e-07, + "loss": 0.8936, + "step": 1348 + }, + { + "epoch": 3.5473684210526315, + "step": 1348, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.55, + "grad_norm": 6.957169055938721, + "learning_rate": 6.568321223525453e-07, + "loss": 0.855, + "step": 1349 + }, + { + "epoch": 3.55, + "step": 1349, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.5526315789473686, + "grad_norm": 6.129119873046875, + "learning_rate": 6.492563272906527e-07, + "loss": 0.875, + "step": 1350 + }, + { + "epoch": 3.5526315789473686, + "step": 1350, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.555263157894737, + "grad_norm": 5.711894512176514, + "learning_rate": 6.417230089284443e-07, + "loss": 0.6982, + "step": 1351 + }, + { + "epoch": 3.555263157894737, + "step": 1351, + "train_accuracy": 0.875 + }, + { + "epoch": 3.557894736842105, + "grad_norm": 6.5783281326293945, + "learning_rate": 6.342322014867874e-07, + "loss": 0.7727, + "step": 1352 + }, + { + "epoch": 3.557894736842105, + "step": 1352, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.5605263157894735, + "grad_norm": 4.89669942855835, + "learning_rate": 6.267839389934272e-07, + "loss": 0.689, + "step": 1353 + }, + { + "epoch": 3.5605263157894735, + "step": 1353, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.5631578947368423, + "grad_norm": 6.40932559967041, + "learning_rate": 6.193782552828564e-07, + "loss": 0.6511, + "step": 1354 + }, + { + "epoch": 3.5631578947368423, + "step": 1354, + "train_accuracy": 0.875 + }, + { + "epoch": 3.5657894736842106, + "grad_norm": 4.5982537269592285, + "learning_rate": 6.120151839961363e-07, + "loss": 0.6316, + "step": 1355 + }, + { + "epoch": 3.5657894736842106, + "step": 1355, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.568421052631579, + "grad_norm": 5.86193323135376, + "learning_rate": 6.046947585807695e-07, + "loss": 0.8232, + "step": 1356 + }, + { + "epoch": 3.568421052631579, + "step": 1356, + "train_accuracy": 0.9375 + }, + { + "epoch": 3.5710526315789473, + "grad_norm": 5.416151523590088, + "learning_rate": 5.974170122905243e-07, + "loss": 0.8751, + "step": 1357 + }, + { + "epoch": 3.5710526315789473, + "step": 1357, + "train_accuracy": 0.875 + }, + { + "epoch": 3.5736842105263156, + "grad_norm": 5.604320526123047, + "learning_rate": 5.901819781853047e-07, + "loss": 0.698, + "step": 1358 + }, + { + "epoch": 3.5736842105263156, + "step": 1358, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.5763157894736843, + "grad_norm": 6.725011348724365, + "learning_rate": 5.829896891309806e-07, + "loss": 0.865, + "step": 1359 + }, + { + "epoch": 3.5763157894736843, + "step": 1359, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 9.378782272338867, + "learning_rate": 5.758401777992572e-07, + "loss": 0.8921, + "step": 1360 + }, + { + "epoch": 3.5789473684210527, + "step": 1360, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.581578947368421, + "grad_norm": 5.4865007400512695, + "learning_rate": 5.687334766675123e-07, + "loss": 0.7825, + "step": 1361 + }, + { + "epoch": 3.581578947368421, + "step": 1361, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.5842105263157897, + "grad_norm": 6.844651222229004, + "learning_rate": 5.616696180186553e-07, + "loss": 0.8823, + "step": 1362 + }, + { + "epoch": 3.5842105263157897, + "step": 1362, + "train_accuracy": 0.8125 + }, + { + "epoch": 3.586842105263158, + "grad_norm": 6.643460273742676, + "learning_rate": 5.546486339409773e-07, + "loss": 0.6953, + "step": 1363 + }, + { + "epoch": 3.586842105263158, + "step": 1363, + "train_accuracy": 0.734375 + }, + { + "epoch": 3.5894736842105264, + "grad_norm": 7.437619686126709, + "learning_rate": 5.476705563280116e-07, + "loss": 0.9561, + "step": 1364 + }, + { + "epoch": 3.5894736842105264, + "step": 1364, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.5921052631578947, + "grad_norm": 4.695252895355225, + "learning_rate": 5.407354168783807e-07, + "loss": 0.666, + "step": 1365 + }, + { + "epoch": 3.5921052631578947, + "step": 1365, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.594736842105263, + "grad_norm": 5.690319061279297, + "learning_rate": 5.33843247095659e-07, + "loss": 0.915, + "step": 1366 + }, + { + "epoch": 3.594736842105263, + "step": 1366, + "train_accuracy": 0.9375 + }, + { + "epoch": 3.5973684210526313, + "grad_norm": 6.777496337890625, + "learning_rate": 5.269940782882233e-07, + "loss": 0.7043, + "step": 1367 + }, + { + "epoch": 3.5973684210526313, + "step": 1367, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.6, + "grad_norm": 6.535483360290527, + "learning_rate": 5.20187941569118e-07, + "loss": 0.8772, + "step": 1368 + }, + { + "epoch": 3.6, + "eval_accuracy": 0.7109622955322266, + "eval_max_score": 11.125, + "eval_min_score": -13.4375, + "eval_runtime": 151.4455, + "eval_samples_per_second": 18.733, + "eval_steps_per_second": 0.297, + "step": 1368 + }, + { + "epoch": 3.6, + "step": 1368, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.6026315789473684, + "grad_norm": 6.642302989959717, + "learning_rate": 5.134248678559072e-07, + "loss": 0.769, + "step": 1369 + }, + { + "epoch": 3.6026315789473684, + "step": 1369, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.6052631578947367, + "grad_norm": 5.750308036804199, + "learning_rate": 5.067048878705383e-07, + "loss": 0.8142, + "step": 1370 + }, + { + "epoch": 3.6052631578947367, + "step": 1370, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.6078947368421055, + "grad_norm": 5.423074245452881, + "learning_rate": 5.000280321392004e-07, + "loss": 0.7988, + "step": 1371 + }, + { + "epoch": 3.6078947368421055, + "step": 1371, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.610526315789474, + "grad_norm": 5.945747375488281, + "learning_rate": 4.933943309921862e-07, + "loss": 1.0742, + "step": 1372 + }, + { + "epoch": 3.610526315789474, + "step": 1372, + "train_accuracy": 0.9375 + }, + { + "epoch": 3.613157894736842, + "grad_norm": 7.78245210647583, + "learning_rate": 4.868038145637566e-07, + "loss": 0.9067, + "step": 1373 + }, + { + "epoch": 3.613157894736842, + "step": 1373, + "train_accuracy": 0.796875 + }, + { + "epoch": 3.6157894736842104, + "grad_norm": 7.036494731903076, + "learning_rate": 4.802565127919967e-07, + "loss": 0.8911, + "step": 1374 + }, + { + "epoch": 3.6157894736842104, + "step": 1374, + "train_accuracy": 0.875 + }, + { + "epoch": 3.6184210526315788, + "grad_norm": 4.94215726852417, + "learning_rate": 4.73752455418689e-07, + "loss": 0.7522, + "step": 1375 + }, + { + "epoch": 3.6184210526315788, + "step": 1375, + "train_accuracy": 0.796875 + }, + { + "epoch": 3.6210526315789475, + "grad_norm": 6.040024757385254, + "learning_rate": 4.6729167198917293e-07, + "loss": 1.0239, + "step": 1376 + }, + { + "epoch": 3.6210526315789475, + "step": 1376, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.623684210526316, + "grad_norm": 4.882493495941162, + "learning_rate": 4.6087419185220973e-07, + "loss": 0.9067, + "step": 1377 + }, + { + "epoch": 3.623684210526316, + "step": 1377, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.626315789473684, + "grad_norm": 6.596987724304199, + "learning_rate": 4.545000441598557e-07, + "loss": 0.9243, + "step": 1378 + }, + { + "epoch": 3.626315789473684, + "step": 1378, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.6289473684210525, + "grad_norm": 6.559911727905273, + "learning_rate": 4.481692578673169e-07, + "loss": 0.894, + "step": 1379 + }, + { + "epoch": 3.6289473684210525, + "step": 1379, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.6315789473684212, + "grad_norm": 4.946629524230957, + "learning_rate": 4.418818617328369e-07, + "loss": 0.8372, + "step": 1380 + }, + { + "epoch": 3.6315789473684212, + "step": 1380, + "train_accuracy": 0.875 + }, + { + "epoch": 3.6342105263157896, + "grad_norm": 4.552062034606934, + "learning_rate": 4.356378843175446e-07, + "loss": 0.8743, + "step": 1381 + }, + { + "epoch": 3.6342105263157896, + "step": 1381, + "train_accuracy": 0.875 + }, + { + "epoch": 3.636842105263158, + "grad_norm": 5.209609031677246, + "learning_rate": 4.2943735398534246e-07, + "loss": 0.8052, + "step": 1382 + }, + { + "epoch": 3.636842105263158, + "step": 1382, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.639473684210526, + "grad_norm": 7.821629524230957, + "learning_rate": 4.232802989027629e-07, + "loss": 1.0181, + "step": 1383 + }, + { + "epoch": 3.639473684210526, + "step": 1383, + "train_accuracy": 0.796875 + }, + { + "epoch": 3.6421052631578945, + "grad_norm": 8.805258750915527, + "learning_rate": 4.171667470388574e-07, + "loss": 1.1377, + "step": 1384 + }, + { + "epoch": 3.6421052631578945, + "step": 1384, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.6447368421052633, + "grad_norm": 6.199164390563965, + "learning_rate": 4.110967261650489e-07, + "loss": 0.9255, + "step": 1385 + }, + { + "epoch": 3.6447368421052633, + "step": 1385, + "train_accuracy": 0.875 + }, + { + "epoch": 3.6473684210526316, + "grad_norm": 4.666980266571045, + "learning_rate": 4.0507026385502747e-07, + "loss": 0.7688, + "step": 1386 + }, + { + "epoch": 3.6473684210526316, + "step": 1386, + "train_accuracy": 0.796875 + }, + { + "epoch": 3.65, + "grad_norm": 6.014039993286133, + "learning_rate": 3.990873874846013e-07, + "loss": 0.9492, + "step": 1387 + }, + { + "epoch": 3.65, + "step": 1387, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.6526315789473687, + "grad_norm": 4.869380950927734, + "learning_rate": 3.931481242315993e-07, + "loss": 0.7153, + "step": 1388 + }, + { + "epoch": 3.6526315789473687, + "step": 1388, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.655263157894737, + "grad_norm": 5.03730583190918, + "learning_rate": 3.8725250107571887e-07, + "loss": 0.8467, + "step": 1389 + }, + { + "epoch": 3.655263157894737, + "step": 1389, + "train_accuracy": 0.953125 + }, + { + "epoch": 3.6578947368421053, + "grad_norm": 7.302511692047119, + "learning_rate": 3.814005447984315e-07, + "loss": 0.6509, + "step": 1390 + }, + { + "epoch": 3.6578947368421053, + "step": 1390, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.6605263157894736, + "grad_norm": 7.450629711151123, + "learning_rate": 3.7559228198283505e-07, + "loss": 0.8364, + "step": 1391 + }, + { + "epoch": 3.6605263157894736, + "step": 1391, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.663157894736842, + "grad_norm": 7.157631874084473, + "learning_rate": 3.6982773901355626e-07, + "loss": 0.8926, + "step": 1392 + }, + { + "epoch": 3.663157894736842, + "step": 1392, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.6657894736842103, + "grad_norm": 6.8965840339660645, + "learning_rate": 3.641069420766108e-07, + "loss": 0.9307, + "step": 1393 + }, + { + "epoch": 3.6657894736842103, + "step": 1393, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.668421052631579, + "grad_norm": 5.144319534301758, + "learning_rate": 3.58429917159302e-07, + "loss": 0.8076, + "step": 1394 + }, + { + "epoch": 3.668421052631579, + "step": 1394, + "train_accuracy": 0.875 + }, + { + "epoch": 3.6710526315789473, + "grad_norm": 5.101710319519043, + "learning_rate": 3.5279669005008786e-07, + "loss": 0.7998, + "step": 1395 + }, + { + "epoch": 3.6710526315789473, + "step": 1395, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.6736842105263157, + "grad_norm": 5.39324426651001, + "learning_rate": 3.4720728633847235e-07, + "loss": 0.771, + "step": 1396 + }, + { + "epoch": 3.6736842105263157, + "step": 1396, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.6763157894736844, + "grad_norm": 8.216167449951172, + "learning_rate": 3.416617314148896e-07, + "loss": 0.7695, + "step": 1397 + }, + { + "epoch": 3.6763157894736844, + "step": 1397, + "train_accuracy": 0.875 + }, + { + "epoch": 3.6789473684210527, + "grad_norm": 4.266907691955566, + "learning_rate": 3.36160050470582e-07, + "loss": 0.8184, + "step": 1398 + }, + { + "epoch": 3.6789473684210527, + "step": 1398, + "train_accuracy": 0.875 + }, + { + "epoch": 3.681578947368421, + "grad_norm": 9.431413650512695, + "learning_rate": 3.3070226849749367e-07, + "loss": 0.8953, + "step": 1399 + }, + { + "epoch": 3.681578947368421, + "step": 1399, + "train_accuracy": 0.8125 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 7.751030445098877, + "learning_rate": 3.252884102881515e-07, + "loss": 1.0225, + "step": 1400 + }, + { + "epoch": 3.6842105263157894, + "step": 1400, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.6868421052631577, + "grad_norm": 5.091083526611328, + "learning_rate": 3.199185004355543e-07, + "loss": 0.687, + "step": 1401 + }, + { + "epoch": 3.6868421052631577, + "step": 1401, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.6894736842105265, + "grad_norm": 5.090567588806152, + "learning_rate": 3.1459256333306044e-07, + "loss": 0.834, + "step": 1402 + }, + { + "epoch": 3.6894736842105265, + "step": 1402, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.692105263157895, + "grad_norm": 6.143687725067139, + "learning_rate": 3.093106231742793e-07, + "loss": 0.9241, + "step": 1403 + }, + { + "epoch": 3.692105263157895, + "step": 1403, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.694736842105263, + "grad_norm": 5.69082498550415, + "learning_rate": 3.04072703952959e-07, + "loss": 0.9238, + "step": 1404 + }, + { + "epoch": 3.694736842105263, + "step": 1404, + "train_accuracy": 0.875 + }, + { + "epoch": 3.6973684210526314, + "grad_norm": 4.9670491218566895, + "learning_rate": 2.988788294628764e-07, + "loss": 0.8506, + "step": 1405 + }, + { + "epoch": 3.6973684210526314, + "step": 1405, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.7, + "grad_norm": 6.865701198577881, + "learning_rate": 2.93729023297733e-07, + "loss": 0.8594, + "step": 1406 + }, + { + "epoch": 3.7, + "step": 1406, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.7026315789473685, + "grad_norm": 7.327455520629883, + "learning_rate": 2.8862330885104485e-07, + "loss": 0.917, + "step": 1407 + }, + { + "epoch": 3.7026315789473685, + "step": 1407, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.705263157894737, + "grad_norm": 4.919546604156494, + "learning_rate": 2.8356170931603587e-07, + "loss": 0.6611, + "step": 1408 + }, + { + "epoch": 3.705263157894737, + "step": 1408, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.707894736842105, + "grad_norm": 6.415260314941406, + "learning_rate": 2.78544247685536e-07, + "loss": 1.0083, + "step": 1409 + }, + { + "epoch": 3.707894736842105, + "step": 1409, + "train_accuracy": 0.9375 + }, + { + "epoch": 3.7105263157894735, + "grad_norm": 7.830519676208496, + "learning_rate": 2.735709467518699e-07, + "loss": 0.6174, + "step": 1410 + }, + { + "epoch": 3.7105263157894735, + "step": 1410, + "train_accuracy": 0.75 + }, + { + "epoch": 3.713157894736842, + "grad_norm": 6.5190229415893555, + "learning_rate": 2.6864182910676275e-07, + "loss": 1.0508, + "step": 1411 + }, + { + "epoch": 3.713157894736842, + "step": 1411, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.7157894736842105, + "grad_norm": 5.143101692199707, + "learning_rate": 2.637569171412302e-07, + "loss": 0.9497, + "step": 1412 + }, + { + "epoch": 3.7157894736842105, + "step": 1412, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.718421052631579, + "grad_norm": 8.717037200927734, + "learning_rate": 2.5891623304547644e-07, + "loss": 1.0107, + "step": 1413 + }, + { + "epoch": 3.718421052631579, + "step": 1413, + "train_accuracy": 0.875 + }, + { + "epoch": 3.7210526315789476, + "grad_norm": 10.112276077270508, + "learning_rate": 2.54119798808804e-07, + "loss": 0.8257, + "step": 1414 + }, + { + "epoch": 3.7210526315789476, + "step": 1414, + "train_accuracy": 0.8125 + }, + { + "epoch": 3.723684210526316, + "grad_norm": 6.264548301696777, + "learning_rate": 2.4936763621949743e-07, + "loss": 1.1211, + "step": 1415 + }, + { + "epoch": 3.723684210526316, + "step": 1415, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.7263157894736842, + "grad_norm": 5.747198581695557, + "learning_rate": 2.446597668647366e-07, + "loss": 0.6389, + "step": 1416 + }, + { + "epoch": 3.7263157894736842, + "step": 1416, + "train_accuracy": 0.875 + }, + { + "epoch": 3.7289473684210526, + "grad_norm": 4.447943210601807, + "learning_rate": 2.399962121304966e-07, + "loss": 0.7246, + "step": 1417 + }, + { + "epoch": 3.7289473684210526, + "step": 1417, + "train_accuracy": 0.875 + }, + { + "epoch": 3.731578947368421, + "grad_norm": 6.314094066619873, + "learning_rate": 2.3537699320144493e-07, + "loss": 0.7841, + "step": 1418 + }, + { + "epoch": 3.731578947368421, + "step": 1418, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.734210526315789, + "grad_norm": 5.254788398742676, + "learning_rate": 2.3080213106085104e-07, + "loss": 0.8262, + "step": 1419 + }, + { + "epoch": 3.734210526315789, + "step": 1419, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.736842105263158, + "grad_norm": 5.414982318878174, + "learning_rate": 2.2627164649049128e-07, + "loss": 0.7561, + "step": 1420 + }, + { + "epoch": 3.736842105263158, + "step": 1420, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.7394736842105263, + "grad_norm": 6.738826751708984, + "learning_rate": 2.2178556007054876e-07, + "loss": 0.7996, + "step": 1421 + }, + { + "epoch": 3.7394736842105263, + "step": 1421, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.7421052631578946, + "grad_norm": 9.523528099060059, + "learning_rate": 2.1734389217952455e-07, + "loss": 1.0396, + "step": 1422 + }, + { + "epoch": 3.7421052631578946, + "step": 1422, + "train_accuracy": 0.875 + }, + { + "epoch": 3.7447368421052634, + "grad_norm": 9.194483757019043, + "learning_rate": 2.129466629941468e-07, + "loss": 0.7188, + "step": 1423 + }, + { + "epoch": 3.7447368421052634, + "step": 1423, + "train_accuracy": 0.875 + }, + { + "epoch": 3.7473684210526317, + "grad_norm": 5.894323348999023, + "learning_rate": 2.0859389248927275e-07, + "loss": 0.8308, + "step": 1424 + }, + { + "epoch": 3.7473684210526317, + "step": 1424, + "train_accuracy": 0.765625 + }, + { + "epoch": 3.75, + "grad_norm": 8.306107521057129, + "learning_rate": 2.0428560043780355e-07, + "loss": 0.9111, + "step": 1425 + }, + { + "epoch": 3.75, + "step": 1425, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.7526315789473683, + "grad_norm": 6.247003078460693, + "learning_rate": 2.000218064105919e-07, + "loss": 0.8145, + "step": 1426 + }, + { + "epoch": 3.7526315789473683, + "step": 1426, + "train_accuracy": 0.875 + }, + { + "epoch": 3.7552631578947366, + "grad_norm": 7.513801574707031, + "learning_rate": 1.9580252977635327e-07, + "loss": 0.9038, + "step": 1427 + }, + { + "epoch": 3.7552631578947366, + "step": 1427, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.7578947368421054, + "grad_norm": 4.72861909866333, + "learning_rate": 1.9162778970157947e-07, + "loss": 0.6511, + "step": 1428 + }, + { + "epoch": 3.7578947368421054, + "step": 1428, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.7605263157894737, + "grad_norm": 6.055332183837891, + "learning_rate": 1.8749760515044957e-07, + "loss": 0.8096, + "step": 1429 + }, + { + "epoch": 3.7605263157894737, + "step": 1429, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.763157894736842, + "grad_norm": 5.819669723510742, + "learning_rate": 1.834119948847457e-07, + "loss": 0.9575, + "step": 1430 + }, + { + "epoch": 3.763157894736842, + "step": 1430, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.765789473684211, + "grad_norm": 6.1318511962890625, + "learning_rate": 1.793709774637653e-07, + "loss": 0.8774, + "step": 1431 + }, + { + "epoch": 3.765789473684211, + "step": 1431, + "train_accuracy": 0.8125 + }, + { + "epoch": 3.768421052631579, + "grad_norm": 5.945221900939941, + "learning_rate": 1.7537457124423896e-07, + "loss": 0.7466, + "step": 1432 + }, + { + "epoch": 3.768421052631579, + "step": 1432, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.7710526315789474, + "grad_norm": 9.892115592956543, + "learning_rate": 1.7142279438024713e-07, + "loss": 0.9128, + "step": 1433 + }, + { + "epoch": 3.7710526315789474, + "step": 1433, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.7736842105263158, + "grad_norm": 6.725366115570068, + "learning_rate": 1.6751566482313463e-07, + "loss": 0.9673, + "step": 1434 + }, + { + "epoch": 3.7736842105263158, + "step": 1434, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.776315789473684, + "grad_norm": 7.097107410430908, + "learning_rate": 1.6365320032143527e-07, + "loss": 1.1465, + "step": 1435 + }, + { + "epoch": 3.776315789473684, + "step": 1435, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.7789473684210524, + "grad_norm": 5.92283296585083, + "learning_rate": 1.5983541842078398e-07, + "loss": 0.7777, + "step": 1436 + }, + { + "epoch": 3.7789473684210524, + "step": 1436, + "train_accuracy": 0.875 + }, + { + "epoch": 3.781578947368421, + "grad_norm": 5.0878143310546875, + "learning_rate": 1.560623364638414e-07, + "loss": 0.6826, + "step": 1437 + }, + { + "epoch": 3.781578947368421, + "step": 1437, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.7842105263157895, + "grad_norm": 7.022207260131836, + "learning_rate": 1.523339715902139e-07, + "loss": 0.8191, + "step": 1438 + }, + { + "epoch": 3.7842105263157895, + "step": 1438, + "train_accuracy": 0.8125 + }, + { + "epoch": 3.786842105263158, + "grad_norm": 6.04548454284668, + "learning_rate": 1.486503407363782e-07, + "loss": 0.9409, + "step": 1439 + }, + { + "epoch": 3.786842105263158, + "step": 1439, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 10.105904579162598, + "learning_rate": 1.450114606355979e-07, + "loss": 1.0864, + "step": 1440 + }, + { + "epoch": 3.7894736842105265, + "step": 1440, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.792105263157895, + "grad_norm": 7.647596836090088, + "learning_rate": 1.41417347817856e-07, + "loss": 0.8865, + "step": 1441 + }, + { + "epoch": 3.792105263157895, + "step": 1441, + "train_accuracy": 0.8125 + }, + { + "epoch": 3.794736842105263, + "grad_norm": 6.55628776550293, + "learning_rate": 1.3786801860977138e-07, + "loss": 1.1155, + "step": 1442 + }, + { + "epoch": 3.794736842105263, + "step": 1442, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.7973684210526315, + "grad_norm": 6.652747631072998, + "learning_rate": 1.3436348913453578e-07, + "loss": 0.877, + "step": 1443 + }, + { + "epoch": 3.7973684210526315, + "step": 1443, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.8, + "grad_norm": 5.4360032081604, + "learning_rate": 1.3090377531182364e-07, + "loss": 0.7061, + "step": 1444 + }, + { + "epoch": 3.8, + "eval_accuracy": 0.7134296894073486, + "eval_max_score": 11.5625, + "eval_min_score": -13.25, + "eval_runtime": 151.4119, + "eval_samples_per_second": 18.737, + "eval_steps_per_second": 0.297, + "step": 1444 + }, + { + "epoch": 3.8, + "step": 1444, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.8026315789473686, + "grad_norm": 8.141218185424805, + "learning_rate": 1.2748889285774002e-07, + "loss": 0.7708, + "step": 1445 + }, + { + "epoch": 3.8026315789473686, + "step": 1445, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.805263157894737, + "grad_norm": 9.344526290893555, + "learning_rate": 1.241188572847296e-07, + "loss": 1.0728, + "step": 1446 + }, + { + "epoch": 3.805263157894737, + "step": 1446, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.807894736842105, + "grad_norm": 7.375489234924316, + "learning_rate": 1.2079368390152446e-07, + "loss": 0.8359, + "step": 1447 + }, + { + "epoch": 3.807894736842105, + "step": 1447, + "train_accuracy": 0.875 + }, + { + "epoch": 3.8105263157894735, + "grad_norm": 7.120669841766357, + "learning_rate": 1.1751338781305854e-07, + "loss": 0.6541, + "step": 1448 + }, + { + "epoch": 3.8105263157894735, + "step": 1448, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.8131578947368423, + "grad_norm": 6.789575576782227, + "learning_rate": 1.142779839204089e-07, + "loss": 0.9011, + "step": 1449 + }, + { + "epoch": 3.8131578947368423, + "step": 1449, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.8157894736842106, + "grad_norm": 4.636354923248291, + "learning_rate": 1.110874869207268e-07, + "loss": 0.8013, + "step": 1450 + }, + { + "epoch": 3.8157894736842106, + "step": 1450, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.818421052631579, + "grad_norm": 5.722029685974121, + "learning_rate": 1.079419113071678e-07, + "loss": 0.8926, + "step": 1451 + }, + { + "epoch": 3.818421052631579, + "step": 1451, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.8210526315789473, + "grad_norm": 7.031396865844727, + "learning_rate": 1.0484127136882627e-07, + "loss": 1.0366, + "step": 1452 + }, + { + "epoch": 3.8210526315789473, + "step": 1452, + "train_accuracy": 0.875 + }, + { + "epoch": 3.8236842105263156, + "grad_norm": 4.9844651222229, + "learning_rate": 1.0178558119067316e-07, + "loss": 0.7656, + "step": 1453 + }, + { + "epoch": 3.8236842105263156, + "step": 1453, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.8263157894736843, + "grad_norm": 8.02027416229248, + "learning_rate": 9.877485465349057e-08, + "loss": 1.1079, + "step": 1454 + }, + { + "epoch": 3.8263157894736843, + "step": 1454, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.8289473684210527, + "grad_norm": 6.716357707977295, + "learning_rate": 9.580910543380839e-08, + "loss": 0.8298, + "step": 1455 + }, + { + "epoch": 3.8289473684210527, + "step": 1455, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.831578947368421, + "grad_norm": 10.892178535461426, + "learning_rate": 9.288834700384441e-08, + "loss": 0.9429, + "step": 1456 + }, + { + "epoch": 3.831578947368421, + "step": 1456, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.8342105263157897, + "grad_norm": 10.586390495300293, + "learning_rate": 9.001259263143769e-08, + "loss": 0.9136, + "step": 1457 + }, + { + "epoch": 3.8342105263157897, + "step": 1457, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.836842105263158, + "grad_norm": 8.095308303833008, + "learning_rate": 8.718185537999302e-08, + "loss": 0.7913, + "step": 1458 + }, + { + "epoch": 3.836842105263158, + "step": 1458, + "train_accuracy": 0.8125 + }, + { + "epoch": 3.8394736842105264, + "grad_norm": 5.505589008331299, + "learning_rate": 8.439614810842211e-08, + "loss": 0.9375, + "step": 1459 + }, + { + "epoch": 3.8394736842105264, + "step": 1459, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.8421052631578947, + "grad_norm": 5.2070231437683105, + "learning_rate": 8.165548347108254e-08, + "loss": 0.7466, + "step": 1460 + }, + { + "epoch": 3.8421052631578947, + "step": 1460, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.844736842105263, + "grad_norm": 4.662224769592285, + "learning_rate": 7.895987391771998e-08, + "loss": 0.6652, + "step": 1461 + }, + { + "epoch": 3.844736842105263, + "step": 1461, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.8473684210526313, + "grad_norm": 5.000279426574707, + "learning_rate": 7.630933169341493e-08, + "loss": 0.8708, + "step": 1462 + }, + { + "epoch": 3.8473684210526313, + "step": 1462, + "train_accuracy": 0.78125 + }, + { + "epoch": 3.85, + "grad_norm": 8.080231666564941, + "learning_rate": 7.370386883852165e-08, + "loss": 1.1318, + "step": 1463 + }, + { + "epoch": 3.85, + "step": 1463, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.8526315789473684, + "grad_norm": 10.200188636779785, + "learning_rate": 7.114349718862045e-08, + "loss": 0.906, + "step": 1464 + }, + { + "epoch": 3.8526315789473684, + "step": 1464, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.8552631578947367, + "grad_norm": 5.127605438232422, + "learning_rate": 6.862822837445882e-08, + "loss": 0.8738, + "step": 1465 + }, + { + "epoch": 3.8552631578947367, + "step": 1465, + "train_accuracy": 0.796875 + }, + { + "epoch": 3.8578947368421055, + "grad_norm": 7.480038642883301, + "learning_rate": 6.61580738218992e-08, + "loss": 1.1709, + "step": 1466 + }, + { + "epoch": 3.8578947368421055, + "step": 1466, + "train_accuracy": 0.875 + }, + { + "epoch": 3.860526315789474, + "grad_norm": 5.7492852210998535, + "learning_rate": 6.373304475186803e-08, + "loss": 0.8499, + "step": 1467 + }, + { + "epoch": 3.860526315789474, + "step": 1467, + "train_accuracy": 0.953125 + }, + { + "epoch": 3.863157894736842, + "grad_norm": 4.850841522216797, + "learning_rate": 6.13531521803068e-08, + "loss": 0.5709, + "step": 1468 + }, + { + "epoch": 3.863157894736842, + "step": 1468, + "train_accuracy": 0.875 + }, + { + "epoch": 3.8657894736842104, + "grad_norm": 7.0631422996521, + "learning_rate": 5.9018406918118774e-08, + "loss": 1.0337, + "step": 1469 + }, + { + "epoch": 3.8657894736842104, + "step": 1469, + "train_accuracy": 0.796875 + }, + { + "epoch": 3.8684210526315788, + "grad_norm": 6.850086212158203, + "learning_rate": 5.672881957111909e-08, + "loss": 0.877, + "step": 1470 + }, + { + "epoch": 3.8684210526315788, + "step": 1470, + "train_accuracy": 0.875 + }, + { + "epoch": 3.8710526315789475, + "grad_norm": 5.157078742980957, + "learning_rate": 5.448440053999138e-08, + "loss": 0.6685, + "step": 1471 + }, + { + "epoch": 3.8710526315789475, + "step": 1471, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.873684210526316, + "grad_norm": 7.3163862228393555, + "learning_rate": 5.228516002023565e-08, + "loss": 0.8005, + "step": 1472 + }, + { + "epoch": 3.873684210526316, + "step": 1472, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.876315789473684, + "grad_norm": 6.442539215087891, + "learning_rate": 5.013110800212384e-08, + "loss": 0.8091, + "step": 1473 + }, + { + "epoch": 3.876315789473684, + "step": 1473, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.8789473684210525, + "grad_norm": 5.667739391326904, + "learning_rate": 4.802225427065654e-08, + "loss": 0.9438, + "step": 1474 + }, + { + "epoch": 3.8789473684210525, + "step": 1474, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.8815789473684212, + "grad_norm": 6.527713775634766, + "learning_rate": 4.5958608405515246e-08, + "loss": 0.8308, + "step": 1475 + }, + { + "epoch": 3.8815789473684212, + "step": 1475, + "train_accuracy": 0.875 + }, + { + "epoch": 3.8842105263157896, + "grad_norm": 6.002090930938721, + "learning_rate": 4.394017978101905e-08, + "loss": 0.8254, + "step": 1476 + }, + { + "epoch": 3.8842105263157896, + "step": 1476, + "train_accuracy": 0.875 + }, + { + "epoch": 3.886842105263158, + "grad_norm": 13.156556129455566, + "learning_rate": 4.196697756608581e-08, + "loss": 0.9932, + "step": 1477 + }, + { + "epoch": 3.886842105263158, + "step": 1477, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.889473684210526, + "grad_norm": 7.244754791259766, + "learning_rate": 4.0039010724184366e-08, + "loss": 0.937, + "step": 1478 + }, + { + "epoch": 3.889473684210526, + "step": 1478, + "train_accuracy": 0.8125 + }, + { + "epoch": 3.8921052631578945, + "grad_norm": 5.323999404907227, + "learning_rate": 3.815628801330129e-08, + "loss": 0.8892, + "step": 1479 + }, + { + "epoch": 3.8921052631578945, + "step": 1479, + "train_accuracy": 0.796875 + }, + { + "epoch": 3.8947368421052633, + "grad_norm": 6.918015003204346, + "learning_rate": 3.6318817985894206e-08, + "loss": 1.0068, + "step": 1480 + }, + { + "epoch": 3.8947368421052633, + "step": 1480, + "train_accuracy": 0.875 + }, + { + "epoch": 3.8973684210526316, + "grad_norm": 6.255131244659424, + "learning_rate": 3.45266089888574e-08, + "loss": 0.7729, + "step": 1481 + }, + { + "epoch": 3.8973684210526316, + "step": 1481, + "train_accuracy": 0.875 + }, + { + "epoch": 3.9, + "grad_norm": 7.700157165527344, + "learning_rate": 3.277966916347963e-08, + "loss": 0.8022, + "step": 1482 + }, + { + "epoch": 3.9, + "step": 1482, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.9026315789473687, + "grad_norm": 6.339673042297363, + "learning_rate": 3.1078006445414145e-08, + "loss": 0.7759, + "step": 1483 + }, + { + "epoch": 3.9026315789473687, + "step": 1483, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.905263157894737, + "grad_norm": 5.586291790008545, + "learning_rate": 2.9421628564634265e-08, + "loss": 0.957, + "step": 1484 + }, + { + "epoch": 3.905263157894737, + "step": 1484, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.9078947368421053, + "grad_norm": 6.360084533691406, + "learning_rate": 2.7810543045405648e-08, + "loss": 1.064, + "step": 1485 + }, + { + "epoch": 3.9078947368421053, + "step": 1485, + "train_accuracy": 0.953125 + }, + { + "epoch": 3.9105263157894736, + "grad_norm": 5.326403617858887, + "learning_rate": 2.6244757206247417e-08, + "loss": 0.739, + "step": 1486 + }, + { + "epoch": 3.9105263157894736, + "step": 1486, + "train_accuracy": 0.9375 + }, + { + "epoch": 3.913157894736842, + "grad_norm": 6.157349586486816, + "learning_rate": 2.4724278159898863e-08, + "loss": 0.8335, + "step": 1487 + }, + { + "epoch": 3.913157894736842, + "step": 1487, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.9157894736842103, + "grad_norm": 8.328683853149414, + "learning_rate": 2.3249112813291676e-08, + "loss": 1.0186, + "step": 1488 + }, + { + "epoch": 3.9157894736842103, + "step": 1488, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.918421052631579, + "grad_norm": 8.057480812072754, + "learning_rate": 2.1819267867512206e-08, + "loss": 0.8733, + "step": 1489 + }, + { + "epoch": 3.918421052631579, + "step": 1489, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.9210526315789473, + "grad_norm": 5.9042439460754395, + "learning_rate": 2.0434749817777045e-08, + "loss": 0.8164, + "step": 1490 + }, + { + "epoch": 3.9210526315789473, + "step": 1490, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.9236842105263157, + "grad_norm": 6.103878021240234, + "learning_rate": 1.9095564953398594e-08, + "loss": 0.9336, + "step": 1491 + }, + { + "epoch": 3.9236842105263157, + "step": 1491, + "train_accuracy": 0.9375 + }, + { + "epoch": 3.9263157894736844, + "grad_norm": 10.032098770141602, + "learning_rate": 1.780171935775954e-08, + "loss": 0.8813, + "step": 1492 + }, + { + "epoch": 3.9263157894736844, + "step": 1492, + "train_accuracy": 0.953125 + }, + { + "epoch": 3.9289473684210527, + "grad_norm": 5.073022365570068, + "learning_rate": 1.6553218908286207e-08, + "loss": 0.7488, + "step": 1493 + }, + { + "epoch": 3.9289473684210527, + "step": 1493, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.931578947368421, + "grad_norm": 5.193045139312744, + "learning_rate": 1.535006927641636e-08, + "loss": 0.8813, + "step": 1494 + }, + { + "epoch": 3.931578947368421, + "step": 1494, + "train_accuracy": 0.921875 + }, + { + "epoch": 3.9342105263157894, + "grad_norm": 7.310925006866455, + "learning_rate": 1.4192275927580323e-08, + "loss": 0.8572, + "step": 1495 + }, + { + "epoch": 3.9342105263157894, + "step": 1495, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.9368421052631577, + "grad_norm": 5.533489227294922, + "learning_rate": 1.3079844121171026e-08, + "loss": 0.8955, + "step": 1496 + }, + { + "epoch": 3.9368421052631577, + "step": 1496, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.9394736842105265, + "grad_norm": 6.022321701049805, + "learning_rate": 1.2012778910521773e-08, + "loss": 0.8384, + "step": 1497 + }, + { + "epoch": 3.9394736842105265, + "step": 1497, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.942105263157895, + "grad_norm": 5.136116981506348, + "learning_rate": 1.099108514288627e-08, + "loss": 0.8408, + "step": 1498 + }, + { + "epoch": 3.942105263157895, + "step": 1498, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.944736842105263, + "grad_norm": 7.58246374130249, + "learning_rate": 1.0014767459409769e-08, + "loss": 0.9106, + "step": 1499 + }, + { + "epoch": 3.944736842105263, + "step": 1499, + "train_accuracy": 0.71875 + }, + { + "epoch": 3.9473684210526314, + "grad_norm": 7.496955394744873, + "learning_rate": 9.083830295114614e-09, + "loss": 1.0283, + "step": 1500 + }, + { + "epoch": 3.9473684210526314, + "step": 1500, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.95, + "grad_norm": 5.4122314453125, + "learning_rate": 8.198277878879168e-09, + "loss": 1.0747, + "step": 1501 + }, + { + "epoch": 3.95, + "step": 1501, + "train_accuracy": 0.90625 + }, + { + "epoch": 3.9526315789473685, + "grad_norm": 8.616875648498535, + "learning_rate": 7.3581142334122614e-09, + "loss": 0.7441, + "step": 1502 + }, + { + "epoch": 3.9526315789473685, + "step": 1502, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.955263157894737, + "grad_norm": 5.382155418395996, + "learning_rate": 6.563343175243209e-09, + "loss": 0.7217, + "step": 1503 + }, + { + "epoch": 3.955263157894737, + "step": 1503, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.957894736842105, + "grad_norm": 5.414727210998535, + "learning_rate": 5.813968314700713e-09, + "loss": 0.6335, + "step": 1504 + }, + { + "epoch": 3.957894736842105, + "step": 1504, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.9605263157894735, + "grad_norm": 6.143625259399414, + "learning_rate": 5.109993055896212e-09, + "loss": 1.1191, + "step": 1505 + }, + { + "epoch": 3.9605263157894735, + "step": 1505, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.963157894736842, + "grad_norm": 7.317595481872559, + "learning_rate": 4.451420596706113e-09, + "loss": 0.9097, + "step": 1506 + }, + { + "epoch": 3.963157894736842, + "step": 1506, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.9657894736842105, + "grad_norm": 6.752077579498291, + "learning_rate": 3.838253928765135e-09, + "loss": 1.0115, + "step": 1507 + }, + { + "epoch": 3.9657894736842105, + "step": 1507, + "train_accuracy": 0.875 + }, + { + "epoch": 3.968421052631579, + "grad_norm": 6.5832953453063965, + "learning_rate": 3.2704958374440987e-09, + "loss": 0.6831, + "step": 1508 + }, + { + "epoch": 3.968421052631579, + "step": 1508, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.9710526315789476, + "grad_norm": 7.169082164764404, + "learning_rate": 2.7481489018410525e-09, + "loss": 0.8301, + "step": 1509 + }, + { + "epoch": 3.9710526315789476, + "step": 1509, + "train_accuracy": 0.828125 + }, + { + "epoch": 3.973684210526316, + "grad_norm": 6.229421615600586, + "learning_rate": 2.2712154947701627e-09, + "loss": 1.0732, + "step": 1510 + }, + { + "epoch": 3.973684210526316, + "step": 1510, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.9763157894736842, + "grad_norm": 9.258220672607422, + "learning_rate": 1.8396977827495056e-09, + "loss": 0.8176, + "step": 1511 + }, + { + "epoch": 3.9763157894736842, + "step": 1511, + "train_accuracy": 0.859375 + }, + { + "epoch": 3.9789473684210526, + "grad_norm": 6.452820777893066, + "learning_rate": 1.453597725992184e-09, + "loss": 1.0376, + "step": 1512 + }, + { + "epoch": 3.9789473684210526, + "step": 1512, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.981578947368421, + "grad_norm": 6.723094463348389, + "learning_rate": 1.1129170783974463e-09, + "loss": 0.8823, + "step": 1513 + }, + { + "epoch": 3.981578947368421, + "step": 1513, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.984210526315789, + "grad_norm": 5.933262825012207, + "learning_rate": 8.176573875406935e-10, + "loss": 0.959, + "step": 1514 + }, + { + "epoch": 3.984210526315789, + "step": 1514, + "train_accuracy": 0.796875 + }, + { + "epoch": 3.986842105263158, + "grad_norm": 6.924864768981934, + "learning_rate": 5.67819994670149e-10, + "loss": 0.8926, + "step": 1515 + }, + { + "epoch": 3.986842105263158, + "step": 1515, + "train_accuracy": 0.890625 + }, + { + "epoch": 3.9894736842105263, + "grad_norm": 9.446735382080078, + "learning_rate": 3.634060346957569e-10, + "loss": 0.9897, + "step": 1516 + }, + { + "epoch": 3.9894736842105263, + "step": 1516, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.9921052631578946, + "grad_norm": 6.521078586578369, + "learning_rate": 2.0441643618918095e-10, + "loss": 1.0225, + "step": 1517 + }, + { + "epoch": 3.9921052631578946, + "step": 1517, + "train_accuracy": 0.84375 + }, + { + "epoch": 3.9947368421052634, + "grad_norm": 9.099228858947754, + "learning_rate": 9.085192137714415e-11, + "loss": 1.061, + "step": 1518 + }, + { + "epoch": 3.9947368421052634, + "step": 1518, + "train_accuracy": 0.875 + }, + { + "epoch": 3.9973684210526317, + "grad_norm": 5.087467193603516, + "learning_rate": 2.2713006138097572e-11, + "loss": 0.8418, + "step": 1519 + }, + { + "epoch": 3.9973684210526317, + "step": 1519, + "train_accuracy": 0.9375 + }, + { + "epoch": 4.0, + "grad_norm": 4.034855842590332, + "learning_rate": 0.0, + "loss": 0.6743, + "step": 1520 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.7158970832824707, + "eval_max_score": 11.5, + "eval_min_score": -13.125, + "eval_runtime": 151.4884, + "eval_samples_per_second": 18.728, + "eval_steps_per_second": 0.297, + "step": 1520 + }, + { + "epoch": 4.0, + "step": 1520, + "total_flos": 1.2414554645001667e+18, + "train_loss": 1.1865725868626644, + "train_runtime": 25390.1743, + "train_samples_per_second": 3.822, + "train_steps_per_second": 0.06 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.7173070311546326, + "eval_max_score": 10.875, + "eval_min_score": -13.625, + "eval_runtime": 149.5213, + "eval_samples_per_second": 18.974, + "eval_steps_per_second": 0.301, + "step": 1520 + } + ], + "logging_steps": 1.0, + "max_steps": 1520, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 76, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.2414554645001667e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}