{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.11865211200759374, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00023730422401518748, "grad_norm": 0.19874855875968933, "learning_rate": 9.980000000000001e-06, "loss": 2.3055, "step": 1 }, { "epoch": 0.00047460844803037496, "grad_norm": 0.3101324141025543, "learning_rate": 9.960000000000001e-06, "loss": 0.8548, "step": 2 }, { "epoch": 0.0007119126720455624, "grad_norm": 0.18851338326931, "learning_rate": 9.940000000000001e-06, "loss": 1.3783, "step": 3 }, { "epoch": 0.0009492168960607499, "grad_norm": 0.5474942326545715, "learning_rate": 9.920000000000002e-06, "loss": 2.5833, "step": 4 }, { "epoch": 0.0011865211200759373, "grad_norm": 0.28883224725723267, "learning_rate": 9.9e-06, "loss": 2.4045, "step": 5 }, { "epoch": 0.0014238253440911248, "grad_norm": 0.4798765778541565, "learning_rate": 9.88e-06, "loss": 1.2589, "step": 6 }, { "epoch": 0.0016611295681063123, "grad_norm": 0.48112648725509644, "learning_rate": 9.86e-06, "loss": 2.9112, "step": 7 }, { "epoch": 0.0018984337921214998, "grad_norm": 0.8760956525802612, "learning_rate": 9.84e-06, "loss": 2.3175, "step": 8 }, { "epoch": 0.0021357380161366873, "grad_norm": 0.44891074299812317, "learning_rate": 9.820000000000001e-06, "loss": 1.3196, "step": 9 }, { "epoch": 0.0023730422401518746, "grad_norm": 0.3395187556743622, "learning_rate": 9.800000000000001e-06, "loss": 1.588, "step": 10 }, { "epoch": 0.0026103464641670624, "grad_norm": 0.587505578994751, "learning_rate": 9.780000000000001e-06, "loss": 1.2509, "step": 11 }, { "epoch": 0.0028476506881822496, "grad_norm": 0.4817255437374115, "learning_rate": 9.760000000000001e-06, "loss": 1.3541, "step": 12 }, { "epoch": 0.003084954912197437, "grad_norm": 0.312285840511322, "learning_rate": 9.74e-06, "loss": 1.8525, "step": 13 }, { "epoch": 0.0033222591362126247, "grad_norm": 0.7621486783027649, "learning_rate": 9.72e-06, "loss": 1.3197, "step": 14 }, { "epoch": 0.003559563360227812, "grad_norm": 0.5916957259178162, "learning_rate": 9.7e-06, "loss": 1.0805, "step": 15 }, { "epoch": 0.0037968675842429997, "grad_norm": 0.7447299361228943, "learning_rate": 9.68e-06, "loss": 2.5366, "step": 16 }, { "epoch": 0.004034171808258187, "grad_norm": 0.38069915771484375, "learning_rate": 9.66e-06, "loss": 1.6038, "step": 17 }, { "epoch": 0.004271476032273375, "grad_norm": 0.4375569224357605, "learning_rate": 9.640000000000001e-06, "loss": 2.8857, "step": 18 }, { "epoch": 0.004508780256288562, "grad_norm": 0.2266787886619568, "learning_rate": 9.620000000000001e-06, "loss": 2.045, "step": 19 }, { "epoch": 0.004746084480303749, "grad_norm": 0.3714630901813507, "learning_rate": 9.600000000000001e-06, "loss": 1.6152, "step": 20 }, { "epoch": 0.0049833887043189366, "grad_norm": 0.45660167932510376, "learning_rate": 9.58e-06, "loss": 2.1712, "step": 21 }, { "epoch": 0.005220692928334125, "grad_norm": 0.3804182708263397, "learning_rate": 9.56e-06, "loss": 1.7326, "step": 22 }, { "epoch": 0.005457997152349312, "grad_norm": 0.5945218205451965, "learning_rate": 9.54e-06, "loss": 2.8499, "step": 23 }, { "epoch": 0.005695301376364499, "grad_norm": 0.49986812472343445, "learning_rate": 9.52e-06, "loss": 1.887, "step": 24 }, { "epoch": 0.005932605600379687, "grad_norm": 0.41545894742012024, "learning_rate": 9.5e-06, "loss": 1.0656, "step": 25 }, { "epoch": 0.006169909824394874, "grad_norm": 0.6399343609809875, "learning_rate": 9.48e-06, "loss": 1.8672, "step": 26 }, { "epoch": 0.006407214048410062, "grad_norm": 0.3765011727809906, "learning_rate": 9.460000000000001e-06, "loss": 0.9629, "step": 27 }, { "epoch": 0.006644518272425249, "grad_norm": 0.48257043957710266, "learning_rate": 9.440000000000001e-06, "loss": 1.9923, "step": 28 }, { "epoch": 0.006881822496440437, "grad_norm": 0.5844452977180481, "learning_rate": 9.42e-06, "loss": 2.0896, "step": 29 }, { "epoch": 0.007119126720455624, "grad_norm": 0.6092790961265564, "learning_rate": 9.4e-06, "loss": 2.4446, "step": 30 }, { "epoch": 0.007356430944470811, "grad_norm": 0.6862596869468689, "learning_rate": 9.38e-06, "loss": 1.9678, "step": 31 }, { "epoch": 0.007593735168485999, "grad_norm": 0.5207828879356384, "learning_rate": 9.360000000000002e-06, "loss": 1.611, "step": 32 }, { "epoch": 0.007831039392501186, "grad_norm": 0.5294057130813599, "learning_rate": 9.340000000000002e-06, "loss": 1.8447, "step": 33 }, { "epoch": 0.008068343616516375, "grad_norm": 0.4702499210834503, "learning_rate": 9.32e-06, "loss": 2.035, "step": 34 }, { "epoch": 0.008305647840531562, "grad_norm": 0.5244491696357727, "learning_rate": 9.3e-06, "loss": 1.4395, "step": 35 }, { "epoch": 0.00854295206454675, "grad_norm": 0.644615650177002, "learning_rate": 9.280000000000001e-06, "loss": 1.7666, "step": 36 }, { "epoch": 0.008780256288561937, "grad_norm": 0.5552751421928406, "learning_rate": 9.260000000000001e-06, "loss": 1.5753, "step": 37 }, { "epoch": 0.009017560512577124, "grad_norm": 0.2882249057292938, "learning_rate": 9.240000000000001e-06, "loss": 1.5176, "step": 38 }, { "epoch": 0.009254864736592311, "grad_norm": 0.38219153881073, "learning_rate": 9.220000000000002e-06, "loss": 1.7808, "step": 39 }, { "epoch": 0.009492168960607499, "grad_norm": 0.6532744765281677, "learning_rate": 9.200000000000002e-06, "loss": 1.8763, "step": 40 }, { "epoch": 0.009729473184622686, "grad_norm": 0.7350605726242065, "learning_rate": 9.180000000000002e-06, "loss": 2.5947, "step": 41 }, { "epoch": 0.009966777408637873, "grad_norm": 0.44435158371925354, "learning_rate": 9.16e-06, "loss": 2.4829, "step": 42 }, { "epoch": 0.01020408163265306, "grad_norm": 0.3937893509864807, "learning_rate": 9.14e-06, "loss": 2.106, "step": 43 }, { "epoch": 0.01044138585666825, "grad_norm": 0.8209621906280518, "learning_rate": 9.12e-06, "loss": 1.8711, "step": 44 }, { "epoch": 0.010678690080683437, "grad_norm": 1.220982313156128, "learning_rate": 9.100000000000001e-06, "loss": 2.4524, "step": 45 }, { "epoch": 0.010915994304698624, "grad_norm": 0.4203420877456665, "learning_rate": 9.080000000000001e-06, "loss": 1.775, "step": 46 }, { "epoch": 0.011153298528713811, "grad_norm": 0.28424403071403503, "learning_rate": 9.060000000000001e-06, "loss": 1.7594, "step": 47 }, { "epoch": 0.011390602752728999, "grad_norm": 0.7268348336219788, "learning_rate": 9.040000000000002e-06, "loss": 0.9244, "step": 48 }, { "epoch": 0.011627906976744186, "grad_norm": 0.4843103885650635, "learning_rate": 9.020000000000002e-06, "loss": 1.9955, "step": 49 }, { "epoch": 0.011865211200759373, "grad_norm": 0.3405970335006714, "learning_rate": 9e-06, "loss": 1.6384, "step": 50 }, { "epoch": 0.01210251542477456, "grad_norm": 1.1265524625778198, "learning_rate": 8.98e-06, "loss": 1.7042, "step": 51 }, { "epoch": 0.012339819648789748, "grad_norm": 0.13442721962928772, "learning_rate": 8.96e-06, "loss": 1.3843, "step": 52 }, { "epoch": 0.012577123872804937, "grad_norm": 0.4972473382949829, "learning_rate": 8.94e-06, "loss": 0.8437, "step": 53 }, { "epoch": 0.012814428096820124, "grad_norm": 0.28667712211608887, "learning_rate": 8.920000000000001e-06, "loss": 0.9238, "step": 54 }, { "epoch": 0.013051732320835311, "grad_norm": 0.2998306155204773, "learning_rate": 8.900000000000001e-06, "loss": 2.2332, "step": 55 }, { "epoch": 0.013289036544850499, "grad_norm": 0.6133009195327759, "learning_rate": 8.880000000000001e-06, "loss": 2.2289, "step": 56 }, { "epoch": 0.013526340768865686, "grad_norm": 0.48921748995780945, "learning_rate": 8.860000000000002e-06, "loss": 0.6597, "step": 57 }, { "epoch": 0.013763644992880873, "grad_norm": 0.8161422610282898, "learning_rate": 8.84e-06, "loss": 1.3381, "step": 58 }, { "epoch": 0.01400094921689606, "grad_norm": 0.4998335540294647, "learning_rate": 8.82e-06, "loss": 1.5505, "step": 59 }, { "epoch": 0.014238253440911248, "grad_norm": 0.8967633843421936, "learning_rate": 8.8e-06, "loss": 1.2807, "step": 60 }, { "epoch": 0.014475557664926435, "grad_norm": 0.8106015920639038, "learning_rate": 8.78e-06, "loss": 1.2792, "step": 61 }, { "epoch": 0.014712861888941622, "grad_norm": 0.6022857427597046, "learning_rate": 8.76e-06, "loss": 2.023, "step": 62 }, { "epoch": 0.014950166112956811, "grad_norm": 0.6330555081367493, "learning_rate": 8.740000000000001e-06, "loss": 0.931, "step": 63 }, { "epoch": 0.015187470336971999, "grad_norm": 0.6975427269935608, "learning_rate": 8.720000000000001e-06, "loss": 1.5096, "step": 64 }, { "epoch": 0.015424774560987186, "grad_norm": 0.9109779596328735, "learning_rate": 8.700000000000001e-06, "loss": 1.4295, "step": 65 }, { "epoch": 0.01566207878500237, "grad_norm": 0.5679107904434204, "learning_rate": 8.68e-06, "loss": 1.6368, "step": 66 }, { "epoch": 0.01589938300901756, "grad_norm": 0.4659746587276459, "learning_rate": 8.66e-06, "loss": 1.292, "step": 67 }, { "epoch": 0.01613668723303275, "grad_norm": 0.6395153403282166, "learning_rate": 8.64e-06, "loss": 2.0611, "step": 68 }, { "epoch": 0.016373991457047935, "grad_norm": 0.37920185923576355, "learning_rate": 8.62e-06, "loss": 1.1706, "step": 69 }, { "epoch": 0.016611295681063124, "grad_norm": 0.2950354218482971, "learning_rate": 8.6e-06, "loss": 1.4619, "step": 70 }, { "epoch": 0.01684859990507831, "grad_norm": 0.664068877696991, "learning_rate": 8.580000000000001e-06, "loss": 2.3014, "step": 71 }, { "epoch": 0.0170859041290935, "grad_norm": 0.4015841782093048, "learning_rate": 8.560000000000001e-06, "loss": 2.6305, "step": 72 }, { "epoch": 0.017323208353108684, "grad_norm": 0.8365151882171631, "learning_rate": 8.540000000000001e-06, "loss": 2.4402, "step": 73 }, { "epoch": 0.017560512577123873, "grad_norm": 0.4587593376636505, "learning_rate": 8.52e-06, "loss": 1.1948, "step": 74 }, { "epoch": 0.01779781680113906, "grad_norm": 0.5275629162788391, "learning_rate": 8.5e-06, "loss": 1.2889, "step": 75 }, { "epoch": 0.018035121025154248, "grad_norm": 0.5322698354721069, "learning_rate": 8.48e-06, "loss": 2.5775, "step": 76 }, { "epoch": 0.018272425249169437, "grad_norm": 0.7846812009811401, "learning_rate": 8.46e-06, "loss": 1.2905, "step": 77 }, { "epoch": 0.018509729473184623, "grad_norm": 0.4759507179260254, "learning_rate": 8.44e-06, "loss": 1.1162, "step": 78 }, { "epoch": 0.01874703369719981, "grad_norm": 0.596358597278595, "learning_rate": 8.42e-06, "loss": 1.6618, "step": 79 }, { "epoch": 0.018984337921214997, "grad_norm": 0.5133060812950134, "learning_rate": 8.400000000000001e-06, "loss": 1.9102, "step": 80 }, { "epoch": 0.019221642145230186, "grad_norm": 0.8989421129226685, "learning_rate": 8.380000000000001e-06, "loss": 2.1852, "step": 81 }, { "epoch": 0.01945894636924537, "grad_norm": 0.4702143669128418, "learning_rate": 8.36e-06, "loss": 1.2668, "step": 82 }, { "epoch": 0.01969625059326056, "grad_norm": 1.2159205675125122, "learning_rate": 8.34e-06, "loss": 1.1101, "step": 83 }, { "epoch": 0.019933554817275746, "grad_norm": 0.5116935968399048, "learning_rate": 8.32e-06, "loss": 1.222, "step": 84 }, { "epoch": 0.020170859041290935, "grad_norm": 1.3710129261016846, "learning_rate": 8.3e-06, "loss": 3.1337, "step": 85 }, { "epoch": 0.02040816326530612, "grad_norm": 0.7467148303985596, "learning_rate": 8.28e-06, "loss": 2.1887, "step": 86 }, { "epoch": 0.02064546748932131, "grad_norm": 0.6403272151947021, "learning_rate": 8.26e-06, "loss": 1.2013, "step": 87 }, { "epoch": 0.0208827717133365, "grad_norm": 0.4310401678085327, "learning_rate": 8.24e-06, "loss": 1.4044, "step": 88 }, { "epoch": 0.021120075937351684, "grad_norm": 0.6710259318351746, "learning_rate": 8.220000000000001e-06, "loss": 2.5627, "step": 89 }, { "epoch": 0.021357380161366873, "grad_norm": 0.5828210115432739, "learning_rate": 8.2e-06, "loss": 1.7674, "step": 90 }, { "epoch": 0.02159468438538206, "grad_norm": 0.5417571663856506, "learning_rate": 8.18e-06, "loss": 2.2125, "step": 91 }, { "epoch": 0.021831988609397248, "grad_norm": 0.5095130205154419, "learning_rate": 8.16e-06, "loss": 3.1429, "step": 92 }, { "epoch": 0.022069292833412434, "grad_norm": 0.2329273670911789, "learning_rate": 8.14e-06, "loss": 1.84, "step": 93 }, { "epoch": 0.022306597057427623, "grad_norm": 0.4823262691497803, "learning_rate": 8.120000000000002e-06, "loss": 1.7104, "step": 94 }, { "epoch": 0.022543901281442808, "grad_norm": 0.6276722550392151, "learning_rate": 8.1e-06, "loss": 0.7729, "step": 95 }, { "epoch": 0.022781205505457997, "grad_norm": 0.642092764377594, "learning_rate": 8.08e-06, "loss": 2.1965, "step": 96 }, { "epoch": 0.023018509729473186, "grad_norm": 0.34217798709869385, "learning_rate": 8.06e-06, "loss": 0.9981, "step": 97 }, { "epoch": 0.023255813953488372, "grad_norm": 0.47839802503585815, "learning_rate": 8.040000000000001e-06, "loss": 1.594, "step": 98 }, { "epoch": 0.02349311817750356, "grad_norm": 1.0722686052322388, "learning_rate": 8.020000000000001e-06, "loss": 2.0473, "step": 99 }, { "epoch": 0.023730422401518746, "grad_norm": 0.7368118166923523, "learning_rate": 8.000000000000001e-06, "loss": 2.4467, "step": 100 }, { "epoch": 0.023967726625533935, "grad_norm": 1.064618706703186, "learning_rate": 7.980000000000002e-06, "loss": 1.9095, "step": 101 }, { "epoch": 0.02420503084954912, "grad_norm": 0.43928399682044983, "learning_rate": 7.960000000000002e-06, "loss": 1.7245, "step": 102 }, { "epoch": 0.02444233507356431, "grad_norm": 0.6588628888130188, "learning_rate": 7.94e-06, "loss": 0.8569, "step": 103 }, { "epoch": 0.024679639297579496, "grad_norm": 0.5403575301170349, "learning_rate": 7.92e-06, "loss": 1.2521, "step": 104 }, { "epoch": 0.024916943521594685, "grad_norm": 0.6686379313468933, "learning_rate": 7.9e-06, "loss": 1.6869, "step": 105 }, { "epoch": 0.025154247745609874, "grad_norm": 0.3803173005580902, "learning_rate": 7.88e-06, "loss": 2.1051, "step": 106 }, { "epoch": 0.02539155196962506, "grad_norm": 0.12461218237876892, "learning_rate": 7.860000000000001e-06, "loss": 1.5027, "step": 107 }, { "epoch": 0.025628856193640248, "grad_norm": 0.47309207916259766, "learning_rate": 7.840000000000001e-06, "loss": 1.156, "step": 108 }, { "epoch": 0.025866160417655434, "grad_norm": 0.5996202826499939, "learning_rate": 7.820000000000001e-06, "loss": 1.8978, "step": 109 }, { "epoch": 0.026103464641670623, "grad_norm": 1.3080424070358276, "learning_rate": 7.800000000000002e-06, "loss": 1.2111, "step": 110 }, { "epoch": 0.02634076886568581, "grad_norm": 0.5753285884857178, "learning_rate": 7.78e-06, "loss": 1.2555, "step": 111 }, { "epoch": 0.026578073089700997, "grad_norm": 0.4644084870815277, "learning_rate": 7.76e-06, "loss": 0.5773, "step": 112 }, { "epoch": 0.026815377313716183, "grad_norm": 0.7260199189186096, "learning_rate": 7.74e-06, "loss": 1.3806, "step": 113 }, { "epoch": 0.027052681537731372, "grad_norm": 3.247457265853882, "learning_rate": 7.72e-06, "loss": 3.6028, "step": 114 }, { "epoch": 0.027289985761746557, "grad_norm": 0.6819869875907898, "learning_rate": 7.7e-06, "loss": 1.3892, "step": 115 }, { "epoch": 0.027527289985761746, "grad_norm": 0.5939836502075195, "learning_rate": 7.680000000000001e-06, "loss": 0.8986, "step": 116 }, { "epoch": 0.027764594209776935, "grad_norm": 0.9432902336120605, "learning_rate": 7.660000000000001e-06, "loss": 1.9631, "step": 117 }, { "epoch": 0.02800189843379212, "grad_norm": 0.7726979851722717, "learning_rate": 7.640000000000001e-06, "loss": 1.8061, "step": 118 }, { "epoch": 0.02823920265780731, "grad_norm": 1.1715900897979736, "learning_rate": 7.620000000000001e-06, "loss": 1.6768, "step": 119 }, { "epoch": 0.028476506881822496, "grad_norm": 0.422097772359848, "learning_rate": 7.600000000000001e-06, "loss": 1.2938, "step": 120 }, { "epoch": 0.028713811105837685, "grad_norm": 0.4177633225917816, "learning_rate": 7.58e-06, "loss": 2.4758, "step": 121 }, { "epoch": 0.02895111532985287, "grad_norm": 0.8148231506347656, "learning_rate": 7.5600000000000005e-06, "loss": 1.6307, "step": 122 }, { "epoch": 0.02918841955386806, "grad_norm": 0.36970993876457214, "learning_rate": 7.540000000000001e-06, "loss": 1.923, "step": 123 }, { "epoch": 0.029425723777883245, "grad_norm": 0.9572102427482605, "learning_rate": 7.520000000000001e-06, "loss": 2.4889, "step": 124 }, { "epoch": 0.029663028001898434, "grad_norm": 1.023463487625122, "learning_rate": 7.500000000000001e-06, "loss": 0.753, "step": 125 }, { "epoch": 0.029900332225913623, "grad_norm": 0.7689042091369629, "learning_rate": 7.48e-06, "loss": 1.8955, "step": 126 }, { "epoch": 0.03013763644992881, "grad_norm": 0.8892776370048523, "learning_rate": 7.4600000000000006e-06, "loss": 1.907, "step": 127 }, { "epoch": 0.030374940673943997, "grad_norm": 0.6348279118537903, "learning_rate": 7.440000000000001e-06, "loss": 0.9907, "step": 128 }, { "epoch": 0.030612244897959183, "grad_norm": 0.8271303772926331, "learning_rate": 7.420000000000001e-06, "loss": 0.8349, "step": 129 }, { "epoch": 0.030849549121974372, "grad_norm": 0.567034900188446, "learning_rate": 7.4e-06, "loss": 1.5324, "step": 130 }, { "epoch": 0.031086853345989558, "grad_norm": 0.7354723215103149, "learning_rate": 7.3800000000000005e-06, "loss": 1.8572, "step": 131 }, { "epoch": 0.03132415757000474, "grad_norm": 0.7156671285629272, "learning_rate": 7.360000000000001e-06, "loss": 1.6312, "step": 132 }, { "epoch": 0.03156146179401993, "grad_norm": 0.36890867352485657, "learning_rate": 7.340000000000001e-06, "loss": 0.8384, "step": 133 }, { "epoch": 0.03179876601803512, "grad_norm": 0.6410567760467529, "learning_rate": 7.32e-06, "loss": 1.1728, "step": 134 }, { "epoch": 0.03203607024205031, "grad_norm": 0.5395240187644958, "learning_rate": 7.3e-06, "loss": 1.3754, "step": 135 }, { "epoch": 0.0322733744660655, "grad_norm": 0.9049538373947144, "learning_rate": 7.280000000000001e-06, "loss": 1.1323, "step": 136 }, { "epoch": 0.03251067869008068, "grad_norm": 0.7486042380332947, "learning_rate": 7.260000000000001e-06, "loss": 2.6436, "step": 137 }, { "epoch": 0.03274798291409587, "grad_norm": 0.2955069839954376, "learning_rate": 7.24e-06, "loss": 1.6258, "step": 138 }, { "epoch": 0.03298528713811106, "grad_norm": 0.5585281848907471, "learning_rate": 7.22e-06, "loss": 1.0526, "step": 139 }, { "epoch": 0.03322259136212625, "grad_norm": 0.6172239780426025, "learning_rate": 7.2000000000000005e-06, "loss": 2.0741, "step": 140 }, { "epoch": 0.03345989558614143, "grad_norm": 1.0919370651245117, "learning_rate": 7.180000000000001e-06, "loss": 2.6546, "step": 141 }, { "epoch": 0.03369719981015662, "grad_norm": 0.6277972459793091, "learning_rate": 7.16e-06, "loss": 1.4516, "step": 142 }, { "epoch": 0.03393450403417181, "grad_norm": 1.262568712234497, "learning_rate": 7.14e-06, "loss": 1.3782, "step": 143 }, { "epoch": 0.034171808258187, "grad_norm": 0.9131320714950562, "learning_rate": 7.1200000000000004e-06, "loss": 1.3814, "step": 144 }, { "epoch": 0.03440911248220219, "grad_norm": 0.4619258642196655, "learning_rate": 7.100000000000001e-06, "loss": 2.8978, "step": 145 }, { "epoch": 0.03464641670621737, "grad_norm": 0.17557378113269806, "learning_rate": 7.08e-06, "loss": 1.5882, "step": 146 }, { "epoch": 0.03488372093023256, "grad_norm": 0.6798960566520691, "learning_rate": 7.06e-06, "loss": 2.5057, "step": 147 }, { "epoch": 0.03512102515424775, "grad_norm": 0.7229902744293213, "learning_rate": 7.04e-06, "loss": 1.195, "step": 148 }, { "epoch": 0.035358329378262936, "grad_norm": 0.5356243848800659, "learning_rate": 7.0200000000000006e-06, "loss": 2.3255, "step": 149 }, { "epoch": 0.03559563360227812, "grad_norm": 0.735577404499054, "learning_rate": 7e-06, "loss": 0.5763, "step": 150 }, { "epoch": 0.03583293782629331, "grad_norm": 0.7191525101661682, "learning_rate": 6.98e-06, "loss": 1.5225, "step": 151 }, { "epoch": 0.036070242050308496, "grad_norm": 0.5553634166717529, "learning_rate": 6.96e-06, "loss": 2.0026, "step": 152 }, { "epoch": 0.036307546274323685, "grad_norm": 0.9029828310012817, "learning_rate": 6.9400000000000005e-06, "loss": 0.8248, "step": 153 }, { "epoch": 0.036544850498338874, "grad_norm": 0.8566996455192566, "learning_rate": 6.92e-06, "loss": 1.9291, "step": 154 }, { "epoch": 0.036782154722354056, "grad_norm": 0.677769124507904, "learning_rate": 6.9e-06, "loss": 1.5933, "step": 155 }, { "epoch": 0.037019458946369245, "grad_norm": 0.5915787220001221, "learning_rate": 6.88e-06, "loss": 0.9965, "step": 156 }, { "epoch": 0.037256763170384434, "grad_norm": 0.511048436164856, "learning_rate": 6.860000000000001e-06, "loss": 1.8256, "step": 157 }, { "epoch": 0.03749406739439962, "grad_norm": 0.49128812551498413, "learning_rate": 6.8400000000000014e-06, "loss": 1.6187, "step": 158 }, { "epoch": 0.037731371618414805, "grad_norm": 0.554414689540863, "learning_rate": 6.820000000000001e-06, "loss": 1.1991, "step": 159 }, { "epoch": 0.037968675842429994, "grad_norm": 0.9458298683166504, "learning_rate": 6.800000000000001e-06, "loss": 2.2793, "step": 160 }, { "epoch": 0.03820598006644518, "grad_norm": 2.0072669982910156, "learning_rate": 6.780000000000001e-06, "loss": 1.89, "step": 161 }, { "epoch": 0.03844328429046037, "grad_norm": 1.146154761314392, "learning_rate": 6.760000000000001e-06, "loss": 2.607, "step": 162 }, { "epoch": 0.038680588514475554, "grad_norm": 0.6168065667152405, "learning_rate": 6.740000000000001e-06, "loss": 1.459, "step": 163 }, { "epoch": 0.03891789273849074, "grad_norm": 0.4497089684009552, "learning_rate": 6.720000000000001e-06, "loss": 1.2466, "step": 164 }, { "epoch": 0.03915519696250593, "grad_norm": 0.7007705569267273, "learning_rate": 6.700000000000001e-06, "loss": 2.4729, "step": 165 }, { "epoch": 0.03939250118652112, "grad_norm": 0.8613377809524536, "learning_rate": 6.680000000000001e-06, "loss": 2.2219, "step": 166 }, { "epoch": 0.03962980541053631, "grad_norm": 0.539036750793457, "learning_rate": 6.660000000000001e-06, "loss": 0.5526, "step": 167 }, { "epoch": 0.03986710963455149, "grad_norm": 0.6085006594657898, "learning_rate": 6.640000000000001e-06, "loss": 2.1238, "step": 168 }, { "epoch": 0.04010441385856668, "grad_norm": 1.8861020803451538, "learning_rate": 6.620000000000001e-06, "loss": 2.6934, "step": 169 }, { "epoch": 0.04034171808258187, "grad_norm": 0.5803074240684509, "learning_rate": 6.600000000000001e-06, "loss": 1.2254, "step": 170 }, { "epoch": 0.04057902230659706, "grad_norm": 1.0310598611831665, "learning_rate": 6.5800000000000005e-06, "loss": 1.1757, "step": 171 }, { "epoch": 0.04081632653061224, "grad_norm": 0.9412042498588562, "learning_rate": 6.560000000000001e-06, "loss": 1.3748, "step": 172 }, { "epoch": 0.04105363075462743, "grad_norm": 0.6556461453437805, "learning_rate": 6.540000000000001e-06, "loss": 1.0809, "step": 173 }, { "epoch": 0.04129093497864262, "grad_norm": 0.4990858733654022, "learning_rate": 6.520000000000001e-06, "loss": 1.2934, "step": 174 }, { "epoch": 0.04152823920265781, "grad_norm": 0.6699053645133972, "learning_rate": 6.5000000000000004e-06, "loss": 2.4814, "step": 175 }, { "epoch": 0.041765543426673, "grad_norm": 0.8843134641647339, "learning_rate": 6.480000000000001e-06, "loss": 1.9615, "step": 176 }, { "epoch": 0.04200284765068818, "grad_norm": 0.6757798790931702, "learning_rate": 6.460000000000001e-06, "loss": 1.7584, "step": 177 }, { "epoch": 0.04224015187470337, "grad_norm": 0.46641185879707336, "learning_rate": 6.440000000000001e-06, "loss": 1.3054, "step": 178 }, { "epoch": 0.04247745609871856, "grad_norm": 0.7104260325431824, "learning_rate": 6.42e-06, "loss": 1.1761, "step": 179 }, { "epoch": 0.04271476032273375, "grad_norm": 0.3888971209526062, "learning_rate": 6.4000000000000006e-06, "loss": 2.2055, "step": 180 }, { "epoch": 0.04295206454674893, "grad_norm": 1.1661548614501953, "learning_rate": 6.380000000000001e-06, "loss": 1.1329, "step": 181 }, { "epoch": 0.04318936877076412, "grad_norm": 0.40143099427223206, "learning_rate": 6.360000000000001e-06, "loss": 1.0135, "step": 182 }, { "epoch": 0.04342667299477931, "grad_norm": 0.692574679851532, "learning_rate": 6.34e-06, "loss": 1.3553, "step": 183 }, { "epoch": 0.043663977218794496, "grad_norm": 0.6210284233093262, "learning_rate": 6.3200000000000005e-06, "loss": 1.4226, "step": 184 }, { "epoch": 0.043901281442809685, "grad_norm": 0.6218913197517395, "learning_rate": 6.300000000000001e-06, "loss": 1.8475, "step": 185 }, { "epoch": 0.04413858566682487, "grad_norm": 0.6004898548126221, "learning_rate": 6.280000000000001e-06, "loss": 1.8829, "step": 186 }, { "epoch": 0.044375889890840056, "grad_norm": 0.8992873430252075, "learning_rate": 6.26e-06, "loss": 1.0599, "step": 187 }, { "epoch": 0.044613194114855245, "grad_norm": 0.5328947305679321, "learning_rate": 6.24e-06, "loss": 0.7189, "step": 188 }, { "epoch": 0.044850498338870434, "grad_norm": 0.5015738010406494, "learning_rate": 6.220000000000001e-06, "loss": 2.0798, "step": 189 }, { "epoch": 0.045087802562885616, "grad_norm": 0.47242438793182373, "learning_rate": 6.200000000000001e-06, "loss": 1.3423, "step": 190 }, { "epoch": 0.045325106786900805, "grad_norm": 0.9236852526664734, "learning_rate": 6.18e-06, "loss": 1.2769, "step": 191 }, { "epoch": 0.045562411010915994, "grad_norm": 0.47883141040802, "learning_rate": 6.16e-06, "loss": 1.5209, "step": 192 }, { "epoch": 0.04579971523493118, "grad_norm": 0.760746955871582, "learning_rate": 6.1400000000000005e-06, "loss": 1.6353, "step": 193 }, { "epoch": 0.04603701945894637, "grad_norm": 0.5924590826034546, "learning_rate": 6.120000000000001e-06, "loss": 1.3738, "step": 194 }, { "epoch": 0.046274323682961555, "grad_norm": 0.7453778386116028, "learning_rate": 6.1e-06, "loss": 0.8792, "step": 195 }, { "epoch": 0.046511627906976744, "grad_norm": 0.8486724495887756, "learning_rate": 6.08e-06, "loss": 1.7324, "step": 196 }, { "epoch": 0.04674893213099193, "grad_norm": 0.5349167585372925, "learning_rate": 6.0600000000000004e-06, "loss": 1.1938, "step": 197 }, { "epoch": 0.04698623635500712, "grad_norm": 0.4522935748100281, "learning_rate": 6.040000000000001e-06, "loss": 1.282, "step": 198 }, { "epoch": 0.047223540579022304, "grad_norm": 0.6473549008369446, "learning_rate": 6.02e-06, "loss": 0.9753, "step": 199 }, { "epoch": 0.04746084480303749, "grad_norm": 0.8137974739074707, "learning_rate": 6e-06, "loss": 1.6364, "step": 200 }, { "epoch": 0.04769814902705268, "grad_norm": 0.48920387029647827, "learning_rate": 5.98e-06, "loss": 1.8822, "step": 201 }, { "epoch": 0.04793545325106787, "grad_norm": 0.7745688557624817, "learning_rate": 5.9600000000000005e-06, "loss": 1.1183, "step": 202 }, { "epoch": 0.04817275747508306, "grad_norm": 1.2582346200942993, "learning_rate": 5.94e-06, "loss": 2.3109, "step": 203 }, { "epoch": 0.04841006169909824, "grad_norm": 0.6862124800682068, "learning_rate": 5.92e-06, "loss": 1.972, "step": 204 }, { "epoch": 0.04864736592311343, "grad_norm": 0.2611483931541443, "learning_rate": 5.9e-06, "loss": 1.657, "step": 205 }, { "epoch": 0.04888467014712862, "grad_norm": 0.6788746118545532, "learning_rate": 5.8800000000000005e-06, "loss": 1.4215, "step": 206 }, { "epoch": 0.04912197437114381, "grad_norm": 0.7344043254852295, "learning_rate": 5.86e-06, "loss": 1.8161, "step": 207 }, { "epoch": 0.04935927859515899, "grad_norm": 0.9730067849159241, "learning_rate": 5.84e-06, "loss": 2.009, "step": 208 }, { "epoch": 0.04959658281917418, "grad_norm": 0.5687656998634338, "learning_rate": 5.82e-06, "loss": 1.371, "step": 209 }, { "epoch": 0.04983388704318937, "grad_norm": 0.9074623584747314, "learning_rate": 5.8e-06, "loss": 0.9096, "step": 210 }, { "epoch": 0.05007119126720456, "grad_norm": 0.5359849333763123, "learning_rate": 5.78e-06, "loss": 1.7314, "step": 211 }, { "epoch": 0.05030849549121975, "grad_norm": 0.5226656198501587, "learning_rate": 5.76e-06, "loss": 0.7389, "step": 212 }, { "epoch": 0.05054579971523493, "grad_norm": 0.8956894874572754, "learning_rate": 5.74e-06, "loss": 1.1937, "step": 213 }, { "epoch": 0.05078310393925012, "grad_norm": 1.2422311305999756, "learning_rate": 5.72e-06, "loss": 1.8545, "step": 214 }, { "epoch": 0.05102040816326531, "grad_norm": 0.5634399056434631, "learning_rate": 5.7e-06, "loss": 1.446, "step": 215 }, { "epoch": 0.051257712387280496, "grad_norm": 0.7447414398193359, "learning_rate": 5.68e-06, "loss": 1.7231, "step": 216 }, { "epoch": 0.05149501661129568, "grad_norm": 1.131425380706787, "learning_rate": 5.66e-06, "loss": 1.4652, "step": 217 }, { "epoch": 0.05173232083531087, "grad_norm": 0.8668593764305115, "learning_rate": 5.64e-06, "loss": 2.0443, "step": 218 }, { "epoch": 0.051969625059326056, "grad_norm": 1.2437993288040161, "learning_rate": 5.620000000000001e-06, "loss": 2.1323, "step": 219 }, { "epoch": 0.052206929283341245, "grad_norm": 0.914955198764801, "learning_rate": 5.600000000000001e-06, "loss": 2.2205, "step": 220 }, { "epoch": 0.05244423350735643, "grad_norm": 1.0641348361968994, "learning_rate": 5.580000000000001e-06, "loss": 2.4159, "step": 221 }, { "epoch": 0.05268153773137162, "grad_norm": 0.17957435548305511, "learning_rate": 5.560000000000001e-06, "loss": 1.1683, "step": 222 }, { "epoch": 0.052918841955386806, "grad_norm": 1.058635950088501, "learning_rate": 5.540000000000001e-06, "loss": 1.8851, "step": 223 }, { "epoch": 0.053156146179401995, "grad_norm": 0.6216885447502136, "learning_rate": 5.5200000000000005e-06, "loss": 2.0638, "step": 224 }, { "epoch": 0.053393450403417184, "grad_norm": 0.32197320461273193, "learning_rate": 5.500000000000001e-06, "loss": 4.2387, "step": 225 }, { "epoch": 0.053630754627432366, "grad_norm": 0.5334311127662659, "learning_rate": 5.480000000000001e-06, "loss": 1.199, "step": 226 }, { "epoch": 0.053868058851447555, "grad_norm": 0.8656753301620483, "learning_rate": 5.460000000000001e-06, "loss": 1.0053, "step": 227 }, { "epoch": 0.054105363075462744, "grad_norm": 0.9633522629737854, "learning_rate": 5.4400000000000004e-06, "loss": 2.6078, "step": 228 }, { "epoch": 0.05434266729947793, "grad_norm": 0.8262597322463989, "learning_rate": 5.420000000000001e-06, "loss": 2.4638, "step": 229 }, { "epoch": 0.054579971523493115, "grad_norm": 0.9816875457763672, "learning_rate": 5.400000000000001e-06, "loss": 2.3545, "step": 230 }, { "epoch": 0.054817275747508304, "grad_norm": 0.7079796195030212, "learning_rate": 5.380000000000001e-06, "loss": 1.7035, "step": 231 }, { "epoch": 0.05505457997152349, "grad_norm": 0.2551076412200928, "learning_rate": 5.36e-06, "loss": 2.3941, "step": 232 }, { "epoch": 0.05529188419553868, "grad_norm": 0.8667798042297363, "learning_rate": 5.3400000000000005e-06, "loss": 2.5876, "step": 233 }, { "epoch": 0.05552918841955387, "grad_norm": 0.48972687125205994, "learning_rate": 5.320000000000001e-06, "loss": 1.9343, "step": 234 }, { "epoch": 0.05576649264356905, "grad_norm": 1.166282057762146, "learning_rate": 5.300000000000001e-06, "loss": 1.5903, "step": 235 }, { "epoch": 0.05600379686758424, "grad_norm": 1.0136897563934326, "learning_rate": 5.28e-06, "loss": 1.9003, "step": 236 }, { "epoch": 0.05624110109159943, "grad_norm": 0.9301249384880066, "learning_rate": 5.2600000000000005e-06, "loss": 2.2514, "step": 237 }, { "epoch": 0.05647840531561462, "grad_norm": 0.6378384232521057, "learning_rate": 5.240000000000001e-06, "loss": 1.0407, "step": 238 }, { "epoch": 0.0567157095396298, "grad_norm": 0.7191042900085449, "learning_rate": 5.220000000000001e-06, "loss": 1.8487, "step": 239 }, { "epoch": 0.05695301376364499, "grad_norm": 0.8724852204322815, "learning_rate": 5.2e-06, "loss": 1.9766, "step": 240 }, { "epoch": 0.05719031798766018, "grad_norm": 1.2465623617172241, "learning_rate": 5.18e-06, "loss": 2.0123, "step": 241 }, { "epoch": 0.05742762221167537, "grad_norm": 0.6831521987915039, "learning_rate": 5.1600000000000006e-06, "loss": 2.5164, "step": 242 }, { "epoch": 0.05766492643569056, "grad_norm": 0.9678359627723694, "learning_rate": 5.140000000000001e-06, "loss": 1.1767, "step": 243 }, { "epoch": 0.05790223065970574, "grad_norm": 0.7171100378036499, "learning_rate": 5.12e-06, "loss": 1.5144, "step": 244 }, { "epoch": 0.05813953488372093, "grad_norm": 1.8189458847045898, "learning_rate": 5.1e-06, "loss": 0.8019, "step": 245 }, { "epoch": 0.05837683910773612, "grad_norm": 0.8519846796989441, "learning_rate": 5.0800000000000005e-06, "loss": 0.5605, "step": 246 }, { "epoch": 0.05861414333175131, "grad_norm": 0.8352647423744202, "learning_rate": 5.060000000000001e-06, "loss": 1.83, "step": 247 }, { "epoch": 0.05885144755576649, "grad_norm": 0.8553798198699951, "learning_rate": 5.04e-06, "loss": 0.8078, "step": 248 }, { "epoch": 0.05908875177978168, "grad_norm": 0.5836890935897827, "learning_rate": 5.02e-06, "loss": 1.805, "step": 249 }, { "epoch": 0.05932605600379687, "grad_norm": 1.2686045169830322, "learning_rate": 5e-06, "loss": 2.9273, "step": 250 }, { "epoch": 0.05956336022781206, "grad_norm": 1.0224002599716187, "learning_rate": 4.980000000000001e-06, "loss": 1.5689, "step": 251 }, { "epoch": 0.059800664451827246, "grad_norm": 0.6084955334663391, "learning_rate": 4.960000000000001e-06, "loss": 1.4633, "step": 252 }, { "epoch": 0.06003796867584243, "grad_norm": 1.0595405101776123, "learning_rate": 4.94e-06, "loss": 1.799, "step": 253 }, { "epoch": 0.06027527289985762, "grad_norm": 0.6184794306755066, "learning_rate": 4.92e-06, "loss": 0.936, "step": 254 }, { "epoch": 0.060512577123872806, "grad_norm": 0.8224856853485107, "learning_rate": 4.9000000000000005e-06, "loss": 1.2481, "step": 255 }, { "epoch": 0.060749881347887995, "grad_norm": 0.4918515086174011, "learning_rate": 4.880000000000001e-06, "loss": 1.6237, "step": 256 }, { "epoch": 0.06098718557190318, "grad_norm": 0.7177821397781372, "learning_rate": 4.86e-06, "loss": 1.6721, "step": 257 }, { "epoch": 0.061224489795918366, "grad_norm": 0.654577374458313, "learning_rate": 4.84e-06, "loss": 1.6336, "step": 258 }, { "epoch": 0.061461794019933555, "grad_norm": 0.6433861255645752, "learning_rate": 4.8200000000000004e-06, "loss": 0.7603, "step": 259 }, { "epoch": 0.061699098243948744, "grad_norm": 0.8999320864677429, "learning_rate": 4.800000000000001e-06, "loss": 1.5193, "step": 260 }, { "epoch": 0.06193640246796393, "grad_norm": 1.071006417274475, "learning_rate": 4.78e-06, "loss": 2.1792, "step": 261 }, { "epoch": 0.062173706691979115, "grad_norm": 0.950939416885376, "learning_rate": 4.76e-06, "loss": 1.7475, "step": 262 }, { "epoch": 0.062411010915994304, "grad_norm": 0.6791463494300842, "learning_rate": 4.74e-06, "loss": 1.5734, "step": 263 }, { "epoch": 0.06264831514000949, "grad_norm": 0.8757117986679077, "learning_rate": 4.7200000000000005e-06, "loss": 0.9807, "step": 264 }, { "epoch": 0.06288561936402468, "grad_norm": 1.6328972578048706, "learning_rate": 4.7e-06, "loss": 1.3517, "step": 265 }, { "epoch": 0.06312292358803986, "grad_norm": 1.2624143362045288, "learning_rate": 4.680000000000001e-06, "loss": 1.935, "step": 266 }, { "epoch": 0.06336022781205505, "grad_norm": 0.8143572211265564, "learning_rate": 4.66e-06, "loss": 1.2027, "step": 267 }, { "epoch": 0.06359753203607024, "grad_norm": 1.05904221534729, "learning_rate": 4.6400000000000005e-06, "loss": 2.0852, "step": 268 }, { "epoch": 0.06383483626008543, "grad_norm": 0.5743423104286194, "learning_rate": 4.620000000000001e-06, "loss": 1.649, "step": 269 }, { "epoch": 0.06407214048410062, "grad_norm": 1.2721295356750488, "learning_rate": 4.600000000000001e-06, "loss": 1.7225, "step": 270 }, { "epoch": 0.06430944470811581, "grad_norm": 0.8935225009918213, "learning_rate": 4.58e-06, "loss": 1.7969, "step": 271 }, { "epoch": 0.064546748932131, "grad_norm": 0.4470404088497162, "learning_rate": 4.56e-06, "loss": 1.4834, "step": 272 }, { "epoch": 0.06478405315614617, "grad_norm": 0.41209957003593445, "learning_rate": 4.540000000000001e-06, "loss": 1.5577, "step": 273 }, { "epoch": 0.06502135738016136, "grad_norm": 1.3055024147033691, "learning_rate": 4.520000000000001e-06, "loss": 2.0445, "step": 274 }, { "epoch": 0.06525866160417655, "grad_norm": 1.182127594947815, "learning_rate": 4.5e-06, "loss": 2.5339, "step": 275 }, { "epoch": 0.06549596582819174, "grad_norm": 1.030988335609436, "learning_rate": 4.48e-06, "loss": 2.8977, "step": 276 }, { "epoch": 0.06573327005220693, "grad_norm": 1.023729920387268, "learning_rate": 4.4600000000000005e-06, "loss": 3.7561, "step": 277 }, { "epoch": 0.06597057427622212, "grad_norm": 0.8441396951675415, "learning_rate": 4.440000000000001e-06, "loss": 0.9105, "step": 278 }, { "epoch": 0.06620787850023731, "grad_norm": 0.781200647354126, "learning_rate": 4.42e-06, "loss": 1.1585, "step": 279 }, { "epoch": 0.0664451827242525, "grad_norm": 1.0872159004211426, "learning_rate": 4.4e-06, "loss": 1.8595, "step": 280 }, { "epoch": 0.06668248694826769, "grad_norm": 0.7548374533653259, "learning_rate": 4.38e-06, "loss": 1.5494, "step": 281 }, { "epoch": 0.06691979117228286, "grad_norm": 0.7101930975914001, "learning_rate": 4.360000000000001e-06, "loss": 0.9779, "step": 282 }, { "epoch": 0.06715709539629805, "grad_norm": 2.0798416137695312, "learning_rate": 4.34e-06, "loss": 2.2541, "step": 283 }, { "epoch": 0.06739439962031324, "grad_norm": 0.45076173543930054, "learning_rate": 4.32e-06, "loss": 2.0375, "step": 284 }, { "epoch": 0.06763170384432843, "grad_norm": 1.132407784461975, "learning_rate": 4.3e-06, "loss": 1.8917, "step": 285 }, { "epoch": 0.06786900806834362, "grad_norm": 0.4013515114784241, "learning_rate": 4.2800000000000005e-06, "loss": 0.3564, "step": 286 }, { "epoch": 0.0681063122923588, "grad_norm": 0.7896368503570557, "learning_rate": 4.26e-06, "loss": 1.5522, "step": 287 }, { "epoch": 0.068343616516374, "grad_norm": 0.5332828760147095, "learning_rate": 4.24e-06, "loss": 0.56, "step": 288 }, { "epoch": 0.06858092074038918, "grad_norm": 0.6611258387565613, "learning_rate": 4.22e-06, "loss": 1.5975, "step": 289 }, { "epoch": 0.06881822496440437, "grad_norm": 0.8199064135551453, "learning_rate": 4.2000000000000004e-06, "loss": 2.1785, "step": 290 }, { "epoch": 0.06905552918841955, "grad_norm": 0.8175731301307678, "learning_rate": 4.18e-06, "loss": 2.1028, "step": 291 }, { "epoch": 0.06929283341243474, "grad_norm": 0.7817385196685791, "learning_rate": 4.16e-06, "loss": 1.6518, "step": 292 }, { "epoch": 0.06953013763644993, "grad_norm": 0.7397461533546448, "learning_rate": 4.14e-06, "loss": 1.4586, "step": 293 }, { "epoch": 0.06976744186046512, "grad_norm": 0.39525383710861206, "learning_rate": 4.12e-06, "loss": 2.1693, "step": 294 }, { "epoch": 0.0700047460844803, "grad_norm": 0.16452622413635254, "learning_rate": 4.1e-06, "loss": 2.0744, "step": 295 }, { "epoch": 0.0702420503084955, "grad_norm": 0.49008700251579285, "learning_rate": 4.08e-06, "loss": 1.5107, "step": 296 }, { "epoch": 0.07047935453251068, "grad_norm": 0.6975173354148865, "learning_rate": 4.060000000000001e-06, "loss": 1.4174, "step": 297 }, { "epoch": 0.07071665875652587, "grad_norm": 0.6213851571083069, "learning_rate": 4.04e-06, "loss": 1.4038, "step": 298 }, { "epoch": 0.07095396298054106, "grad_norm": 0.5320644378662109, "learning_rate": 4.0200000000000005e-06, "loss": 0.8553, "step": 299 }, { "epoch": 0.07119126720455624, "grad_norm": 1.0871286392211914, "learning_rate": 4.000000000000001e-06, "loss": 2.0419, "step": 300 }, { "epoch": 0.07142857142857142, "grad_norm": 0.3801209330558777, "learning_rate": 3.980000000000001e-06, "loss": 0.5386, "step": 301 }, { "epoch": 0.07166587565258661, "grad_norm": 0.39513254165649414, "learning_rate": 3.96e-06, "loss": 2.0508, "step": 302 }, { "epoch": 0.0719031798766018, "grad_norm": 0.45403411984443665, "learning_rate": 3.94e-06, "loss": 1.3929, "step": 303 }, { "epoch": 0.07214048410061699, "grad_norm": 0.9575373530387878, "learning_rate": 3.920000000000001e-06, "loss": 2.3445, "step": 304 }, { "epoch": 0.07237778832463218, "grad_norm": 0.7246173620223999, "learning_rate": 3.900000000000001e-06, "loss": 1.1485, "step": 305 }, { "epoch": 0.07261509254864737, "grad_norm": 0.4268713891506195, "learning_rate": 3.88e-06, "loss": 1.7111, "step": 306 }, { "epoch": 0.07285239677266256, "grad_norm": 0.3135124742984772, "learning_rate": 3.86e-06, "loss": 1.4731, "step": 307 }, { "epoch": 0.07308970099667775, "grad_norm": 0.647866427898407, "learning_rate": 3.8400000000000005e-06, "loss": 0.9345, "step": 308 }, { "epoch": 0.07332700522069292, "grad_norm": 0.6480103135108948, "learning_rate": 3.820000000000001e-06, "loss": 0.5553, "step": 309 }, { "epoch": 0.07356430944470811, "grad_norm": 0.7226047515869141, "learning_rate": 3.8000000000000005e-06, "loss": 1.1579, "step": 310 }, { "epoch": 0.0738016136687233, "grad_norm": 0.6003654599189758, "learning_rate": 3.7800000000000002e-06, "loss": 1.2483, "step": 311 }, { "epoch": 0.07403891789273849, "grad_norm": 0.6066475510597229, "learning_rate": 3.7600000000000004e-06, "loss": 0.7838, "step": 312 }, { "epoch": 0.07427622211675368, "grad_norm": 0.5357272624969482, "learning_rate": 3.74e-06, "loss": 1.8871, "step": 313 }, { "epoch": 0.07451352634076887, "grad_norm": 0.8201131820678711, "learning_rate": 3.7200000000000004e-06, "loss": 1.6833, "step": 314 }, { "epoch": 0.07475083056478406, "grad_norm": 0.6600767970085144, "learning_rate": 3.7e-06, "loss": 1.5297, "step": 315 }, { "epoch": 0.07498813478879925, "grad_norm": 0.6373748779296875, "learning_rate": 3.6800000000000003e-06, "loss": 1.5093, "step": 316 }, { "epoch": 0.07522543901281442, "grad_norm": 0.7496886849403381, "learning_rate": 3.66e-06, "loss": 2.4157, "step": 317 }, { "epoch": 0.07546274323682961, "grad_norm": 0.9333056211471558, "learning_rate": 3.6400000000000003e-06, "loss": 1.5617, "step": 318 }, { "epoch": 0.0757000474608448, "grad_norm": 1.0693997144699097, "learning_rate": 3.62e-06, "loss": 1.5375, "step": 319 }, { "epoch": 0.07593735168485999, "grad_norm": 0.5746883749961853, "learning_rate": 3.6000000000000003e-06, "loss": 1.4276, "step": 320 }, { "epoch": 0.07617465590887518, "grad_norm": 0.9793761968612671, "learning_rate": 3.58e-06, "loss": 0.9466, "step": 321 }, { "epoch": 0.07641196013289037, "grad_norm": 0.6006602048873901, "learning_rate": 3.5600000000000002e-06, "loss": 1.028, "step": 322 }, { "epoch": 0.07664926435690556, "grad_norm": 0.7533923983573914, "learning_rate": 3.54e-06, "loss": 1.9837, "step": 323 }, { "epoch": 0.07688656858092074, "grad_norm": 1.0659205913543701, "learning_rate": 3.52e-06, "loss": 1.4845, "step": 324 }, { "epoch": 0.07712387280493593, "grad_norm": 0.9382888078689575, "learning_rate": 3.5e-06, "loss": 1.8868, "step": 325 }, { "epoch": 0.07736117702895111, "grad_norm": 0.5263766050338745, "learning_rate": 3.48e-06, "loss": 1.6957, "step": 326 }, { "epoch": 0.0775984812529663, "grad_norm": 1.1845793724060059, "learning_rate": 3.46e-06, "loss": 1.7971, "step": 327 }, { "epoch": 0.07783578547698149, "grad_norm": 0.7983663082122803, "learning_rate": 3.44e-06, "loss": 1.7826, "step": 328 }, { "epoch": 0.07807308970099668, "grad_norm": 0.999782145023346, "learning_rate": 3.4200000000000007e-06, "loss": 1.4238, "step": 329 }, { "epoch": 0.07831039392501186, "grad_norm": 0.44168537855148315, "learning_rate": 3.4000000000000005e-06, "loss": 1.3762, "step": 330 }, { "epoch": 0.07854769814902705, "grad_norm": 3.451951026916504, "learning_rate": 3.3800000000000007e-06, "loss": 1.8961, "step": 331 }, { "epoch": 0.07878500237304224, "grad_norm": 1.2203079462051392, "learning_rate": 3.3600000000000004e-06, "loss": 2.4091, "step": 332 }, { "epoch": 0.07902230659705743, "grad_norm": 0.7909596562385559, "learning_rate": 3.3400000000000006e-06, "loss": 1.288, "step": 333 }, { "epoch": 0.07925961082107262, "grad_norm": 1.0289673805236816, "learning_rate": 3.3200000000000004e-06, "loss": 2.7401, "step": 334 }, { "epoch": 0.0794969150450878, "grad_norm": 1.550726056098938, "learning_rate": 3.3000000000000006e-06, "loss": 2.0937, "step": 335 }, { "epoch": 0.07973421926910298, "grad_norm": 0.9550947546958923, "learning_rate": 3.2800000000000004e-06, "loss": 1.3554, "step": 336 }, { "epoch": 0.07997152349311817, "grad_norm": 0.5482783913612366, "learning_rate": 3.2600000000000006e-06, "loss": 2.0287, "step": 337 }, { "epoch": 0.08020882771713336, "grad_norm": 1.071254014968872, "learning_rate": 3.2400000000000003e-06, "loss": 1.6622, "step": 338 }, { "epoch": 0.08044613194114855, "grad_norm": 1.0661407709121704, "learning_rate": 3.2200000000000005e-06, "loss": 0.9957, "step": 339 }, { "epoch": 0.08068343616516374, "grad_norm": 0.91053307056427, "learning_rate": 3.2000000000000003e-06, "loss": 0.8996, "step": 340 }, { "epoch": 0.08092074038917893, "grad_norm": 1.1073331832885742, "learning_rate": 3.1800000000000005e-06, "loss": 2.591, "step": 341 }, { "epoch": 0.08115804461319412, "grad_norm": 1.430469274520874, "learning_rate": 3.1600000000000002e-06, "loss": 2.6224, "step": 342 }, { "epoch": 0.08139534883720931, "grad_norm": 0.8856847882270813, "learning_rate": 3.1400000000000004e-06, "loss": 1.8773, "step": 343 }, { "epoch": 0.08163265306122448, "grad_norm": 0.7149975895881653, "learning_rate": 3.12e-06, "loss": 1.9744, "step": 344 }, { "epoch": 0.08186995728523967, "grad_norm": 1.156784176826477, "learning_rate": 3.1000000000000004e-06, "loss": 2.718, "step": 345 }, { "epoch": 0.08210726150925486, "grad_norm": 1.512473464012146, "learning_rate": 3.08e-06, "loss": 1.0725, "step": 346 }, { "epoch": 0.08234456573327005, "grad_norm": 0.5760719776153564, "learning_rate": 3.0600000000000003e-06, "loss": 1.3986, "step": 347 }, { "epoch": 0.08258186995728524, "grad_norm": 1.1456931829452515, "learning_rate": 3.04e-06, "loss": 2.6081, "step": 348 }, { "epoch": 0.08281917418130043, "grad_norm": 0.6848717927932739, "learning_rate": 3.0200000000000003e-06, "loss": 2.8127, "step": 349 }, { "epoch": 0.08305647840531562, "grad_norm": 1.2799925804138184, "learning_rate": 3e-06, "loss": 1.1195, "step": 350 }, { "epoch": 0.0832937826293308, "grad_norm": 1.1718430519104004, "learning_rate": 2.9800000000000003e-06, "loss": 1.6587, "step": 351 }, { "epoch": 0.083531086853346, "grad_norm": 0.5741757750511169, "learning_rate": 2.96e-06, "loss": 1.6265, "step": 352 }, { "epoch": 0.08376839107736117, "grad_norm": 0.8194566965103149, "learning_rate": 2.9400000000000002e-06, "loss": 1.2456, "step": 353 }, { "epoch": 0.08400569530137636, "grad_norm": 0.49410197138786316, "learning_rate": 2.92e-06, "loss": 1.5949, "step": 354 }, { "epoch": 0.08424299952539155, "grad_norm": 0.9407163858413696, "learning_rate": 2.9e-06, "loss": 1.1365, "step": 355 }, { "epoch": 0.08448030374940674, "grad_norm": 0.764671266078949, "learning_rate": 2.88e-06, "loss": 0.9231, "step": 356 }, { "epoch": 0.08471760797342193, "grad_norm": 0.6322979927062988, "learning_rate": 2.86e-06, "loss": 0.477, "step": 357 }, { "epoch": 0.08495491219743712, "grad_norm": 0.7397903800010681, "learning_rate": 2.84e-06, "loss": 1.2955, "step": 358 }, { "epoch": 0.0851922164214523, "grad_norm": 1.3564809560775757, "learning_rate": 2.82e-06, "loss": 1.0156, "step": 359 }, { "epoch": 0.0854295206454675, "grad_norm": 0.9099704027175903, "learning_rate": 2.8000000000000003e-06, "loss": 2.5199, "step": 360 }, { "epoch": 0.08566682486948268, "grad_norm": 0.596339225769043, "learning_rate": 2.7800000000000005e-06, "loss": 1.9148, "step": 361 }, { "epoch": 0.08590412909349786, "grad_norm": 0.8687242269515991, "learning_rate": 2.7600000000000003e-06, "loss": 1.6878, "step": 362 }, { "epoch": 0.08614143331751305, "grad_norm": 1.1088467836380005, "learning_rate": 2.7400000000000004e-06, "loss": 1.3883, "step": 363 }, { "epoch": 0.08637873754152824, "grad_norm": 0.26222410798072815, "learning_rate": 2.7200000000000002e-06, "loss": 1.6821, "step": 364 }, { "epoch": 0.08661604176554343, "grad_norm": 0.9348676800727844, "learning_rate": 2.7000000000000004e-06, "loss": 1.3761, "step": 365 }, { "epoch": 0.08685334598955861, "grad_norm": 0.6078274250030518, "learning_rate": 2.68e-06, "loss": 0.8237, "step": 366 }, { "epoch": 0.0870906502135738, "grad_norm": 2.2705135345458984, "learning_rate": 2.6600000000000004e-06, "loss": 2.1154, "step": 367 }, { "epoch": 0.08732795443758899, "grad_norm": 0.5769693851470947, "learning_rate": 2.64e-06, "loss": 0.7956, "step": 368 }, { "epoch": 0.08756525866160418, "grad_norm": 0.9362378120422363, "learning_rate": 2.6200000000000003e-06, "loss": 0.8264, "step": 369 }, { "epoch": 0.08780256288561937, "grad_norm": 1.3028643131256104, "learning_rate": 2.6e-06, "loss": 2.0418, "step": 370 }, { "epoch": 0.08803986710963455, "grad_norm": 1.020676612854004, "learning_rate": 2.5800000000000003e-06, "loss": 1.6203, "step": 371 }, { "epoch": 0.08827717133364973, "grad_norm": 0.5925426483154297, "learning_rate": 2.56e-06, "loss": 2.0088, "step": 372 }, { "epoch": 0.08851447555766492, "grad_norm": 0.9247467517852783, "learning_rate": 2.5400000000000002e-06, "loss": 1.4702, "step": 373 }, { "epoch": 0.08875177978168011, "grad_norm": 0.8355708718299866, "learning_rate": 2.52e-06, "loss": 1.511, "step": 374 }, { "epoch": 0.0889890840056953, "grad_norm": 0.8599538207054138, "learning_rate": 2.5e-06, "loss": 1.1829, "step": 375 }, { "epoch": 0.08922638822971049, "grad_norm": 0.7480601668357849, "learning_rate": 2.4800000000000004e-06, "loss": 0.5197, "step": 376 }, { "epoch": 0.08946369245372568, "grad_norm": 0.6589898467063904, "learning_rate": 2.46e-06, "loss": 1.8403, "step": 377 }, { "epoch": 0.08970099667774087, "grad_norm": 1.1234686374664307, "learning_rate": 2.4400000000000004e-06, "loss": 1.1402, "step": 378 }, { "epoch": 0.08993830090175606, "grad_norm": 0.4725819528102875, "learning_rate": 2.42e-06, "loss": 1.3948, "step": 379 }, { "epoch": 0.09017560512577123, "grad_norm": 0.579430878162384, "learning_rate": 2.4000000000000003e-06, "loss": 1.2087, "step": 380 }, { "epoch": 0.09041290934978642, "grad_norm": 0.9052660465240479, "learning_rate": 2.38e-06, "loss": 0.9077, "step": 381 }, { "epoch": 0.09065021357380161, "grad_norm": 0.5743672847747803, "learning_rate": 2.3600000000000003e-06, "loss": 2.4246, "step": 382 }, { "epoch": 0.0908875177978168, "grad_norm": 0.5894416570663452, "learning_rate": 2.3400000000000005e-06, "loss": 0.9541, "step": 383 }, { "epoch": 0.09112482202183199, "grad_norm": 0.5887079238891602, "learning_rate": 2.3200000000000002e-06, "loss": 1.5845, "step": 384 }, { "epoch": 0.09136212624584718, "grad_norm": 0.6027985215187073, "learning_rate": 2.3000000000000004e-06, "loss": 1.8314, "step": 385 }, { "epoch": 0.09159943046986237, "grad_norm": 1.0281093120574951, "learning_rate": 2.28e-06, "loss": 1.0648, "step": 386 }, { "epoch": 0.09183673469387756, "grad_norm": 1.0283626317977905, "learning_rate": 2.2600000000000004e-06, "loss": 2.3067, "step": 387 }, { "epoch": 0.09207403891789274, "grad_norm": 1.1482547521591187, "learning_rate": 2.24e-06, "loss": 1.8681, "step": 388 }, { "epoch": 0.09231134314190792, "grad_norm": 0.8530061841011047, "learning_rate": 2.2200000000000003e-06, "loss": 0.8857, "step": 389 }, { "epoch": 0.09254864736592311, "grad_norm": 0.836930513381958, "learning_rate": 2.2e-06, "loss": 1.8586, "step": 390 }, { "epoch": 0.0927859515899383, "grad_norm": 0.8930790424346924, "learning_rate": 2.1800000000000003e-06, "loss": 1.6977, "step": 391 }, { "epoch": 0.09302325581395349, "grad_norm": 0.4565262496471405, "learning_rate": 2.16e-06, "loss": 2.1539, "step": 392 }, { "epoch": 0.09326056003796868, "grad_norm": 1.635286569595337, "learning_rate": 2.1400000000000003e-06, "loss": 2.4099, "step": 393 }, { "epoch": 0.09349786426198387, "grad_norm": 1.1868668794631958, "learning_rate": 2.12e-06, "loss": 1.6283, "step": 394 }, { "epoch": 0.09373516848599905, "grad_norm": 1.4009878635406494, "learning_rate": 2.1000000000000002e-06, "loss": 1.4061, "step": 395 }, { "epoch": 0.09397247271001424, "grad_norm": 0.46099644899368286, "learning_rate": 2.08e-06, "loss": 0.4871, "step": 396 }, { "epoch": 0.09420977693402943, "grad_norm": 0.7012650370597839, "learning_rate": 2.06e-06, "loss": 1.5595, "step": 397 }, { "epoch": 0.09444708115804461, "grad_norm": 0.7766276001930237, "learning_rate": 2.04e-06, "loss": 1.0104, "step": 398 }, { "epoch": 0.0946843853820598, "grad_norm": 1.029155969619751, "learning_rate": 2.02e-06, "loss": 2.6892, "step": 399 }, { "epoch": 0.09492168960607499, "grad_norm": 1.6015249490737915, "learning_rate": 2.0000000000000003e-06, "loss": 1.7859, "step": 400 }, { "epoch": 0.09515899383009017, "grad_norm": 0.3838267922401428, "learning_rate": 1.98e-06, "loss": 1.5704, "step": 401 }, { "epoch": 0.09539629805410536, "grad_norm": 0.6478832364082336, "learning_rate": 1.9600000000000003e-06, "loss": 1.2718, "step": 402 }, { "epoch": 0.09563360227812055, "grad_norm": 0.9483569860458374, "learning_rate": 1.94e-06, "loss": 1.9627, "step": 403 }, { "epoch": 0.09587090650213574, "grad_norm": 0.5800157785415649, "learning_rate": 1.9200000000000003e-06, "loss": 0.9114, "step": 404 }, { "epoch": 0.09610821072615093, "grad_norm": 0.5951704978942871, "learning_rate": 1.9000000000000002e-06, "loss": 1.2684, "step": 405 }, { "epoch": 0.09634551495016612, "grad_norm": 0.7555820345878601, "learning_rate": 1.8800000000000002e-06, "loss": 1.7506, "step": 406 }, { "epoch": 0.0965828191741813, "grad_norm": 1.0840675830841064, "learning_rate": 1.8600000000000002e-06, "loss": 2.3729, "step": 407 }, { "epoch": 0.09682012339819648, "grad_norm": 1.0009734630584717, "learning_rate": 1.8400000000000002e-06, "loss": 1.0095, "step": 408 }, { "epoch": 0.09705742762221167, "grad_norm": 0.8436226844787598, "learning_rate": 1.8200000000000002e-06, "loss": 1.6425, "step": 409 }, { "epoch": 0.09729473184622686, "grad_norm": 0.6967753767967224, "learning_rate": 1.8000000000000001e-06, "loss": 2.2186, "step": 410 }, { "epoch": 0.09753203607024205, "grad_norm": 0.9439252614974976, "learning_rate": 1.7800000000000001e-06, "loss": 1.0762, "step": 411 }, { "epoch": 0.09776934029425724, "grad_norm": 1.6090588569641113, "learning_rate": 1.76e-06, "loss": 1.6685, "step": 412 }, { "epoch": 0.09800664451827243, "grad_norm": 0.6204804182052612, "learning_rate": 1.74e-06, "loss": 0.7148, "step": 413 }, { "epoch": 0.09824394874228762, "grad_norm": 0.9542770385742188, "learning_rate": 1.72e-06, "loss": 0.7676, "step": 414 }, { "epoch": 0.0984812529663028, "grad_norm": 1.0385842323303223, "learning_rate": 1.7000000000000002e-06, "loss": 1.6284, "step": 415 }, { "epoch": 0.09871855719031798, "grad_norm": 0.643661379814148, "learning_rate": 1.6800000000000002e-06, "loss": 0.9284, "step": 416 }, { "epoch": 0.09895586141433317, "grad_norm": 0.7566413283348083, "learning_rate": 1.6600000000000002e-06, "loss": 1.8893, "step": 417 }, { "epoch": 0.09919316563834836, "grad_norm": 0.805566132068634, "learning_rate": 1.6400000000000002e-06, "loss": 1.8842, "step": 418 }, { "epoch": 0.09943046986236355, "grad_norm": 0.503933310508728, "learning_rate": 1.6200000000000002e-06, "loss": 1.4513, "step": 419 }, { "epoch": 0.09966777408637874, "grad_norm": 2.3548262119293213, "learning_rate": 1.6000000000000001e-06, "loss": 1.381, "step": 420 }, { "epoch": 0.09990507831039393, "grad_norm": 1.1577256917953491, "learning_rate": 1.5800000000000001e-06, "loss": 2.0068, "step": 421 }, { "epoch": 0.10014238253440912, "grad_norm": 0.8693385124206543, "learning_rate": 1.56e-06, "loss": 1.38, "step": 422 }, { "epoch": 0.1003796867584243, "grad_norm": 0.27834704518318176, "learning_rate": 1.54e-06, "loss": 0.3649, "step": 423 }, { "epoch": 0.1006169909824395, "grad_norm": 0.6906237006187439, "learning_rate": 1.52e-06, "loss": 2.2498, "step": 424 }, { "epoch": 0.10085429520645467, "grad_norm": 1.5801548957824707, "learning_rate": 1.5e-06, "loss": 1.6734, "step": 425 }, { "epoch": 0.10109159943046986, "grad_norm": 0.6525102853775024, "learning_rate": 1.48e-06, "loss": 1.7765, "step": 426 }, { "epoch": 0.10132890365448505, "grad_norm": 0.6370388865470886, "learning_rate": 1.46e-06, "loss": 1.497, "step": 427 }, { "epoch": 0.10156620787850024, "grad_norm": 0.9169662594795227, "learning_rate": 1.44e-06, "loss": 1.7245, "step": 428 }, { "epoch": 0.10180351210251543, "grad_norm": 0.8349008560180664, "learning_rate": 1.42e-06, "loss": 1.2525, "step": 429 }, { "epoch": 0.10204081632653061, "grad_norm": 0.5627273917198181, "learning_rate": 1.4000000000000001e-06, "loss": 1.5604, "step": 430 }, { "epoch": 0.1022781205505458, "grad_norm": 0.6167902946472168, "learning_rate": 1.3800000000000001e-06, "loss": 0.4832, "step": 431 }, { "epoch": 0.10251542477456099, "grad_norm": 0.6913139224052429, "learning_rate": 1.3600000000000001e-06, "loss": 0.4277, "step": 432 }, { "epoch": 0.10275272899857618, "grad_norm": 0.9053479433059692, "learning_rate": 1.34e-06, "loss": 1.1996, "step": 433 }, { "epoch": 0.10299003322259136, "grad_norm": 0.5853822827339172, "learning_rate": 1.32e-06, "loss": 1.5833, "step": 434 }, { "epoch": 0.10322733744660655, "grad_norm": 0.6833449602127075, "learning_rate": 1.3e-06, "loss": 1.5597, "step": 435 }, { "epoch": 0.10346464167062173, "grad_norm": 0.48403409123420715, "learning_rate": 1.28e-06, "loss": 1.7934, "step": 436 }, { "epoch": 0.10370194589463692, "grad_norm": 0.8773030638694763, "learning_rate": 1.26e-06, "loss": 0.5695, "step": 437 }, { "epoch": 0.10393925011865211, "grad_norm": 0.6054906249046326, "learning_rate": 1.2400000000000002e-06, "loss": 2.3307, "step": 438 }, { "epoch": 0.1041765543426673, "grad_norm": 0.6956624388694763, "learning_rate": 1.2200000000000002e-06, "loss": 1.0714, "step": 439 }, { "epoch": 0.10441385856668249, "grad_norm": 1.519985556602478, "learning_rate": 1.2000000000000002e-06, "loss": 1.695, "step": 440 }, { "epoch": 0.10465116279069768, "grad_norm": 1.0551714897155762, "learning_rate": 1.1800000000000001e-06, "loss": 2.457, "step": 441 }, { "epoch": 0.10488846701471286, "grad_norm": 0.7493646740913391, "learning_rate": 1.1600000000000001e-06, "loss": 1.965, "step": 442 }, { "epoch": 0.10512577123872804, "grad_norm": 0.7913329601287842, "learning_rate": 1.14e-06, "loss": 1.5939, "step": 443 }, { "epoch": 0.10536307546274323, "grad_norm": 1.10833740234375, "learning_rate": 1.12e-06, "loss": 2.1043, "step": 444 }, { "epoch": 0.10560037968675842, "grad_norm": 0.8675681948661804, "learning_rate": 1.1e-06, "loss": 1.6333, "step": 445 }, { "epoch": 0.10583768391077361, "grad_norm": 0.8735470771789551, "learning_rate": 1.08e-06, "loss": 2.1604, "step": 446 }, { "epoch": 0.1060749881347888, "grad_norm": 0.9015608429908752, "learning_rate": 1.06e-06, "loss": 0.9548, "step": 447 }, { "epoch": 0.10631229235880399, "grad_norm": 0.7339662313461304, "learning_rate": 1.04e-06, "loss": 1.3065, "step": 448 }, { "epoch": 0.10654959658281918, "grad_norm": 0.5532211661338806, "learning_rate": 1.02e-06, "loss": 1.8098, "step": 449 }, { "epoch": 0.10678690080683437, "grad_norm": 0.8225467801094055, "learning_rate": 1.0000000000000002e-06, "loss": 2.2966, "step": 450 }, { "epoch": 0.10702420503084954, "grad_norm": 0.5000866651535034, "learning_rate": 9.800000000000001e-07, "loss": 0.2921, "step": 451 }, { "epoch": 0.10726150925486473, "grad_norm": 1.0391067266464233, "learning_rate": 9.600000000000001e-07, "loss": 2.2974, "step": 452 }, { "epoch": 0.10749881347887992, "grad_norm": 0.685451865196228, "learning_rate": 9.400000000000001e-07, "loss": 2.0784, "step": 453 }, { "epoch": 0.10773611770289511, "grad_norm": 0.5864785313606262, "learning_rate": 9.200000000000001e-07, "loss": 2.2062, "step": 454 }, { "epoch": 0.1079734219269103, "grad_norm": 0.6014403104782104, "learning_rate": 9.000000000000001e-07, "loss": 0.8348, "step": 455 }, { "epoch": 0.10821072615092549, "grad_norm": 1.6633024215698242, "learning_rate": 8.8e-07, "loss": 1.2606, "step": 456 }, { "epoch": 0.10844803037494068, "grad_norm": 0.7936999797821045, "learning_rate": 8.6e-07, "loss": 2.0804, "step": 457 }, { "epoch": 0.10868533459895587, "grad_norm": 0.22339628636837006, "learning_rate": 8.400000000000001e-07, "loss": 2.1541, "step": 458 }, { "epoch": 0.10892263882297105, "grad_norm": 0.6188535690307617, "learning_rate": 8.200000000000001e-07, "loss": 1.4218, "step": 459 }, { "epoch": 0.10915994304698623, "grad_norm": 0.8845232129096985, "learning_rate": 8.000000000000001e-07, "loss": 1.4453, "step": 460 }, { "epoch": 0.10939724727100142, "grad_norm": 0.5522229671478271, "learning_rate": 7.8e-07, "loss": 1.4562, "step": 461 }, { "epoch": 0.10963455149501661, "grad_norm": 0.49053800106048584, "learning_rate": 7.6e-07, "loss": 0.6361, "step": 462 }, { "epoch": 0.1098718557190318, "grad_norm": 0.5572338104248047, "learning_rate": 7.4e-07, "loss": 0.8394, "step": 463 }, { "epoch": 0.11010915994304699, "grad_norm": 0.9283513426780701, "learning_rate": 7.2e-07, "loss": 1.6316, "step": 464 }, { "epoch": 0.11034646416706217, "grad_norm": 0.8812064528465271, "learning_rate": 7.000000000000001e-07, "loss": 1.2302, "step": 465 }, { "epoch": 0.11058376839107736, "grad_norm": 0.7392125129699707, "learning_rate": 6.800000000000001e-07, "loss": 0.8159, "step": 466 }, { "epoch": 0.11082107261509255, "grad_norm": 0.7616608738899231, "learning_rate": 6.6e-07, "loss": 1.0404, "step": 467 }, { "epoch": 0.11105837683910774, "grad_norm": 0.7175336480140686, "learning_rate": 6.4e-07, "loss": 1.7509, "step": 468 }, { "epoch": 0.11129568106312292, "grad_norm": 0.7180752754211426, "learning_rate": 6.200000000000001e-07, "loss": 0.9795, "step": 469 }, { "epoch": 0.1115329852871381, "grad_norm": 0.875347912311554, "learning_rate": 6.000000000000001e-07, "loss": 1.8271, "step": 470 }, { "epoch": 0.1117702895111533, "grad_norm": 0.6006546020507812, "learning_rate": 5.800000000000001e-07, "loss": 2.0545, "step": 471 }, { "epoch": 0.11200759373516848, "grad_norm": 1.1124011278152466, "learning_rate": 5.6e-07, "loss": 2.2172, "step": 472 }, { "epoch": 0.11224489795918367, "grad_norm": 1.5857324600219727, "learning_rate": 5.4e-07, "loss": 1.2116, "step": 473 }, { "epoch": 0.11248220218319886, "grad_norm": 0.6902075409889221, "learning_rate": 5.2e-07, "loss": 2.0822, "step": 474 }, { "epoch": 0.11271950640721405, "grad_norm": 0.9990330934524536, "learning_rate": 5.000000000000001e-07, "loss": 1.9775, "step": 475 }, { "epoch": 0.11295681063122924, "grad_norm": 1.281016230583191, "learning_rate": 4.800000000000001e-07, "loss": 1.9021, "step": 476 }, { "epoch": 0.11319411485524443, "grad_norm": 0.3223126232624054, "learning_rate": 4.6000000000000004e-07, "loss": 1.8383, "step": 477 }, { "epoch": 0.1134314190792596, "grad_norm": 1.1316232681274414, "learning_rate": 4.4e-07, "loss": 1.6243, "step": 478 }, { "epoch": 0.1136687233032748, "grad_norm": 0.6887989640235901, "learning_rate": 4.2000000000000006e-07, "loss": 1.5909, "step": 479 }, { "epoch": 0.11390602752728998, "grad_norm": 0.8150919675827026, "learning_rate": 4.0000000000000003e-07, "loss": 2.1949, "step": 480 }, { "epoch": 0.11414333175130517, "grad_norm": 0.6823549866676331, "learning_rate": 3.8e-07, "loss": 2.095, "step": 481 }, { "epoch": 0.11438063597532036, "grad_norm": 1.052901268005371, "learning_rate": 3.6e-07, "loss": 1.7599, "step": 482 }, { "epoch": 0.11461794019933555, "grad_norm": 1.048052430152893, "learning_rate": 3.4000000000000003e-07, "loss": 2.1143, "step": 483 }, { "epoch": 0.11485524442335074, "grad_norm": 1.2748647928237915, "learning_rate": 3.2e-07, "loss": 1.8183, "step": 484 }, { "epoch": 0.11509254864736593, "grad_norm": 1.2471035718917847, "learning_rate": 3.0000000000000004e-07, "loss": 1.8442, "step": 485 }, { "epoch": 0.11532985287138112, "grad_norm": 0.46195486187934875, "learning_rate": 2.8e-07, "loss": 1.1617, "step": 486 }, { "epoch": 0.11556715709539629, "grad_norm": 0.6743305325508118, "learning_rate": 2.6e-07, "loss": 1.4748, "step": 487 }, { "epoch": 0.11580446131941148, "grad_norm": 1.0564024448394775, "learning_rate": 2.4000000000000003e-07, "loss": 1.5775, "step": 488 }, { "epoch": 0.11604176554342667, "grad_norm": 0.6965152025222778, "learning_rate": 2.2e-07, "loss": 2.5251, "step": 489 }, { "epoch": 0.11627906976744186, "grad_norm": 0.8700504899024963, "learning_rate": 2.0000000000000002e-07, "loss": 1.6123, "step": 490 }, { "epoch": 0.11651637399145705, "grad_norm": 0.7157378196716309, "learning_rate": 1.8e-07, "loss": 1.7864, "step": 491 }, { "epoch": 0.11675367821547224, "grad_norm": 0.9464967250823975, "learning_rate": 1.6e-07, "loss": 2.0064, "step": 492 }, { "epoch": 0.11699098243948743, "grad_norm": 0.734511137008667, "learning_rate": 1.4e-07, "loss": 0.8449, "step": 493 }, { "epoch": 0.11722828666350261, "grad_norm": 1.0827577114105225, "learning_rate": 1.2000000000000002e-07, "loss": 1.8062, "step": 494 }, { "epoch": 0.1174655908875178, "grad_norm": 1.1060395240783691, "learning_rate": 1.0000000000000001e-07, "loss": 1.9935, "step": 495 }, { "epoch": 0.11770289511153298, "grad_norm": 0.9821236729621887, "learning_rate": 8e-08, "loss": 1.1501, "step": 496 }, { "epoch": 0.11794019933554817, "grad_norm": 0.46705755591392517, "learning_rate": 6.000000000000001e-08, "loss": 2.2727, "step": 497 }, { "epoch": 0.11817750355956336, "grad_norm": 0.7528263926506042, "learning_rate": 4e-08, "loss": 2.2196, "step": 498 }, { "epoch": 0.11841480778357855, "grad_norm": 1.029765009880066, "learning_rate": 2e-08, "loss": 1.3983, "step": 499 }, { "epoch": 0.11865211200759374, "grad_norm": 0.9231687188148499, "learning_rate": 9.000000000000001e-11, "loss": 0.7083, "step": 500 }, { "epoch": 0.11865211200759374, "step": 500, "total_flos": 8.201567207424e+16, "train_loss": 1.6423830435276032, "train_runtime": 704.5977, "train_samples_per_second": 1.419, "train_steps_per_second": 0.71 } ], "logging_steps": 1.0, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.201567207424e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }