diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": 0.2650785446166992, "best_model_checkpoint": "./w2v-bert-2.0-hausa_naijavoices_100h/checkpoint-10000", - "epoch": 36.312849162011176, + "epoch": 39.10614525139665, "eval_steps": 1000, - "global_step": 13000, + "global_step": 14000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -91137,6 +91137,7016 @@ "eval_steps_per_second": 0.628, "eval_wer": 0.34800703890952767, "step": 13000 + }, + { + "epoch": 36.315642458100555, + "grad_norm": 0.5103163719177246, + "learning_rate": 2.8493482309124767e-05, + "loss": 0.1163, + "step": 13001 + }, + { + "epoch": 36.31843575418994, + "grad_norm": 0.45563486218452454, + "learning_rate": 2.8493109869646184e-05, + "loss": 0.0957, + "step": 13002 + }, + { + "epoch": 36.32122905027933, + "grad_norm": 0.6634702682495117, + "learning_rate": 2.8492737430167597e-05, + "loss": 0.086, + "step": 13003 + }, + { + "epoch": 36.324022346368714, + "grad_norm": 0.3813391327857971, + "learning_rate": 2.8492364990689013e-05, + "loss": 0.0841, + "step": 13004 + }, + { + "epoch": 36.3268156424581, + "grad_norm": 0.4647751748561859, + "learning_rate": 2.849199255121043e-05, + "loss": 0.0837, + "step": 13005 + }, + { + "epoch": 36.32960893854749, + "grad_norm": 0.5191593170166016, + "learning_rate": 2.8491620111731843e-05, + "loss": 0.0903, + "step": 13006 + }, + { + "epoch": 36.332402234636874, + "grad_norm": 2.6562979221343994, + "learning_rate": 2.849124767225326e-05, + "loss": 0.0852, + "step": 13007 + }, + { + "epoch": 36.33519553072626, + "grad_norm": 1.221608281135559, + "learning_rate": 2.8490875232774676e-05, + "loss": 0.102, + "step": 13008 + }, + { + "epoch": 36.33798882681564, + "grad_norm": 1.1749480962753296, + "learning_rate": 2.8490502793296093e-05, + "loss": 0.0789, + "step": 13009 + }, + { + "epoch": 36.340782122905026, + "grad_norm": 0.8152073621749878, + "learning_rate": 2.8490130353817502e-05, + "loss": 0.082, + "step": 13010 + }, + { + "epoch": 36.34357541899441, + "grad_norm": 0.6769988536834717, + "learning_rate": 2.848975791433892e-05, + "loss": 0.0732, + "step": 13011 + }, + { + "epoch": 36.3463687150838, + "grad_norm": 0.7155337929725647, + "learning_rate": 2.8489385474860336e-05, + "loss": 0.0835, + "step": 13012 + }, + { + "epoch": 36.349162011173185, + "grad_norm": 0.9940541386604309, + "learning_rate": 2.8489013035381752e-05, + "loss": 0.1017, + "step": 13013 + }, + { + "epoch": 36.35195530726257, + "grad_norm": 0.3431513011455536, + "learning_rate": 2.848864059590317e-05, + "loss": 0.1231, + "step": 13014 + }, + { + "epoch": 36.35474860335196, + "grad_norm": 0.4082512855529785, + "learning_rate": 2.8488268156424582e-05, + "loss": 0.1119, + "step": 13015 + }, + { + "epoch": 36.357541899441344, + "grad_norm": 0.6857700347900391, + "learning_rate": 2.8487895716946e-05, + "loss": 0.1125, + "step": 13016 + }, + { + "epoch": 36.36033519553072, + "grad_norm": 0.4404042661190033, + "learning_rate": 2.848752327746741e-05, + "loss": 0.1183, + "step": 13017 + }, + { + "epoch": 36.36312849162011, + "grad_norm": 0.4675848186016083, + "learning_rate": 2.8487150837988828e-05, + "loss": 0.122, + "step": 13018 + }, + { + "epoch": 36.365921787709496, + "grad_norm": 0.7972007393836975, + "learning_rate": 2.848677839851024e-05, + "loss": 0.1244, + "step": 13019 + }, + { + "epoch": 36.36871508379888, + "grad_norm": 0.8320142030715942, + "learning_rate": 2.8486405959031658e-05, + "loss": 0.1171, + "step": 13020 + }, + { + "epoch": 36.37150837988827, + "grad_norm": 0.9292674660682678, + "learning_rate": 2.8486033519553074e-05, + "loss": 0.1069, + "step": 13021 + }, + { + "epoch": 36.374301675977655, + "grad_norm": 0.4998623728752136, + "learning_rate": 2.848566108007449e-05, + "loss": 0.1004, + "step": 13022 + }, + { + "epoch": 36.37709497206704, + "grad_norm": 1.3461363315582275, + "learning_rate": 2.8485288640595904e-05, + "loss": 0.0973, + "step": 13023 + }, + { + "epoch": 36.37988826815643, + "grad_norm": 0.49873998761177063, + "learning_rate": 2.8484916201117317e-05, + "loss": 0.1177, + "step": 13024 + }, + { + "epoch": 36.38268156424581, + "grad_norm": 0.5666747689247131, + "learning_rate": 2.8484543761638734e-05, + "loss": 0.1083, + "step": 13025 + }, + { + "epoch": 36.385474860335194, + "grad_norm": 1.3581769466400146, + "learning_rate": 2.848417132216015e-05, + "loss": 0.0955, + "step": 13026 + }, + { + "epoch": 36.38826815642458, + "grad_norm": 0.6231361627578735, + "learning_rate": 2.8483798882681567e-05, + "loss": 0.0989, + "step": 13027 + }, + { + "epoch": 36.391061452513966, + "grad_norm": 0.5649336576461792, + "learning_rate": 2.848342644320298e-05, + "loss": 0.1005, + "step": 13028 + }, + { + "epoch": 36.39385474860335, + "grad_norm": 0.5340343117713928, + "learning_rate": 2.8483054003724396e-05, + "loss": 0.0914, + "step": 13029 + }, + { + "epoch": 36.39664804469274, + "grad_norm": 0.6382849216461182, + "learning_rate": 2.848268156424581e-05, + "loss": 0.0874, + "step": 13030 + }, + { + "epoch": 36.399441340782126, + "grad_norm": 0.7184431552886963, + "learning_rate": 2.8482309124767226e-05, + "loss": 0.0749, + "step": 13031 + }, + { + "epoch": 36.40223463687151, + "grad_norm": 0.7611850500106812, + "learning_rate": 2.8481936685288643e-05, + "loss": 0.064, + "step": 13032 + }, + { + "epoch": 36.40502793296089, + "grad_norm": 0.8529970645904541, + "learning_rate": 2.8481564245810056e-05, + "loss": 0.084, + "step": 13033 + }, + { + "epoch": 36.40782122905028, + "grad_norm": 0.6734018325805664, + "learning_rate": 2.8481191806331472e-05, + "loss": 0.0931, + "step": 13034 + }, + { + "epoch": 36.410614525139664, + "grad_norm": 0.7495433688163757, + "learning_rate": 2.848081936685289e-05, + "loss": 0.0937, + "step": 13035 + }, + { + "epoch": 36.41340782122905, + "grad_norm": 0.7531351447105408, + "learning_rate": 2.8480446927374305e-05, + "loss": 0.0887, + "step": 13036 + }, + { + "epoch": 36.41620111731844, + "grad_norm": 0.9327375888824463, + "learning_rate": 2.8480074487895715e-05, + "loss": 0.0894, + "step": 13037 + }, + { + "epoch": 36.41899441340782, + "grad_norm": 2.5498523712158203, + "learning_rate": 2.847970204841713e-05, + "loss": 0.1179, + "step": 13038 + }, + { + "epoch": 36.42178770949721, + "grad_norm": 0.732311487197876, + "learning_rate": 2.8479329608938548e-05, + "loss": 0.1288, + "step": 13039 + }, + { + "epoch": 36.42458100558659, + "grad_norm": 0.3687518239021301, + "learning_rate": 2.8478957169459965e-05, + "loss": 0.1117, + "step": 13040 + }, + { + "epoch": 36.427374301675975, + "grad_norm": 0.41522008180618286, + "learning_rate": 2.847858472998138e-05, + "loss": 0.0986, + "step": 13041 + }, + { + "epoch": 36.43016759776536, + "grad_norm": 1.1092604398727417, + "learning_rate": 2.8478212290502794e-05, + "loss": 0.1044, + "step": 13042 + }, + { + "epoch": 36.43296089385475, + "grad_norm": 0.5997103452682495, + "learning_rate": 2.8477839851024208e-05, + "loss": 0.1477, + "step": 13043 + }, + { + "epoch": 36.435754189944134, + "grad_norm": 1.1083749532699585, + "learning_rate": 2.8477467411545624e-05, + "loss": 0.1203, + "step": 13044 + }, + { + "epoch": 36.43854748603352, + "grad_norm": 0.6279087662696838, + "learning_rate": 2.847709497206704e-05, + "loss": 0.1205, + "step": 13045 + }, + { + "epoch": 36.44134078212291, + "grad_norm": 0.35380157828330994, + "learning_rate": 2.8476722532588454e-05, + "loss": 0.0955, + "step": 13046 + }, + { + "epoch": 36.444134078212294, + "grad_norm": 0.44671377539634705, + "learning_rate": 2.847635009310987e-05, + "loss": 0.1007, + "step": 13047 + }, + { + "epoch": 36.44692737430167, + "grad_norm": 0.5153653621673584, + "learning_rate": 2.8475977653631287e-05, + "loss": 0.0959, + "step": 13048 + }, + { + "epoch": 36.44972067039106, + "grad_norm": 0.7488757967948914, + "learning_rate": 2.8475605214152703e-05, + "loss": 0.0933, + "step": 13049 + }, + { + "epoch": 36.452513966480446, + "grad_norm": 1.1935946941375732, + "learning_rate": 2.8475232774674116e-05, + "loss": 0.1124, + "step": 13050 + }, + { + "epoch": 36.45530726256983, + "grad_norm": 0.9721536636352539, + "learning_rate": 2.847486033519553e-05, + "loss": 0.0827, + "step": 13051 + }, + { + "epoch": 36.45810055865922, + "grad_norm": 0.5015149712562561, + "learning_rate": 2.8474487895716946e-05, + "loss": 0.0967, + "step": 13052 + }, + { + "epoch": 36.460893854748605, + "grad_norm": 0.9974826574325562, + "learning_rate": 2.8474115456238363e-05, + "loss": 0.0924, + "step": 13053 + }, + { + "epoch": 36.46368715083799, + "grad_norm": 0.8104403614997864, + "learning_rate": 2.847374301675978e-05, + "loss": 0.0931, + "step": 13054 + }, + { + "epoch": 36.46648044692738, + "grad_norm": 0.632412314414978, + "learning_rate": 2.8473370577281192e-05, + "loss": 0.0825, + "step": 13055 + }, + { + "epoch": 36.46927374301676, + "grad_norm": 0.8117548227310181, + "learning_rate": 2.847299813780261e-05, + "loss": 0.084, + "step": 13056 + }, + { + "epoch": 36.47206703910614, + "grad_norm": 0.728563129901886, + "learning_rate": 2.8472625698324022e-05, + "loss": 0.091, + "step": 13057 + }, + { + "epoch": 36.47486033519553, + "grad_norm": 0.6499229669570923, + "learning_rate": 2.847225325884544e-05, + "loss": 0.1043, + "step": 13058 + }, + { + "epoch": 36.477653631284916, + "grad_norm": 0.5659500360488892, + "learning_rate": 2.8471880819366852e-05, + "loss": 0.0817, + "step": 13059 + }, + { + "epoch": 36.4804469273743, + "grad_norm": 1.1683993339538574, + "learning_rate": 2.8471508379888268e-05, + "loss": 0.0788, + "step": 13060 + }, + { + "epoch": 36.48324022346369, + "grad_norm": 1.722424030303955, + "learning_rate": 2.8471135940409685e-05, + "loss": 0.0754, + "step": 13061 + }, + { + "epoch": 36.486033519553075, + "grad_norm": 1.2399303913116455, + "learning_rate": 2.84707635009311e-05, + "loss": 0.117, + "step": 13062 + }, + { + "epoch": 36.48882681564246, + "grad_norm": 1.3291460275650024, + "learning_rate": 2.8470391061452514e-05, + "loss": 0.1093, + "step": 13063 + }, + { + "epoch": 36.49162011173184, + "grad_norm": 0.41691839694976807, + "learning_rate": 2.8470018621973928e-05, + "loss": 0.1385, + "step": 13064 + }, + { + "epoch": 36.49441340782123, + "grad_norm": 0.8629980087280273, + "learning_rate": 2.8469646182495344e-05, + "loss": 0.1391, + "step": 13065 + }, + { + "epoch": 36.497206703910614, + "grad_norm": 0.5989841818809509, + "learning_rate": 2.846927374301676e-05, + "loss": 0.1405, + "step": 13066 + }, + { + "epoch": 36.5, + "grad_norm": 0.4392365515232086, + "learning_rate": 2.8468901303538177e-05, + "loss": 0.1256, + "step": 13067 + }, + { + "epoch": 36.502793296089386, + "grad_norm": 0.41069284081459045, + "learning_rate": 2.846852886405959e-05, + "loss": 0.1267, + "step": 13068 + }, + { + "epoch": 36.50558659217877, + "grad_norm": 1.3699334859848022, + "learning_rate": 2.8468156424581007e-05, + "loss": 0.1088, + "step": 13069 + }, + { + "epoch": 36.50837988826816, + "grad_norm": 0.3773321807384491, + "learning_rate": 2.846778398510242e-05, + "loss": 0.1207, + "step": 13070 + }, + { + "epoch": 36.51117318435754, + "grad_norm": 0.3941558599472046, + "learning_rate": 2.8467411545623837e-05, + "loss": 0.0972, + "step": 13071 + }, + { + "epoch": 36.513966480446925, + "grad_norm": 0.5023883581161499, + "learning_rate": 2.8467039106145253e-05, + "loss": 0.0949, + "step": 13072 + }, + { + "epoch": 36.51675977653631, + "grad_norm": 0.6170898079872131, + "learning_rate": 2.8466666666666666e-05, + "loss": 0.1054, + "step": 13073 + }, + { + "epoch": 36.5195530726257, + "grad_norm": 0.4747655689716339, + "learning_rate": 2.8466294227188083e-05, + "loss": 0.1049, + "step": 13074 + }, + { + "epoch": 36.522346368715084, + "grad_norm": 0.6342079043388367, + "learning_rate": 2.84659217877095e-05, + "loss": 0.1078, + "step": 13075 + }, + { + "epoch": 36.52513966480447, + "grad_norm": 0.9232836961746216, + "learning_rate": 2.8465549348230916e-05, + "loss": 0.0895, + "step": 13076 + }, + { + "epoch": 36.52793296089386, + "grad_norm": 0.6280178427696228, + "learning_rate": 2.8465176908752326e-05, + "loss": 0.1007, + "step": 13077 + }, + { + "epoch": 36.53072625698324, + "grad_norm": 1.9618819952011108, + "learning_rate": 2.8464804469273742e-05, + "loss": 0.0875, + "step": 13078 + }, + { + "epoch": 36.53351955307262, + "grad_norm": 0.5735899209976196, + "learning_rate": 2.846443202979516e-05, + "loss": 0.0871, + "step": 13079 + }, + { + "epoch": 36.53631284916201, + "grad_norm": 0.7038599848747253, + "learning_rate": 2.8464059590316575e-05, + "loss": 0.0882, + "step": 13080 + }, + { + "epoch": 36.539106145251395, + "grad_norm": 0.5991548895835876, + "learning_rate": 2.8463687150837992e-05, + "loss": 0.0794, + "step": 13081 + }, + { + "epoch": 36.54189944134078, + "grad_norm": 0.7051090002059937, + "learning_rate": 2.8463314711359405e-05, + "loss": 0.0823, + "step": 13082 + }, + { + "epoch": 36.54469273743017, + "grad_norm": 0.5633767247200012, + "learning_rate": 2.8462942271880818e-05, + "loss": 0.0807, + "step": 13083 + }, + { + "epoch": 36.547486033519554, + "grad_norm": 0.7850005626678467, + "learning_rate": 2.8462569832402235e-05, + "loss": 0.0999, + "step": 13084 + }, + { + "epoch": 36.55027932960894, + "grad_norm": 0.6875097751617432, + "learning_rate": 2.846219739292365e-05, + "loss": 0.0797, + "step": 13085 + }, + { + "epoch": 36.55307262569833, + "grad_norm": 0.8078372478485107, + "learning_rate": 2.8461824953445064e-05, + "loss": 0.0774, + "step": 13086 + }, + { + "epoch": 36.555865921787706, + "grad_norm": 0.9205446243286133, + "learning_rate": 2.846145251396648e-05, + "loss": 0.0913, + "step": 13087 + }, + { + "epoch": 36.55865921787709, + "grad_norm": 1.0804953575134277, + "learning_rate": 2.8461080074487897e-05, + "loss": 0.1215, + "step": 13088 + }, + { + "epoch": 36.56145251396648, + "grad_norm": 0.5169636607170105, + "learning_rate": 2.8460707635009314e-05, + "loss": 0.12, + "step": 13089 + }, + { + "epoch": 36.564245810055866, + "grad_norm": 0.7846399545669556, + "learning_rate": 2.8460335195530727e-05, + "loss": 0.1233, + "step": 13090 + }, + { + "epoch": 36.56703910614525, + "grad_norm": 0.6060928702354431, + "learning_rate": 2.845996275605214e-05, + "loss": 0.1051, + "step": 13091 + }, + { + "epoch": 36.56983240223464, + "grad_norm": 0.9782649278640747, + "learning_rate": 2.8459590316573557e-05, + "loss": 0.1298, + "step": 13092 + }, + { + "epoch": 36.572625698324025, + "grad_norm": 0.5207920074462891, + "learning_rate": 2.8459217877094973e-05, + "loss": 0.1315, + "step": 13093 + }, + { + "epoch": 36.57541899441341, + "grad_norm": 0.5926817059516907, + "learning_rate": 2.845884543761639e-05, + "loss": 0.1254, + "step": 13094 + }, + { + "epoch": 36.57821229050279, + "grad_norm": 0.707974374294281, + "learning_rate": 2.8458472998137803e-05, + "loss": 0.1204, + "step": 13095 + }, + { + "epoch": 36.58100558659218, + "grad_norm": 0.5093036890029907, + "learning_rate": 2.845810055865922e-05, + "loss": 0.1044, + "step": 13096 + }, + { + "epoch": 36.58379888268156, + "grad_norm": 0.47131848335266113, + "learning_rate": 2.8457728119180633e-05, + "loss": 0.1029, + "step": 13097 + }, + { + "epoch": 36.58659217877095, + "grad_norm": 0.7507746815681458, + "learning_rate": 2.845735567970205e-05, + "loss": 0.1113, + "step": 13098 + }, + { + "epoch": 36.589385474860336, + "grad_norm": 0.445652037858963, + "learning_rate": 2.8456983240223466e-05, + "loss": 0.0843, + "step": 13099 + }, + { + "epoch": 36.59217877094972, + "grad_norm": 0.6210559606552124, + "learning_rate": 2.845661080074488e-05, + "loss": 0.0881, + "step": 13100 + }, + { + "epoch": 36.59497206703911, + "grad_norm": 0.5198919773101807, + "learning_rate": 2.8456238361266295e-05, + "loss": 0.099, + "step": 13101 + }, + { + "epoch": 36.59776536312849, + "grad_norm": 0.7758316397666931, + "learning_rate": 2.8455865921787712e-05, + "loss": 0.1087, + "step": 13102 + }, + { + "epoch": 36.600558659217874, + "grad_norm": 0.36870887875556946, + "learning_rate": 2.8455493482309125e-05, + "loss": 0.0796, + "step": 13103 + }, + { + "epoch": 36.60335195530726, + "grad_norm": 4.285307884216309, + "learning_rate": 2.8455121042830538e-05, + "loss": 0.0923, + "step": 13104 + }, + { + "epoch": 36.60614525139665, + "grad_norm": 1.537385106086731, + "learning_rate": 2.8454748603351955e-05, + "loss": 0.1028, + "step": 13105 + }, + { + "epoch": 36.608938547486034, + "grad_norm": 0.9062882661819458, + "learning_rate": 2.845437616387337e-05, + "loss": 0.0798, + "step": 13106 + }, + { + "epoch": 36.61173184357542, + "grad_norm": 1.2549960613250732, + "learning_rate": 2.8454003724394788e-05, + "loss": 0.0865, + "step": 13107 + }, + { + "epoch": 36.614525139664806, + "grad_norm": 0.4961925148963928, + "learning_rate": 2.8453631284916204e-05, + "loss": 0.0696, + "step": 13108 + }, + { + "epoch": 36.61731843575419, + "grad_norm": 1.2841949462890625, + "learning_rate": 2.8453258845437618e-05, + "loss": 0.0937, + "step": 13109 + }, + { + "epoch": 36.62011173184357, + "grad_norm": 0.7004602551460266, + "learning_rate": 2.845288640595903e-05, + "loss": 0.0846, + "step": 13110 + }, + { + "epoch": 36.62290502793296, + "grad_norm": 1.0388610363006592, + "learning_rate": 2.8452513966480447e-05, + "loss": 0.0798, + "step": 13111 + }, + { + "epoch": 36.625698324022345, + "grad_norm": 0.8924124240875244, + "learning_rate": 2.8452141527001864e-05, + "loss": 0.0911, + "step": 13112 + }, + { + "epoch": 36.62849162011173, + "grad_norm": 0.9572955369949341, + "learning_rate": 2.8451769087523277e-05, + "loss": 0.1039, + "step": 13113 + }, + { + "epoch": 36.63128491620112, + "grad_norm": 0.5310201644897461, + "learning_rate": 2.8451396648044693e-05, + "loss": 0.1167, + "step": 13114 + }, + { + "epoch": 36.634078212290504, + "grad_norm": 0.4729738235473633, + "learning_rate": 2.845102420856611e-05, + "loss": 0.1421, + "step": 13115 + }, + { + "epoch": 36.63687150837989, + "grad_norm": 0.3305386006832123, + "learning_rate": 2.8450651769087526e-05, + "loss": 0.1175, + "step": 13116 + }, + { + "epoch": 36.63966480446928, + "grad_norm": 0.49460217356681824, + "learning_rate": 2.845027932960894e-05, + "loss": 0.1306, + "step": 13117 + }, + { + "epoch": 36.642458100558656, + "grad_norm": 0.3999815881252289, + "learning_rate": 2.8449906890130353e-05, + "loss": 0.1112, + "step": 13118 + }, + { + "epoch": 36.64525139664804, + "grad_norm": 0.6515106558799744, + "learning_rate": 2.844953445065177e-05, + "loss": 0.1082, + "step": 13119 + }, + { + "epoch": 36.64804469273743, + "grad_norm": 1.4159810543060303, + "learning_rate": 2.8449162011173186e-05, + "loss": 0.1204, + "step": 13120 + }, + { + "epoch": 36.650837988826815, + "grad_norm": 0.7723098993301392, + "learning_rate": 2.8448789571694602e-05, + "loss": 0.1074, + "step": 13121 + }, + { + "epoch": 36.6536312849162, + "grad_norm": 0.9828838109970093, + "learning_rate": 2.8448417132216016e-05, + "loss": 0.1013, + "step": 13122 + }, + { + "epoch": 36.65642458100559, + "grad_norm": 0.5460968613624573, + "learning_rate": 2.844804469273743e-05, + "loss": 0.1136, + "step": 13123 + }, + { + "epoch": 36.659217877094974, + "grad_norm": 0.4706367552280426, + "learning_rate": 2.8447672253258845e-05, + "loss": 0.1149, + "step": 13124 + }, + { + "epoch": 36.66201117318436, + "grad_norm": 0.38748401403427124, + "learning_rate": 2.8447299813780262e-05, + "loss": 0.0912, + "step": 13125 + }, + { + "epoch": 36.66480446927374, + "grad_norm": 0.45856836438179016, + "learning_rate": 2.8446927374301678e-05, + "loss": 0.0865, + "step": 13126 + }, + { + "epoch": 36.667597765363126, + "grad_norm": 0.7570058703422546, + "learning_rate": 2.844655493482309e-05, + "loss": 0.1041, + "step": 13127 + }, + { + "epoch": 36.67039106145251, + "grad_norm": 0.821748673915863, + "learning_rate": 2.8446182495344508e-05, + "loss": 0.1071, + "step": 13128 + }, + { + "epoch": 36.6731843575419, + "grad_norm": 0.5184983611106873, + "learning_rate": 2.8445810055865925e-05, + "loss": 0.0797, + "step": 13129 + }, + { + "epoch": 36.675977653631286, + "grad_norm": 0.3808233439922333, + "learning_rate": 2.8445437616387338e-05, + "loss": 0.0871, + "step": 13130 + }, + { + "epoch": 36.67877094972067, + "grad_norm": 0.6422951221466064, + "learning_rate": 2.844506517690875e-05, + "loss": 0.0887, + "step": 13131 + }, + { + "epoch": 36.68156424581006, + "grad_norm": 0.684080958366394, + "learning_rate": 2.8444692737430167e-05, + "loss": 0.0795, + "step": 13132 + }, + { + "epoch": 36.684357541899445, + "grad_norm": 0.7174074053764343, + "learning_rate": 2.8444320297951584e-05, + "loss": 0.0931, + "step": 13133 + }, + { + "epoch": 36.687150837988824, + "grad_norm": 0.7364905476570129, + "learning_rate": 2.8443947858473e-05, + "loss": 0.0802, + "step": 13134 + }, + { + "epoch": 36.68994413407821, + "grad_norm": 0.7372745871543884, + "learning_rate": 2.8443575418994417e-05, + "loss": 0.0972, + "step": 13135 + }, + { + "epoch": 36.6927374301676, + "grad_norm": 0.7332994341850281, + "learning_rate": 2.844320297951583e-05, + "loss": 0.0594, + "step": 13136 + }, + { + "epoch": 36.69553072625698, + "grad_norm": 7.079324245452881, + "learning_rate": 2.8442830540037243e-05, + "loss": 0.0908, + "step": 13137 + }, + { + "epoch": 36.69832402234637, + "grad_norm": 1.1419892311096191, + "learning_rate": 2.844245810055866e-05, + "loss": 0.1062, + "step": 13138 + }, + { + "epoch": 36.701117318435756, + "grad_norm": 0.4915393590927124, + "learning_rate": 2.8442085661080076e-05, + "loss": 0.1254, + "step": 13139 + }, + { + "epoch": 36.70391061452514, + "grad_norm": 0.355114221572876, + "learning_rate": 2.844171322160149e-05, + "loss": 0.1235, + "step": 13140 + }, + { + "epoch": 36.70670391061452, + "grad_norm": 1.367867350578308, + "learning_rate": 2.8441340782122906e-05, + "loss": 0.1279, + "step": 13141 + }, + { + "epoch": 36.70949720670391, + "grad_norm": 0.5479897856712341, + "learning_rate": 2.8440968342644323e-05, + "loss": 0.1014, + "step": 13142 + }, + { + "epoch": 36.712290502793294, + "grad_norm": 0.44179767370224, + "learning_rate": 2.8440595903165736e-05, + "loss": 0.1249, + "step": 13143 + }, + { + "epoch": 36.71508379888268, + "grad_norm": 0.4458467960357666, + "learning_rate": 2.8440223463687152e-05, + "loss": 0.0995, + "step": 13144 + }, + { + "epoch": 36.71787709497207, + "grad_norm": 1.1193277835845947, + "learning_rate": 2.8439851024208565e-05, + "loss": 0.1329, + "step": 13145 + }, + { + "epoch": 36.720670391061454, + "grad_norm": 0.6375119686126709, + "learning_rate": 2.8439478584729982e-05, + "loss": 0.0915, + "step": 13146 + }, + { + "epoch": 36.72346368715084, + "grad_norm": 0.4403560161590576, + "learning_rate": 2.84391061452514e-05, + "loss": 0.1027, + "step": 13147 + }, + { + "epoch": 36.726256983240226, + "grad_norm": 0.6501517295837402, + "learning_rate": 2.8438733705772815e-05, + "loss": 0.1, + "step": 13148 + }, + { + "epoch": 36.729050279329606, + "grad_norm": 0.4135424792766571, + "learning_rate": 2.8438361266294228e-05, + "loss": 0.1057, + "step": 13149 + }, + { + "epoch": 36.73184357541899, + "grad_norm": 0.8687627911567688, + "learning_rate": 2.843798882681564e-05, + "loss": 0.1108, + "step": 13150 + }, + { + "epoch": 36.73463687150838, + "grad_norm": 0.4155297577381134, + "learning_rate": 2.8437616387337058e-05, + "loss": 0.0879, + "step": 13151 + }, + { + "epoch": 36.737430167597765, + "grad_norm": 0.5047573447227478, + "learning_rate": 2.8437243947858474e-05, + "loss": 0.101, + "step": 13152 + }, + { + "epoch": 36.74022346368715, + "grad_norm": 0.6942926645278931, + "learning_rate": 2.8436871508379887e-05, + "loss": 0.0892, + "step": 13153 + }, + { + "epoch": 36.74301675977654, + "grad_norm": 1.0379856824874878, + "learning_rate": 2.8436499068901304e-05, + "loss": 0.0922, + "step": 13154 + }, + { + "epoch": 36.745810055865924, + "grad_norm": 0.4566536247730255, + "learning_rate": 2.843612662942272e-05, + "loss": 0.0717, + "step": 13155 + }, + { + "epoch": 36.74860335195531, + "grad_norm": 0.9123262763023376, + "learning_rate": 2.8435754189944137e-05, + "loss": 0.0781, + "step": 13156 + }, + { + "epoch": 36.75139664804469, + "grad_norm": 0.4724482297897339, + "learning_rate": 2.843538175046555e-05, + "loss": 0.0946, + "step": 13157 + }, + { + "epoch": 36.754189944134076, + "grad_norm": 0.7829778790473938, + "learning_rate": 2.8435009310986963e-05, + "loss": 0.0967, + "step": 13158 + }, + { + "epoch": 36.75698324022346, + "grad_norm": 0.9873928427696228, + "learning_rate": 2.843463687150838e-05, + "loss": 0.0709, + "step": 13159 + }, + { + "epoch": 36.75977653631285, + "grad_norm": 0.6474883556365967, + "learning_rate": 2.8434264432029796e-05, + "loss": 0.0921, + "step": 13160 + }, + { + "epoch": 36.762569832402235, + "grad_norm": 0.6852738857269287, + "learning_rate": 2.8433891992551213e-05, + "loss": 0.0933, + "step": 13161 + }, + { + "epoch": 36.76536312849162, + "grad_norm": 0.5250356793403625, + "learning_rate": 2.8433519553072626e-05, + "loss": 0.0797, + "step": 13162 + }, + { + "epoch": 36.76815642458101, + "grad_norm": 1.3000695705413818, + "learning_rate": 2.843314711359404e-05, + "loss": 0.101, + "step": 13163 + }, + { + "epoch": 36.770949720670394, + "grad_norm": 0.4818575084209442, + "learning_rate": 2.8432774674115456e-05, + "loss": 0.132, + "step": 13164 + }, + { + "epoch": 36.773743016759774, + "grad_norm": 0.3751843273639679, + "learning_rate": 2.8432402234636872e-05, + "loss": 0.1106, + "step": 13165 + }, + { + "epoch": 36.77653631284916, + "grad_norm": 0.5293797254562378, + "learning_rate": 2.843202979515829e-05, + "loss": 0.1372, + "step": 13166 + }, + { + "epoch": 36.779329608938546, + "grad_norm": 0.47639548778533936, + "learning_rate": 2.8431657355679702e-05, + "loss": 0.1118, + "step": 13167 + }, + { + "epoch": 36.78212290502793, + "grad_norm": 0.4971713423728943, + "learning_rate": 2.843128491620112e-05, + "loss": 0.1132, + "step": 13168 + }, + { + "epoch": 36.78491620111732, + "grad_norm": 0.6892354488372803, + "learning_rate": 2.8430912476722535e-05, + "loss": 0.1185, + "step": 13169 + }, + { + "epoch": 36.787709497206706, + "grad_norm": 0.3397558629512787, + "learning_rate": 2.8430540037243948e-05, + "loss": 0.0944, + "step": 13170 + }, + { + "epoch": 36.79050279329609, + "grad_norm": 0.4257424473762512, + "learning_rate": 2.843016759776536e-05, + "loss": 0.1035, + "step": 13171 + }, + { + "epoch": 36.79329608938548, + "grad_norm": 0.412949800491333, + "learning_rate": 2.8429795158286778e-05, + "loss": 0.1024, + "step": 13172 + }, + { + "epoch": 36.79608938547486, + "grad_norm": 0.7454365491867065, + "learning_rate": 2.8429422718808194e-05, + "loss": 0.1367, + "step": 13173 + }, + { + "epoch": 36.798882681564244, + "grad_norm": 1.1098469495773315, + "learning_rate": 2.842905027932961e-05, + "loss": 0.1223, + "step": 13174 + }, + { + "epoch": 36.80167597765363, + "grad_norm": 0.5613962411880493, + "learning_rate": 2.8428677839851028e-05, + "loss": 0.1224, + "step": 13175 + }, + { + "epoch": 36.80446927374302, + "grad_norm": 0.46567198634147644, + "learning_rate": 2.842830540037244e-05, + "loss": 0.1028, + "step": 13176 + }, + { + "epoch": 36.8072625698324, + "grad_norm": 0.5464855432510376, + "learning_rate": 2.8427932960893854e-05, + "loss": 0.0952, + "step": 13177 + }, + { + "epoch": 36.81005586592179, + "grad_norm": 0.5916465520858765, + "learning_rate": 2.842756052141527e-05, + "loss": 0.0911, + "step": 13178 + }, + { + "epoch": 36.812849162011176, + "grad_norm": 0.6826441884040833, + "learning_rate": 2.8427188081936687e-05, + "loss": 0.1084, + "step": 13179 + }, + { + "epoch": 36.815642458100555, + "grad_norm": 0.7281197309494019, + "learning_rate": 2.84268156424581e-05, + "loss": 0.1074, + "step": 13180 + }, + { + "epoch": 36.81843575418994, + "grad_norm": 0.4663384258747101, + "learning_rate": 2.8426443202979517e-05, + "loss": 0.0977, + "step": 13181 + }, + { + "epoch": 36.82122905027933, + "grad_norm": 0.8397631645202637, + "learning_rate": 2.8426070763500933e-05, + "loss": 0.0837, + "step": 13182 + }, + { + "epoch": 36.824022346368714, + "grad_norm": 0.8874335885047913, + "learning_rate": 2.8425698324022346e-05, + "loss": 0.0988, + "step": 13183 + }, + { + "epoch": 36.8268156424581, + "grad_norm": 0.44335997104644775, + "learning_rate": 2.8425325884543763e-05, + "loss": 0.0917, + "step": 13184 + }, + { + "epoch": 36.82960893854749, + "grad_norm": 1.6380071640014648, + "learning_rate": 2.8424953445065176e-05, + "loss": 0.1009, + "step": 13185 + }, + { + "epoch": 36.832402234636874, + "grad_norm": 0.7539939284324646, + "learning_rate": 2.8424581005586593e-05, + "loss": 0.0974, + "step": 13186 + }, + { + "epoch": 36.83519553072626, + "grad_norm": 1.3203480243682861, + "learning_rate": 2.842420856610801e-05, + "loss": 0.109, + "step": 13187 + }, + { + "epoch": 36.83798882681564, + "grad_norm": 1.7456378936767578, + "learning_rate": 2.8423836126629426e-05, + "loss": 0.1004, + "step": 13188 + }, + { + "epoch": 36.840782122905026, + "grad_norm": 0.521722674369812, + "learning_rate": 2.842346368715084e-05, + "loss": 0.1203, + "step": 13189 + }, + { + "epoch": 36.84357541899441, + "grad_norm": 0.7139871120452881, + "learning_rate": 2.8423091247672252e-05, + "loss": 0.1281, + "step": 13190 + }, + { + "epoch": 36.8463687150838, + "grad_norm": 0.6199519038200378, + "learning_rate": 2.842271880819367e-05, + "loss": 0.1215, + "step": 13191 + }, + { + "epoch": 36.849162011173185, + "grad_norm": 0.37283897399902344, + "learning_rate": 2.8422346368715085e-05, + "loss": 0.1176, + "step": 13192 + }, + { + "epoch": 36.85195530726257, + "grad_norm": 0.3728958070278168, + "learning_rate": 2.84219739292365e-05, + "loss": 0.0929, + "step": 13193 + }, + { + "epoch": 36.85474860335196, + "grad_norm": 0.4494488537311554, + "learning_rate": 2.8421601489757915e-05, + "loss": 0.1108, + "step": 13194 + }, + { + "epoch": 36.857541899441344, + "grad_norm": 0.48986729979515076, + "learning_rate": 2.842122905027933e-05, + "loss": 0.1142, + "step": 13195 + }, + { + "epoch": 36.86033519553072, + "grad_norm": 0.43761172890663147, + "learning_rate": 2.8420856610800744e-05, + "loss": 0.1285, + "step": 13196 + }, + { + "epoch": 36.86312849162011, + "grad_norm": 0.674289345741272, + "learning_rate": 2.842048417132216e-05, + "loss": 0.1029, + "step": 13197 + }, + { + "epoch": 36.865921787709496, + "grad_norm": 0.5453643798828125, + "learning_rate": 2.8420111731843574e-05, + "loss": 0.1072, + "step": 13198 + }, + { + "epoch": 36.86871508379888, + "grad_norm": 0.4174542725086212, + "learning_rate": 2.841973929236499e-05, + "loss": 0.1048, + "step": 13199 + }, + { + "epoch": 36.87150837988827, + "grad_norm": 0.39227601885795593, + "learning_rate": 2.8419366852886407e-05, + "loss": 0.1043, + "step": 13200 + }, + { + "epoch": 36.874301675977655, + "grad_norm": 0.4821287989616394, + "learning_rate": 2.8418994413407824e-05, + "loss": 0.1172, + "step": 13201 + }, + { + "epoch": 36.87709497206704, + "grad_norm": 0.3642721474170685, + "learning_rate": 2.841862197392924e-05, + "loss": 0.0936, + "step": 13202 + }, + { + "epoch": 36.87988826815643, + "grad_norm": 0.5393512845039368, + "learning_rate": 2.841824953445065e-05, + "loss": 0.0887, + "step": 13203 + }, + { + "epoch": 36.88268156424581, + "grad_norm": 1.2354297637939453, + "learning_rate": 2.8417877094972066e-05, + "loss": 0.1027, + "step": 13204 + }, + { + "epoch": 36.885474860335194, + "grad_norm": 0.46873167157173157, + "learning_rate": 2.8417504655493483e-05, + "loss": 0.0888, + "step": 13205 + }, + { + "epoch": 36.88826815642458, + "grad_norm": 0.5780599117279053, + "learning_rate": 2.84171322160149e-05, + "loss": 0.1048, + "step": 13206 + }, + { + "epoch": 36.891061452513966, + "grad_norm": 0.7402818202972412, + "learning_rate": 2.8416759776536313e-05, + "loss": 0.1128, + "step": 13207 + }, + { + "epoch": 36.89385474860335, + "grad_norm": 0.5356085300445557, + "learning_rate": 2.841638733705773e-05, + "loss": 0.0932, + "step": 13208 + }, + { + "epoch": 36.89664804469274, + "grad_norm": 0.8292642831802368, + "learning_rate": 2.8416014897579146e-05, + "loss": 0.0829, + "step": 13209 + }, + { + "epoch": 36.899441340782126, + "grad_norm": 1.1126779317855835, + "learning_rate": 2.841564245810056e-05, + "loss": 0.097, + "step": 13210 + }, + { + "epoch": 36.90223463687151, + "grad_norm": 0.5330525636672974, + "learning_rate": 2.8415270018621975e-05, + "loss": 0.0742, + "step": 13211 + }, + { + "epoch": 36.90502793296089, + "grad_norm": 0.6607159376144409, + "learning_rate": 2.841489757914339e-05, + "loss": 0.0895, + "step": 13212 + }, + { + "epoch": 36.90782122905028, + "grad_norm": 1.2516320943832397, + "learning_rate": 2.8414525139664805e-05, + "loss": 0.1351, + "step": 13213 + }, + { + "epoch": 36.910614525139664, + "grad_norm": 0.37802550196647644, + "learning_rate": 2.841415270018622e-05, + "loss": 0.1214, + "step": 13214 + }, + { + "epoch": 36.91340782122905, + "grad_norm": 0.4019375443458557, + "learning_rate": 2.8413780260707638e-05, + "loss": 0.1308, + "step": 13215 + }, + { + "epoch": 36.91620111731844, + "grad_norm": 0.4334688186645508, + "learning_rate": 2.8413407821229048e-05, + "loss": 0.1254, + "step": 13216 + }, + { + "epoch": 36.91899441340782, + "grad_norm": 0.4634140133857727, + "learning_rate": 2.8413035381750464e-05, + "loss": 0.1257, + "step": 13217 + }, + { + "epoch": 36.92178770949721, + "grad_norm": 0.709736168384552, + "learning_rate": 2.841266294227188e-05, + "loss": 0.1236, + "step": 13218 + }, + { + "epoch": 36.92458100558659, + "grad_norm": 3.1148626804351807, + "learning_rate": 2.8412290502793298e-05, + "loss": 0.1265, + "step": 13219 + }, + { + "epoch": 36.927374301675975, + "grad_norm": 0.3892812430858612, + "learning_rate": 2.8411918063314714e-05, + "loss": 0.1122, + "step": 13220 + }, + { + "epoch": 36.93016759776536, + "grad_norm": 0.499705970287323, + "learning_rate": 2.8411545623836127e-05, + "loss": 0.0959, + "step": 13221 + }, + { + "epoch": 36.93296089385475, + "grad_norm": 0.6239756941795349, + "learning_rate": 2.8411173184357544e-05, + "loss": 0.1202, + "step": 13222 + }, + { + "epoch": 36.935754189944134, + "grad_norm": 0.6576144695281982, + "learning_rate": 2.8410800744878957e-05, + "loss": 0.0878, + "step": 13223 + }, + { + "epoch": 36.93854748603352, + "grad_norm": 0.7083747982978821, + "learning_rate": 2.8410428305400373e-05, + "loss": 0.0938, + "step": 13224 + }, + { + "epoch": 36.94134078212291, + "grad_norm": 1.140073299407959, + "learning_rate": 2.8410055865921787e-05, + "loss": 0.103, + "step": 13225 + }, + { + "epoch": 36.944134078212294, + "grad_norm": 0.5083725452423096, + "learning_rate": 2.8409683426443203e-05, + "loss": 0.0934, + "step": 13226 + }, + { + "epoch": 36.94692737430167, + "grad_norm": 0.7583737373352051, + "learning_rate": 2.840931098696462e-05, + "loss": 0.0941, + "step": 13227 + }, + { + "epoch": 36.94972067039106, + "grad_norm": 0.5554139018058777, + "learning_rate": 2.8408938547486036e-05, + "loss": 0.084, + "step": 13228 + }, + { + "epoch": 36.952513966480446, + "grad_norm": 0.7402909994125366, + "learning_rate": 2.8408566108007453e-05, + "loss": 0.0973, + "step": 13229 + }, + { + "epoch": 36.95530726256983, + "grad_norm": 0.7091984152793884, + "learning_rate": 2.8408193668528862e-05, + "loss": 0.0993, + "step": 13230 + }, + { + "epoch": 36.95810055865922, + "grad_norm": 0.7565969228744507, + "learning_rate": 2.840782122905028e-05, + "loss": 0.1019, + "step": 13231 + }, + { + "epoch": 36.960893854748605, + "grad_norm": 0.49960198998451233, + "learning_rate": 2.8407448789571696e-05, + "loss": 0.0865, + "step": 13232 + }, + { + "epoch": 36.96368715083799, + "grad_norm": 0.7727753520011902, + "learning_rate": 2.8407076350093112e-05, + "loss": 0.0817, + "step": 13233 + }, + { + "epoch": 36.96648044692738, + "grad_norm": 0.8217834234237671, + "learning_rate": 2.8406703910614525e-05, + "loss": 0.0905, + "step": 13234 + }, + { + "epoch": 36.96927374301676, + "grad_norm": 0.6883734464645386, + "learning_rate": 2.8406331471135942e-05, + "loss": 0.0864, + "step": 13235 + }, + { + "epoch": 36.97206703910614, + "grad_norm": 0.5650118589401245, + "learning_rate": 2.8405959031657355e-05, + "loss": 0.0866, + "step": 13236 + }, + { + "epoch": 36.97486033519553, + "grad_norm": 1.497057318687439, + "learning_rate": 2.840558659217877e-05, + "loss": 0.0874, + "step": 13237 + }, + { + "epoch": 36.977653631284916, + "grad_norm": 0.9132478833198547, + "learning_rate": 2.8405214152700188e-05, + "loss": 0.1133, + "step": 13238 + }, + { + "epoch": 36.9804469273743, + "grad_norm": 1.1537809371948242, + "learning_rate": 2.84048417132216e-05, + "loss": 0.133, + "step": 13239 + }, + { + "epoch": 36.98324022346369, + "grad_norm": 0.7350137829780579, + "learning_rate": 2.8404469273743018e-05, + "loss": 0.1315, + "step": 13240 + }, + { + "epoch": 36.986033519553075, + "grad_norm": 0.9472057223320007, + "learning_rate": 2.8404096834264434e-05, + "loss": 0.1178, + "step": 13241 + }, + { + "epoch": 36.98882681564246, + "grad_norm": 0.42533066868782043, + "learning_rate": 2.840372439478585e-05, + "loss": 0.0875, + "step": 13242 + }, + { + "epoch": 36.99162011173184, + "grad_norm": 0.6571608185768127, + "learning_rate": 2.840335195530726e-05, + "loss": 0.1159, + "step": 13243 + }, + { + "epoch": 36.99441340782123, + "grad_norm": 1.0682183504104614, + "learning_rate": 2.8402979515828677e-05, + "loss": 0.1039, + "step": 13244 + }, + { + "epoch": 36.997206703910614, + "grad_norm": 1.3828575611114502, + "learning_rate": 2.8402607076350094e-05, + "loss": 0.0998, + "step": 13245 + }, + { + "epoch": 37.0, + "grad_norm": 0.8961092233657837, + "learning_rate": 2.840223463687151e-05, + "loss": 0.0814, + "step": 13246 + }, + { + "epoch": 37.002793296089386, + "grad_norm": 0.74761563539505, + "learning_rate": 2.8401862197392923e-05, + "loss": 0.1385, + "step": 13247 + }, + { + "epoch": 37.00558659217877, + "grad_norm": 0.37932631373405457, + "learning_rate": 2.840148975791434e-05, + "loss": 0.1189, + "step": 13248 + }, + { + "epoch": 37.00837988826816, + "grad_norm": 0.43780866265296936, + "learning_rate": 2.8401117318435756e-05, + "loss": 0.1135, + "step": 13249 + }, + { + "epoch": 37.01117318435754, + "grad_norm": 0.3957724869251251, + "learning_rate": 2.840074487895717e-05, + "loss": 0.1077, + "step": 13250 + }, + { + "epoch": 37.013966480446925, + "grad_norm": 0.9262292981147766, + "learning_rate": 2.8400372439478586e-05, + "loss": 0.1378, + "step": 13251 + }, + { + "epoch": 37.01675977653631, + "grad_norm": 0.46259424090385437, + "learning_rate": 2.84e-05, + "loss": 0.1103, + "step": 13252 + }, + { + "epoch": 37.0195530726257, + "grad_norm": 0.5878164768218994, + "learning_rate": 2.8399627560521416e-05, + "loss": 0.1142, + "step": 13253 + }, + { + "epoch": 37.022346368715084, + "grad_norm": 0.6430202722549438, + "learning_rate": 2.8399255121042832e-05, + "loss": 0.1042, + "step": 13254 + }, + { + "epoch": 37.02513966480447, + "grad_norm": 0.4991042912006378, + "learning_rate": 2.839888268156425e-05, + "loss": 0.0864, + "step": 13255 + }, + { + "epoch": 37.02793296089386, + "grad_norm": 0.45447537302970886, + "learning_rate": 2.839851024208566e-05, + "loss": 0.123, + "step": 13256 + }, + { + "epoch": 37.03072625698324, + "grad_norm": 0.37202754616737366, + "learning_rate": 2.8398137802607075e-05, + "loss": 0.0912, + "step": 13257 + }, + { + "epoch": 37.03351955307262, + "grad_norm": 0.5839473605155945, + "learning_rate": 2.839776536312849e-05, + "loss": 0.0897, + "step": 13258 + }, + { + "epoch": 37.03631284916201, + "grad_norm": 0.3872879445552826, + "learning_rate": 2.8397392923649908e-05, + "loss": 0.0834, + "step": 13259 + }, + { + "epoch": 37.039106145251395, + "grad_norm": 0.910811185836792, + "learning_rate": 2.8397020484171325e-05, + "loss": 0.0976, + "step": 13260 + }, + { + "epoch": 37.04189944134078, + "grad_norm": 0.5208591222763062, + "learning_rate": 2.8396648044692738e-05, + "loss": 0.0856, + "step": 13261 + }, + { + "epoch": 37.04469273743017, + "grad_norm": 0.35912981629371643, + "learning_rate": 2.8396275605214154e-05, + "loss": 0.0799, + "step": 13262 + }, + { + "epoch": 37.047486033519554, + "grad_norm": 0.6543576717376709, + "learning_rate": 2.8395903165735567e-05, + "loss": 0.0988, + "step": 13263 + }, + { + "epoch": 37.05027932960894, + "grad_norm": 1.3767013549804688, + "learning_rate": 2.8395530726256984e-05, + "loss": 0.0887, + "step": 13264 + }, + { + "epoch": 37.05307262569833, + "grad_norm": 0.424280047416687, + "learning_rate": 2.8395158286778397e-05, + "loss": 0.0784, + "step": 13265 + }, + { + "epoch": 37.055865921787706, + "grad_norm": 0.49617505073547363, + "learning_rate": 2.8394785847299814e-05, + "loss": 0.0757, + "step": 13266 + }, + { + "epoch": 37.05865921787709, + "grad_norm": 0.46867772936820984, + "learning_rate": 2.839441340782123e-05, + "loss": 0.0662, + "step": 13267 + }, + { + "epoch": 37.06145251396648, + "grad_norm": 0.684806764125824, + "learning_rate": 2.8394040968342647e-05, + "loss": 0.0799, + "step": 13268 + }, + { + "epoch": 37.064245810055866, + "grad_norm": 0.6075620651245117, + "learning_rate": 2.8393668528864063e-05, + "loss": 0.0686, + "step": 13269 + }, + { + "epoch": 37.06703910614525, + "grad_norm": 1.639358401298523, + "learning_rate": 2.8393296089385473e-05, + "loss": 0.0766, + "step": 13270 + }, + { + "epoch": 37.06983240223464, + "grad_norm": 0.9062066674232483, + "learning_rate": 2.839292364990689e-05, + "loss": 0.0985, + "step": 13271 + }, + { + "epoch": 37.072625698324025, + "grad_norm": 0.5305954217910767, + "learning_rate": 2.8392551210428306e-05, + "loss": 0.143, + "step": 13272 + }, + { + "epoch": 37.07541899441341, + "grad_norm": 0.38107144832611084, + "learning_rate": 2.8392178770949723e-05, + "loss": 0.1104, + "step": 13273 + }, + { + "epoch": 37.07821229050279, + "grad_norm": 0.5925142168998718, + "learning_rate": 2.8391806331471136e-05, + "loss": 0.12, + "step": 13274 + }, + { + "epoch": 37.08100558659218, + "grad_norm": 0.45266959071159363, + "learning_rate": 2.8391433891992552e-05, + "loss": 0.1267, + "step": 13275 + }, + { + "epoch": 37.08379888268156, + "grad_norm": 0.48352015018463135, + "learning_rate": 2.8391061452513965e-05, + "loss": 0.0999, + "step": 13276 + }, + { + "epoch": 37.08659217877095, + "grad_norm": 0.388950377702713, + "learning_rate": 2.8390689013035382e-05, + "loss": 0.1228, + "step": 13277 + }, + { + "epoch": 37.089385474860336, + "grad_norm": 0.5266644358634949, + "learning_rate": 2.83903165735568e-05, + "loss": 0.1099, + "step": 13278 + }, + { + "epoch": 37.09217877094972, + "grad_norm": 0.5096754431724548, + "learning_rate": 2.8389944134078212e-05, + "loss": 0.1007, + "step": 13279 + }, + { + "epoch": 37.09497206703911, + "grad_norm": 0.6119211912155151, + "learning_rate": 2.8389571694599628e-05, + "loss": 0.1055, + "step": 13280 + }, + { + "epoch": 37.097765363128495, + "grad_norm": 0.44630804657936096, + "learning_rate": 2.8389199255121045e-05, + "loss": 0.1079, + "step": 13281 + }, + { + "epoch": 37.100558659217874, + "grad_norm": 0.4275634288787842, + "learning_rate": 2.838882681564246e-05, + "loss": 0.1094, + "step": 13282 + }, + { + "epoch": 37.10335195530726, + "grad_norm": 0.4561696946620941, + "learning_rate": 2.838845437616387e-05, + "loss": 0.0932, + "step": 13283 + }, + { + "epoch": 37.10614525139665, + "grad_norm": 0.5482856631278992, + "learning_rate": 2.8388081936685288e-05, + "loss": 0.0911, + "step": 13284 + }, + { + "epoch": 37.108938547486034, + "grad_norm": 0.9299963116645813, + "learning_rate": 2.8387709497206704e-05, + "loss": 0.0902, + "step": 13285 + }, + { + "epoch": 37.11173184357542, + "grad_norm": 0.4009720981121063, + "learning_rate": 2.838733705772812e-05, + "loss": 0.0824, + "step": 13286 + }, + { + "epoch": 37.114525139664806, + "grad_norm": 0.9100480079650879, + "learning_rate": 2.8386964618249537e-05, + "loss": 0.0713, + "step": 13287 + }, + { + "epoch": 37.11731843575419, + "grad_norm": 0.6844253540039062, + "learning_rate": 2.838659217877095e-05, + "loss": 0.0746, + "step": 13288 + }, + { + "epoch": 37.12011173184357, + "grad_norm": 0.5541043877601624, + "learning_rate": 2.8386219739292367e-05, + "loss": 0.075, + "step": 13289 + }, + { + "epoch": 37.12290502793296, + "grad_norm": 2.5420081615448, + "learning_rate": 2.838584729981378e-05, + "loss": 0.0753, + "step": 13290 + }, + { + "epoch": 37.125698324022345, + "grad_norm": 0.4890208840370178, + "learning_rate": 2.8385474860335197e-05, + "loss": 0.074, + "step": 13291 + }, + { + "epoch": 37.12849162011173, + "grad_norm": 0.865363359451294, + "learning_rate": 2.838510242085661e-05, + "loss": 0.082, + "step": 13292 + }, + { + "epoch": 37.13128491620112, + "grad_norm": 1.2129333019256592, + "learning_rate": 2.8384729981378026e-05, + "loss": 0.1015, + "step": 13293 + }, + { + "epoch": 37.134078212290504, + "grad_norm": 1.9394524097442627, + "learning_rate": 2.8384357541899443e-05, + "loss": 0.0688, + "step": 13294 + }, + { + "epoch": 37.13687150837989, + "grad_norm": 1.8177286386489868, + "learning_rate": 2.838398510242086e-05, + "loss": 0.0943, + "step": 13295 + }, + { + "epoch": 37.13966480446928, + "grad_norm": 0.7867361307144165, + "learning_rate": 2.8383612662942272e-05, + "loss": 0.1062, + "step": 13296 + }, + { + "epoch": 37.142458100558656, + "grad_norm": 0.5121132731437683, + "learning_rate": 2.8383240223463686e-05, + "loss": 0.1339, + "step": 13297 + }, + { + "epoch": 37.14525139664804, + "grad_norm": 0.46477118134498596, + "learning_rate": 2.8382867783985102e-05, + "loss": 0.1315, + "step": 13298 + }, + { + "epoch": 37.14804469273743, + "grad_norm": 0.4613207280635834, + "learning_rate": 2.838249534450652e-05, + "loss": 0.1268, + "step": 13299 + }, + { + "epoch": 37.150837988826815, + "grad_norm": 1.3940175771713257, + "learning_rate": 2.8382122905027935e-05, + "loss": 0.1207, + "step": 13300 + }, + { + "epoch": 37.1536312849162, + "grad_norm": 0.4325193762779236, + "learning_rate": 2.838175046554935e-05, + "loss": 0.0965, + "step": 13301 + }, + { + "epoch": 37.15642458100559, + "grad_norm": 0.415004163980484, + "learning_rate": 2.8381378026070765e-05, + "loss": 0.1054, + "step": 13302 + }, + { + "epoch": 37.159217877094974, + "grad_norm": 0.3888610005378723, + "learning_rate": 2.8381005586592178e-05, + "loss": 0.1094, + "step": 13303 + }, + { + "epoch": 37.16201117318436, + "grad_norm": 0.5196507573127747, + "learning_rate": 2.8380633147113595e-05, + "loss": 0.112, + "step": 13304 + }, + { + "epoch": 37.16480446927374, + "grad_norm": 0.49852314591407776, + "learning_rate": 2.838026070763501e-05, + "loss": 0.0795, + "step": 13305 + }, + { + "epoch": 37.167597765363126, + "grad_norm": 0.7088412046432495, + "learning_rate": 2.8379888268156424e-05, + "loss": 0.0973, + "step": 13306 + }, + { + "epoch": 37.17039106145251, + "grad_norm": 0.3519684076309204, + "learning_rate": 2.837951582867784e-05, + "loss": 0.0835, + "step": 13307 + }, + { + "epoch": 37.1731843575419, + "grad_norm": 0.6783422827720642, + "learning_rate": 2.8379143389199257e-05, + "loss": 0.0806, + "step": 13308 + }, + { + "epoch": 37.175977653631286, + "grad_norm": 0.7427796125411987, + "learning_rate": 2.8378770949720674e-05, + "loss": 0.0869, + "step": 13309 + }, + { + "epoch": 37.17877094972067, + "grad_norm": 0.6161222457885742, + "learning_rate": 2.8378398510242084e-05, + "loss": 0.0739, + "step": 13310 + }, + { + "epoch": 37.18156424581006, + "grad_norm": 0.5002378225326538, + "learning_rate": 2.83780260707635e-05, + "loss": 0.0853, + "step": 13311 + }, + { + "epoch": 37.184357541899445, + "grad_norm": 0.4288215637207031, + "learning_rate": 2.8377653631284917e-05, + "loss": 0.0827, + "step": 13312 + }, + { + "epoch": 37.187150837988824, + "grad_norm": 0.5347750782966614, + "learning_rate": 2.8377281191806333e-05, + "loss": 0.0902, + "step": 13313 + }, + { + "epoch": 37.18994413407821, + "grad_norm": 1.5278944969177246, + "learning_rate": 2.837690875232775e-05, + "loss": 0.0627, + "step": 13314 + }, + { + "epoch": 37.1927374301676, + "grad_norm": 0.489658385515213, + "learning_rate": 2.8376536312849163e-05, + "loss": 0.0789, + "step": 13315 + }, + { + "epoch": 37.19553072625698, + "grad_norm": 0.5824589133262634, + "learning_rate": 2.8376163873370576e-05, + "loss": 0.0842, + "step": 13316 + }, + { + "epoch": 37.19832402234637, + "grad_norm": 0.7069962620735168, + "learning_rate": 2.8375791433891993e-05, + "loss": 0.0673, + "step": 13317 + }, + { + "epoch": 37.201117318435756, + "grad_norm": 0.6756222248077393, + "learning_rate": 2.837541899441341e-05, + "loss": 0.0781, + "step": 13318 + }, + { + "epoch": 37.20391061452514, + "grad_norm": 0.5991140007972717, + "learning_rate": 2.8375046554934822e-05, + "loss": 0.0648, + "step": 13319 + }, + { + "epoch": 37.20670391061452, + "grad_norm": 1.3049763441085815, + "learning_rate": 2.837467411545624e-05, + "loss": 0.0709, + "step": 13320 + }, + { + "epoch": 37.20949720670391, + "grad_norm": 1.0473449230194092, + "learning_rate": 2.8374301675977655e-05, + "loss": 0.0869, + "step": 13321 + }, + { + "epoch": 37.212290502793294, + "grad_norm": 0.6897624135017395, + "learning_rate": 2.8373929236499072e-05, + "loss": 0.1259, + "step": 13322 + }, + { + "epoch": 37.21508379888268, + "grad_norm": 0.5800973176956177, + "learning_rate": 2.8373556797020485e-05, + "loss": 0.1242, + "step": 13323 + }, + { + "epoch": 37.21787709497207, + "grad_norm": 0.3458525538444519, + "learning_rate": 2.8373184357541898e-05, + "loss": 0.1159, + "step": 13324 + }, + { + "epoch": 37.220670391061454, + "grad_norm": 0.36441630125045776, + "learning_rate": 2.8372811918063315e-05, + "loss": 0.1088, + "step": 13325 + }, + { + "epoch": 37.22346368715084, + "grad_norm": 0.48520901799201965, + "learning_rate": 2.837243947858473e-05, + "loss": 0.111, + "step": 13326 + }, + { + "epoch": 37.226256983240226, + "grad_norm": 0.5218999981880188, + "learning_rate": 2.8372067039106148e-05, + "loss": 0.0923, + "step": 13327 + }, + { + "epoch": 37.229050279329606, + "grad_norm": 0.5178213715553284, + "learning_rate": 2.837169459962756e-05, + "loss": 0.1068, + "step": 13328 + }, + { + "epoch": 37.23184357541899, + "grad_norm": 0.4273589849472046, + "learning_rate": 2.8371322160148978e-05, + "loss": 0.086, + "step": 13329 + }, + { + "epoch": 37.23463687150838, + "grad_norm": 0.32568174600601196, + "learning_rate": 2.837094972067039e-05, + "loss": 0.0961, + "step": 13330 + }, + { + "epoch": 37.237430167597765, + "grad_norm": 0.43901607394218445, + "learning_rate": 2.8370577281191807e-05, + "loss": 0.0957, + "step": 13331 + }, + { + "epoch": 37.24022346368715, + "grad_norm": 0.4905693233013153, + "learning_rate": 2.8370204841713224e-05, + "loss": 0.1086, + "step": 13332 + }, + { + "epoch": 37.24301675977654, + "grad_norm": 0.6996617317199707, + "learning_rate": 2.8369832402234637e-05, + "loss": 0.0883, + "step": 13333 + }, + { + "epoch": 37.245810055865924, + "grad_norm": 0.424746572971344, + "learning_rate": 2.8369459962756053e-05, + "loss": 0.1, + "step": 13334 + }, + { + "epoch": 37.24860335195531, + "grad_norm": 0.6246015429496765, + "learning_rate": 2.836908752327747e-05, + "loss": 0.0867, + "step": 13335 + }, + { + "epoch": 37.25139664804469, + "grad_norm": 0.7625470161437988, + "learning_rate": 2.8368715083798883e-05, + "loss": 0.091, + "step": 13336 + }, + { + "epoch": 37.254189944134076, + "grad_norm": 0.7978317141532898, + "learning_rate": 2.8368342644320296e-05, + "loss": 0.0841, + "step": 13337 + }, + { + "epoch": 37.25698324022346, + "grad_norm": 0.6554155945777893, + "learning_rate": 2.8367970204841713e-05, + "loss": 0.092, + "step": 13338 + }, + { + "epoch": 37.25977653631285, + "grad_norm": 0.9054871797561646, + "learning_rate": 2.836759776536313e-05, + "loss": 0.1006, + "step": 13339 + }, + { + "epoch": 37.262569832402235, + "grad_norm": 0.7611244916915894, + "learning_rate": 2.8367225325884546e-05, + "loss": 0.0789, + "step": 13340 + }, + { + "epoch": 37.26536312849162, + "grad_norm": 0.748970091342926, + "learning_rate": 2.836685288640596e-05, + "loss": 0.0634, + "step": 13341 + }, + { + "epoch": 37.26815642458101, + "grad_norm": 0.8926631808280945, + "learning_rate": 2.8366480446927376e-05, + "loss": 0.0833, + "step": 13342 + }, + { + "epoch": 37.270949720670394, + "grad_norm": 0.6337838768959045, + "learning_rate": 2.836610800744879e-05, + "loss": 0.0817, + "step": 13343 + }, + { + "epoch": 37.273743016759774, + "grad_norm": 0.4261005222797394, + "learning_rate": 2.8365735567970205e-05, + "loss": 0.0568, + "step": 13344 + }, + { + "epoch": 37.27653631284916, + "grad_norm": 0.7921914458274841, + "learning_rate": 2.8365363128491622e-05, + "loss": 0.0658, + "step": 13345 + }, + { + "epoch": 37.279329608938546, + "grad_norm": 1.3355543613433838, + "learning_rate": 2.8364990689013035e-05, + "loss": 0.1009, + "step": 13346 + }, + { + "epoch": 37.28212290502793, + "grad_norm": 0.6163201332092285, + "learning_rate": 2.836461824953445e-05, + "loss": 0.1288, + "step": 13347 + }, + { + "epoch": 37.28491620111732, + "grad_norm": 0.60809725522995, + "learning_rate": 2.8364245810055868e-05, + "loss": 0.1338, + "step": 13348 + }, + { + "epoch": 37.287709497206706, + "grad_norm": 0.4188610315322876, + "learning_rate": 2.8363873370577284e-05, + "loss": 0.1319, + "step": 13349 + }, + { + "epoch": 37.29050279329609, + "grad_norm": 0.6678484678268433, + "learning_rate": 2.8363500931098694e-05, + "loss": 0.1344, + "step": 13350 + }, + { + "epoch": 37.29329608938548, + "grad_norm": 0.3436428904533386, + "learning_rate": 2.836312849162011e-05, + "loss": 0.1105, + "step": 13351 + }, + { + "epoch": 37.29608938547486, + "grad_norm": 0.4123501479625702, + "learning_rate": 2.8362756052141527e-05, + "loss": 0.0983, + "step": 13352 + }, + { + "epoch": 37.298882681564244, + "grad_norm": 0.6006595492362976, + "learning_rate": 2.8362383612662944e-05, + "loss": 0.1112, + "step": 13353 + }, + { + "epoch": 37.30167597765363, + "grad_norm": 0.46118563413619995, + "learning_rate": 2.836201117318436e-05, + "loss": 0.0929, + "step": 13354 + }, + { + "epoch": 37.30446927374302, + "grad_norm": 0.3846932053565979, + "learning_rate": 2.8361638733705774e-05, + "loss": 0.1011, + "step": 13355 + }, + { + "epoch": 37.3072625698324, + "grad_norm": 0.4042539596557617, + "learning_rate": 2.8361266294227187e-05, + "loss": 0.0863, + "step": 13356 + }, + { + "epoch": 37.31005586592179, + "grad_norm": 0.6703089475631714, + "learning_rate": 2.8360893854748603e-05, + "loss": 0.089, + "step": 13357 + }, + { + "epoch": 37.312849162011176, + "grad_norm": 0.8210147619247437, + "learning_rate": 2.836052141527002e-05, + "loss": 0.0928, + "step": 13358 + }, + { + "epoch": 37.315642458100555, + "grad_norm": 0.48430782556533813, + "learning_rate": 2.8360148975791433e-05, + "loss": 0.089, + "step": 13359 + }, + { + "epoch": 37.31843575418994, + "grad_norm": 0.5108228921890259, + "learning_rate": 2.835977653631285e-05, + "loss": 0.1011, + "step": 13360 + }, + { + "epoch": 37.32122905027933, + "grad_norm": 0.4103139340877533, + "learning_rate": 2.8359404096834266e-05, + "loss": 0.0889, + "step": 13361 + }, + { + "epoch": 37.324022346368714, + "grad_norm": 0.5112071633338928, + "learning_rate": 2.8359031657355683e-05, + "loss": 0.0867, + "step": 13362 + }, + { + "epoch": 37.3268156424581, + "grad_norm": 0.6700937747955322, + "learning_rate": 2.8358659217877096e-05, + "loss": 0.081, + "step": 13363 + }, + { + "epoch": 37.32960893854749, + "grad_norm": 1.1024699211120605, + "learning_rate": 2.835828677839851e-05, + "loss": 0.093, + "step": 13364 + }, + { + "epoch": 37.332402234636874, + "grad_norm": 0.5729321837425232, + "learning_rate": 2.8357914338919925e-05, + "loss": 0.072, + "step": 13365 + }, + { + "epoch": 37.33519553072626, + "grad_norm": 0.5340880155563354, + "learning_rate": 2.8357541899441342e-05, + "loss": 0.0751, + "step": 13366 + }, + { + "epoch": 37.33798882681564, + "grad_norm": 0.6916103363037109, + "learning_rate": 2.835716945996276e-05, + "loss": 0.0762, + "step": 13367 + }, + { + "epoch": 37.340782122905026, + "grad_norm": 0.693001925945282, + "learning_rate": 2.835679702048417e-05, + "loss": 0.0707, + "step": 13368 + }, + { + "epoch": 37.34357541899441, + "grad_norm": 0.8475944995880127, + "learning_rate": 2.8356424581005588e-05, + "loss": 0.0819, + "step": 13369 + }, + { + "epoch": 37.3463687150838, + "grad_norm": 1.7849116325378418, + "learning_rate": 2.8356052141527e-05, + "loss": 0.0717, + "step": 13370 + }, + { + "epoch": 37.349162011173185, + "grad_norm": 1.116263747215271, + "learning_rate": 2.8355679702048418e-05, + "loss": 0.1079, + "step": 13371 + }, + { + "epoch": 37.35195530726257, + "grad_norm": 0.4863429069519043, + "learning_rate": 2.8355307262569834e-05, + "loss": 0.1154, + "step": 13372 + }, + { + "epoch": 37.35474860335196, + "grad_norm": 0.4517359435558319, + "learning_rate": 2.8354934823091247e-05, + "loss": 0.0943, + "step": 13373 + }, + { + "epoch": 37.357541899441344, + "grad_norm": 0.36375024914741516, + "learning_rate": 2.8354562383612664e-05, + "loss": 0.1108, + "step": 13374 + }, + { + "epoch": 37.36033519553072, + "grad_norm": 0.685482919216156, + "learning_rate": 2.835418994413408e-05, + "loss": 0.0988, + "step": 13375 + }, + { + "epoch": 37.36312849162011, + "grad_norm": 0.4224201738834381, + "learning_rate": 2.8353817504655494e-05, + "loss": 0.1146, + "step": 13376 + }, + { + "epoch": 37.365921787709496, + "grad_norm": 0.5666427612304688, + "learning_rate": 2.8353445065176907e-05, + "loss": 0.1056, + "step": 13377 + }, + { + "epoch": 37.36871508379888, + "grad_norm": 0.4976658225059509, + "learning_rate": 2.8353072625698323e-05, + "loss": 0.1166, + "step": 13378 + }, + { + "epoch": 37.37150837988827, + "grad_norm": 1.321623682975769, + "learning_rate": 2.835270018621974e-05, + "loss": 0.0955, + "step": 13379 + }, + { + "epoch": 37.374301675977655, + "grad_norm": 0.47414878010749817, + "learning_rate": 2.8352327746741156e-05, + "loss": 0.0955, + "step": 13380 + }, + { + "epoch": 37.37709497206704, + "grad_norm": 0.6215054392814636, + "learning_rate": 2.8351955307262573e-05, + "loss": 0.11, + "step": 13381 + }, + { + "epoch": 37.37988826815643, + "grad_norm": 0.4070114493370056, + "learning_rate": 2.8351582867783986e-05, + "loss": 0.1018, + "step": 13382 + }, + { + "epoch": 37.38268156424581, + "grad_norm": 0.38315749168395996, + "learning_rate": 2.83512104283054e-05, + "loss": 0.0971, + "step": 13383 + }, + { + "epoch": 37.385474860335194, + "grad_norm": 0.522167444229126, + "learning_rate": 2.8350837988826816e-05, + "loss": 0.0904, + "step": 13384 + }, + { + "epoch": 37.38826815642458, + "grad_norm": 0.3863961696624756, + "learning_rate": 2.8350465549348232e-05, + "loss": 0.0726, + "step": 13385 + }, + { + "epoch": 37.391061452513966, + "grad_norm": 1.890020489692688, + "learning_rate": 2.8350093109869645e-05, + "loss": 0.0855, + "step": 13386 + }, + { + "epoch": 37.39385474860335, + "grad_norm": 0.49436914920806885, + "learning_rate": 2.8349720670391062e-05, + "loss": 0.0852, + "step": 13387 + }, + { + "epoch": 37.39664804469274, + "grad_norm": 0.6537649035453796, + "learning_rate": 2.834934823091248e-05, + "loss": 0.068, + "step": 13388 + }, + { + "epoch": 37.399441340782126, + "grad_norm": 0.5225786566734314, + "learning_rate": 2.8348975791433895e-05, + "loss": 0.0778, + "step": 13389 + }, + { + "epoch": 37.40223463687151, + "grad_norm": 0.6574114561080933, + "learning_rate": 2.8348603351955308e-05, + "loss": 0.068, + "step": 13390 + }, + { + "epoch": 37.40502793296089, + "grad_norm": 0.6743513941764832, + "learning_rate": 2.834823091247672e-05, + "loss": 0.0831, + "step": 13391 + }, + { + "epoch": 37.40782122905028, + "grad_norm": 0.46237507462501526, + "learning_rate": 2.8347858472998138e-05, + "loss": 0.0859, + "step": 13392 + }, + { + "epoch": 37.410614525139664, + "grad_norm": 0.529401957988739, + "learning_rate": 2.8347486033519554e-05, + "loss": 0.0712, + "step": 13393 + }, + { + "epoch": 37.41340782122905, + "grad_norm": 0.7206148505210876, + "learning_rate": 2.834711359404097e-05, + "loss": 0.088, + "step": 13394 + }, + { + "epoch": 37.41620111731844, + "grad_norm": 0.673270583152771, + "learning_rate": 2.8346741154562384e-05, + "loss": 0.0656, + "step": 13395 + }, + { + "epoch": 37.41899441340782, + "grad_norm": 3.8371360301971436, + "learning_rate": 2.8346368715083797e-05, + "loss": 0.1028, + "step": 13396 + }, + { + "epoch": 37.42178770949721, + "grad_norm": 1.7300827503204346, + "learning_rate": 2.8345996275605214e-05, + "loss": 0.1199, + "step": 13397 + }, + { + "epoch": 37.42458100558659, + "grad_norm": 0.36664679646492004, + "learning_rate": 2.834562383612663e-05, + "loss": 0.1263, + "step": 13398 + }, + { + "epoch": 37.427374301675975, + "grad_norm": 0.3068758547306061, + "learning_rate": 2.8345251396648047e-05, + "loss": 0.0956, + "step": 13399 + }, + { + "epoch": 37.43016759776536, + "grad_norm": 0.503147304058075, + "learning_rate": 2.834487895716946e-05, + "loss": 0.1094, + "step": 13400 + }, + { + "epoch": 37.43296089385475, + "grad_norm": 0.7151728868484497, + "learning_rate": 2.8344506517690877e-05, + "loss": 0.1028, + "step": 13401 + }, + { + "epoch": 37.435754189944134, + "grad_norm": 0.7435734868049622, + "learning_rate": 2.8344134078212293e-05, + "loss": 0.1255, + "step": 13402 + }, + { + "epoch": 37.43854748603352, + "grad_norm": 0.472522109746933, + "learning_rate": 2.8343761638733706e-05, + "loss": 0.1107, + "step": 13403 + }, + { + "epoch": 37.44134078212291, + "grad_norm": 0.37457749247550964, + "learning_rate": 2.834338919925512e-05, + "loss": 0.0905, + "step": 13404 + }, + { + "epoch": 37.444134078212294, + "grad_norm": 0.4437308609485626, + "learning_rate": 2.8343016759776536e-05, + "loss": 0.0931, + "step": 13405 + }, + { + "epoch": 37.44692737430167, + "grad_norm": 0.4241126775741577, + "learning_rate": 2.8342644320297952e-05, + "loss": 0.0853, + "step": 13406 + }, + { + "epoch": 37.44972067039106, + "grad_norm": 0.7503301501274109, + "learning_rate": 2.834227188081937e-05, + "loss": 0.1058, + "step": 13407 + }, + { + "epoch": 37.452513966480446, + "grad_norm": 0.43189507722854614, + "learning_rate": 2.8341899441340786e-05, + "loss": 0.0872, + "step": 13408 + }, + { + "epoch": 37.45530726256983, + "grad_norm": 0.5886490941047668, + "learning_rate": 2.83415270018622e-05, + "loss": 0.0919, + "step": 13409 + }, + { + "epoch": 37.45810055865922, + "grad_norm": 0.6694846153259277, + "learning_rate": 2.8341154562383612e-05, + "loss": 0.1075, + "step": 13410 + }, + { + "epoch": 37.460893854748605, + "grad_norm": 0.5880787968635559, + "learning_rate": 2.834078212290503e-05, + "loss": 0.0863, + "step": 13411 + }, + { + "epoch": 37.46368715083799, + "grad_norm": 0.6287997364997864, + "learning_rate": 2.8340409683426445e-05, + "loss": 0.0732, + "step": 13412 + }, + { + "epoch": 37.46648044692738, + "grad_norm": 0.43379098176956177, + "learning_rate": 2.8340037243947858e-05, + "loss": 0.0837, + "step": 13413 + }, + { + "epoch": 37.46927374301676, + "grad_norm": 0.432059645652771, + "learning_rate": 2.8339664804469275e-05, + "loss": 0.0781, + "step": 13414 + }, + { + "epoch": 37.47206703910614, + "grad_norm": 0.5733720660209656, + "learning_rate": 2.833929236499069e-05, + "loss": 0.0967, + "step": 13415 + }, + { + "epoch": 37.47486033519553, + "grad_norm": 0.5086501240730286, + "learning_rate": 2.8338919925512104e-05, + "loss": 0.079, + "step": 13416 + }, + { + "epoch": 37.477653631284916, + "grad_norm": 1.2448936700820923, + "learning_rate": 2.833854748603352e-05, + "loss": 0.0589, + "step": 13417 + }, + { + "epoch": 37.4804469273743, + "grad_norm": 0.6683579683303833, + "learning_rate": 2.8338175046554934e-05, + "loss": 0.0611, + "step": 13418 + }, + { + "epoch": 37.48324022346369, + "grad_norm": 0.5866195559501648, + "learning_rate": 2.833780260707635e-05, + "loss": 0.0747, + "step": 13419 + }, + { + "epoch": 37.486033519553075, + "grad_norm": 1.6199088096618652, + "learning_rate": 2.8337430167597767e-05, + "loss": 0.0792, + "step": 13420 + }, + { + "epoch": 37.48882681564246, + "grad_norm": 1.4266573190689087, + "learning_rate": 2.8337057728119184e-05, + "loss": 0.0934, + "step": 13421 + }, + { + "epoch": 37.49162011173184, + "grad_norm": 1.9309886693954468, + "learning_rate": 2.8336685288640597e-05, + "loss": 0.1278, + "step": 13422 + }, + { + "epoch": 37.49441340782123, + "grad_norm": 0.704800009727478, + "learning_rate": 2.833631284916201e-05, + "loss": 0.1105, + "step": 13423 + }, + { + "epoch": 37.497206703910614, + "grad_norm": 0.5257163047790527, + "learning_rate": 2.8335940409683426e-05, + "loss": 0.122, + "step": 13424 + }, + { + "epoch": 37.5, + "grad_norm": 0.5658948421478271, + "learning_rate": 2.8335567970204843e-05, + "loss": 0.102, + "step": 13425 + }, + { + "epoch": 37.502793296089386, + "grad_norm": 1.2854669094085693, + "learning_rate": 2.833519553072626e-05, + "loss": 0.1175, + "step": 13426 + }, + { + "epoch": 37.50558659217877, + "grad_norm": 0.4650685787200928, + "learning_rate": 2.8334823091247673e-05, + "loss": 0.1125, + "step": 13427 + }, + { + "epoch": 37.50837988826816, + "grad_norm": 0.41859397292137146, + "learning_rate": 2.833445065176909e-05, + "loss": 0.1228, + "step": 13428 + }, + { + "epoch": 37.51117318435754, + "grad_norm": 0.8538311719894409, + "learning_rate": 2.8334078212290506e-05, + "loss": 0.0994, + "step": 13429 + }, + { + "epoch": 37.513966480446925, + "grad_norm": 0.7266781330108643, + "learning_rate": 2.833370577281192e-05, + "loss": 0.0919, + "step": 13430 + }, + { + "epoch": 37.51675977653631, + "grad_norm": 0.46352332830429077, + "learning_rate": 2.8333333333333332e-05, + "loss": 0.0963, + "step": 13431 + }, + { + "epoch": 37.5195530726257, + "grad_norm": 0.6529415249824524, + "learning_rate": 2.833296089385475e-05, + "loss": 0.0961, + "step": 13432 + }, + { + "epoch": 37.522346368715084, + "grad_norm": 0.618046224117279, + "learning_rate": 2.8332588454376165e-05, + "loss": 0.0979, + "step": 13433 + }, + { + "epoch": 37.52513966480447, + "grad_norm": 0.5989570021629333, + "learning_rate": 2.833221601489758e-05, + "loss": 0.096, + "step": 13434 + }, + { + "epoch": 37.52793296089386, + "grad_norm": 0.5621271729469299, + "learning_rate": 2.8331843575418995e-05, + "loss": 0.086, + "step": 13435 + }, + { + "epoch": 37.53072625698324, + "grad_norm": 0.4390662908554077, + "learning_rate": 2.8331471135940408e-05, + "loss": 0.0627, + "step": 13436 + }, + { + "epoch": 37.53351955307262, + "grad_norm": 0.6179614663124084, + "learning_rate": 2.8331098696461824e-05, + "loss": 0.0813, + "step": 13437 + }, + { + "epoch": 37.53631284916201, + "grad_norm": 0.4908084571361542, + "learning_rate": 2.833072625698324e-05, + "loss": 0.0697, + "step": 13438 + }, + { + "epoch": 37.539106145251395, + "grad_norm": 0.4476589262485504, + "learning_rate": 2.8330353817504657e-05, + "loss": 0.0832, + "step": 13439 + }, + { + "epoch": 37.54189944134078, + "grad_norm": 0.47580498456954956, + "learning_rate": 2.832998137802607e-05, + "loss": 0.0648, + "step": 13440 + }, + { + "epoch": 37.54469273743017, + "grad_norm": 0.5279530882835388, + "learning_rate": 2.8329608938547487e-05, + "loss": 0.081, + "step": 13441 + }, + { + "epoch": 37.547486033519554, + "grad_norm": 0.5034967064857483, + "learning_rate": 2.8329236499068904e-05, + "loss": 0.0704, + "step": 13442 + }, + { + "epoch": 37.55027932960894, + "grad_norm": 0.7681204080581665, + "learning_rate": 2.8328864059590317e-05, + "loss": 0.0901, + "step": 13443 + }, + { + "epoch": 37.55307262569833, + "grad_norm": 1.0770174264907837, + "learning_rate": 2.832849162011173e-05, + "loss": 0.0817, + "step": 13444 + }, + { + "epoch": 37.555865921787706, + "grad_norm": 0.6479112505912781, + "learning_rate": 2.8328119180633147e-05, + "loss": 0.0739, + "step": 13445 + }, + { + "epoch": 37.55865921787709, + "grad_norm": 2.0961380004882812, + "learning_rate": 2.8327746741154563e-05, + "loss": 0.1462, + "step": 13446 + }, + { + "epoch": 37.56145251396648, + "grad_norm": 0.4542543888092041, + "learning_rate": 2.832737430167598e-05, + "loss": 0.1317, + "step": 13447 + }, + { + "epoch": 37.564245810055866, + "grad_norm": 0.42304617166519165, + "learning_rate": 2.8327001862197396e-05, + "loss": 0.1034, + "step": 13448 + }, + { + "epoch": 37.56703910614525, + "grad_norm": 2.2587661743164062, + "learning_rate": 2.832662942271881e-05, + "loss": 0.1152, + "step": 13449 + }, + { + "epoch": 37.56983240223464, + "grad_norm": 0.6085516214370728, + "learning_rate": 2.8326256983240222e-05, + "loss": 0.1089, + "step": 13450 + }, + { + "epoch": 37.572625698324025, + "grad_norm": 0.7562211751937866, + "learning_rate": 2.832588454376164e-05, + "loss": 0.139, + "step": 13451 + }, + { + "epoch": 37.57541899441341, + "grad_norm": 0.7050812244415283, + "learning_rate": 2.8325512104283056e-05, + "loss": 0.1198, + "step": 13452 + }, + { + "epoch": 37.57821229050279, + "grad_norm": 2.613920211791992, + "learning_rate": 2.832513966480447e-05, + "loss": 0.0951, + "step": 13453 + }, + { + "epoch": 37.58100558659218, + "grad_norm": 0.5703511238098145, + "learning_rate": 2.8324767225325885e-05, + "loss": 0.1243, + "step": 13454 + }, + { + "epoch": 37.58379888268156, + "grad_norm": 0.6454934477806091, + "learning_rate": 2.8324394785847302e-05, + "loss": 0.1023, + "step": 13455 + }, + { + "epoch": 37.58659217877095, + "grad_norm": 0.364760160446167, + "learning_rate": 2.8324022346368715e-05, + "loss": 0.1019, + "step": 13456 + }, + { + "epoch": 37.589385474860336, + "grad_norm": 0.5709313154220581, + "learning_rate": 2.832364990689013e-05, + "loss": 0.098, + "step": 13457 + }, + { + "epoch": 37.59217877094972, + "grad_norm": 0.5588530898094177, + "learning_rate": 2.8323277467411545e-05, + "loss": 0.0932, + "step": 13458 + }, + { + "epoch": 37.59497206703911, + "grad_norm": 0.7134435176849365, + "learning_rate": 2.832290502793296e-05, + "loss": 0.0909, + "step": 13459 + }, + { + "epoch": 37.59776536312849, + "grad_norm": 0.5087964534759521, + "learning_rate": 2.8322532588454378e-05, + "loss": 0.0866, + "step": 13460 + }, + { + "epoch": 37.600558659217874, + "grad_norm": 2.167469024658203, + "learning_rate": 2.8322160148975794e-05, + "loss": 0.0873, + "step": 13461 + }, + { + "epoch": 37.60335195530726, + "grad_norm": 0.5431972146034241, + "learning_rate": 2.8321787709497207e-05, + "loss": 0.0714, + "step": 13462 + }, + { + "epoch": 37.60614525139665, + "grad_norm": 0.6218196153640747, + "learning_rate": 2.832141527001862e-05, + "loss": 0.0896, + "step": 13463 + }, + { + "epoch": 37.608938547486034, + "grad_norm": 0.6366216540336609, + "learning_rate": 2.8321042830540037e-05, + "loss": 0.0856, + "step": 13464 + }, + { + "epoch": 37.61173184357542, + "grad_norm": 0.6926112771034241, + "learning_rate": 2.8320670391061454e-05, + "loss": 0.083, + "step": 13465 + }, + { + "epoch": 37.614525139664806, + "grad_norm": 1.5821130275726318, + "learning_rate": 2.832029795158287e-05, + "loss": 0.0878, + "step": 13466 + }, + { + "epoch": 37.61731843575419, + "grad_norm": 0.8636652827262878, + "learning_rate": 2.8319925512104283e-05, + "loss": 0.0942, + "step": 13467 + }, + { + "epoch": 37.62011173184357, + "grad_norm": 0.61641526222229, + "learning_rate": 2.83195530726257e-05, + "loss": 0.0624, + "step": 13468 + }, + { + "epoch": 37.62290502793296, + "grad_norm": 0.6826140880584717, + "learning_rate": 2.8319180633147116e-05, + "loss": 0.0932, + "step": 13469 + }, + { + "epoch": 37.625698324022345, + "grad_norm": 1.7921136617660522, + "learning_rate": 2.831880819366853e-05, + "loss": 0.0746, + "step": 13470 + }, + { + "epoch": 37.62849162011173, + "grad_norm": 1.4691777229309082, + "learning_rate": 2.8318435754189943e-05, + "loss": 0.0733, + "step": 13471 + }, + { + "epoch": 37.63128491620112, + "grad_norm": 2.1894166469573975, + "learning_rate": 2.831806331471136e-05, + "loss": 0.1993, + "step": 13472 + }, + { + "epoch": 37.634078212290504, + "grad_norm": 0.7527551054954529, + "learning_rate": 2.8317690875232776e-05, + "loss": 0.1275, + "step": 13473 + }, + { + "epoch": 37.63687150837989, + "grad_norm": 0.49783098697662354, + "learning_rate": 2.8317318435754192e-05, + "loss": 0.1285, + "step": 13474 + }, + { + "epoch": 37.63966480446928, + "grad_norm": 0.5186768174171448, + "learning_rate": 2.831694599627561e-05, + "loss": 0.1161, + "step": 13475 + }, + { + "epoch": 37.642458100558656, + "grad_norm": 0.4457460641860962, + "learning_rate": 2.831657355679702e-05, + "loss": 0.1206, + "step": 13476 + }, + { + "epoch": 37.64525139664804, + "grad_norm": 0.4691607356071472, + "learning_rate": 2.8316201117318435e-05, + "loss": 0.1025, + "step": 13477 + }, + { + "epoch": 37.64804469273743, + "grad_norm": 0.4640813171863556, + "learning_rate": 2.831582867783985e-05, + "loss": 0.0891, + "step": 13478 + }, + { + "epoch": 37.650837988826815, + "grad_norm": 0.5140835642814636, + "learning_rate": 2.8315456238361268e-05, + "loss": 0.0927, + "step": 13479 + }, + { + "epoch": 37.6536312849162, + "grad_norm": 0.53755122423172, + "learning_rate": 2.831508379888268e-05, + "loss": 0.1013, + "step": 13480 + }, + { + "epoch": 37.65642458100559, + "grad_norm": 0.5389484167098999, + "learning_rate": 2.8314711359404098e-05, + "loss": 0.0797, + "step": 13481 + }, + { + "epoch": 37.659217877094974, + "grad_norm": 0.45718449354171753, + "learning_rate": 2.8314338919925514e-05, + "loss": 0.0992, + "step": 13482 + }, + { + "epoch": 37.66201117318436, + "grad_norm": 0.4195377230644226, + "learning_rate": 2.8313966480446927e-05, + "loss": 0.0825, + "step": 13483 + }, + { + "epoch": 37.66480446927374, + "grad_norm": 0.6377631425857544, + "learning_rate": 2.8313594040968344e-05, + "loss": 0.088, + "step": 13484 + }, + { + "epoch": 37.667597765363126, + "grad_norm": 0.6691291332244873, + "learning_rate": 2.8313221601489757e-05, + "loss": 0.121, + "step": 13485 + }, + { + "epoch": 37.67039106145251, + "grad_norm": 0.5260879993438721, + "learning_rate": 2.8312849162011174e-05, + "loss": 0.0961, + "step": 13486 + }, + { + "epoch": 37.6731843575419, + "grad_norm": 0.5460242033004761, + "learning_rate": 2.831247672253259e-05, + "loss": 0.0994, + "step": 13487 + }, + { + "epoch": 37.675977653631286, + "grad_norm": 2.0270771980285645, + "learning_rate": 2.8312104283054007e-05, + "loss": 0.0847, + "step": 13488 + }, + { + "epoch": 37.67877094972067, + "grad_norm": 0.4517759084701538, + "learning_rate": 2.831173184357542e-05, + "loss": 0.0899, + "step": 13489 + }, + { + "epoch": 37.68156424581006, + "grad_norm": 0.40351763367652893, + "learning_rate": 2.8311359404096833e-05, + "loss": 0.0745, + "step": 13490 + }, + { + "epoch": 37.684357541899445, + "grad_norm": 0.5966194272041321, + "learning_rate": 2.831098696461825e-05, + "loss": 0.0867, + "step": 13491 + }, + { + "epoch": 37.687150837988824, + "grad_norm": 0.5589469075202942, + "learning_rate": 2.8310614525139666e-05, + "loss": 0.0737, + "step": 13492 + }, + { + "epoch": 37.68994413407821, + "grad_norm": 1.3126776218414307, + "learning_rate": 2.8310242085661083e-05, + "loss": 0.0851, + "step": 13493 + }, + { + "epoch": 37.6927374301676, + "grad_norm": 1.3814119100570679, + "learning_rate": 2.8309869646182496e-05, + "loss": 0.0877, + "step": 13494 + }, + { + "epoch": 37.69553072625698, + "grad_norm": 1.1588068008422852, + "learning_rate": 2.8309497206703912e-05, + "loss": 0.0776, + "step": 13495 + }, + { + "epoch": 37.69832402234637, + "grad_norm": 0.6683679223060608, + "learning_rate": 2.8309124767225325e-05, + "loss": 0.0942, + "step": 13496 + }, + { + "epoch": 37.701117318435756, + "grad_norm": 0.5763810873031616, + "learning_rate": 2.8308752327746742e-05, + "loss": 0.1245, + "step": 13497 + }, + { + "epoch": 37.70391061452514, + "grad_norm": 0.3733951449394226, + "learning_rate": 2.8308379888268155e-05, + "loss": 0.1116, + "step": 13498 + }, + { + "epoch": 37.70670391061452, + "grad_norm": 0.5441080331802368, + "learning_rate": 2.830800744878957e-05, + "loss": 0.1028, + "step": 13499 + }, + { + "epoch": 37.70949720670391, + "grad_norm": 0.721950888633728, + "learning_rate": 2.8307635009310988e-05, + "loss": 0.1203, + "step": 13500 + }, + { + "epoch": 37.712290502793294, + "grad_norm": 1.3386579751968384, + "learning_rate": 2.8307262569832405e-05, + "loss": 0.1179, + "step": 13501 + }, + { + "epoch": 37.71508379888268, + "grad_norm": 0.4122866988182068, + "learning_rate": 2.830689013035382e-05, + "loss": 0.0993, + "step": 13502 + }, + { + "epoch": 37.71787709497207, + "grad_norm": 0.8496657609939575, + "learning_rate": 2.830651769087523e-05, + "loss": 0.1042, + "step": 13503 + }, + { + "epoch": 37.720670391061454, + "grad_norm": 0.42184126377105713, + "learning_rate": 2.8306145251396648e-05, + "loss": 0.1155, + "step": 13504 + }, + { + "epoch": 37.72346368715084, + "grad_norm": 0.4997349977493286, + "learning_rate": 2.8305772811918064e-05, + "loss": 0.0942, + "step": 13505 + }, + { + "epoch": 37.726256983240226, + "grad_norm": 0.8263510465621948, + "learning_rate": 2.830540037243948e-05, + "loss": 0.0947, + "step": 13506 + }, + { + "epoch": 37.729050279329606, + "grad_norm": 0.4532104730606079, + "learning_rate": 2.8305027932960894e-05, + "loss": 0.093, + "step": 13507 + }, + { + "epoch": 37.73184357541899, + "grad_norm": 0.6759839653968811, + "learning_rate": 2.830465549348231e-05, + "loss": 0.101, + "step": 13508 + }, + { + "epoch": 37.73463687150838, + "grad_norm": 0.47681787610054016, + "learning_rate": 2.8304283054003727e-05, + "loss": 0.0886, + "step": 13509 + }, + { + "epoch": 37.737430167597765, + "grad_norm": 0.7991171479225159, + "learning_rate": 2.830391061452514e-05, + "loss": 0.0772, + "step": 13510 + }, + { + "epoch": 37.74022346368715, + "grad_norm": 0.5240347981452942, + "learning_rate": 2.8303538175046557e-05, + "loss": 0.0871, + "step": 13511 + }, + { + "epoch": 37.74301675977654, + "grad_norm": 1.0193067789077759, + "learning_rate": 2.830316573556797e-05, + "loss": 0.0934, + "step": 13512 + }, + { + "epoch": 37.745810055865924, + "grad_norm": 0.5378578305244446, + "learning_rate": 2.8302793296089386e-05, + "loss": 0.074, + "step": 13513 + }, + { + "epoch": 37.74860335195531, + "grad_norm": 0.5372198820114136, + "learning_rate": 2.8302420856610803e-05, + "loss": 0.0699, + "step": 13514 + }, + { + "epoch": 37.75139664804469, + "grad_norm": 1.436480164527893, + "learning_rate": 2.830204841713222e-05, + "loss": 0.084, + "step": 13515 + }, + { + "epoch": 37.754189944134076, + "grad_norm": 0.9659476280212402, + "learning_rate": 2.830167597765363e-05, + "loss": 0.0992, + "step": 13516 + }, + { + "epoch": 37.75698324022346, + "grad_norm": 0.6202499270439148, + "learning_rate": 2.8301303538175046e-05, + "loss": 0.089, + "step": 13517 + }, + { + "epoch": 37.75977653631285, + "grad_norm": 0.5778876543045044, + "learning_rate": 2.8300931098696462e-05, + "loss": 0.0689, + "step": 13518 + }, + { + "epoch": 37.762569832402235, + "grad_norm": 0.6211645007133484, + "learning_rate": 2.830055865921788e-05, + "loss": 0.0678, + "step": 13519 + }, + { + "epoch": 37.76536312849162, + "grad_norm": 1.2204548120498657, + "learning_rate": 2.8300186219739295e-05, + "loss": 0.0869, + "step": 13520 + }, + { + "epoch": 37.76815642458101, + "grad_norm": 1.3301259279251099, + "learning_rate": 2.829981378026071e-05, + "loss": 0.1025, + "step": 13521 + }, + { + "epoch": 37.770949720670394, + "grad_norm": 0.6454889178276062, + "learning_rate": 2.8299441340782125e-05, + "loss": 0.1185, + "step": 13522 + }, + { + "epoch": 37.773743016759774, + "grad_norm": 0.678033709526062, + "learning_rate": 2.8299068901303538e-05, + "loss": 0.1458, + "step": 13523 + }, + { + "epoch": 37.77653631284916, + "grad_norm": 1.3174045085906982, + "learning_rate": 2.8298696461824955e-05, + "loss": 0.1204, + "step": 13524 + }, + { + "epoch": 37.779329608938546, + "grad_norm": 0.5161803960800171, + "learning_rate": 2.8298324022346368e-05, + "loss": 0.0941, + "step": 13525 + }, + { + "epoch": 37.78212290502793, + "grad_norm": 3.0800626277923584, + "learning_rate": 2.8297951582867784e-05, + "loss": 0.1096, + "step": 13526 + }, + { + "epoch": 37.78491620111732, + "grad_norm": 0.6972345113754272, + "learning_rate": 2.82975791433892e-05, + "loss": 0.1123, + "step": 13527 + }, + { + "epoch": 37.787709497206706, + "grad_norm": 1.100494384765625, + "learning_rate": 2.8297206703910617e-05, + "loss": 0.1191, + "step": 13528 + }, + { + "epoch": 37.79050279329609, + "grad_norm": 0.4266567826271057, + "learning_rate": 2.829683426443203e-05, + "loss": 0.1074, + "step": 13529 + }, + { + "epoch": 37.79329608938548, + "grad_norm": 1.1610013246536255, + "learning_rate": 2.8296461824953444e-05, + "loss": 0.1241, + "step": 13530 + }, + { + "epoch": 37.79608938547486, + "grad_norm": 0.9091783165931702, + "learning_rate": 2.829608938547486e-05, + "loss": 0.097, + "step": 13531 + }, + { + "epoch": 37.798882681564244, + "grad_norm": 0.601504385471344, + "learning_rate": 2.8295716945996277e-05, + "loss": 0.1166, + "step": 13532 + }, + { + "epoch": 37.80167597765363, + "grad_norm": 0.9661040306091309, + "learning_rate": 2.8295344506517693e-05, + "loss": 0.098, + "step": 13533 + }, + { + "epoch": 37.80446927374302, + "grad_norm": 0.47214701771736145, + "learning_rate": 2.8294972067039106e-05, + "loss": 0.0988, + "step": 13534 + }, + { + "epoch": 37.8072625698324, + "grad_norm": 0.49803289771080017, + "learning_rate": 2.8294599627560523e-05, + "loss": 0.0838, + "step": 13535 + }, + { + "epoch": 37.81005586592179, + "grad_norm": 0.5302631258964539, + "learning_rate": 2.8294227188081936e-05, + "loss": 0.0729, + "step": 13536 + }, + { + "epoch": 37.812849162011176, + "grad_norm": 0.8785151839256287, + "learning_rate": 2.8293854748603353e-05, + "loss": 0.0772, + "step": 13537 + }, + { + "epoch": 37.815642458100555, + "grad_norm": 0.7076191306114197, + "learning_rate": 2.8293482309124766e-05, + "loss": 0.0923, + "step": 13538 + }, + { + "epoch": 37.81843575418994, + "grad_norm": 0.60430508852005, + "learning_rate": 2.8293109869646182e-05, + "loss": 0.0891, + "step": 13539 + }, + { + "epoch": 37.82122905027933, + "grad_norm": 1.816399335861206, + "learning_rate": 2.82927374301676e-05, + "loss": 0.0803, + "step": 13540 + }, + { + "epoch": 37.824022346368714, + "grad_norm": 0.838579535484314, + "learning_rate": 2.8292364990689015e-05, + "loss": 0.1021, + "step": 13541 + }, + { + "epoch": 37.8268156424581, + "grad_norm": 0.48321452736854553, + "learning_rate": 2.8291992551210432e-05, + "loss": 0.0738, + "step": 13542 + }, + { + "epoch": 37.82960893854749, + "grad_norm": 2.8692708015441895, + "learning_rate": 2.829162011173184e-05, + "loss": 0.0862, + "step": 13543 + }, + { + "epoch": 37.832402234636874, + "grad_norm": 0.587476909160614, + "learning_rate": 2.8291247672253258e-05, + "loss": 0.0766, + "step": 13544 + }, + { + "epoch": 37.83519553072626, + "grad_norm": 1.1105986833572388, + "learning_rate": 2.8290875232774675e-05, + "loss": 0.072, + "step": 13545 + }, + { + "epoch": 37.83798882681564, + "grad_norm": 2.053555488586426, + "learning_rate": 2.829050279329609e-05, + "loss": 0.115, + "step": 13546 + }, + { + "epoch": 37.840782122905026, + "grad_norm": 0.5315178632736206, + "learning_rate": 2.8290130353817504e-05, + "loss": 0.1307, + "step": 13547 + }, + { + "epoch": 37.84357541899441, + "grad_norm": 0.47529634833335876, + "learning_rate": 2.828975791433892e-05, + "loss": 0.1158, + "step": 13548 + }, + { + "epoch": 37.8463687150838, + "grad_norm": 0.3512713313102722, + "learning_rate": 2.8289385474860337e-05, + "loss": 0.1268, + "step": 13549 + }, + { + "epoch": 37.849162011173185, + "grad_norm": 0.44737356901168823, + "learning_rate": 2.828901303538175e-05, + "loss": 0.1351, + "step": 13550 + }, + { + "epoch": 37.85195530726257, + "grad_norm": 0.6788381338119507, + "learning_rate": 2.8288640595903167e-05, + "loss": 0.1074, + "step": 13551 + }, + { + "epoch": 37.85474860335196, + "grad_norm": 0.9020258188247681, + "learning_rate": 2.828826815642458e-05, + "loss": 0.1038, + "step": 13552 + }, + { + "epoch": 37.857541899441344, + "grad_norm": 0.4978260397911072, + "learning_rate": 2.8287895716945997e-05, + "loss": 0.1096, + "step": 13553 + }, + { + "epoch": 37.86033519553072, + "grad_norm": 0.9137853980064392, + "learning_rate": 2.8287523277467413e-05, + "loss": 0.0982, + "step": 13554 + }, + { + "epoch": 37.86312849162011, + "grad_norm": 0.5086469650268555, + "learning_rate": 2.828715083798883e-05, + "loss": 0.1034, + "step": 13555 + }, + { + "epoch": 37.865921787709496, + "grad_norm": 0.7379629015922546, + "learning_rate": 2.828677839851024e-05, + "loss": 0.1085, + "step": 13556 + }, + { + "epoch": 37.86871508379888, + "grad_norm": 1.3744231462478638, + "learning_rate": 2.8286405959031656e-05, + "loss": 0.0843, + "step": 13557 + }, + { + "epoch": 37.87150837988827, + "grad_norm": 0.49118366837501526, + "learning_rate": 2.8286033519553073e-05, + "loss": 0.1005, + "step": 13558 + }, + { + "epoch": 37.874301675977655, + "grad_norm": 0.5433399081230164, + "learning_rate": 2.828566108007449e-05, + "loss": 0.0903, + "step": 13559 + }, + { + "epoch": 37.87709497206704, + "grad_norm": 0.4455433487892151, + "learning_rate": 2.8285288640595906e-05, + "loss": 0.0804, + "step": 13560 + }, + { + "epoch": 37.87988826815643, + "grad_norm": 0.52919602394104, + "learning_rate": 2.828491620111732e-05, + "loss": 0.087, + "step": 13561 + }, + { + "epoch": 37.88268156424581, + "grad_norm": 0.5937462449073792, + "learning_rate": 2.8284543761638735e-05, + "loss": 0.0739, + "step": 13562 + }, + { + "epoch": 37.885474860335194, + "grad_norm": 0.5406915545463562, + "learning_rate": 2.828417132216015e-05, + "loss": 0.0722, + "step": 13563 + }, + { + "epoch": 37.88826815642458, + "grad_norm": 0.6284484267234802, + "learning_rate": 2.8283798882681565e-05, + "loss": 0.0586, + "step": 13564 + }, + { + "epoch": 37.891061452513966, + "grad_norm": 1.0538344383239746, + "learning_rate": 2.828342644320298e-05, + "loss": 0.1021, + "step": 13565 + }, + { + "epoch": 37.89385474860335, + "grad_norm": 0.4975854754447937, + "learning_rate": 2.8283054003724395e-05, + "loss": 0.08, + "step": 13566 + }, + { + "epoch": 37.89664804469274, + "grad_norm": 0.5422156453132629, + "learning_rate": 2.828268156424581e-05, + "loss": 0.0784, + "step": 13567 + }, + { + "epoch": 37.899441340782126, + "grad_norm": 2.4349567890167236, + "learning_rate": 2.8282309124767228e-05, + "loss": 0.0901, + "step": 13568 + }, + { + "epoch": 37.90223463687151, + "grad_norm": 6.845480918884277, + "learning_rate": 2.8281936685288644e-05, + "loss": 0.0814, + "step": 13569 + }, + { + "epoch": 37.90502793296089, + "grad_norm": 1.056230902671814, + "learning_rate": 2.8281564245810054e-05, + "loss": 0.1114, + "step": 13570 + }, + { + "epoch": 37.90782122905028, + "grad_norm": 1.1407275199890137, + "learning_rate": 2.828119180633147e-05, + "loss": 0.1104, + "step": 13571 + }, + { + "epoch": 37.910614525139664, + "grad_norm": 1.0816618204116821, + "learning_rate": 2.8280819366852887e-05, + "loss": 0.1578, + "step": 13572 + }, + { + "epoch": 37.91340782122905, + "grad_norm": 0.6514893770217896, + "learning_rate": 2.8280446927374304e-05, + "loss": 0.1263, + "step": 13573 + }, + { + "epoch": 37.91620111731844, + "grad_norm": 0.3552793562412262, + "learning_rate": 2.8280074487895717e-05, + "loss": 0.1194, + "step": 13574 + }, + { + "epoch": 37.91899441340782, + "grad_norm": 0.46572446823120117, + "learning_rate": 2.8279702048417134e-05, + "loss": 0.1245, + "step": 13575 + }, + { + "epoch": 37.92178770949721, + "grad_norm": 0.40307238698005676, + "learning_rate": 2.8279329608938547e-05, + "loss": 0.1126, + "step": 13576 + }, + { + "epoch": 37.92458100558659, + "grad_norm": 0.931151270866394, + "learning_rate": 2.8278957169459963e-05, + "loss": 0.1095, + "step": 13577 + }, + { + "epoch": 37.927374301675975, + "grad_norm": 0.5486778616905212, + "learning_rate": 2.827858472998138e-05, + "loss": 0.1035, + "step": 13578 + }, + { + "epoch": 37.93016759776536, + "grad_norm": 0.2694908380508423, + "learning_rate": 2.8278212290502793e-05, + "loss": 0.0905, + "step": 13579 + }, + { + "epoch": 37.93296089385475, + "grad_norm": 0.449146568775177, + "learning_rate": 2.827783985102421e-05, + "loss": 0.1, + "step": 13580 + }, + { + "epoch": 37.935754189944134, + "grad_norm": 0.754401445388794, + "learning_rate": 2.8277467411545626e-05, + "loss": 0.1009, + "step": 13581 + }, + { + "epoch": 37.93854748603352, + "grad_norm": 0.639654815196991, + "learning_rate": 2.8277094972067042e-05, + "loss": 0.0942, + "step": 13582 + }, + { + "epoch": 37.94134078212291, + "grad_norm": 0.9236147403717041, + "learning_rate": 2.8276722532588452e-05, + "loss": 0.0889, + "step": 13583 + }, + { + "epoch": 37.944134078212294, + "grad_norm": 1.026513695716858, + "learning_rate": 2.827635009310987e-05, + "loss": 0.0894, + "step": 13584 + }, + { + "epoch": 37.94692737430167, + "grad_norm": 0.43466776609420776, + "learning_rate": 2.8275977653631285e-05, + "loss": 0.0806, + "step": 13585 + }, + { + "epoch": 37.94972067039106, + "grad_norm": 0.4979175925254822, + "learning_rate": 2.8275605214152702e-05, + "loss": 0.0805, + "step": 13586 + }, + { + "epoch": 37.952513966480446, + "grad_norm": 0.5099448561668396, + "learning_rate": 2.827523277467412e-05, + "loss": 0.0756, + "step": 13587 + }, + { + "epoch": 37.95530726256983, + "grad_norm": 0.818507969379425, + "learning_rate": 2.827486033519553e-05, + "loss": 0.0806, + "step": 13588 + }, + { + "epoch": 37.95810055865922, + "grad_norm": 0.5272806882858276, + "learning_rate": 2.8274487895716948e-05, + "loss": 0.0785, + "step": 13589 + }, + { + "epoch": 37.960893854748605, + "grad_norm": 0.7429022192955017, + "learning_rate": 2.827411545623836e-05, + "loss": 0.0851, + "step": 13590 + }, + { + "epoch": 37.96368715083799, + "grad_norm": 1.2446386814117432, + "learning_rate": 2.8273743016759778e-05, + "loss": 0.0571, + "step": 13591 + }, + { + "epoch": 37.96648044692738, + "grad_norm": 0.5699859857559204, + "learning_rate": 2.827337057728119e-05, + "loss": 0.0644, + "step": 13592 + }, + { + "epoch": 37.96927374301676, + "grad_norm": 0.9957475662231445, + "learning_rate": 2.8272998137802607e-05, + "loss": 0.0865, + "step": 13593 + }, + { + "epoch": 37.97206703910614, + "grad_norm": 0.5944711565971375, + "learning_rate": 2.8272625698324024e-05, + "loss": 0.0766, + "step": 13594 + }, + { + "epoch": 37.97486033519553, + "grad_norm": 0.7629641890525818, + "learning_rate": 2.827225325884544e-05, + "loss": 0.0716, + "step": 13595 + }, + { + "epoch": 37.977653631284916, + "grad_norm": 1.5692315101623535, + "learning_rate": 2.8271880819366854e-05, + "loss": 0.0926, + "step": 13596 + }, + { + "epoch": 37.9804469273743, + "grad_norm": 0.5939427018165588, + "learning_rate": 2.8271508379888267e-05, + "loss": 0.1423, + "step": 13597 + }, + { + "epoch": 37.98324022346369, + "grad_norm": 0.5194096565246582, + "learning_rate": 2.8271135940409683e-05, + "loss": 0.1203, + "step": 13598 + }, + { + "epoch": 37.986033519553075, + "grad_norm": 0.6489980220794678, + "learning_rate": 2.82707635009311e-05, + "loss": 0.099, + "step": 13599 + }, + { + "epoch": 37.98882681564246, + "grad_norm": 1.8672524690628052, + "learning_rate": 2.8270391061452516e-05, + "loss": 0.1208, + "step": 13600 + }, + { + "epoch": 37.99162011173184, + "grad_norm": 0.8116322755813599, + "learning_rate": 2.827001862197393e-05, + "loss": 0.0923, + "step": 13601 + }, + { + "epoch": 37.99441340782123, + "grad_norm": 0.7780147194862366, + "learning_rate": 2.8269646182495346e-05, + "loss": 0.0892, + "step": 13602 + }, + { + "epoch": 37.997206703910614, + "grad_norm": 1.1590795516967773, + "learning_rate": 2.826927374301676e-05, + "loss": 0.0803, + "step": 13603 + }, + { + "epoch": 38.0, + "grad_norm": 0.8958903551101685, + "learning_rate": 2.8268901303538176e-05, + "loss": 0.0854, + "step": 13604 + }, + { + "epoch": 38.002793296089386, + "grad_norm": 0.7198182344436646, + "learning_rate": 2.8268528864059592e-05, + "loss": 0.1597, + "step": 13605 + }, + { + "epoch": 38.00558659217877, + "grad_norm": 0.893574059009552, + "learning_rate": 2.8268156424581005e-05, + "loss": 0.1118, + "step": 13606 + }, + { + "epoch": 38.00837988826816, + "grad_norm": 0.37087881565093994, + "learning_rate": 2.8267783985102422e-05, + "loss": 0.1334, + "step": 13607 + }, + { + "epoch": 38.01117318435754, + "grad_norm": 0.3942200839519501, + "learning_rate": 2.826741154562384e-05, + "loss": 0.1025, + "step": 13608 + }, + { + "epoch": 38.013966480446925, + "grad_norm": 0.6616092920303345, + "learning_rate": 2.8267039106145255e-05, + "loss": 0.1067, + "step": 13609 + }, + { + "epoch": 38.01675977653631, + "grad_norm": 1.5576893091201782, + "learning_rate": 2.8266666666666665e-05, + "loss": 0.1003, + "step": 13610 + }, + { + "epoch": 38.0195530726257, + "grad_norm": 0.5423663258552551, + "learning_rate": 2.826629422718808e-05, + "loss": 0.0944, + "step": 13611 + }, + { + "epoch": 38.022346368715084, + "grad_norm": 0.7358835339546204, + "learning_rate": 2.8265921787709498e-05, + "loss": 0.1091, + "step": 13612 + }, + { + "epoch": 38.02513966480447, + "grad_norm": 1.841209053993225, + "learning_rate": 2.8265549348230914e-05, + "loss": 0.1033, + "step": 13613 + }, + { + "epoch": 38.02793296089386, + "grad_norm": 0.5449988842010498, + "learning_rate": 2.826517690875233e-05, + "loss": 0.1002, + "step": 13614 + }, + { + "epoch": 38.03072625698324, + "grad_norm": 1.0527795553207397, + "learning_rate": 2.8264804469273744e-05, + "loss": 0.0782, + "step": 13615 + }, + { + "epoch": 38.03351955307262, + "grad_norm": 0.8708153367042542, + "learning_rate": 2.8264432029795157e-05, + "loss": 0.0923, + "step": 13616 + }, + { + "epoch": 38.03631284916201, + "grad_norm": 0.5480820536613464, + "learning_rate": 2.8264059590316574e-05, + "loss": 0.0742, + "step": 13617 + }, + { + "epoch": 38.039106145251395, + "grad_norm": 0.9960919618606567, + "learning_rate": 2.826368715083799e-05, + "loss": 0.0832, + "step": 13618 + }, + { + "epoch": 38.04189944134078, + "grad_norm": 0.5023010969161987, + "learning_rate": 2.8263314711359403e-05, + "loss": 0.0854, + "step": 13619 + }, + { + "epoch": 38.04469273743017, + "grad_norm": 0.4977894127368927, + "learning_rate": 2.826294227188082e-05, + "loss": 0.08, + "step": 13620 + }, + { + "epoch": 38.047486033519554, + "grad_norm": 0.5609042048454285, + "learning_rate": 2.8262569832402237e-05, + "loss": 0.063, + "step": 13621 + }, + { + "epoch": 38.05027932960894, + "grad_norm": 1.672320008277893, + "learning_rate": 2.8262197392923653e-05, + "loss": 0.0659, + "step": 13622 + }, + { + "epoch": 38.05307262569833, + "grad_norm": 0.5528787970542908, + "learning_rate": 2.8261824953445066e-05, + "loss": 0.0887, + "step": 13623 + }, + { + "epoch": 38.055865921787706, + "grad_norm": 1.5771676301956177, + "learning_rate": 2.826145251396648e-05, + "loss": 0.0774, + "step": 13624 + }, + { + "epoch": 38.05865921787709, + "grad_norm": 0.7770093083381653, + "learning_rate": 2.8261080074487896e-05, + "loss": 0.0618, + "step": 13625 + }, + { + "epoch": 38.06145251396648, + "grad_norm": 0.7451122403144836, + "learning_rate": 2.8260707635009312e-05, + "loss": 0.0577, + "step": 13626 + }, + { + "epoch": 38.064245810055866, + "grad_norm": 0.6932182312011719, + "learning_rate": 2.826033519553073e-05, + "loss": 0.0737, + "step": 13627 + }, + { + "epoch": 38.06703910614525, + "grad_norm": 1.5139777660369873, + "learning_rate": 2.8259962756052142e-05, + "loss": 0.0694, + "step": 13628 + }, + { + "epoch": 38.06983240223464, + "grad_norm": 1.4735448360443115, + "learning_rate": 2.8259590316573555e-05, + "loss": 0.0989, + "step": 13629 + }, + { + "epoch": 38.072625698324025, + "grad_norm": 1.1085277795791626, + "learning_rate": 2.8259217877094972e-05, + "loss": 0.1209, + "step": 13630 + }, + { + "epoch": 38.07541899441341, + "grad_norm": 0.3986848294734955, + "learning_rate": 2.825884543761639e-05, + "loss": 0.1104, + "step": 13631 + }, + { + "epoch": 38.07821229050279, + "grad_norm": 0.44649389386177063, + "learning_rate": 2.82584729981378e-05, + "loss": 0.1067, + "step": 13632 + }, + { + "epoch": 38.08100558659218, + "grad_norm": 0.689566433429718, + "learning_rate": 2.8258100558659218e-05, + "loss": 0.1, + "step": 13633 + }, + { + "epoch": 38.08379888268156, + "grad_norm": 1.3156629800796509, + "learning_rate": 2.8257728119180635e-05, + "loss": 0.117, + "step": 13634 + }, + { + "epoch": 38.08659217877095, + "grad_norm": 0.5990111827850342, + "learning_rate": 2.825735567970205e-05, + "loss": 0.0831, + "step": 13635 + }, + { + "epoch": 38.089385474860336, + "grad_norm": 0.5798774361610413, + "learning_rate": 2.8256983240223464e-05, + "loss": 0.0903, + "step": 13636 + }, + { + "epoch": 38.09217877094972, + "grad_norm": 0.5133525729179382, + "learning_rate": 2.8256610800744877e-05, + "loss": 0.0883, + "step": 13637 + }, + { + "epoch": 38.09497206703911, + "grad_norm": 0.4218067526817322, + "learning_rate": 2.8256238361266294e-05, + "loss": 0.0781, + "step": 13638 + }, + { + "epoch": 38.097765363128495, + "grad_norm": 0.4693346917629242, + "learning_rate": 2.825586592178771e-05, + "loss": 0.0987, + "step": 13639 + }, + { + "epoch": 38.100558659217874, + "grad_norm": 0.5241656303405762, + "learning_rate": 2.8255493482309127e-05, + "loss": 0.0797, + "step": 13640 + }, + { + "epoch": 38.10335195530726, + "grad_norm": 1.5795722007751465, + "learning_rate": 2.825512104283054e-05, + "loss": 0.0955, + "step": 13641 + }, + { + "epoch": 38.10614525139665, + "grad_norm": 1.8565741777420044, + "learning_rate": 2.8254748603351957e-05, + "loss": 0.0908, + "step": 13642 + }, + { + "epoch": 38.108938547486034, + "grad_norm": 0.6266420483589172, + "learning_rate": 2.825437616387337e-05, + "loss": 0.0917, + "step": 13643 + }, + { + "epoch": 38.11173184357542, + "grad_norm": 0.49125924706459045, + "learning_rate": 2.8254003724394786e-05, + "loss": 0.0922, + "step": 13644 + }, + { + "epoch": 38.114525139664806, + "grad_norm": 1.1133333444595337, + "learning_rate": 2.8253631284916203e-05, + "loss": 0.0682, + "step": 13645 + }, + { + "epoch": 38.11731843575419, + "grad_norm": 0.5136995911598206, + "learning_rate": 2.8253258845437616e-05, + "loss": 0.0777, + "step": 13646 + }, + { + "epoch": 38.12011173184357, + "grad_norm": 0.6037970185279846, + "learning_rate": 2.8252886405959033e-05, + "loss": 0.0809, + "step": 13647 + }, + { + "epoch": 38.12290502793296, + "grad_norm": 0.4830288290977478, + "learning_rate": 2.825251396648045e-05, + "loss": 0.0784, + "step": 13648 + }, + { + "epoch": 38.125698324022345, + "grad_norm": 2.425668478012085, + "learning_rate": 2.8252141527001862e-05, + "loss": 0.0558, + "step": 13649 + }, + { + "epoch": 38.12849162011173, + "grad_norm": 0.6688857674598694, + "learning_rate": 2.8251769087523275e-05, + "loss": 0.078, + "step": 13650 + }, + { + "epoch": 38.13128491620112, + "grad_norm": 1.0375055074691772, + "learning_rate": 2.8251396648044692e-05, + "loss": 0.0747, + "step": 13651 + }, + { + "epoch": 38.134078212290504, + "grad_norm": 0.7346288561820984, + "learning_rate": 2.825102420856611e-05, + "loss": 0.0718, + "step": 13652 + }, + { + "epoch": 38.13687150837989, + "grad_norm": 0.8996039628982544, + "learning_rate": 2.8250651769087525e-05, + "loss": 0.0792, + "step": 13653 + }, + { + "epoch": 38.13966480446928, + "grad_norm": 0.8932623863220215, + "learning_rate": 2.825027932960894e-05, + "loss": 0.0857, + "step": 13654 + }, + { + "epoch": 38.142458100558656, + "grad_norm": 0.378711462020874, + "learning_rate": 2.8249906890130355e-05, + "loss": 0.1138, + "step": 13655 + }, + { + "epoch": 38.14525139664804, + "grad_norm": 0.49095889925956726, + "learning_rate": 2.8249534450651768e-05, + "loss": 0.1177, + "step": 13656 + }, + { + "epoch": 38.14804469273743, + "grad_norm": 0.3813093304634094, + "learning_rate": 2.8249162011173184e-05, + "loss": 0.1115, + "step": 13657 + }, + { + "epoch": 38.150837988826815, + "grad_norm": 1.2772905826568604, + "learning_rate": 2.82487895716946e-05, + "loss": 0.1208, + "step": 13658 + }, + { + "epoch": 38.1536312849162, + "grad_norm": 0.4416792392730713, + "learning_rate": 2.8248417132216014e-05, + "loss": 0.0938, + "step": 13659 + }, + { + "epoch": 38.15642458100559, + "grad_norm": 0.6753383278846741, + "learning_rate": 2.824804469273743e-05, + "loss": 0.105, + "step": 13660 + }, + { + "epoch": 38.159217877094974, + "grad_norm": 0.6458865404129028, + "learning_rate": 2.8247672253258847e-05, + "loss": 0.1056, + "step": 13661 + }, + { + "epoch": 38.16201117318436, + "grad_norm": 0.559935986995697, + "learning_rate": 2.8247299813780264e-05, + "loss": 0.101, + "step": 13662 + }, + { + "epoch": 38.16480446927374, + "grad_norm": 0.5054547786712646, + "learning_rate": 2.8246927374301677e-05, + "loss": 0.0761, + "step": 13663 + }, + { + "epoch": 38.167597765363126, + "grad_norm": 0.45961886644363403, + "learning_rate": 2.824655493482309e-05, + "loss": 0.0965, + "step": 13664 + }, + { + "epoch": 38.17039106145251, + "grad_norm": 0.4697854518890381, + "learning_rate": 2.8246182495344507e-05, + "loss": 0.0898, + "step": 13665 + }, + { + "epoch": 38.1731843575419, + "grad_norm": 0.577907145023346, + "learning_rate": 2.8245810055865923e-05, + "loss": 0.0934, + "step": 13666 + }, + { + "epoch": 38.175977653631286, + "grad_norm": 0.4840467870235443, + "learning_rate": 2.824543761638734e-05, + "loss": 0.0776, + "step": 13667 + }, + { + "epoch": 38.17877094972067, + "grad_norm": 0.8989623785018921, + "learning_rate": 2.8245065176908753e-05, + "loss": 0.0881, + "step": 13668 + }, + { + "epoch": 38.18156424581006, + "grad_norm": 0.5477580428123474, + "learning_rate": 2.8244692737430166e-05, + "loss": 0.0836, + "step": 13669 + }, + { + "epoch": 38.184357541899445, + "grad_norm": 0.7397028803825378, + "learning_rate": 2.8244320297951582e-05, + "loss": 0.068, + "step": 13670 + }, + { + "epoch": 38.187150837988824, + "grad_norm": 0.5579710602760315, + "learning_rate": 2.8243947858473e-05, + "loss": 0.0811, + "step": 13671 + }, + { + "epoch": 38.18994413407821, + "grad_norm": 0.40642082691192627, + "learning_rate": 2.8243575418994415e-05, + "loss": 0.0666, + "step": 13672 + }, + { + "epoch": 38.1927374301676, + "grad_norm": 0.541372537612915, + "learning_rate": 2.824320297951583e-05, + "loss": 0.062, + "step": 13673 + }, + { + "epoch": 38.19553072625698, + "grad_norm": 0.4557761251926422, + "learning_rate": 2.8242830540037245e-05, + "loss": 0.0749, + "step": 13674 + }, + { + "epoch": 38.19832402234637, + "grad_norm": 2.782489538192749, + "learning_rate": 2.8242458100558662e-05, + "loss": 0.0729, + "step": 13675 + }, + { + "epoch": 38.201117318435756, + "grad_norm": 0.8448912501335144, + "learning_rate": 2.8242085661080075e-05, + "loss": 0.075, + "step": 13676 + }, + { + "epoch": 38.20391061452514, + "grad_norm": 0.615928053855896, + "learning_rate": 2.8241713221601488e-05, + "loss": 0.057, + "step": 13677 + }, + { + "epoch": 38.20670391061452, + "grad_norm": 0.7912046313285828, + "learning_rate": 2.8241340782122905e-05, + "loss": 0.0759, + "step": 13678 + }, + { + "epoch": 38.20949720670391, + "grad_norm": 1.0558451414108276, + "learning_rate": 2.824096834264432e-05, + "loss": 0.0853, + "step": 13679 + }, + { + "epoch": 38.212290502793294, + "grad_norm": 0.37920260429382324, + "learning_rate": 2.8240595903165738e-05, + "loss": 0.1139, + "step": 13680 + }, + { + "epoch": 38.21508379888268, + "grad_norm": 0.694470226764679, + "learning_rate": 2.8240223463687154e-05, + "loss": 0.1109, + "step": 13681 + }, + { + "epoch": 38.21787709497207, + "grad_norm": 0.4372653663158417, + "learning_rate": 2.8239851024208567e-05, + "loss": 0.1277, + "step": 13682 + }, + { + "epoch": 38.220670391061454, + "grad_norm": 0.5218306183815002, + "learning_rate": 2.823947858472998e-05, + "loss": 0.1122, + "step": 13683 + }, + { + "epoch": 38.22346368715084, + "grad_norm": 0.39648476243019104, + "learning_rate": 2.8239106145251397e-05, + "loss": 0.1024, + "step": 13684 + }, + { + "epoch": 38.226256983240226, + "grad_norm": 0.49528464674949646, + "learning_rate": 2.8238733705772814e-05, + "loss": 0.107, + "step": 13685 + }, + { + "epoch": 38.229050279329606, + "grad_norm": 0.4020237326622009, + "learning_rate": 2.8238361266294227e-05, + "loss": 0.09, + "step": 13686 + }, + { + "epoch": 38.23184357541899, + "grad_norm": 0.9500442743301392, + "learning_rate": 2.8237988826815643e-05, + "loss": 0.0947, + "step": 13687 + }, + { + "epoch": 38.23463687150838, + "grad_norm": 1.2254416942596436, + "learning_rate": 2.823761638733706e-05, + "loss": 0.1017, + "step": 13688 + }, + { + "epoch": 38.237430167597765, + "grad_norm": 0.573433518409729, + "learning_rate": 2.8237243947858473e-05, + "loss": 0.0968, + "step": 13689 + }, + { + "epoch": 38.24022346368715, + "grad_norm": 0.4361385405063629, + "learning_rate": 2.823687150837989e-05, + "loss": 0.0667, + "step": 13690 + }, + { + "epoch": 38.24301675977654, + "grad_norm": 0.9446371793746948, + "learning_rate": 2.8236499068901303e-05, + "loss": 0.0832, + "step": 13691 + }, + { + "epoch": 38.245810055865924, + "grad_norm": 0.9464927911758423, + "learning_rate": 2.823612662942272e-05, + "loss": 0.0936, + "step": 13692 + }, + { + "epoch": 38.24860335195531, + "grad_norm": 0.7366499900817871, + "learning_rate": 2.8235754189944136e-05, + "loss": 0.0939, + "step": 13693 + }, + { + "epoch": 38.25139664804469, + "grad_norm": 0.6689229011535645, + "learning_rate": 2.8235381750465552e-05, + "loss": 0.0875, + "step": 13694 + }, + { + "epoch": 38.254189944134076, + "grad_norm": 0.6350197792053223, + "learning_rate": 2.8235009310986965e-05, + "loss": 0.0712, + "step": 13695 + }, + { + "epoch": 38.25698324022346, + "grad_norm": 0.38538724184036255, + "learning_rate": 2.823463687150838e-05, + "loss": 0.0752, + "step": 13696 + }, + { + "epoch": 38.25977653631285, + "grad_norm": 1.7887818813323975, + "learning_rate": 2.8234264432029795e-05, + "loss": 0.0693, + "step": 13697 + }, + { + "epoch": 38.262569832402235, + "grad_norm": 1.0628087520599365, + "learning_rate": 2.823389199255121e-05, + "loss": 0.0703, + "step": 13698 + }, + { + "epoch": 38.26536312849162, + "grad_norm": 0.6115090847015381, + "learning_rate": 2.8233519553072628e-05, + "loss": 0.0882, + "step": 13699 + }, + { + "epoch": 38.26815642458101, + "grad_norm": 0.5248382091522217, + "learning_rate": 2.823314711359404e-05, + "loss": 0.0687, + "step": 13700 + }, + { + "epoch": 38.270949720670394, + "grad_norm": 0.67684406042099, + "learning_rate": 2.8232774674115458e-05, + "loss": 0.0637, + "step": 13701 + }, + { + "epoch": 38.273743016759774, + "grad_norm": 1.2188458442687988, + "learning_rate": 2.8232402234636874e-05, + "loss": 0.066, + "step": 13702 + }, + { + "epoch": 38.27653631284916, + "grad_norm": 0.8558314442634583, + "learning_rate": 2.8232029795158287e-05, + "loss": 0.0658, + "step": 13703 + }, + { + "epoch": 38.279329608938546, + "grad_norm": 0.9805386066436768, + "learning_rate": 2.82316573556797e-05, + "loss": 0.0978, + "step": 13704 + }, + { + "epoch": 38.28212290502793, + "grad_norm": 0.578081488609314, + "learning_rate": 2.8231284916201117e-05, + "loss": 0.1263, + "step": 13705 + }, + { + "epoch": 38.28491620111732, + "grad_norm": 0.49524828791618347, + "learning_rate": 2.8230912476722534e-05, + "loss": 0.0998, + "step": 13706 + }, + { + "epoch": 38.287709497206706, + "grad_norm": 0.8349334001541138, + "learning_rate": 2.823054003724395e-05, + "loss": 0.1322, + "step": 13707 + }, + { + "epoch": 38.29050279329609, + "grad_norm": 0.9149228930473328, + "learning_rate": 2.8230167597765367e-05, + "loss": 0.1065, + "step": 13708 + }, + { + "epoch": 38.29329608938548, + "grad_norm": 1.0362051725387573, + "learning_rate": 2.8229795158286776e-05, + "loss": 0.1195, + "step": 13709 + }, + { + "epoch": 38.29608938547486, + "grad_norm": 0.8216106295585632, + "learning_rate": 2.8229422718808193e-05, + "loss": 0.1023, + "step": 13710 + }, + { + "epoch": 38.298882681564244, + "grad_norm": 0.5101133584976196, + "learning_rate": 2.822905027932961e-05, + "loss": 0.1119, + "step": 13711 + }, + { + "epoch": 38.30167597765363, + "grad_norm": 1.2717500925064087, + "learning_rate": 2.8228677839851026e-05, + "loss": 0.1024, + "step": 13712 + }, + { + "epoch": 38.30446927374302, + "grad_norm": 0.4609566032886505, + "learning_rate": 2.822830540037244e-05, + "loss": 0.0885, + "step": 13713 + }, + { + "epoch": 38.3072625698324, + "grad_norm": 0.8391607999801636, + "learning_rate": 2.8227932960893856e-05, + "loss": 0.0964, + "step": 13714 + }, + { + "epoch": 38.31005586592179, + "grad_norm": 1.4443435668945312, + "learning_rate": 2.8227560521415272e-05, + "loss": 0.081, + "step": 13715 + }, + { + "epoch": 38.312849162011176, + "grad_norm": 0.8448764085769653, + "learning_rate": 2.8227188081936685e-05, + "loss": 0.1022, + "step": 13716 + }, + { + "epoch": 38.315642458100555, + "grad_norm": 0.4799998104572296, + "learning_rate": 2.8226815642458102e-05, + "loss": 0.0798, + "step": 13717 + }, + { + "epoch": 38.31843575418994, + "grad_norm": 0.45563554763793945, + "learning_rate": 2.8226443202979515e-05, + "loss": 0.0934, + "step": 13718 + }, + { + "epoch": 38.32122905027933, + "grad_norm": 0.5778693556785583, + "learning_rate": 2.822607076350093e-05, + "loss": 0.0816, + "step": 13719 + }, + { + "epoch": 38.324022346368714, + "grad_norm": 0.4454435110092163, + "learning_rate": 2.8225698324022348e-05, + "loss": 0.0824, + "step": 13720 + }, + { + "epoch": 38.3268156424581, + "grad_norm": 0.5092951655387878, + "learning_rate": 2.8225325884543765e-05, + "loss": 0.0774, + "step": 13721 + }, + { + "epoch": 38.32960893854749, + "grad_norm": 2.418661117553711, + "learning_rate": 2.8224953445065178e-05, + "loss": 0.0668, + "step": 13722 + }, + { + "epoch": 38.332402234636874, + "grad_norm": 0.5884087681770325, + "learning_rate": 2.822458100558659e-05, + "loss": 0.0876, + "step": 13723 + }, + { + "epoch": 38.33519553072626, + "grad_norm": 0.46387264132499695, + "learning_rate": 2.8224208566108008e-05, + "loss": 0.0781, + "step": 13724 + }, + { + "epoch": 38.33798882681564, + "grad_norm": 0.5995131134986877, + "learning_rate": 2.8223836126629424e-05, + "loss": 0.0644, + "step": 13725 + }, + { + "epoch": 38.340782122905026, + "grad_norm": 0.6904237866401672, + "learning_rate": 2.8223463687150837e-05, + "loss": 0.0881, + "step": 13726 + }, + { + "epoch": 38.34357541899441, + "grad_norm": 1.140262484550476, + "learning_rate": 2.8223091247672254e-05, + "loss": 0.0847, + "step": 13727 + }, + { + "epoch": 38.3463687150838, + "grad_norm": 0.8256446123123169, + "learning_rate": 2.822271880819367e-05, + "loss": 0.0815, + "step": 13728 + }, + { + "epoch": 38.349162011173185, + "grad_norm": 1.4070671796798706, + "learning_rate": 2.8222346368715083e-05, + "loss": 0.092, + "step": 13729 + }, + { + "epoch": 38.35195530726257, + "grad_norm": 0.3839598298072815, + "learning_rate": 2.82219739292365e-05, + "loss": 0.1188, + "step": 13730 + }, + { + "epoch": 38.35474860335196, + "grad_norm": 0.4576537609100342, + "learning_rate": 2.8221601489757913e-05, + "loss": 0.1068, + "step": 13731 + }, + { + "epoch": 38.357541899441344, + "grad_norm": 0.454610675573349, + "learning_rate": 2.822122905027933e-05, + "loss": 0.117, + "step": 13732 + }, + { + "epoch": 38.36033519553072, + "grad_norm": 0.3502713143825531, + "learning_rate": 2.8220856610800746e-05, + "loss": 0.1086, + "step": 13733 + }, + { + "epoch": 38.36312849162011, + "grad_norm": 0.4660640060901642, + "learning_rate": 2.8220484171322163e-05, + "loss": 0.101, + "step": 13734 + }, + { + "epoch": 38.365921787709496, + "grad_norm": 0.5614672303199768, + "learning_rate": 2.8220111731843576e-05, + "loss": 0.1131, + "step": 13735 + }, + { + "epoch": 38.36871508379888, + "grad_norm": 0.5471348166465759, + "learning_rate": 2.821973929236499e-05, + "loss": 0.1126, + "step": 13736 + }, + { + "epoch": 38.37150837988827, + "grad_norm": 0.4048161208629608, + "learning_rate": 2.8219366852886406e-05, + "loss": 0.1018, + "step": 13737 + }, + { + "epoch": 38.374301675977655, + "grad_norm": 0.4353809952735901, + "learning_rate": 2.8218994413407822e-05, + "loss": 0.0969, + "step": 13738 + }, + { + "epoch": 38.37709497206704, + "grad_norm": 0.6568073034286499, + "learning_rate": 2.821862197392924e-05, + "loss": 0.0982, + "step": 13739 + }, + { + "epoch": 38.37988826815643, + "grad_norm": 0.6703651547431946, + "learning_rate": 2.8218249534450652e-05, + "loss": 0.0996, + "step": 13740 + }, + { + "epoch": 38.38268156424581, + "grad_norm": 0.5202404260635376, + "learning_rate": 2.821787709497207e-05, + "loss": 0.077, + "step": 13741 + }, + { + "epoch": 38.385474860335194, + "grad_norm": 1.2995223999023438, + "learning_rate": 2.8217504655493485e-05, + "loss": 0.0766, + "step": 13742 + }, + { + "epoch": 38.38826815642458, + "grad_norm": 0.3537974953651428, + "learning_rate": 2.8217132216014898e-05, + "loss": 0.0695, + "step": 13743 + }, + { + "epoch": 38.391061452513966, + "grad_norm": 0.6345534324645996, + "learning_rate": 2.821675977653631e-05, + "loss": 0.0658, + "step": 13744 + }, + { + "epoch": 38.39385474860335, + "grad_norm": 0.5241303443908691, + "learning_rate": 2.8216387337057728e-05, + "loss": 0.0929, + "step": 13745 + }, + { + "epoch": 38.39664804469274, + "grad_norm": 0.687155544757843, + "learning_rate": 2.8216014897579144e-05, + "loss": 0.0966, + "step": 13746 + }, + { + "epoch": 38.399441340782126, + "grad_norm": 0.9251924753189087, + "learning_rate": 2.821564245810056e-05, + "loss": 0.0783, + "step": 13747 + }, + { + "epoch": 38.40223463687151, + "grad_norm": 0.5184628963470459, + "learning_rate": 2.8215270018621977e-05, + "loss": 0.0647, + "step": 13748 + }, + { + "epoch": 38.40502793296089, + "grad_norm": 0.8536320328712463, + "learning_rate": 2.8214897579143387e-05, + "loss": 0.0717, + "step": 13749 + }, + { + "epoch": 38.40782122905028, + "grad_norm": 0.48444679379463196, + "learning_rate": 2.8214525139664804e-05, + "loss": 0.0751, + "step": 13750 + }, + { + "epoch": 38.410614525139664, + "grad_norm": 0.9786247611045837, + "learning_rate": 2.821415270018622e-05, + "loss": 0.0753, + "step": 13751 + }, + { + "epoch": 38.41340782122905, + "grad_norm": 0.5685248374938965, + "learning_rate": 2.8213780260707637e-05, + "loss": 0.0634, + "step": 13752 + }, + { + "epoch": 38.41620111731844, + "grad_norm": 0.9955393671989441, + "learning_rate": 2.821340782122905e-05, + "loss": 0.0654, + "step": 13753 + }, + { + "epoch": 38.41899441340782, + "grad_norm": 0.8188115954399109, + "learning_rate": 2.8213035381750466e-05, + "loss": 0.0809, + "step": 13754 + }, + { + "epoch": 38.42178770949721, + "grad_norm": 0.38793066143989563, + "learning_rate": 2.8212662942271883e-05, + "loss": 0.1298, + "step": 13755 + }, + { + "epoch": 38.42458100558659, + "grad_norm": 0.7562322020530701, + "learning_rate": 2.8212290502793296e-05, + "loss": 0.1211, + "step": 13756 + }, + { + "epoch": 38.427374301675975, + "grad_norm": 0.9052409529685974, + "learning_rate": 2.8211918063314713e-05, + "loss": 0.0966, + "step": 13757 + }, + { + "epoch": 38.43016759776536, + "grad_norm": 0.6667492389678955, + "learning_rate": 2.8211545623836126e-05, + "loss": 0.1246, + "step": 13758 + }, + { + "epoch": 38.43296089385475, + "grad_norm": 0.618563711643219, + "learning_rate": 2.8211173184357542e-05, + "loss": 0.1239, + "step": 13759 + }, + { + "epoch": 38.435754189944134, + "grad_norm": 0.48272815346717834, + "learning_rate": 2.821080074487896e-05, + "loss": 0.0999, + "step": 13760 + }, + { + "epoch": 38.43854748603352, + "grad_norm": 0.4821487367153168, + "learning_rate": 2.8210428305400375e-05, + "loss": 0.1096, + "step": 13761 + }, + { + "epoch": 38.44134078212291, + "grad_norm": 0.41666385531425476, + "learning_rate": 2.821005586592179e-05, + "loss": 0.0921, + "step": 13762 + }, + { + "epoch": 38.444134078212294, + "grad_norm": 0.5778366327285767, + "learning_rate": 2.82096834264432e-05, + "loss": 0.1046, + "step": 13763 + }, + { + "epoch": 38.44692737430167, + "grad_norm": 0.4795728027820587, + "learning_rate": 2.8209310986964618e-05, + "loss": 0.0929, + "step": 13764 + }, + { + "epoch": 38.44972067039106, + "grad_norm": 0.627427875995636, + "learning_rate": 2.8208938547486035e-05, + "loss": 0.0938, + "step": 13765 + }, + { + "epoch": 38.452513966480446, + "grad_norm": 1.396582841873169, + "learning_rate": 2.820856610800745e-05, + "loss": 0.0865, + "step": 13766 + }, + { + "epoch": 38.45530726256983, + "grad_norm": 0.6990799307823181, + "learning_rate": 2.8208193668528864e-05, + "loss": 0.0727, + "step": 13767 + }, + { + "epoch": 38.45810055865922, + "grad_norm": 0.41566789150238037, + "learning_rate": 2.820782122905028e-05, + "loss": 0.0746, + "step": 13768 + }, + { + "epoch": 38.460893854748605, + "grad_norm": 1.025338888168335, + "learning_rate": 2.8207448789571694e-05, + "loss": 0.0679, + "step": 13769 + }, + { + "epoch": 38.46368715083799, + "grad_norm": 0.9662831425666809, + "learning_rate": 2.820707635009311e-05, + "loss": 0.084, + "step": 13770 + }, + { + "epoch": 38.46648044692738, + "grad_norm": 0.534064769744873, + "learning_rate": 2.8206703910614524e-05, + "loss": 0.067, + "step": 13771 + }, + { + "epoch": 38.46927374301676, + "grad_norm": 0.6542330384254456, + "learning_rate": 2.820633147113594e-05, + "loss": 0.0653, + "step": 13772 + }, + { + "epoch": 38.47206703910614, + "grad_norm": 0.7994427680969238, + "learning_rate": 2.8205959031657357e-05, + "loss": 0.0705, + "step": 13773 + }, + { + "epoch": 38.47486033519553, + "grad_norm": 0.8987044095993042, + "learning_rate": 2.8205586592178773e-05, + "loss": 0.0683, + "step": 13774 + }, + { + "epoch": 38.477653631284916, + "grad_norm": 1.035936951637268, + "learning_rate": 2.820521415270019e-05, + "loss": 0.0574, + "step": 13775 + }, + { + "epoch": 38.4804469273743, + "grad_norm": 0.48982343077659607, + "learning_rate": 2.82048417132216e-05, + "loss": 0.0704, + "step": 13776 + }, + { + "epoch": 38.48324022346369, + "grad_norm": 1.3583160638809204, + "learning_rate": 2.8204469273743016e-05, + "loss": 0.0734, + "step": 13777 + }, + { + "epoch": 38.486033519553075, + "grad_norm": 2.6069440841674805, + "learning_rate": 2.8204096834264433e-05, + "loss": 0.0759, + "step": 13778 + }, + { + "epoch": 38.48882681564246, + "grad_norm": 0.9855948686599731, + "learning_rate": 2.820372439478585e-05, + "loss": 0.0954, + "step": 13779 + }, + { + "epoch": 38.49162011173184, + "grad_norm": 0.4216324985027313, + "learning_rate": 2.8203351955307262e-05, + "loss": 0.1209, + "step": 13780 + }, + { + "epoch": 38.49441340782123, + "grad_norm": 0.5342094898223877, + "learning_rate": 2.820297951582868e-05, + "loss": 0.1108, + "step": 13781 + }, + { + "epoch": 38.497206703910614, + "grad_norm": 0.45024871826171875, + "learning_rate": 2.8202607076350095e-05, + "loss": 0.109, + "step": 13782 + }, + { + "epoch": 38.5, + "grad_norm": 0.807569682598114, + "learning_rate": 2.820223463687151e-05, + "loss": 0.1157, + "step": 13783 + }, + { + "epoch": 38.502793296089386, + "grad_norm": 0.4081205129623413, + "learning_rate": 2.8201862197392925e-05, + "loss": 0.1132, + "step": 13784 + }, + { + "epoch": 38.50558659217877, + "grad_norm": 0.5807958245277405, + "learning_rate": 2.8201489757914338e-05, + "loss": 0.0973, + "step": 13785 + }, + { + "epoch": 38.50837988826816, + "grad_norm": 0.4793674647808075, + "learning_rate": 2.8201117318435755e-05, + "loss": 0.0997, + "step": 13786 + }, + { + "epoch": 38.51117318435754, + "grad_norm": 0.5050674676895142, + "learning_rate": 2.820074487895717e-05, + "loss": 0.0974, + "step": 13787 + }, + { + "epoch": 38.513966480446925, + "grad_norm": 0.39423036575317383, + "learning_rate": 2.8200372439478588e-05, + "loss": 0.0904, + "step": 13788 + }, + { + "epoch": 38.51675977653631, + "grad_norm": 1.1291871070861816, + "learning_rate": 2.8199999999999998e-05, + "loss": 0.104, + "step": 13789 + }, + { + "epoch": 38.5195530726257, + "grad_norm": 0.47780680656433105, + "learning_rate": 2.8199627560521414e-05, + "loss": 0.0994, + "step": 13790 + }, + { + "epoch": 38.522346368715084, + "grad_norm": 1.103088140487671, + "learning_rate": 2.819925512104283e-05, + "loss": 0.0869, + "step": 13791 + }, + { + "epoch": 38.52513966480447, + "grad_norm": 0.7918819189071655, + "learning_rate": 2.8198882681564247e-05, + "loss": 0.0948, + "step": 13792 + }, + { + "epoch": 38.52793296089386, + "grad_norm": 1.0708072185516357, + "learning_rate": 2.8198510242085664e-05, + "loss": 0.0765, + "step": 13793 + }, + { + "epoch": 38.53072625698324, + "grad_norm": 0.7510428428649902, + "learning_rate": 2.8198137802607077e-05, + "loss": 0.0889, + "step": 13794 + }, + { + "epoch": 38.53351955307262, + "grad_norm": 0.503798246383667, + "learning_rate": 2.8197765363128493e-05, + "loss": 0.0804, + "step": 13795 + }, + { + "epoch": 38.53631284916201, + "grad_norm": 0.4742206633090973, + "learning_rate": 2.8197392923649907e-05, + "loss": 0.081, + "step": 13796 + }, + { + "epoch": 38.539106145251395, + "grad_norm": 1.0509443283081055, + "learning_rate": 2.8197020484171323e-05, + "loss": 0.076, + "step": 13797 + }, + { + "epoch": 38.54189944134078, + "grad_norm": 1.1594491004943848, + "learning_rate": 2.8196648044692736e-05, + "loss": 0.0838, + "step": 13798 + }, + { + "epoch": 38.54469273743017, + "grad_norm": 0.6252346634864807, + "learning_rate": 2.8196275605214153e-05, + "loss": 0.0907, + "step": 13799 + }, + { + "epoch": 38.547486033519554, + "grad_norm": 8.080602645874023, + "learning_rate": 2.819590316573557e-05, + "loss": 0.0713, + "step": 13800 + }, + { + "epoch": 38.55027932960894, + "grad_norm": 0.5671552419662476, + "learning_rate": 2.8195530726256986e-05, + "loss": 0.0737, + "step": 13801 + }, + { + "epoch": 38.55307262569833, + "grad_norm": 0.8001420497894287, + "learning_rate": 2.8195158286778402e-05, + "loss": 0.0651, + "step": 13802 + }, + { + "epoch": 38.555865921787706, + "grad_norm": 0.5933485627174377, + "learning_rate": 2.8194785847299812e-05, + "loss": 0.0728, + "step": 13803 + }, + { + "epoch": 38.55865921787709, + "grad_norm": 1.5367540121078491, + "learning_rate": 2.819441340782123e-05, + "loss": 0.0816, + "step": 13804 + }, + { + "epoch": 38.56145251396648, + "grad_norm": 0.5064272880554199, + "learning_rate": 2.8194040968342645e-05, + "loss": 0.1266, + "step": 13805 + }, + { + "epoch": 38.564245810055866, + "grad_norm": 0.4853374660015106, + "learning_rate": 2.8193668528864062e-05, + "loss": 0.134, + "step": 13806 + }, + { + "epoch": 38.56703910614525, + "grad_norm": 0.6735977530479431, + "learning_rate": 2.8193296089385475e-05, + "loss": 0.1075, + "step": 13807 + }, + { + "epoch": 38.56983240223464, + "grad_norm": 0.6326329708099365, + "learning_rate": 2.819292364990689e-05, + "loss": 0.1032, + "step": 13808 + }, + { + "epoch": 38.572625698324025, + "grad_norm": 0.4763261675834656, + "learning_rate": 2.8192551210428305e-05, + "loss": 0.1095, + "step": 13809 + }, + { + "epoch": 38.57541899441341, + "grad_norm": 0.5325655341148376, + "learning_rate": 2.819217877094972e-05, + "loss": 0.0995, + "step": 13810 + }, + { + "epoch": 38.57821229050279, + "grad_norm": 0.8144444823265076, + "learning_rate": 2.8191806331471138e-05, + "loss": 0.1076, + "step": 13811 + }, + { + "epoch": 38.58100558659218, + "grad_norm": 1.1911715269088745, + "learning_rate": 2.819143389199255e-05, + "loss": 0.1014, + "step": 13812 + }, + { + "epoch": 38.58379888268156, + "grad_norm": 0.39264824986457825, + "learning_rate": 2.8191061452513967e-05, + "loss": 0.0828, + "step": 13813 + }, + { + "epoch": 38.58659217877095, + "grad_norm": 0.7393624782562256, + "learning_rate": 2.8190689013035384e-05, + "loss": 0.1007, + "step": 13814 + }, + { + "epoch": 38.589385474860336, + "grad_norm": 0.6280959248542786, + "learning_rate": 2.81903165735568e-05, + "loss": 0.0924, + "step": 13815 + }, + { + "epoch": 38.59217877094972, + "grad_norm": 0.6185868978500366, + "learning_rate": 2.818994413407821e-05, + "loss": 0.0936, + "step": 13816 + }, + { + "epoch": 38.59497206703911, + "grad_norm": 0.678554356098175, + "learning_rate": 2.8189571694599627e-05, + "loss": 0.0812, + "step": 13817 + }, + { + "epoch": 38.59776536312849, + "grad_norm": 0.6086649894714355, + "learning_rate": 2.8189199255121043e-05, + "loss": 0.0808, + "step": 13818 + }, + { + "epoch": 38.600558659217874, + "grad_norm": 0.5977902412414551, + "learning_rate": 2.818882681564246e-05, + "loss": 0.0703, + "step": 13819 + }, + { + "epoch": 38.60335195530726, + "grad_norm": 0.729066014289856, + "learning_rate": 2.8188454376163873e-05, + "loss": 0.0817, + "step": 13820 + }, + { + "epoch": 38.60614525139665, + "grad_norm": 0.4781682789325714, + "learning_rate": 2.818808193668529e-05, + "loss": 0.0712, + "step": 13821 + }, + { + "epoch": 38.608938547486034, + "grad_norm": 0.64860999584198, + "learning_rate": 2.8187709497206706e-05, + "loss": 0.0797, + "step": 13822 + }, + { + "epoch": 38.61173184357542, + "grad_norm": 1.1487290859222412, + "learning_rate": 2.818733705772812e-05, + "loss": 0.0681, + "step": 13823 + }, + { + "epoch": 38.614525139664806, + "grad_norm": 0.7879559993743896, + "learning_rate": 2.8186964618249536e-05, + "loss": 0.0826, + "step": 13824 + }, + { + "epoch": 38.61731843575419, + "grad_norm": 0.5300081372261047, + "learning_rate": 2.818659217877095e-05, + "loss": 0.0761, + "step": 13825 + }, + { + "epoch": 38.62011173184357, + "grad_norm": 0.8959120512008667, + "learning_rate": 2.8186219739292365e-05, + "loss": 0.0746, + "step": 13826 + }, + { + "epoch": 38.62290502793296, + "grad_norm": 1.5305004119873047, + "learning_rate": 2.8185847299813782e-05, + "loss": 0.0786, + "step": 13827 + }, + { + "epoch": 38.625698324022345, + "grad_norm": 0.9396858215332031, + "learning_rate": 2.81854748603352e-05, + "loss": 0.0861, + "step": 13828 + }, + { + "epoch": 38.62849162011173, + "grad_norm": 6.650175094604492, + "learning_rate": 2.8185102420856608e-05, + "loss": 0.0934, + "step": 13829 + }, + { + "epoch": 38.63128491620112, + "grad_norm": 0.46983224153518677, + "learning_rate": 2.8184729981378025e-05, + "loss": 0.1453, + "step": 13830 + }, + { + "epoch": 38.634078212290504, + "grad_norm": 0.42784979939460754, + "learning_rate": 2.818435754189944e-05, + "loss": 0.1195, + "step": 13831 + }, + { + "epoch": 38.63687150837989, + "grad_norm": 0.42154332995414734, + "learning_rate": 2.8183985102420858e-05, + "loss": 0.1004, + "step": 13832 + }, + { + "epoch": 38.63966480446928, + "grad_norm": 0.3716309070587158, + "learning_rate": 2.8183612662942274e-05, + "loss": 0.1004, + "step": 13833 + }, + { + "epoch": 38.642458100558656, + "grad_norm": 0.4281313419342041, + "learning_rate": 2.8183240223463688e-05, + "loss": 0.1035, + "step": 13834 + }, + { + "epoch": 38.64525139664804, + "grad_norm": 0.7026533484458923, + "learning_rate": 2.8182867783985104e-05, + "loss": 0.1159, + "step": 13835 + }, + { + "epoch": 38.64804469273743, + "grad_norm": 0.47632020711898804, + "learning_rate": 2.8182495344506517e-05, + "loss": 0.1036, + "step": 13836 + }, + { + "epoch": 38.650837988826815, + "grad_norm": 0.3790722191333771, + "learning_rate": 2.8182122905027934e-05, + "loss": 0.0978, + "step": 13837 + }, + { + "epoch": 38.6536312849162, + "grad_norm": 1.067626714706421, + "learning_rate": 2.8181750465549347e-05, + "loss": 0.0944, + "step": 13838 + }, + { + "epoch": 38.65642458100559, + "grad_norm": 0.43713054060935974, + "learning_rate": 2.8181378026070763e-05, + "loss": 0.0967, + "step": 13839 + }, + { + "epoch": 38.659217877094974, + "grad_norm": 0.4869667887687683, + "learning_rate": 2.818100558659218e-05, + "loss": 0.1297, + "step": 13840 + }, + { + "epoch": 38.66201117318436, + "grad_norm": 0.6247203946113586, + "learning_rate": 2.8180633147113597e-05, + "loss": 0.083, + "step": 13841 + }, + { + "epoch": 38.66480446927374, + "grad_norm": 0.5846632122993469, + "learning_rate": 2.8180260707635013e-05, + "loss": 0.0756, + "step": 13842 + }, + { + "epoch": 38.667597765363126, + "grad_norm": 0.569110631942749, + "learning_rate": 2.8179888268156423e-05, + "loss": 0.0884, + "step": 13843 + }, + { + "epoch": 38.67039106145251, + "grad_norm": 0.5299766063690186, + "learning_rate": 2.817951582867784e-05, + "loss": 0.079, + "step": 13844 + }, + { + "epoch": 38.6731843575419, + "grad_norm": 0.4069921672344208, + "learning_rate": 2.8179143389199256e-05, + "loss": 0.0792, + "step": 13845 + }, + { + "epoch": 38.675977653631286, + "grad_norm": 0.5567750334739685, + "learning_rate": 2.8178770949720672e-05, + "loss": 0.0842, + "step": 13846 + }, + { + "epoch": 38.67877094972067, + "grad_norm": 1.068578839302063, + "learning_rate": 2.8178398510242086e-05, + "loss": 0.0707, + "step": 13847 + }, + { + "epoch": 38.68156424581006, + "grad_norm": 0.6553998589515686, + "learning_rate": 2.8178026070763502e-05, + "loss": 0.0668, + "step": 13848 + }, + { + "epoch": 38.684357541899445, + "grad_norm": 1.1579440832138062, + "learning_rate": 2.8177653631284915e-05, + "loss": 0.0767, + "step": 13849 + }, + { + "epoch": 38.687150837988824, + "grad_norm": 0.6024009585380554, + "learning_rate": 2.8177281191806332e-05, + "loss": 0.0665, + "step": 13850 + }, + { + "epoch": 38.68994413407821, + "grad_norm": 0.5937240719795227, + "learning_rate": 2.817690875232775e-05, + "loss": 0.0685, + "step": 13851 + }, + { + "epoch": 38.6927374301676, + "grad_norm": 0.9592479467391968, + "learning_rate": 2.817653631284916e-05, + "loss": 0.0723, + "step": 13852 + }, + { + "epoch": 38.69553072625698, + "grad_norm": 0.8652563095092773, + "learning_rate": 2.8176163873370578e-05, + "loss": 0.0864, + "step": 13853 + }, + { + "epoch": 38.69832402234637, + "grad_norm": 1.0788688659667969, + "learning_rate": 2.8175791433891995e-05, + "loss": 0.0815, + "step": 13854 + }, + { + "epoch": 38.701117318435756, + "grad_norm": 0.6012893319129944, + "learning_rate": 2.817541899441341e-05, + "loss": 0.125, + "step": 13855 + }, + { + "epoch": 38.70391061452514, + "grad_norm": 0.5100256204605103, + "learning_rate": 2.817504655493482e-05, + "loss": 0.1371, + "step": 13856 + }, + { + "epoch": 38.70670391061452, + "grad_norm": 0.3431745171546936, + "learning_rate": 2.8174674115456237e-05, + "loss": 0.1087, + "step": 13857 + }, + { + "epoch": 38.70949720670391, + "grad_norm": 0.6963807940483093, + "learning_rate": 2.8174301675977654e-05, + "loss": 0.1034, + "step": 13858 + }, + { + "epoch": 38.712290502793294, + "grad_norm": 0.7737045884132385, + "learning_rate": 2.817392923649907e-05, + "loss": 0.1246, + "step": 13859 + }, + { + "epoch": 38.71508379888268, + "grad_norm": 0.7375789284706116, + "learning_rate": 2.8173556797020487e-05, + "loss": 0.1079, + "step": 13860 + }, + { + "epoch": 38.71787709497207, + "grad_norm": 0.6823984980583191, + "learning_rate": 2.81731843575419e-05, + "loss": 0.1127, + "step": 13861 + }, + { + "epoch": 38.720670391061454, + "grad_norm": 0.37735646963119507, + "learning_rate": 2.8172811918063317e-05, + "loss": 0.0995, + "step": 13862 + }, + { + "epoch": 38.72346368715084, + "grad_norm": 0.5369113683700562, + "learning_rate": 2.817243947858473e-05, + "loss": 0.0871, + "step": 13863 + }, + { + "epoch": 38.726256983240226, + "grad_norm": 0.7061443328857422, + "learning_rate": 2.8172067039106146e-05, + "loss": 0.0957, + "step": 13864 + }, + { + "epoch": 38.729050279329606, + "grad_norm": 0.6290752291679382, + "learning_rate": 2.817169459962756e-05, + "loss": 0.1146, + "step": 13865 + }, + { + "epoch": 38.73184357541899, + "grad_norm": 0.6431031823158264, + "learning_rate": 2.8171322160148976e-05, + "loss": 0.1022, + "step": 13866 + }, + { + "epoch": 38.73463687150838, + "grad_norm": 0.5127524733543396, + "learning_rate": 2.8170949720670393e-05, + "loss": 0.0906, + "step": 13867 + }, + { + "epoch": 38.737430167597765, + "grad_norm": 0.6714181303977966, + "learning_rate": 2.817057728119181e-05, + "loss": 0.0906, + "step": 13868 + }, + { + "epoch": 38.74022346368715, + "grad_norm": 0.6430686116218567, + "learning_rate": 2.8170204841713222e-05, + "loss": 0.0862, + "step": 13869 + }, + { + "epoch": 38.74301675977654, + "grad_norm": 0.5403818488121033, + "learning_rate": 2.8169832402234635e-05, + "loss": 0.0742, + "step": 13870 + }, + { + "epoch": 38.745810055865924, + "grad_norm": 0.678941547870636, + "learning_rate": 2.8169459962756052e-05, + "loss": 0.0797, + "step": 13871 + }, + { + "epoch": 38.74860335195531, + "grad_norm": 0.6076549291610718, + "learning_rate": 2.816908752327747e-05, + "loss": 0.0872, + "step": 13872 + }, + { + "epoch": 38.75139664804469, + "grad_norm": 1.3735816478729248, + "learning_rate": 2.8168715083798885e-05, + "loss": 0.1057, + "step": 13873 + }, + { + "epoch": 38.754189944134076, + "grad_norm": 1.2291306257247925, + "learning_rate": 2.8168342644320298e-05, + "loss": 0.0818, + "step": 13874 + }, + { + "epoch": 38.75698324022346, + "grad_norm": 0.5107596516609192, + "learning_rate": 2.8167970204841715e-05, + "loss": 0.064, + "step": 13875 + }, + { + "epoch": 38.75977653631285, + "grad_norm": 0.7181755900382996, + "learning_rate": 2.8167597765363128e-05, + "loss": 0.0756, + "step": 13876 + }, + { + "epoch": 38.762569832402235, + "grad_norm": 0.790803074836731, + "learning_rate": 2.8167225325884544e-05, + "loss": 0.092, + "step": 13877 + }, + { + "epoch": 38.76536312849162, + "grad_norm": 0.9664952158927917, + "learning_rate": 2.816685288640596e-05, + "loss": 0.0977, + "step": 13878 + }, + { + "epoch": 38.76815642458101, + "grad_norm": 2.558032989501953, + "learning_rate": 2.8166480446927374e-05, + "loss": 0.0898, + "step": 13879 + }, + { + "epoch": 38.770949720670394, + "grad_norm": 0.33781322836875916, + "learning_rate": 2.816610800744879e-05, + "loss": 0.1236, + "step": 13880 + }, + { + "epoch": 38.773743016759774, + "grad_norm": 0.3987351357936859, + "learning_rate": 2.8165735567970207e-05, + "loss": 0.1282, + "step": 13881 + }, + { + "epoch": 38.77653631284916, + "grad_norm": 0.7068341374397278, + "learning_rate": 2.8165363128491624e-05, + "loss": 0.1016, + "step": 13882 + }, + { + "epoch": 38.779329608938546, + "grad_norm": 0.35489025712013245, + "learning_rate": 2.8164990689013033e-05, + "loss": 0.1154, + "step": 13883 + }, + { + "epoch": 38.78212290502793, + "grad_norm": 0.7438069581985474, + "learning_rate": 2.816461824953445e-05, + "loss": 0.1131, + "step": 13884 + }, + { + "epoch": 38.78491620111732, + "grad_norm": 0.9405038952827454, + "learning_rate": 2.8164245810055866e-05, + "loss": 0.117, + "step": 13885 + }, + { + "epoch": 38.787709497206706, + "grad_norm": 0.49241313338279724, + "learning_rate": 2.8163873370577283e-05, + "loss": 0.1054, + "step": 13886 + }, + { + "epoch": 38.79050279329609, + "grad_norm": 0.4505404531955719, + "learning_rate": 2.81635009310987e-05, + "loss": 0.0912, + "step": 13887 + }, + { + "epoch": 38.79329608938548, + "grad_norm": 0.539735734462738, + "learning_rate": 2.8163128491620113e-05, + "loss": 0.0984, + "step": 13888 + }, + { + "epoch": 38.79608938547486, + "grad_norm": 0.45133382081985474, + "learning_rate": 2.8162756052141526e-05, + "loss": 0.0825, + "step": 13889 + }, + { + "epoch": 38.798882681564244, + "grad_norm": 1.4024449586868286, + "learning_rate": 2.8162383612662942e-05, + "loss": 0.097, + "step": 13890 + }, + { + "epoch": 38.80167597765363, + "grad_norm": 0.7537744641304016, + "learning_rate": 2.816201117318436e-05, + "loss": 0.0805, + "step": 13891 + }, + { + "epoch": 38.80446927374302, + "grad_norm": 0.401835173368454, + "learning_rate": 2.8161638733705772e-05, + "loss": 0.0813, + "step": 13892 + }, + { + "epoch": 38.8072625698324, + "grad_norm": 0.42641526460647583, + "learning_rate": 2.816126629422719e-05, + "loss": 0.0802, + "step": 13893 + }, + { + "epoch": 38.81005586592179, + "grad_norm": 0.6978179216384888, + "learning_rate": 2.8160893854748605e-05, + "loss": 0.0812, + "step": 13894 + }, + { + "epoch": 38.812849162011176, + "grad_norm": 0.34789130091667175, + "learning_rate": 2.816052141527002e-05, + "loss": 0.0638, + "step": 13895 + }, + { + "epoch": 38.815642458100555, + "grad_norm": 0.6026569604873657, + "learning_rate": 2.8160148975791435e-05, + "loss": 0.064, + "step": 13896 + }, + { + "epoch": 38.81843575418994, + "grad_norm": 0.44745948910713196, + "learning_rate": 2.8159776536312848e-05, + "loss": 0.0839, + "step": 13897 + }, + { + "epoch": 38.82122905027933, + "grad_norm": 0.6541919112205505, + "learning_rate": 2.8159404096834265e-05, + "loss": 0.0789, + "step": 13898 + }, + { + "epoch": 38.824022346368714, + "grad_norm": 0.5804952383041382, + "learning_rate": 2.815903165735568e-05, + "loss": 0.0668, + "step": 13899 + }, + { + "epoch": 38.8268156424581, + "grad_norm": 0.7965481877326965, + "learning_rate": 2.8158659217877098e-05, + "loss": 0.0809, + "step": 13900 + }, + { + "epoch": 38.82960893854749, + "grad_norm": 0.732953667640686, + "learning_rate": 2.815828677839851e-05, + "loss": 0.0766, + "step": 13901 + }, + { + "epoch": 38.832402234636874, + "grad_norm": 0.8889634013175964, + "learning_rate": 2.8157914338919927e-05, + "loss": 0.0866, + "step": 13902 + }, + { + "epoch": 38.83519553072626, + "grad_norm": 0.752293050289154, + "learning_rate": 2.815754189944134e-05, + "loss": 0.074, + "step": 13903 + }, + { + "epoch": 38.83798882681564, + "grad_norm": 1.1037317514419556, + "learning_rate": 2.8157169459962757e-05, + "loss": 0.088, + "step": 13904 + }, + { + "epoch": 38.840782122905026, + "grad_norm": 0.6492994427680969, + "learning_rate": 2.8156797020484173e-05, + "loss": 0.1333, + "step": 13905 + }, + { + "epoch": 38.84357541899441, + "grad_norm": 0.7365188002586365, + "learning_rate": 2.8156424581005587e-05, + "loss": 0.1376, + "step": 13906 + }, + { + "epoch": 38.8463687150838, + "grad_norm": NaN, + "learning_rate": 2.8156424581005587e-05, + "loss": 0.1063, + "step": 13907 + }, + { + "epoch": 38.849162011173185, + "grad_norm": 0.850227415561676, + "learning_rate": 2.8156052141527003e-05, + "loss": 0.1292, + "step": 13908 + }, + { + "epoch": 38.85195530726257, + "grad_norm": 0.6937223672866821, + "learning_rate": 2.815567970204842e-05, + "loss": 0.1142, + "step": 13909 + }, + { + "epoch": 38.85474860335196, + "grad_norm": 0.5119539499282837, + "learning_rate": 2.8155307262569833e-05, + "loss": 0.1076, + "step": 13910 + }, + { + "epoch": 38.857541899441344, + "grad_norm": 0.6546431183815002, + "learning_rate": 2.8154934823091246e-05, + "loss": 0.1022, + "step": 13911 + }, + { + "epoch": 38.86033519553072, + "grad_norm": 1.2861254215240479, + "learning_rate": 2.8154562383612663e-05, + "loss": 0.0916, + "step": 13912 + }, + { + "epoch": 38.86312849162011, + "grad_norm": 0.3550238609313965, + "learning_rate": 2.815418994413408e-05, + "loss": 0.093, + "step": 13913 + }, + { + "epoch": 38.865921787709496, + "grad_norm": 0.595964252948761, + "learning_rate": 2.8153817504655496e-05, + "loss": 0.0908, + "step": 13914 + }, + { + "epoch": 38.86871508379888, + "grad_norm": 0.5790852308273315, + "learning_rate": 2.815344506517691e-05, + "loss": 0.1004, + "step": 13915 + }, + { + "epoch": 38.87150837988827, + "grad_norm": 0.5495722889900208, + "learning_rate": 2.8153072625698325e-05, + "loss": 0.0834, + "step": 13916 + }, + { + "epoch": 38.874301675977655, + "grad_norm": 0.9984388947486877, + "learning_rate": 2.815270018621974e-05, + "loss": 0.0855, + "step": 13917 + }, + { + "epoch": 38.87709497206704, + "grad_norm": 0.5843257308006287, + "learning_rate": 2.8152327746741155e-05, + "loss": 0.0896, + "step": 13918 + }, + { + "epoch": 38.87988826815643, + "grad_norm": 0.6011370420455933, + "learning_rate": 2.815195530726257e-05, + "loss": 0.0821, + "step": 13919 + }, + { + "epoch": 38.88268156424581, + "grad_norm": 0.4790153205394745, + "learning_rate": 2.8151582867783985e-05, + "loss": 0.0833, + "step": 13920 + }, + { + "epoch": 38.885474860335194, + "grad_norm": 0.6771031022071838, + "learning_rate": 2.81512104283054e-05, + "loss": 0.0998, + "step": 13921 + }, + { + "epoch": 38.88826815642458, + "grad_norm": 0.9446739554405212, + "learning_rate": 2.8150837988826818e-05, + "loss": 0.0745, + "step": 13922 + }, + { + "epoch": 38.891061452513966, + "grad_norm": 0.49410757422447205, + "learning_rate": 2.8150465549348234e-05, + "loss": 0.0633, + "step": 13923 + }, + { + "epoch": 38.89385474860335, + "grad_norm": 0.48964133858680725, + "learning_rate": 2.8150093109869644e-05, + "loss": 0.0612, + "step": 13924 + }, + { + "epoch": 38.89664804469274, + "grad_norm": 0.581567645072937, + "learning_rate": 2.814972067039106e-05, + "loss": 0.0676, + "step": 13925 + }, + { + "epoch": 38.899441340782126, + "grad_norm": 1.5354546308517456, + "learning_rate": 2.8149348230912477e-05, + "loss": 0.0603, + "step": 13926 + }, + { + "epoch": 38.90223463687151, + "grad_norm": 1.0759246349334717, + "learning_rate": 2.8148975791433894e-05, + "loss": 0.0595, + "step": 13927 + }, + { + "epoch": 38.90502793296089, + "grad_norm": 0.7140375971794128, + "learning_rate": 2.814860335195531e-05, + "loss": 0.0801, + "step": 13928 + }, + { + "epoch": 38.90782122905028, + "grad_norm": 0.9975626468658447, + "learning_rate": 2.8148230912476723e-05, + "loss": 0.0932, + "step": 13929 + }, + { + "epoch": 38.910614525139664, + "grad_norm": 0.6339572668075562, + "learning_rate": 2.8147858472998136e-05, + "loss": 0.1189, + "step": 13930 + }, + { + "epoch": 38.91340782122905, + "grad_norm": 0.43222346901893616, + "learning_rate": 2.8147486033519553e-05, + "loss": 0.115, + "step": 13931 + }, + { + "epoch": 38.91620111731844, + "grad_norm": 0.7280413508415222, + "learning_rate": 2.814711359404097e-05, + "loss": 0.0914, + "step": 13932 + }, + { + "epoch": 38.91899441340782, + "grad_norm": 0.5688527822494507, + "learning_rate": 2.8146741154562383e-05, + "loss": 0.1085, + "step": 13933 + }, + { + "epoch": 38.92178770949721, + "grad_norm": 0.7641273140907288, + "learning_rate": 2.81463687150838e-05, + "loss": 0.1243, + "step": 13934 + }, + { + "epoch": 38.92458100558659, + "grad_norm": 0.4794342815876007, + "learning_rate": 2.8145996275605216e-05, + "loss": 0.1112, + "step": 13935 + }, + { + "epoch": 38.927374301675975, + "grad_norm": 1.7034844160079956, + "learning_rate": 2.8145623836126632e-05, + "loss": 0.1209, + "step": 13936 + }, + { + "epoch": 38.93016759776536, + "grad_norm": 0.4324362576007843, + "learning_rate": 2.8145251396648045e-05, + "loss": 0.1074, + "step": 13937 + }, + { + "epoch": 38.93296089385475, + "grad_norm": 1.1344329118728638, + "learning_rate": 2.814487895716946e-05, + "loss": 0.1, + "step": 13938 + }, + { + "epoch": 38.935754189944134, + "grad_norm": 0.7757636308670044, + "learning_rate": 2.8144506517690875e-05, + "loss": 0.1044, + "step": 13939 + }, + { + "epoch": 38.93854748603352, + "grad_norm": 0.5023086071014404, + "learning_rate": 2.814413407821229e-05, + "loss": 0.1031, + "step": 13940 + }, + { + "epoch": 38.94134078212291, + "grad_norm": 0.4075654149055481, + "learning_rate": 2.8143761638733708e-05, + "loss": 0.1082, + "step": 13941 + }, + { + "epoch": 38.944134078212294, + "grad_norm": 0.48283904790878296, + "learning_rate": 2.814338919925512e-05, + "loss": 0.0905, + "step": 13942 + }, + { + "epoch": 38.94692737430167, + "grad_norm": 0.8485958576202393, + "learning_rate": 2.8143016759776538e-05, + "loss": 0.1032, + "step": 13943 + }, + { + "epoch": 38.94972067039106, + "grad_norm": 0.6102892756462097, + "learning_rate": 2.814264432029795e-05, + "loss": 0.0903, + "step": 13944 + }, + { + "epoch": 38.952513966480446, + "grad_norm": 0.5736841559410095, + "learning_rate": 2.8142271880819368e-05, + "loss": 0.0699, + "step": 13945 + }, + { + "epoch": 38.95530726256983, + "grad_norm": 0.7148399353027344, + "learning_rate": 2.8141899441340784e-05, + "loss": 0.0906, + "step": 13946 + }, + { + "epoch": 38.95810055865922, + "grad_norm": 1.0956953763961792, + "learning_rate": 2.8141527001862197e-05, + "loss": 0.0947, + "step": 13947 + }, + { + "epoch": 38.960893854748605, + "grad_norm": 0.6016318202018738, + "learning_rate": 2.8141154562383614e-05, + "loss": 0.0669, + "step": 13948 + }, + { + "epoch": 38.96368715083799, + "grad_norm": 0.7711247801780701, + "learning_rate": 2.814078212290503e-05, + "loss": 0.0866, + "step": 13949 + }, + { + "epoch": 38.96648044692738, + "grad_norm": 0.5966675281524658, + "learning_rate": 2.8140409683426443e-05, + "loss": 0.0763, + "step": 13950 + }, + { + "epoch": 38.96927374301676, + "grad_norm": 0.6873340606689453, + "learning_rate": 2.8140037243947857e-05, + "loss": 0.0739, + "step": 13951 + }, + { + "epoch": 38.97206703910614, + "grad_norm": 1.223775029182434, + "learning_rate": 2.8139664804469273e-05, + "loss": 0.0636, + "step": 13952 + }, + { + "epoch": 38.97486033519553, + "grad_norm": 0.8691296577453613, + "learning_rate": 2.813929236499069e-05, + "loss": 0.0678, + "step": 13953 + }, + { + "epoch": 38.977653631284916, + "grad_norm": 1.0352729558944702, + "learning_rate": 2.8138919925512106e-05, + "loss": 0.0936, + "step": 13954 + }, + { + "epoch": 38.9804469273743, + "grad_norm": 0.8331159353256226, + "learning_rate": 2.8138547486033523e-05, + "loss": 0.1105, + "step": 13955 + }, + { + "epoch": 38.98324022346369, + "grad_norm": 0.5664482712745667, + "learning_rate": 2.8138175046554936e-05, + "loss": 0.1048, + "step": 13956 + }, + { + "epoch": 38.986033519553075, + "grad_norm": 0.41837936639785767, + "learning_rate": 2.813780260707635e-05, + "loss": 0.0925, + "step": 13957 + }, + { + "epoch": 38.98882681564246, + "grad_norm": 3.002607583999634, + "learning_rate": 2.8137430167597766e-05, + "loss": 0.0986, + "step": 13958 + }, + { + "epoch": 38.99162011173184, + "grad_norm": 0.7762014865875244, + "learning_rate": 2.8137057728119182e-05, + "loss": 0.0654, + "step": 13959 + }, + { + "epoch": 38.99441340782123, + "grad_norm": 1.0032436847686768, + "learning_rate": 2.8136685288640595e-05, + "loss": 0.0858, + "step": 13960 + }, + { + "epoch": 38.997206703910614, + "grad_norm": 1.0900036096572876, + "learning_rate": 2.8136312849162012e-05, + "loss": 0.0764, + "step": 13961 + }, + { + "epoch": 39.0, + "grad_norm": 0.7696616649627686, + "learning_rate": 2.813594040968343e-05, + "loss": 0.089, + "step": 13962 + }, + { + "epoch": 39.002793296089386, + "grad_norm": 0.6514900922775269, + "learning_rate": 2.8135567970204845e-05, + "loss": 0.1198, + "step": 13963 + }, + { + "epoch": 39.00558659217877, + "grad_norm": 0.35125869512557983, + "learning_rate": 2.8135195530726258e-05, + "loss": 0.1265, + "step": 13964 + }, + { + "epoch": 39.00837988826816, + "grad_norm": 0.33939993381500244, + "learning_rate": 2.813482309124767e-05, + "loss": 0.1047, + "step": 13965 + }, + { + "epoch": 39.01117318435754, + "grad_norm": 0.4291524291038513, + "learning_rate": 2.8134450651769088e-05, + "loss": 0.1084, + "step": 13966 + }, + { + "epoch": 39.013966480446925, + "grad_norm": 1.307497501373291, + "learning_rate": 2.8134078212290504e-05, + "loss": 0.1234, + "step": 13967 + }, + { + "epoch": 39.01675977653631, + "grad_norm": 0.4758478105068207, + "learning_rate": 2.813370577281192e-05, + "loss": 0.0846, + "step": 13968 + }, + { + "epoch": 39.0195530726257, + "grad_norm": 0.7991107702255249, + "learning_rate": 2.8133333333333334e-05, + "loss": 0.0974, + "step": 13969 + }, + { + "epoch": 39.022346368715084, + "grad_norm": 0.39669328927993774, + "learning_rate": 2.8132960893854747e-05, + "loss": 0.0913, + "step": 13970 + }, + { + "epoch": 39.02513966480447, + "grad_norm": 0.5983617305755615, + "learning_rate": 2.8132588454376164e-05, + "loss": 0.0935, + "step": 13971 + }, + { + "epoch": 39.02793296089386, + "grad_norm": 1.2277920246124268, + "learning_rate": 2.813221601489758e-05, + "loss": 0.0805, + "step": 13972 + }, + { + "epoch": 39.03072625698324, + "grad_norm": 0.3706749975681305, + "learning_rate": 2.8131843575418997e-05, + "loss": 0.0657, + "step": 13973 + }, + { + "epoch": 39.03351955307262, + "grad_norm": 1.1726256608963013, + "learning_rate": 2.813147113594041e-05, + "loss": 0.0962, + "step": 13974 + }, + { + "epoch": 39.03631284916201, + "grad_norm": 0.4656217396259308, + "learning_rate": 2.8131098696461826e-05, + "loss": 0.0754, + "step": 13975 + }, + { + "epoch": 39.039106145251395, + "grad_norm": 0.6613001823425293, + "learning_rate": 2.8130726256983243e-05, + "loss": 0.0779, + "step": 13976 + }, + { + "epoch": 39.04189944134078, + "grad_norm": 0.6982211470603943, + "learning_rate": 2.8130353817504656e-05, + "loss": 0.0752, + "step": 13977 + }, + { + "epoch": 39.04469273743017, + "grad_norm": 0.5721521377563477, + "learning_rate": 2.812998137802607e-05, + "loss": 0.0859, + "step": 13978 + }, + { + "epoch": 39.047486033519554, + "grad_norm": 0.9946912527084351, + "learning_rate": 2.8129608938547486e-05, + "loss": 0.0697, + "step": 13979 + }, + { + "epoch": 39.05027932960894, + "grad_norm": 0.7086393237113953, + "learning_rate": 2.8129236499068902e-05, + "loss": 0.0738, + "step": 13980 + }, + { + "epoch": 39.05307262569833, + "grad_norm": 0.6088959574699402, + "learning_rate": 2.812886405959032e-05, + "loss": 0.0728, + "step": 13981 + }, + { + "epoch": 39.055865921787706, + "grad_norm": 0.47745397686958313, + "learning_rate": 2.8128491620111735e-05, + "loss": 0.0543, + "step": 13982 + }, + { + "epoch": 39.05865921787709, + "grad_norm": 0.475727915763855, + "learning_rate": 2.812811918063315e-05, + "loss": 0.0632, + "step": 13983 + }, + { + "epoch": 39.06145251396648, + "grad_norm": 0.6240501403808594, + "learning_rate": 2.812774674115456e-05, + "loss": 0.0712, + "step": 13984 + }, + { + "epoch": 39.064245810055866, + "grad_norm": 0.5536040663719177, + "learning_rate": 2.8127374301675978e-05, + "loss": 0.0542, + "step": 13985 + }, + { + "epoch": 39.06703910614525, + "grad_norm": 0.9809906482696533, + "learning_rate": 2.8127001862197395e-05, + "loss": 0.0697, + "step": 13986 + }, + { + "epoch": 39.06983240223464, + "grad_norm": 5.530771732330322, + "learning_rate": 2.8126629422718808e-05, + "loss": 0.0735, + "step": 13987 + }, + { + "epoch": 39.072625698324025, + "grad_norm": 0.5011076927185059, + "learning_rate": 2.8126256983240224e-05, + "loss": 0.1168, + "step": 13988 + }, + { + "epoch": 39.07541899441341, + "grad_norm": 0.763650119304657, + "learning_rate": 2.812588454376164e-05, + "loss": 0.1218, + "step": 13989 + }, + { + "epoch": 39.07821229050279, + "grad_norm": 0.36942729353904724, + "learning_rate": 2.8125512104283054e-05, + "loss": 0.1008, + "step": 13990 + }, + { + "epoch": 39.08100558659218, + "grad_norm": 0.9547336101531982, + "learning_rate": 2.812513966480447e-05, + "loss": 0.1045, + "step": 13991 + }, + { + "epoch": 39.08379888268156, + "grad_norm": 1.6202845573425293, + "learning_rate": 2.8124767225325884e-05, + "loss": 0.0881, + "step": 13992 + }, + { + "epoch": 39.08659217877095, + "grad_norm": 0.40806204080581665, + "learning_rate": 2.81243947858473e-05, + "loss": 0.1054, + "step": 13993 + }, + { + "epoch": 39.089385474860336, + "grad_norm": 0.73966383934021, + "learning_rate": 2.8124022346368717e-05, + "loss": 0.094, + "step": 13994 + }, + { + "epoch": 39.09217877094972, + "grad_norm": 0.43949905037879944, + "learning_rate": 2.8123649906890133e-05, + "loss": 0.1135, + "step": 13995 + }, + { + "epoch": 39.09497206703911, + "grad_norm": 0.4388486444950104, + "learning_rate": 2.8123277467411546e-05, + "loss": 0.0768, + "step": 13996 + }, + { + "epoch": 39.097765363128495, + "grad_norm": 1.4556699991226196, + "learning_rate": 2.812290502793296e-05, + "loss": 0.1005, + "step": 13997 + }, + { + "epoch": 39.100558659217874, + "grad_norm": 0.4349512755870819, + "learning_rate": 2.8122532588454376e-05, + "loss": 0.0869, + "step": 13998 + }, + { + "epoch": 39.10335195530726, + "grad_norm": 0.5730288624763489, + "learning_rate": 2.8122160148975793e-05, + "loss": 0.1073, + "step": 13999 + }, + { + "epoch": 39.10614525139665, + "grad_norm": 0.6371434926986694, + "learning_rate": 2.812178770949721e-05, + "loss": 0.066, + "step": 14000 + }, + { + "epoch": 39.10614525139665, + "eval_cer": 0.19417258573096352, + "eval_loss": 0.33140841126441956, + "eval_runtime": 24.1764, + "eval_samples_per_second": 187.704, + "eval_steps_per_second": 0.62, + "eval_wer": 0.3525879165386442, + "step": 14000 } ], "logging_steps": 1.0, @@ -91151,7 +98161,7 @@ "early_stopping_threshold": 0.0 }, "attributes": { - "early_stopping_patience_counter": 3 + "early_stopping_patience_counter": 4 } }, "TrainerControl": { @@ -91165,7 +98175,7 @@ "attributes": {} } }, - "total_flos": 4.0153217257554156e+20, + "total_flos": 4.324168153867349e+20, "train_batch_size": 160, "trial_name": null, "trial_params": null