{ "best_metric": 0.7954939341421143, "best_model_checkpoint": "cvt-13-normal/checkpoint-700", "epoch": 100.0, "eval_steps": 500, "global_step": 700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "eval_accuracy": 0.7105719237435009, "eval_loss": 1.0209927558898926, "eval_runtime": 17.9138, "eval_samples_per_second": 64.419, "eval_steps_per_second": 0.558, "step": 7 }, { "epoch": 1.4285714285714286, "grad_norm": 7.967917442321777, "learning_rate": 4.285714285714285e-05, "loss": 5.5642, "step": 10 }, { "epoch": 2.0, "eval_accuracy": 0.7097053726169844, "eval_loss": 1.0071666240692139, "eval_runtime": 17.7355, "eval_samples_per_second": 65.067, "eval_steps_per_second": 0.564, "step": 14 }, { "epoch": 2.857142857142857, "grad_norm": 8.133280754089355, "learning_rate": 8.57142857142857e-05, "loss": 5.662, "step": 20 }, { "epoch": 3.0, "eval_accuracy": 0.708838821490468, "eval_loss": 1.0150678157806396, "eval_runtime": 17.8577, "eval_samples_per_second": 64.622, "eval_steps_per_second": 0.56, "step": 21 }, { "epoch": 4.0, "eval_accuracy": 0.7140381282495667, "eval_loss": 1.0016363859176636, "eval_runtime": 17.837, "eval_samples_per_second": 64.697, "eval_steps_per_second": 0.561, "step": 28 }, { "epoch": 4.285714285714286, "grad_norm": 8.433135986328125, "learning_rate": 0.00012857142857142855, "loss": 5.381, "step": 30 }, { "epoch": 5.0, "eval_accuracy": 0.7123050259965338, "eval_loss": 1.0119163990020752, "eval_runtime": 17.7345, "eval_samples_per_second": 65.071, "eval_steps_per_second": 0.564, "step": 35 }, { "epoch": 5.714285714285714, "grad_norm": 9.856744766235352, "learning_rate": 0.0001714285714285714, "loss": 5.3348, "step": 40 }, { "epoch": 6.0, "eval_accuracy": 0.720103986135182, "eval_loss": 0.9661750793457031, "eval_runtime": 17.9039, "eval_samples_per_second": 64.455, "eval_steps_per_second": 0.559, "step": 42 }, { "epoch": 7.0, "eval_accuracy": 0.7261698440207972, "eval_loss": 0.9513705372810364, "eval_runtime": 17.8649, "eval_samples_per_second": 64.596, "eval_steps_per_second": 0.56, "step": 49 }, { "epoch": 7.142857142857143, "grad_norm": 10.7362699508667, "learning_rate": 0.00021428571428571427, "loss": 5.2423, "step": 50 }, { "epoch": 8.0, "eval_accuracy": 0.7105719237435009, "eval_loss": 0.9588707685470581, "eval_runtime": 17.8964, "eval_samples_per_second": 64.482, "eval_steps_per_second": 0.559, "step": 56 }, { "epoch": 8.571428571428571, "grad_norm": 11.099422454833984, "learning_rate": 0.0002571428571428571, "loss": 5.0251, "step": 60 }, { "epoch": 9.0, "eval_accuracy": 0.7279029462738301, "eval_loss": 0.908963680267334, "eval_runtime": 17.9404, "eval_samples_per_second": 64.324, "eval_steps_per_second": 0.557, "step": 63 }, { "epoch": 10.0, "grad_norm": 16.643394470214844, "learning_rate": 0.0003, "loss": 5.0547, "step": 70 }, { "epoch": 10.0, "eval_accuracy": 0.7123050259965338, "eval_loss": 0.9352001547813416, "eval_runtime": 18.2788, "eval_samples_per_second": 63.133, "eval_steps_per_second": 0.547, "step": 70 }, { "epoch": 11.0, "eval_accuracy": 0.6993067590987868, "eval_loss": 1.0062916278839111, "eval_runtime": 17.9901, "eval_samples_per_second": 64.146, "eval_steps_per_second": 0.556, "step": 77 }, { "epoch": 11.428571428571429, "grad_norm": 9.376890182495117, "learning_rate": 0.0002952380952380952, "loss": 4.8246, "step": 80 }, { "epoch": 12.0, "eval_accuracy": 0.7105719237435009, "eval_loss": 0.9190986752510071, "eval_runtime": 18.1793, "eval_samples_per_second": 63.479, "eval_steps_per_second": 0.55, "step": 84 }, { "epoch": 12.857142857142858, "grad_norm": 7.629549026489258, "learning_rate": 0.00029047619047619045, "loss": 4.7811, "step": 90 }, { "epoch": 13.0, "eval_accuracy": 0.7123050259965338, "eval_loss": 0.9947251677513123, "eval_runtime": 17.9036, "eval_samples_per_second": 64.456, "eval_steps_per_second": 0.559, "step": 91 }, { "epoch": 14.0, "eval_accuracy": 0.7175043327556326, "eval_loss": 0.9671235084533691, "eval_runtime": 18.1306, "eval_samples_per_second": 63.649, "eval_steps_per_second": 0.552, "step": 98 }, { "epoch": 14.285714285714286, "grad_norm": 13.771581649780273, "learning_rate": 0.0002857142857142857, "loss": 4.8234, "step": 100 }, { "epoch": 15.0, "eval_accuracy": 0.7235701906412478, "eval_loss": 0.9055125117301941, "eval_runtime": 18.3144, "eval_samples_per_second": 63.01, "eval_steps_per_second": 0.546, "step": 105 }, { "epoch": 15.714285714285714, "grad_norm": 9.288651466369629, "learning_rate": 0.0002809523809523809, "loss": 4.4787, "step": 110 }, { "epoch": 16.0, "eval_accuracy": 0.744367417677643, "eval_loss": 0.8837802410125732, "eval_runtime": 18.2071, "eval_samples_per_second": 63.382, "eval_steps_per_second": 0.549, "step": 112 }, { "epoch": 17.0, "eval_accuracy": 0.729636048526863, "eval_loss": 0.9059325456619263, "eval_runtime": 18.0331, "eval_samples_per_second": 63.994, "eval_steps_per_second": 0.555, "step": 119 }, { "epoch": 17.142857142857142, "grad_norm": 8.790782928466797, "learning_rate": 0.00027619047619047615, "loss": 4.39, "step": 120 }, { "epoch": 18.0, "eval_accuracy": 0.7461005199306759, "eval_loss": 0.8639523983001709, "eval_runtime": 18.0609, "eval_samples_per_second": 63.895, "eval_steps_per_second": 0.554, "step": 126 }, { "epoch": 18.571428571428573, "grad_norm": 7.883941650390625, "learning_rate": 0.0002714285714285714, "loss": 4.1424, "step": 130 }, { "epoch": 19.0, "eval_accuracy": 0.7487001733102253, "eval_loss": 0.8660562634468079, "eval_runtime": 17.7478, "eval_samples_per_second": 65.022, "eval_steps_per_second": 0.563, "step": 133 }, { "epoch": 20.0, "grad_norm": 21.828325271606445, "learning_rate": 0.0002666666666666666, "loss": 4.1065, "step": 140 }, { "epoch": 20.0, "eval_accuracy": 0.7305025996533796, "eval_loss": 0.9056758284568787, "eval_runtime": 17.8484, "eval_samples_per_second": 64.656, "eval_steps_per_second": 0.56, "step": 140 }, { "epoch": 21.0, "eval_accuracy": 0.7348353552859619, "eval_loss": 0.8865219354629517, "eval_runtime": 18.0329, "eval_samples_per_second": 63.994, "eval_steps_per_second": 0.555, "step": 147 }, { "epoch": 21.428571428571427, "grad_norm": 7.540792465209961, "learning_rate": 0.00026190476190476186, "loss": 4.0844, "step": 150 }, { "epoch": 22.0, "eval_accuracy": 0.7391681109185442, "eval_loss": 0.8928019404411316, "eval_runtime": 17.9197, "eval_samples_per_second": 64.398, "eval_steps_per_second": 0.558, "step": 154 }, { "epoch": 22.857142857142858, "grad_norm": 14.240620613098145, "learning_rate": 0.0002571428571428571, "loss": 3.9835, "step": 160 }, { "epoch": 23.0, "eval_accuracy": 0.7538994800693241, "eval_loss": 0.8675404787063599, "eval_runtime": 18.0176, "eval_samples_per_second": 64.048, "eval_steps_per_second": 0.555, "step": 161 }, { "epoch": 24.0, "eval_accuracy": 0.755632582322357, "eval_loss": 0.8828888535499573, "eval_runtime": 17.7466, "eval_samples_per_second": 65.027, "eval_steps_per_second": 0.563, "step": 168 }, { "epoch": 24.285714285714285, "grad_norm": 8.749543190002441, "learning_rate": 0.0002523809523809524, "loss": 3.8199, "step": 170 }, { "epoch": 25.0, "eval_accuracy": 0.7616984402079723, "eval_loss": 0.8176947832107544, "eval_runtime": 17.983, "eval_samples_per_second": 64.172, "eval_steps_per_second": 0.556, "step": 175 }, { "epoch": 25.714285714285715, "grad_norm": 9.475801467895508, "learning_rate": 0.00024761904761904757, "loss": 3.7898, "step": 180 }, { "epoch": 26.0, "eval_accuracy": 0.7461005199306759, "eval_loss": 0.8885547518730164, "eval_runtime": 18.0273, "eval_samples_per_second": 64.014, "eval_steps_per_second": 0.555, "step": 182 }, { "epoch": 27.0, "eval_accuracy": 0.7461005199306759, "eval_loss": 0.9394861459732056, "eval_runtime": 18.1419, "eval_samples_per_second": 63.61, "eval_steps_per_second": 0.551, "step": 189 }, { "epoch": 27.142857142857142, "grad_norm": 7.944543361663818, "learning_rate": 0.00024285714285714283, "loss": 3.7734, "step": 190 }, { "epoch": 28.0, "eval_accuracy": 0.7608318890814558, "eval_loss": 0.8348239064216614, "eval_runtime": 17.9109, "eval_samples_per_second": 64.43, "eval_steps_per_second": 0.558, "step": 196 }, { "epoch": 28.571428571428573, "grad_norm": 9.20173168182373, "learning_rate": 0.00023809523809523807, "loss": 3.7835, "step": 200 }, { "epoch": 29.0, "eval_accuracy": 0.75736568457539, "eval_loss": 0.836903989315033, "eval_runtime": 18.1677, "eval_samples_per_second": 63.519, "eval_steps_per_second": 0.55, "step": 203 }, { "epoch": 30.0, "grad_norm": 17.463150024414062, "learning_rate": 0.0002333333333333333, "loss": 3.6414, "step": 210 }, { "epoch": 30.0, "eval_accuracy": 0.7660311958405546, "eval_loss": 0.8668186664581299, "eval_runtime": 17.8247, "eval_samples_per_second": 64.742, "eval_steps_per_second": 0.561, "step": 210 }, { "epoch": 31.0, "eval_accuracy": 0.7599653379549394, "eval_loss": 0.8909233808517456, "eval_runtime": 18.1581, "eval_samples_per_second": 63.553, "eval_steps_per_second": 0.551, "step": 217 }, { "epoch": 31.428571428571427, "grad_norm": 13.756216049194336, "learning_rate": 0.00022857142857142854, "loss": 3.5076, "step": 220 }, { "epoch": 32.0, "eval_accuracy": 0.7495667244367418, "eval_loss": 0.8795309066772461, "eval_runtime": 17.8514, "eval_samples_per_second": 64.645, "eval_steps_per_second": 0.56, "step": 224 }, { "epoch": 32.857142857142854, "grad_norm": 9.03218936920166, "learning_rate": 0.0002238095238095238, "loss": 3.5447, "step": 230 }, { "epoch": 33.0, "eval_accuracy": 0.7538994800693241, "eval_loss": 0.9227800369262695, "eval_runtime": 17.9657, "eval_samples_per_second": 64.233, "eval_steps_per_second": 0.557, "step": 231 }, { "epoch": 34.0, "eval_accuracy": 0.7521663778162911, "eval_loss": 0.8850377798080444, "eval_runtime": 17.9906, "eval_samples_per_second": 64.144, "eval_steps_per_second": 0.556, "step": 238 }, { "epoch": 34.285714285714285, "grad_norm": 7.675583839416504, "learning_rate": 0.000219047619047619, "loss": 3.5344, "step": 240 }, { "epoch": 35.0, "eval_accuracy": 0.7651646447140381, "eval_loss": 0.8584573864936829, "eval_runtime": 18.1255, "eval_samples_per_second": 63.667, "eval_steps_per_second": 0.552, "step": 245 }, { "epoch": 35.714285714285715, "grad_norm": 7.848378658294678, "learning_rate": 0.00021428571428571427, "loss": 3.3678, "step": 250 }, { "epoch": 36.0, "eval_accuracy": 0.75736568457539, "eval_loss": 0.8631114959716797, "eval_runtime": 18.0275, "eval_samples_per_second": 64.013, "eval_steps_per_second": 0.555, "step": 252 }, { "epoch": 37.0, "eval_accuracy": 0.770363951473137, "eval_loss": 0.8675860166549683, "eval_runtime": 18.0196, "eval_samples_per_second": 64.042, "eval_steps_per_second": 0.555, "step": 259 }, { "epoch": 37.142857142857146, "grad_norm": 9.06800651550293, "learning_rate": 0.00020952380952380948, "loss": 3.4061, "step": 260 }, { "epoch": 38.0, "eval_accuracy": 0.7616984402079723, "eval_loss": 0.9131080508232117, "eval_runtime": 17.9025, "eval_samples_per_second": 64.46, "eval_steps_per_second": 0.559, "step": 266 }, { "epoch": 38.57142857142857, "grad_norm": 11.665525436401367, "learning_rate": 0.00020476190476190475, "loss": 3.3177, "step": 270 }, { "epoch": 39.0, "eval_accuracy": 0.7677642980935875, "eval_loss": 0.8631002902984619, "eval_runtime": 17.9771, "eval_samples_per_second": 64.193, "eval_steps_per_second": 0.556, "step": 273 }, { "epoch": 40.0, "grad_norm": 15.023707389831543, "learning_rate": 0.00019999999999999998, "loss": 3.2767, "step": 280 }, { "epoch": 40.0, "eval_accuracy": 0.7642980935875217, "eval_loss": 0.8802210092544556, "eval_runtime": 17.9247, "eval_samples_per_second": 64.381, "eval_steps_per_second": 0.558, "step": 280 }, { "epoch": 41.0, "eval_accuracy": 0.7677642980935875, "eval_loss": 0.8518037796020508, "eval_runtime": 18.183, "eval_samples_per_second": 63.466, "eval_steps_per_second": 0.55, "step": 287 }, { "epoch": 41.42857142857143, "grad_norm": 8.431020736694336, "learning_rate": 0.00019523809523809522, "loss": 3.1992, "step": 290 }, { "epoch": 42.0, "eval_accuracy": 0.75736568457539, "eval_loss": 0.923156201839447, "eval_runtime": 18.0318, "eval_samples_per_second": 63.998, "eval_steps_per_second": 0.555, "step": 294 }, { "epoch": 42.857142857142854, "grad_norm": 8.130815505981445, "learning_rate": 0.00019047619047619045, "loss": 3.2743, "step": 300 }, { "epoch": 43.0, "eval_accuracy": 0.7521663778162911, "eval_loss": 0.9305623173713684, "eval_runtime": 17.9901, "eval_samples_per_second": 64.146, "eval_steps_per_second": 0.556, "step": 301 }, { "epoch": 44.0, "eval_accuracy": 0.7755632582322357, "eval_loss": 0.8419708013534546, "eval_runtime": 17.9031, "eval_samples_per_second": 64.458, "eval_steps_per_second": 0.559, "step": 308 }, { "epoch": 44.285714285714285, "grad_norm": 9.007019996643066, "learning_rate": 0.00018571428571428572, "loss": 3.1704, "step": 310 }, { "epoch": 45.0, "eval_accuracy": 0.7564991334488734, "eval_loss": 0.8801714777946472, "eval_runtime": 17.8984, "eval_samples_per_second": 64.475, "eval_steps_per_second": 0.559, "step": 315 }, { "epoch": 45.714285714285715, "grad_norm": 8.079572677612305, "learning_rate": 0.00018095238095238093, "loss": 3.2466, "step": 320 }, { "epoch": 46.0, "eval_accuracy": 0.7677642980935875, "eval_loss": 0.878183901309967, "eval_runtime": 18.135, "eval_samples_per_second": 63.634, "eval_steps_per_second": 0.551, "step": 322 }, { "epoch": 47.0, "eval_accuracy": 0.7746967071057193, "eval_loss": 0.844364583492279, "eval_runtime": 18.003, "eval_samples_per_second": 64.1, "eval_steps_per_second": 0.555, "step": 329 }, { "epoch": 47.142857142857146, "grad_norm": 6.920067310333252, "learning_rate": 0.0001761904761904762, "loss": 3.0879, "step": 330 }, { "epoch": 48.0, "eval_accuracy": 0.7694974003466204, "eval_loss": 0.8579216003417969, "eval_runtime": 17.8532, "eval_samples_per_second": 64.638, "eval_steps_per_second": 0.56, "step": 336 }, { "epoch": 48.57142857142857, "grad_norm": 6.670530796051025, "learning_rate": 0.0001714285714285714, "loss": 3.1677, "step": 340 }, { "epoch": 49.0, "eval_accuracy": 0.7712305025996534, "eval_loss": 0.858402669429779, "eval_runtime": 17.75, "eval_samples_per_second": 65.014, "eval_steps_per_second": 0.563, "step": 343 }, { "epoch": 50.0, "grad_norm": 13.106241226196289, "learning_rate": 0.00016666666666666666, "loss": 3.0965, "step": 350 }, { "epoch": 50.0, "eval_accuracy": 0.7755632582322357, "eval_loss": 0.8400810956954956, "eval_runtime": 18.0075, "eval_samples_per_second": 64.084, "eval_steps_per_second": 0.555, "step": 350 }, { "epoch": 51.0, "eval_accuracy": 0.7651646447140381, "eval_loss": 0.8724238872528076, "eval_runtime": 18.0097, "eval_samples_per_second": 64.077, "eval_steps_per_second": 0.555, "step": 357 }, { "epoch": 51.42857142857143, "grad_norm": 8.85236930847168, "learning_rate": 0.00016190476190476187, "loss": 3.0611, "step": 360 }, { "epoch": 52.0, "eval_accuracy": 0.7807625649913345, "eval_loss": 0.8638470768928528, "eval_runtime": 18.0439, "eval_samples_per_second": 63.955, "eval_steps_per_second": 0.554, "step": 364 }, { "epoch": 52.857142857142854, "grad_norm": 7.648194789886475, "learning_rate": 0.00015714285714285713, "loss": 3.0204, "step": 370 }, { "epoch": 53.0, "eval_accuracy": 0.7660311958405546, "eval_loss": 0.9167099595069885, "eval_runtime": 17.9056, "eval_samples_per_second": 64.449, "eval_steps_per_second": 0.558, "step": 371 }, { "epoch": 54.0, "eval_accuracy": 0.7738301559792028, "eval_loss": 0.8322371244430542, "eval_runtime": 17.9741, "eval_samples_per_second": 64.204, "eval_steps_per_second": 0.556, "step": 378 }, { "epoch": 54.285714285714285, "grad_norm": 6.742936611175537, "learning_rate": 0.00015238095238095237, "loss": 2.9704, "step": 380 }, { "epoch": 55.0, "eval_accuracy": 0.7642980935875217, "eval_loss": 0.8577215671539307, "eval_runtime": 18.0258, "eval_samples_per_second": 64.019, "eval_steps_per_second": 0.555, "step": 385 }, { "epoch": 55.714285714285715, "grad_norm": 6.2735395431518555, "learning_rate": 0.0001476190476190476, "loss": 2.939, "step": 390 }, { "epoch": 56.0, "eval_accuracy": 0.7859618717504333, "eval_loss": 0.8296905755996704, "eval_runtime": 18.0649, "eval_samples_per_second": 63.881, "eval_steps_per_second": 0.554, "step": 392 }, { "epoch": 57.0, "eval_accuracy": 0.7686308492201039, "eval_loss": 0.874596893787384, "eval_runtime": 17.9658, "eval_samples_per_second": 64.233, "eval_steps_per_second": 0.557, "step": 399 }, { "epoch": 57.142857142857146, "grad_norm": 6.44887113571167, "learning_rate": 0.00014285714285714284, "loss": 3.0341, "step": 400 }, { "epoch": 58.0, "eval_accuracy": 0.7824956672443674, "eval_loss": 0.8620171546936035, "eval_runtime": 17.939, "eval_samples_per_second": 64.329, "eval_steps_per_second": 0.557, "step": 406 }, { "epoch": 58.57142857142857, "grad_norm": 6.199102401733398, "learning_rate": 0.00013809523809523808, "loss": 2.8997, "step": 410 }, { "epoch": 59.0, "eval_accuracy": 0.75736568457539, "eval_loss": 0.8835130333900452, "eval_runtime": 18.2434, "eval_samples_per_second": 63.256, "eval_steps_per_second": 0.548, "step": 413 }, { "epoch": 60.0, "grad_norm": 27.795392990112305, "learning_rate": 0.0001333333333333333, "loss": 3.0187, "step": 420 }, { "epoch": 60.0, "eval_accuracy": 0.7694974003466204, "eval_loss": 0.9018464684486389, "eval_runtime": 18.2513, "eval_samples_per_second": 63.228, "eval_steps_per_second": 0.548, "step": 420 }, { "epoch": 61.0, "eval_accuracy": 0.7772963604852686, "eval_loss": 0.8939943909645081, "eval_runtime": 18.1365, "eval_samples_per_second": 63.629, "eval_steps_per_second": 0.551, "step": 427 }, { "epoch": 61.42857142857143, "grad_norm": 10.215301513671875, "learning_rate": 0.00012857142857142855, "loss": 2.9316, "step": 430 }, { "epoch": 62.0, "eval_accuracy": 0.7712305025996534, "eval_loss": 0.8858510851860046, "eval_runtime": 18.1655, "eval_samples_per_second": 63.527, "eval_steps_per_second": 0.55, "step": 434 }, { "epoch": 62.857142857142854, "grad_norm": 5.105686187744141, "learning_rate": 0.00012380952380952378, "loss": 2.8746, "step": 440 }, { "epoch": 63.0, "eval_accuracy": 0.7764298093587522, "eval_loss": 0.8661392331123352, "eval_runtime": 17.9626, "eval_samples_per_second": 64.245, "eval_steps_per_second": 0.557, "step": 441 }, { "epoch": 64.0, "eval_accuracy": 0.7712305025996534, "eval_loss": 0.8916440010070801, "eval_runtime": 17.94, "eval_samples_per_second": 64.326, "eval_steps_per_second": 0.557, "step": 448 }, { "epoch": 64.28571428571429, "grad_norm": 9.268267631530762, "learning_rate": 0.00011904761904761903, "loss": 2.817, "step": 450 }, { "epoch": 65.0, "eval_accuracy": 0.7781629116117851, "eval_loss": 0.8645418286323547, "eval_runtime": 18.2441, "eval_samples_per_second": 63.253, "eval_steps_per_second": 0.548, "step": 455 }, { "epoch": 65.71428571428571, "grad_norm": 6.703152179718018, "learning_rate": 0.00011428571428571427, "loss": 2.7593, "step": 460 }, { "epoch": 66.0, "eval_accuracy": 0.7686308492201039, "eval_loss": 0.8828719854354858, "eval_runtime": 18.1608, "eval_samples_per_second": 63.543, "eval_steps_per_second": 0.551, "step": 462 }, { "epoch": 67.0, "eval_accuracy": 0.7790294627383015, "eval_loss": 0.8883015513420105, "eval_runtime": 18.1166, "eval_samples_per_second": 63.698, "eval_steps_per_second": 0.552, "step": 469 }, { "epoch": 67.14285714285714, "grad_norm": 5.34393310546875, "learning_rate": 0.0001095238095238095, "loss": 2.9212, "step": 470 }, { "epoch": 68.0, "eval_accuracy": 0.7824956672443674, "eval_loss": 0.8507192134857178, "eval_runtime": 18.0504, "eval_samples_per_second": 63.932, "eval_steps_per_second": 0.554, "step": 476 }, { "epoch": 68.57142857142857, "grad_norm": 6.5966668128967285, "learning_rate": 0.00010476190476190474, "loss": 2.8659, "step": 480 }, { "epoch": 69.0, "eval_accuracy": 0.7876949740034662, "eval_loss": 0.8553578853607178, "eval_runtime": 18.0681, "eval_samples_per_second": 63.869, "eval_steps_per_second": 0.553, "step": 483 }, { "epoch": 70.0, "grad_norm": 22.730220794677734, "learning_rate": 9.999999999999999e-05, "loss": 2.9068, "step": 490 }, { "epoch": 70.0, "eval_accuracy": 0.7764298093587522, "eval_loss": 0.8812502026557922, "eval_runtime": 17.9671, "eval_samples_per_second": 64.229, "eval_steps_per_second": 0.557, "step": 490 }, { "epoch": 71.0, "eval_accuracy": 0.7859618717504333, "eval_loss": 0.8555229902267456, "eval_runtime": 18.0711, "eval_samples_per_second": 63.859, "eval_steps_per_second": 0.553, "step": 497 }, { "epoch": 71.42857142857143, "grad_norm": 5.773199558258057, "learning_rate": 9.523809523809523e-05, "loss": 2.8334, "step": 500 }, { "epoch": 72.0, "eval_accuracy": 0.7790294627383015, "eval_loss": 0.8665823340415955, "eval_runtime": 18.4819, "eval_samples_per_second": 62.439, "eval_steps_per_second": 0.541, "step": 504 }, { "epoch": 72.85714285714286, "grad_norm": 6.063803672790527, "learning_rate": 9.047619047619046e-05, "loss": 2.7322, "step": 510 }, { "epoch": 73.0, "eval_accuracy": 0.7824956672443674, "eval_loss": 0.8682228922843933, "eval_runtime": 18.1239, "eval_samples_per_second": 63.673, "eval_steps_per_second": 0.552, "step": 511 }, { "epoch": 74.0, "eval_accuracy": 0.7885615251299827, "eval_loss": 0.881618320941925, "eval_runtime": 17.8842, "eval_samples_per_second": 64.526, "eval_steps_per_second": 0.559, "step": 518 }, { "epoch": 74.28571428571429, "grad_norm": 5.207172870635986, "learning_rate": 8.57142857142857e-05, "loss": 2.8548, "step": 520 }, { "epoch": 75.0, "eval_accuracy": 0.7902946273830156, "eval_loss": 0.8523378968238831, "eval_runtime": 18.1134, "eval_samples_per_second": 63.71, "eval_steps_per_second": 0.552, "step": 525 }, { "epoch": 75.71428571428571, "grad_norm": 6.294586658477783, "learning_rate": 8.095238095238093e-05, "loss": 2.8696, "step": 530 }, { "epoch": 76.0, "eval_accuracy": 0.7894280762564991, "eval_loss": 0.8509147763252258, "eval_runtime": 18.182, "eval_samples_per_second": 63.469, "eval_steps_per_second": 0.55, "step": 532 }, { "epoch": 77.0, "eval_accuracy": 0.7807625649913345, "eval_loss": 0.8682960867881775, "eval_runtime": 18.3628, "eval_samples_per_second": 62.845, "eval_steps_per_second": 0.545, "step": 539 }, { "epoch": 77.14285714285714, "grad_norm": 5.558056831359863, "learning_rate": 7.619047619047618e-05, "loss": 2.6439, "step": 540 }, { "epoch": 78.0, "eval_accuracy": 0.7876949740034662, "eval_loss": 0.860653281211853, "eval_runtime": 18.2632, "eval_samples_per_second": 63.187, "eval_steps_per_second": 0.548, "step": 546 }, { "epoch": 78.57142857142857, "grad_norm": 5.7894415855407715, "learning_rate": 7.142857142857142e-05, "loss": 2.9039, "step": 550 }, { "epoch": 79.0, "eval_accuracy": 0.7842287694974004, "eval_loss": 0.8698387742042542, "eval_runtime": 18.1385, "eval_samples_per_second": 63.622, "eval_steps_per_second": 0.551, "step": 553 }, { "epoch": 80.0, "grad_norm": 28.787755966186523, "learning_rate": 6.666666666666666e-05, "loss": 2.6338, "step": 560 }, { "epoch": 80.0, "eval_accuracy": 0.7876949740034662, "eval_loss": 0.8718376755714417, "eval_runtime": 18.0357, "eval_samples_per_second": 63.984, "eval_steps_per_second": 0.554, "step": 560 }, { "epoch": 81.0, "eval_accuracy": 0.7902946273830156, "eval_loss": 0.8370843529701233, "eval_runtime": 18.1407, "eval_samples_per_second": 63.614, "eval_steps_per_second": 0.551, "step": 567 }, { "epoch": 81.42857142857143, "grad_norm": 6.290432929992676, "learning_rate": 6.190476190476189e-05, "loss": 2.7271, "step": 570 }, { "epoch": 82.0, "eval_accuracy": 0.792894280762565, "eval_loss": 0.8426641821861267, "eval_runtime": 17.8494, "eval_samples_per_second": 64.652, "eval_steps_per_second": 0.56, "step": 574 }, { "epoch": 82.85714285714286, "grad_norm": 4.4193525314331055, "learning_rate": 5.7142857142857135e-05, "loss": 2.7555, "step": 580 }, { "epoch": 83.0, "eval_accuracy": 0.7937608318890814, "eval_loss": 0.8621939420700073, "eval_runtime": 17.8242, "eval_samples_per_second": 64.743, "eval_steps_per_second": 0.561, "step": 581 }, { "epoch": 84.0, "eval_accuracy": 0.7859618717504333, "eval_loss": 0.8768612146377563, "eval_runtime": 17.9828, "eval_samples_per_second": 64.172, "eval_steps_per_second": 0.556, "step": 588 }, { "epoch": 84.28571428571429, "grad_norm": 5.777393341064453, "learning_rate": 5.238095238095237e-05, "loss": 2.7702, "step": 590 }, { "epoch": 85.0, "eval_accuracy": 0.7859618717504333, "eval_loss": 0.88438481092453, "eval_runtime": 17.8963, "eval_samples_per_second": 64.483, "eval_steps_per_second": 0.559, "step": 595 }, { "epoch": 85.71428571428571, "grad_norm": 5.748138904571533, "learning_rate": 4.7619047619047614e-05, "loss": 2.8678, "step": 600 }, { "epoch": 86.0, "eval_accuracy": 0.7824956672443674, "eval_loss": 0.8882182836532593, "eval_runtime": 17.8524, "eval_samples_per_second": 64.641, "eval_steps_per_second": 0.56, "step": 602 }, { "epoch": 87.0, "eval_accuracy": 0.7824956672443674, "eval_loss": 0.8715818524360657, "eval_runtime": 17.8328, "eval_samples_per_second": 64.712, "eval_steps_per_second": 0.561, "step": 609 }, { "epoch": 87.14285714285714, "grad_norm": 4.612086772918701, "learning_rate": 4.285714285714285e-05, "loss": 2.6334, "step": 610 }, { "epoch": 88.0, "eval_accuracy": 0.7781629116117851, "eval_loss": 0.8782148361206055, "eval_runtime": 17.9213, "eval_samples_per_second": 64.393, "eval_steps_per_second": 0.558, "step": 616 }, { "epoch": 88.57142857142857, "grad_norm": 6.36035680770874, "learning_rate": 3.809523809523809e-05, "loss": 2.7782, "step": 620 }, { "epoch": 89.0, "eval_accuracy": 0.7807625649913345, "eval_loss": 0.8752433657646179, "eval_runtime": 18.042, "eval_samples_per_second": 63.962, "eval_steps_per_second": 0.554, "step": 623 }, { "epoch": 90.0, "grad_norm": 6.581643581390381, "learning_rate": 3.333333333333333e-05, "loss": 2.5527, "step": 630 }, { "epoch": 90.0, "eval_accuracy": 0.7807625649913345, "eval_loss": 0.8674911856651306, "eval_runtime": 17.811, "eval_samples_per_second": 64.791, "eval_steps_per_second": 0.561, "step": 630 }, { "epoch": 91.0, "eval_accuracy": 0.7842287694974004, "eval_loss": 0.8734576106071472, "eval_runtime": 17.906, "eval_samples_per_second": 64.448, "eval_steps_per_second": 0.558, "step": 637 }, { "epoch": 91.42857142857143, "grad_norm": 6.266481399536133, "learning_rate": 2.8571428571428567e-05, "loss": 2.6812, "step": 640 }, { "epoch": 92.0, "eval_accuracy": 0.7885615251299827, "eval_loss": 0.8649889826774597, "eval_runtime": 18.1196, "eval_samples_per_second": 63.688, "eval_steps_per_second": 0.552, "step": 644 }, { "epoch": 92.85714285714286, "grad_norm": 5.178635597229004, "learning_rate": 2.3809523809523807e-05, "loss": 2.6167, "step": 650 }, { "epoch": 93.0, "eval_accuracy": 0.7946273830155979, "eval_loss": 0.8530935049057007, "eval_runtime": 17.8992, "eval_samples_per_second": 64.472, "eval_steps_per_second": 0.559, "step": 651 }, { "epoch": 94.0, "eval_accuracy": 0.7868284228769498, "eval_loss": 0.8698766827583313, "eval_runtime": 17.9684, "eval_samples_per_second": 64.224, "eval_steps_per_second": 0.557, "step": 658 }, { "epoch": 94.28571428571429, "grad_norm": 4.488171100616455, "learning_rate": 1.9047619047619046e-05, "loss": 2.6553, "step": 660 }, { "epoch": 95.0, "eval_accuracy": 0.7894280762564991, "eval_loss": 0.8666642308235168, "eval_runtime": 17.9669, "eval_samples_per_second": 64.229, "eval_steps_per_second": 0.557, "step": 665 }, { "epoch": 95.71428571428571, "grad_norm": 6.009092330932617, "learning_rate": 1.4285714285714284e-05, "loss": 2.7758, "step": 670 }, { "epoch": 96.0, "eval_accuracy": 0.7920277296360485, "eval_loss": 0.8650416731834412, "eval_runtime": 18.0841, "eval_samples_per_second": 63.813, "eval_steps_per_second": 0.553, "step": 672 }, { "epoch": 97.0, "eval_accuracy": 0.7902946273830156, "eval_loss": 0.8684815764427185, "eval_runtime": 17.8482, "eval_samples_per_second": 64.656, "eval_steps_per_second": 0.56, "step": 679 }, { "epoch": 97.14285714285714, "grad_norm": 5.19600772857666, "learning_rate": 9.523809523809523e-06, "loss": 2.6592, "step": 680 }, { "epoch": 98.0, "eval_accuracy": 0.7885615251299827, "eval_loss": 0.8592236042022705, "eval_runtime": 17.9065, "eval_samples_per_second": 64.446, "eval_steps_per_second": 0.558, "step": 686 }, { "epoch": 98.57142857142857, "grad_norm": 5.676305770874023, "learning_rate": 4.7619047619047615e-06, "loss": 2.5202, "step": 690 }, { "epoch": 99.0, "eval_accuracy": 0.7894280762564991, "eval_loss": 0.8744557499885559, "eval_runtime": 17.8619, "eval_samples_per_second": 64.607, "eval_steps_per_second": 0.56, "step": 693 }, { "epoch": 100.0, "grad_norm": 48.86530685424805, "learning_rate": 0.0, "loss": 2.6577, "step": 700 }, { "epoch": 100.0, "eval_accuracy": 0.7954939341421143, "eval_loss": 0.8635059595108032, "eval_runtime": 18.1084, "eval_samples_per_second": 63.727, "eval_steps_per_second": 0.552, "step": 700 }, { "epoch": 100.0, "step": 700, "total_flos": 6.134894724962304e+18, "train_loss": 3.4289448138645717, "train_runtime": 8283.6476, "train_samples_per_second": 41.793, "train_steps_per_second": 0.085 } ], "logging_steps": 10, "max_steps": 700, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.134894724962304e+18, "train_batch_size": 128, "trial_name": null, "trial_params": null }