cvt-13-normal / trainer_state.json
desarrolloasesoreslocales's picture
Training in progress, epoch 1
73037a6 verified
{
"best_metric": 0.7954939341421143,
"best_model_checkpoint": "cvt-13-normal/checkpoint-700",
"epoch": 100.0,
"eval_steps": 500,
"global_step": 700,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.0,
"eval_accuracy": 0.7105719237435009,
"eval_loss": 1.0209927558898926,
"eval_runtime": 17.9138,
"eval_samples_per_second": 64.419,
"eval_steps_per_second": 0.558,
"step": 7
},
{
"epoch": 1.4285714285714286,
"grad_norm": 7.967917442321777,
"learning_rate": 4.285714285714285e-05,
"loss": 5.5642,
"step": 10
},
{
"epoch": 2.0,
"eval_accuracy": 0.7097053726169844,
"eval_loss": 1.0071666240692139,
"eval_runtime": 17.7355,
"eval_samples_per_second": 65.067,
"eval_steps_per_second": 0.564,
"step": 14
},
{
"epoch": 2.857142857142857,
"grad_norm": 8.133280754089355,
"learning_rate": 8.57142857142857e-05,
"loss": 5.662,
"step": 20
},
{
"epoch": 3.0,
"eval_accuracy": 0.708838821490468,
"eval_loss": 1.0150678157806396,
"eval_runtime": 17.8577,
"eval_samples_per_second": 64.622,
"eval_steps_per_second": 0.56,
"step": 21
},
{
"epoch": 4.0,
"eval_accuracy": 0.7140381282495667,
"eval_loss": 1.0016363859176636,
"eval_runtime": 17.837,
"eval_samples_per_second": 64.697,
"eval_steps_per_second": 0.561,
"step": 28
},
{
"epoch": 4.285714285714286,
"grad_norm": 8.433135986328125,
"learning_rate": 0.00012857142857142855,
"loss": 5.381,
"step": 30
},
{
"epoch": 5.0,
"eval_accuracy": 0.7123050259965338,
"eval_loss": 1.0119163990020752,
"eval_runtime": 17.7345,
"eval_samples_per_second": 65.071,
"eval_steps_per_second": 0.564,
"step": 35
},
{
"epoch": 5.714285714285714,
"grad_norm": 9.856744766235352,
"learning_rate": 0.0001714285714285714,
"loss": 5.3348,
"step": 40
},
{
"epoch": 6.0,
"eval_accuracy": 0.720103986135182,
"eval_loss": 0.9661750793457031,
"eval_runtime": 17.9039,
"eval_samples_per_second": 64.455,
"eval_steps_per_second": 0.559,
"step": 42
},
{
"epoch": 7.0,
"eval_accuracy": 0.7261698440207972,
"eval_loss": 0.9513705372810364,
"eval_runtime": 17.8649,
"eval_samples_per_second": 64.596,
"eval_steps_per_second": 0.56,
"step": 49
},
{
"epoch": 7.142857142857143,
"grad_norm": 10.7362699508667,
"learning_rate": 0.00021428571428571427,
"loss": 5.2423,
"step": 50
},
{
"epoch": 8.0,
"eval_accuracy": 0.7105719237435009,
"eval_loss": 0.9588707685470581,
"eval_runtime": 17.8964,
"eval_samples_per_second": 64.482,
"eval_steps_per_second": 0.559,
"step": 56
},
{
"epoch": 8.571428571428571,
"grad_norm": 11.099422454833984,
"learning_rate": 0.0002571428571428571,
"loss": 5.0251,
"step": 60
},
{
"epoch": 9.0,
"eval_accuracy": 0.7279029462738301,
"eval_loss": 0.908963680267334,
"eval_runtime": 17.9404,
"eval_samples_per_second": 64.324,
"eval_steps_per_second": 0.557,
"step": 63
},
{
"epoch": 10.0,
"grad_norm": 16.643394470214844,
"learning_rate": 0.0003,
"loss": 5.0547,
"step": 70
},
{
"epoch": 10.0,
"eval_accuracy": 0.7123050259965338,
"eval_loss": 0.9352001547813416,
"eval_runtime": 18.2788,
"eval_samples_per_second": 63.133,
"eval_steps_per_second": 0.547,
"step": 70
},
{
"epoch": 11.0,
"eval_accuracy": 0.6993067590987868,
"eval_loss": 1.0062916278839111,
"eval_runtime": 17.9901,
"eval_samples_per_second": 64.146,
"eval_steps_per_second": 0.556,
"step": 77
},
{
"epoch": 11.428571428571429,
"grad_norm": 9.376890182495117,
"learning_rate": 0.0002952380952380952,
"loss": 4.8246,
"step": 80
},
{
"epoch": 12.0,
"eval_accuracy": 0.7105719237435009,
"eval_loss": 0.9190986752510071,
"eval_runtime": 18.1793,
"eval_samples_per_second": 63.479,
"eval_steps_per_second": 0.55,
"step": 84
},
{
"epoch": 12.857142857142858,
"grad_norm": 7.629549026489258,
"learning_rate": 0.00029047619047619045,
"loss": 4.7811,
"step": 90
},
{
"epoch": 13.0,
"eval_accuracy": 0.7123050259965338,
"eval_loss": 0.9947251677513123,
"eval_runtime": 17.9036,
"eval_samples_per_second": 64.456,
"eval_steps_per_second": 0.559,
"step": 91
},
{
"epoch": 14.0,
"eval_accuracy": 0.7175043327556326,
"eval_loss": 0.9671235084533691,
"eval_runtime": 18.1306,
"eval_samples_per_second": 63.649,
"eval_steps_per_second": 0.552,
"step": 98
},
{
"epoch": 14.285714285714286,
"grad_norm": 13.771581649780273,
"learning_rate": 0.0002857142857142857,
"loss": 4.8234,
"step": 100
},
{
"epoch": 15.0,
"eval_accuracy": 0.7235701906412478,
"eval_loss": 0.9055125117301941,
"eval_runtime": 18.3144,
"eval_samples_per_second": 63.01,
"eval_steps_per_second": 0.546,
"step": 105
},
{
"epoch": 15.714285714285714,
"grad_norm": 9.288651466369629,
"learning_rate": 0.0002809523809523809,
"loss": 4.4787,
"step": 110
},
{
"epoch": 16.0,
"eval_accuracy": 0.744367417677643,
"eval_loss": 0.8837802410125732,
"eval_runtime": 18.2071,
"eval_samples_per_second": 63.382,
"eval_steps_per_second": 0.549,
"step": 112
},
{
"epoch": 17.0,
"eval_accuracy": 0.729636048526863,
"eval_loss": 0.9059325456619263,
"eval_runtime": 18.0331,
"eval_samples_per_second": 63.994,
"eval_steps_per_second": 0.555,
"step": 119
},
{
"epoch": 17.142857142857142,
"grad_norm": 8.790782928466797,
"learning_rate": 0.00027619047619047615,
"loss": 4.39,
"step": 120
},
{
"epoch": 18.0,
"eval_accuracy": 0.7461005199306759,
"eval_loss": 0.8639523983001709,
"eval_runtime": 18.0609,
"eval_samples_per_second": 63.895,
"eval_steps_per_second": 0.554,
"step": 126
},
{
"epoch": 18.571428571428573,
"grad_norm": 7.883941650390625,
"learning_rate": 0.0002714285714285714,
"loss": 4.1424,
"step": 130
},
{
"epoch": 19.0,
"eval_accuracy": 0.7487001733102253,
"eval_loss": 0.8660562634468079,
"eval_runtime": 17.7478,
"eval_samples_per_second": 65.022,
"eval_steps_per_second": 0.563,
"step": 133
},
{
"epoch": 20.0,
"grad_norm": 21.828325271606445,
"learning_rate": 0.0002666666666666666,
"loss": 4.1065,
"step": 140
},
{
"epoch": 20.0,
"eval_accuracy": 0.7305025996533796,
"eval_loss": 0.9056758284568787,
"eval_runtime": 17.8484,
"eval_samples_per_second": 64.656,
"eval_steps_per_second": 0.56,
"step": 140
},
{
"epoch": 21.0,
"eval_accuracy": 0.7348353552859619,
"eval_loss": 0.8865219354629517,
"eval_runtime": 18.0329,
"eval_samples_per_second": 63.994,
"eval_steps_per_second": 0.555,
"step": 147
},
{
"epoch": 21.428571428571427,
"grad_norm": 7.540792465209961,
"learning_rate": 0.00026190476190476186,
"loss": 4.0844,
"step": 150
},
{
"epoch": 22.0,
"eval_accuracy": 0.7391681109185442,
"eval_loss": 0.8928019404411316,
"eval_runtime": 17.9197,
"eval_samples_per_second": 64.398,
"eval_steps_per_second": 0.558,
"step": 154
},
{
"epoch": 22.857142857142858,
"grad_norm": 14.240620613098145,
"learning_rate": 0.0002571428571428571,
"loss": 3.9835,
"step": 160
},
{
"epoch": 23.0,
"eval_accuracy": 0.7538994800693241,
"eval_loss": 0.8675404787063599,
"eval_runtime": 18.0176,
"eval_samples_per_second": 64.048,
"eval_steps_per_second": 0.555,
"step": 161
},
{
"epoch": 24.0,
"eval_accuracy": 0.755632582322357,
"eval_loss": 0.8828888535499573,
"eval_runtime": 17.7466,
"eval_samples_per_second": 65.027,
"eval_steps_per_second": 0.563,
"step": 168
},
{
"epoch": 24.285714285714285,
"grad_norm": 8.749543190002441,
"learning_rate": 0.0002523809523809524,
"loss": 3.8199,
"step": 170
},
{
"epoch": 25.0,
"eval_accuracy": 0.7616984402079723,
"eval_loss": 0.8176947832107544,
"eval_runtime": 17.983,
"eval_samples_per_second": 64.172,
"eval_steps_per_second": 0.556,
"step": 175
},
{
"epoch": 25.714285714285715,
"grad_norm": 9.475801467895508,
"learning_rate": 0.00024761904761904757,
"loss": 3.7898,
"step": 180
},
{
"epoch": 26.0,
"eval_accuracy": 0.7461005199306759,
"eval_loss": 0.8885547518730164,
"eval_runtime": 18.0273,
"eval_samples_per_second": 64.014,
"eval_steps_per_second": 0.555,
"step": 182
},
{
"epoch": 27.0,
"eval_accuracy": 0.7461005199306759,
"eval_loss": 0.9394861459732056,
"eval_runtime": 18.1419,
"eval_samples_per_second": 63.61,
"eval_steps_per_second": 0.551,
"step": 189
},
{
"epoch": 27.142857142857142,
"grad_norm": 7.944543361663818,
"learning_rate": 0.00024285714285714283,
"loss": 3.7734,
"step": 190
},
{
"epoch": 28.0,
"eval_accuracy": 0.7608318890814558,
"eval_loss": 0.8348239064216614,
"eval_runtime": 17.9109,
"eval_samples_per_second": 64.43,
"eval_steps_per_second": 0.558,
"step": 196
},
{
"epoch": 28.571428571428573,
"grad_norm": 9.20173168182373,
"learning_rate": 0.00023809523809523807,
"loss": 3.7835,
"step": 200
},
{
"epoch": 29.0,
"eval_accuracy": 0.75736568457539,
"eval_loss": 0.836903989315033,
"eval_runtime": 18.1677,
"eval_samples_per_second": 63.519,
"eval_steps_per_second": 0.55,
"step": 203
},
{
"epoch": 30.0,
"grad_norm": 17.463150024414062,
"learning_rate": 0.0002333333333333333,
"loss": 3.6414,
"step": 210
},
{
"epoch": 30.0,
"eval_accuracy": 0.7660311958405546,
"eval_loss": 0.8668186664581299,
"eval_runtime": 17.8247,
"eval_samples_per_second": 64.742,
"eval_steps_per_second": 0.561,
"step": 210
},
{
"epoch": 31.0,
"eval_accuracy": 0.7599653379549394,
"eval_loss": 0.8909233808517456,
"eval_runtime": 18.1581,
"eval_samples_per_second": 63.553,
"eval_steps_per_second": 0.551,
"step": 217
},
{
"epoch": 31.428571428571427,
"grad_norm": 13.756216049194336,
"learning_rate": 0.00022857142857142854,
"loss": 3.5076,
"step": 220
},
{
"epoch": 32.0,
"eval_accuracy": 0.7495667244367418,
"eval_loss": 0.8795309066772461,
"eval_runtime": 17.8514,
"eval_samples_per_second": 64.645,
"eval_steps_per_second": 0.56,
"step": 224
},
{
"epoch": 32.857142857142854,
"grad_norm": 9.03218936920166,
"learning_rate": 0.0002238095238095238,
"loss": 3.5447,
"step": 230
},
{
"epoch": 33.0,
"eval_accuracy": 0.7538994800693241,
"eval_loss": 0.9227800369262695,
"eval_runtime": 17.9657,
"eval_samples_per_second": 64.233,
"eval_steps_per_second": 0.557,
"step": 231
},
{
"epoch": 34.0,
"eval_accuracy": 0.7521663778162911,
"eval_loss": 0.8850377798080444,
"eval_runtime": 17.9906,
"eval_samples_per_second": 64.144,
"eval_steps_per_second": 0.556,
"step": 238
},
{
"epoch": 34.285714285714285,
"grad_norm": 7.675583839416504,
"learning_rate": 0.000219047619047619,
"loss": 3.5344,
"step": 240
},
{
"epoch": 35.0,
"eval_accuracy": 0.7651646447140381,
"eval_loss": 0.8584573864936829,
"eval_runtime": 18.1255,
"eval_samples_per_second": 63.667,
"eval_steps_per_second": 0.552,
"step": 245
},
{
"epoch": 35.714285714285715,
"grad_norm": 7.848378658294678,
"learning_rate": 0.00021428571428571427,
"loss": 3.3678,
"step": 250
},
{
"epoch": 36.0,
"eval_accuracy": 0.75736568457539,
"eval_loss": 0.8631114959716797,
"eval_runtime": 18.0275,
"eval_samples_per_second": 64.013,
"eval_steps_per_second": 0.555,
"step": 252
},
{
"epoch": 37.0,
"eval_accuracy": 0.770363951473137,
"eval_loss": 0.8675860166549683,
"eval_runtime": 18.0196,
"eval_samples_per_second": 64.042,
"eval_steps_per_second": 0.555,
"step": 259
},
{
"epoch": 37.142857142857146,
"grad_norm": 9.06800651550293,
"learning_rate": 0.00020952380952380948,
"loss": 3.4061,
"step": 260
},
{
"epoch": 38.0,
"eval_accuracy": 0.7616984402079723,
"eval_loss": 0.9131080508232117,
"eval_runtime": 17.9025,
"eval_samples_per_second": 64.46,
"eval_steps_per_second": 0.559,
"step": 266
},
{
"epoch": 38.57142857142857,
"grad_norm": 11.665525436401367,
"learning_rate": 0.00020476190476190475,
"loss": 3.3177,
"step": 270
},
{
"epoch": 39.0,
"eval_accuracy": 0.7677642980935875,
"eval_loss": 0.8631002902984619,
"eval_runtime": 17.9771,
"eval_samples_per_second": 64.193,
"eval_steps_per_second": 0.556,
"step": 273
},
{
"epoch": 40.0,
"grad_norm": 15.023707389831543,
"learning_rate": 0.00019999999999999998,
"loss": 3.2767,
"step": 280
},
{
"epoch": 40.0,
"eval_accuracy": 0.7642980935875217,
"eval_loss": 0.8802210092544556,
"eval_runtime": 17.9247,
"eval_samples_per_second": 64.381,
"eval_steps_per_second": 0.558,
"step": 280
},
{
"epoch": 41.0,
"eval_accuracy": 0.7677642980935875,
"eval_loss": 0.8518037796020508,
"eval_runtime": 18.183,
"eval_samples_per_second": 63.466,
"eval_steps_per_second": 0.55,
"step": 287
},
{
"epoch": 41.42857142857143,
"grad_norm": 8.431020736694336,
"learning_rate": 0.00019523809523809522,
"loss": 3.1992,
"step": 290
},
{
"epoch": 42.0,
"eval_accuracy": 0.75736568457539,
"eval_loss": 0.923156201839447,
"eval_runtime": 18.0318,
"eval_samples_per_second": 63.998,
"eval_steps_per_second": 0.555,
"step": 294
},
{
"epoch": 42.857142857142854,
"grad_norm": 8.130815505981445,
"learning_rate": 0.00019047619047619045,
"loss": 3.2743,
"step": 300
},
{
"epoch": 43.0,
"eval_accuracy": 0.7521663778162911,
"eval_loss": 0.9305623173713684,
"eval_runtime": 17.9901,
"eval_samples_per_second": 64.146,
"eval_steps_per_second": 0.556,
"step": 301
},
{
"epoch": 44.0,
"eval_accuracy": 0.7755632582322357,
"eval_loss": 0.8419708013534546,
"eval_runtime": 17.9031,
"eval_samples_per_second": 64.458,
"eval_steps_per_second": 0.559,
"step": 308
},
{
"epoch": 44.285714285714285,
"grad_norm": 9.007019996643066,
"learning_rate": 0.00018571428571428572,
"loss": 3.1704,
"step": 310
},
{
"epoch": 45.0,
"eval_accuracy": 0.7564991334488734,
"eval_loss": 0.8801714777946472,
"eval_runtime": 17.8984,
"eval_samples_per_second": 64.475,
"eval_steps_per_second": 0.559,
"step": 315
},
{
"epoch": 45.714285714285715,
"grad_norm": 8.079572677612305,
"learning_rate": 0.00018095238095238093,
"loss": 3.2466,
"step": 320
},
{
"epoch": 46.0,
"eval_accuracy": 0.7677642980935875,
"eval_loss": 0.878183901309967,
"eval_runtime": 18.135,
"eval_samples_per_second": 63.634,
"eval_steps_per_second": 0.551,
"step": 322
},
{
"epoch": 47.0,
"eval_accuracy": 0.7746967071057193,
"eval_loss": 0.844364583492279,
"eval_runtime": 18.003,
"eval_samples_per_second": 64.1,
"eval_steps_per_second": 0.555,
"step": 329
},
{
"epoch": 47.142857142857146,
"grad_norm": 6.920067310333252,
"learning_rate": 0.0001761904761904762,
"loss": 3.0879,
"step": 330
},
{
"epoch": 48.0,
"eval_accuracy": 0.7694974003466204,
"eval_loss": 0.8579216003417969,
"eval_runtime": 17.8532,
"eval_samples_per_second": 64.638,
"eval_steps_per_second": 0.56,
"step": 336
},
{
"epoch": 48.57142857142857,
"grad_norm": 6.670530796051025,
"learning_rate": 0.0001714285714285714,
"loss": 3.1677,
"step": 340
},
{
"epoch": 49.0,
"eval_accuracy": 0.7712305025996534,
"eval_loss": 0.858402669429779,
"eval_runtime": 17.75,
"eval_samples_per_second": 65.014,
"eval_steps_per_second": 0.563,
"step": 343
},
{
"epoch": 50.0,
"grad_norm": 13.106241226196289,
"learning_rate": 0.00016666666666666666,
"loss": 3.0965,
"step": 350
},
{
"epoch": 50.0,
"eval_accuracy": 0.7755632582322357,
"eval_loss": 0.8400810956954956,
"eval_runtime": 18.0075,
"eval_samples_per_second": 64.084,
"eval_steps_per_second": 0.555,
"step": 350
},
{
"epoch": 51.0,
"eval_accuracy": 0.7651646447140381,
"eval_loss": 0.8724238872528076,
"eval_runtime": 18.0097,
"eval_samples_per_second": 64.077,
"eval_steps_per_second": 0.555,
"step": 357
},
{
"epoch": 51.42857142857143,
"grad_norm": 8.85236930847168,
"learning_rate": 0.00016190476190476187,
"loss": 3.0611,
"step": 360
},
{
"epoch": 52.0,
"eval_accuracy": 0.7807625649913345,
"eval_loss": 0.8638470768928528,
"eval_runtime": 18.0439,
"eval_samples_per_second": 63.955,
"eval_steps_per_second": 0.554,
"step": 364
},
{
"epoch": 52.857142857142854,
"grad_norm": 7.648194789886475,
"learning_rate": 0.00015714285714285713,
"loss": 3.0204,
"step": 370
},
{
"epoch": 53.0,
"eval_accuracy": 0.7660311958405546,
"eval_loss": 0.9167099595069885,
"eval_runtime": 17.9056,
"eval_samples_per_second": 64.449,
"eval_steps_per_second": 0.558,
"step": 371
},
{
"epoch": 54.0,
"eval_accuracy": 0.7738301559792028,
"eval_loss": 0.8322371244430542,
"eval_runtime": 17.9741,
"eval_samples_per_second": 64.204,
"eval_steps_per_second": 0.556,
"step": 378
},
{
"epoch": 54.285714285714285,
"grad_norm": 6.742936611175537,
"learning_rate": 0.00015238095238095237,
"loss": 2.9704,
"step": 380
},
{
"epoch": 55.0,
"eval_accuracy": 0.7642980935875217,
"eval_loss": 0.8577215671539307,
"eval_runtime": 18.0258,
"eval_samples_per_second": 64.019,
"eval_steps_per_second": 0.555,
"step": 385
},
{
"epoch": 55.714285714285715,
"grad_norm": 6.2735395431518555,
"learning_rate": 0.0001476190476190476,
"loss": 2.939,
"step": 390
},
{
"epoch": 56.0,
"eval_accuracy": 0.7859618717504333,
"eval_loss": 0.8296905755996704,
"eval_runtime": 18.0649,
"eval_samples_per_second": 63.881,
"eval_steps_per_second": 0.554,
"step": 392
},
{
"epoch": 57.0,
"eval_accuracy": 0.7686308492201039,
"eval_loss": 0.874596893787384,
"eval_runtime": 17.9658,
"eval_samples_per_second": 64.233,
"eval_steps_per_second": 0.557,
"step": 399
},
{
"epoch": 57.142857142857146,
"grad_norm": 6.44887113571167,
"learning_rate": 0.00014285714285714284,
"loss": 3.0341,
"step": 400
},
{
"epoch": 58.0,
"eval_accuracy": 0.7824956672443674,
"eval_loss": 0.8620171546936035,
"eval_runtime": 17.939,
"eval_samples_per_second": 64.329,
"eval_steps_per_second": 0.557,
"step": 406
},
{
"epoch": 58.57142857142857,
"grad_norm": 6.199102401733398,
"learning_rate": 0.00013809523809523808,
"loss": 2.8997,
"step": 410
},
{
"epoch": 59.0,
"eval_accuracy": 0.75736568457539,
"eval_loss": 0.8835130333900452,
"eval_runtime": 18.2434,
"eval_samples_per_second": 63.256,
"eval_steps_per_second": 0.548,
"step": 413
},
{
"epoch": 60.0,
"grad_norm": 27.795392990112305,
"learning_rate": 0.0001333333333333333,
"loss": 3.0187,
"step": 420
},
{
"epoch": 60.0,
"eval_accuracy": 0.7694974003466204,
"eval_loss": 0.9018464684486389,
"eval_runtime": 18.2513,
"eval_samples_per_second": 63.228,
"eval_steps_per_second": 0.548,
"step": 420
},
{
"epoch": 61.0,
"eval_accuracy": 0.7772963604852686,
"eval_loss": 0.8939943909645081,
"eval_runtime": 18.1365,
"eval_samples_per_second": 63.629,
"eval_steps_per_second": 0.551,
"step": 427
},
{
"epoch": 61.42857142857143,
"grad_norm": 10.215301513671875,
"learning_rate": 0.00012857142857142855,
"loss": 2.9316,
"step": 430
},
{
"epoch": 62.0,
"eval_accuracy": 0.7712305025996534,
"eval_loss": 0.8858510851860046,
"eval_runtime": 18.1655,
"eval_samples_per_second": 63.527,
"eval_steps_per_second": 0.55,
"step": 434
},
{
"epoch": 62.857142857142854,
"grad_norm": 5.105686187744141,
"learning_rate": 0.00012380952380952378,
"loss": 2.8746,
"step": 440
},
{
"epoch": 63.0,
"eval_accuracy": 0.7764298093587522,
"eval_loss": 0.8661392331123352,
"eval_runtime": 17.9626,
"eval_samples_per_second": 64.245,
"eval_steps_per_second": 0.557,
"step": 441
},
{
"epoch": 64.0,
"eval_accuracy": 0.7712305025996534,
"eval_loss": 0.8916440010070801,
"eval_runtime": 17.94,
"eval_samples_per_second": 64.326,
"eval_steps_per_second": 0.557,
"step": 448
},
{
"epoch": 64.28571428571429,
"grad_norm": 9.268267631530762,
"learning_rate": 0.00011904761904761903,
"loss": 2.817,
"step": 450
},
{
"epoch": 65.0,
"eval_accuracy": 0.7781629116117851,
"eval_loss": 0.8645418286323547,
"eval_runtime": 18.2441,
"eval_samples_per_second": 63.253,
"eval_steps_per_second": 0.548,
"step": 455
},
{
"epoch": 65.71428571428571,
"grad_norm": 6.703152179718018,
"learning_rate": 0.00011428571428571427,
"loss": 2.7593,
"step": 460
},
{
"epoch": 66.0,
"eval_accuracy": 0.7686308492201039,
"eval_loss": 0.8828719854354858,
"eval_runtime": 18.1608,
"eval_samples_per_second": 63.543,
"eval_steps_per_second": 0.551,
"step": 462
},
{
"epoch": 67.0,
"eval_accuracy": 0.7790294627383015,
"eval_loss": 0.8883015513420105,
"eval_runtime": 18.1166,
"eval_samples_per_second": 63.698,
"eval_steps_per_second": 0.552,
"step": 469
},
{
"epoch": 67.14285714285714,
"grad_norm": 5.34393310546875,
"learning_rate": 0.0001095238095238095,
"loss": 2.9212,
"step": 470
},
{
"epoch": 68.0,
"eval_accuracy": 0.7824956672443674,
"eval_loss": 0.8507192134857178,
"eval_runtime": 18.0504,
"eval_samples_per_second": 63.932,
"eval_steps_per_second": 0.554,
"step": 476
},
{
"epoch": 68.57142857142857,
"grad_norm": 6.5966668128967285,
"learning_rate": 0.00010476190476190474,
"loss": 2.8659,
"step": 480
},
{
"epoch": 69.0,
"eval_accuracy": 0.7876949740034662,
"eval_loss": 0.8553578853607178,
"eval_runtime": 18.0681,
"eval_samples_per_second": 63.869,
"eval_steps_per_second": 0.553,
"step": 483
},
{
"epoch": 70.0,
"grad_norm": 22.730220794677734,
"learning_rate": 9.999999999999999e-05,
"loss": 2.9068,
"step": 490
},
{
"epoch": 70.0,
"eval_accuracy": 0.7764298093587522,
"eval_loss": 0.8812502026557922,
"eval_runtime": 17.9671,
"eval_samples_per_second": 64.229,
"eval_steps_per_second": 0.557,
"step": 490
},
{
"epoch": 71.0,
"eval_accuracy": 0.7859618717504333,
"eval_loss": 0.8555229902267456,
"eval_runtime": 18.0711,
"eval_samples_per_second": 63.859,
"eval_steps_per_second": 0.553,
"step": 497
},
{
"epoch": 71.42857142857143,
"grad_norm": 5.773199558258057,
"learning_rate": 9.523809523809523e-05,
"loss": 2.8334,
"step": 500
},
{
"epoch": 72.0,
"eval_accuracy": 0.7790294627383015,
"eval_loss": 0.8665823340415955,
"eval_runtime": 18.4819,
"eval_samples_per_second": 62.439,
"eval_steps_per_second": 0.541,
"step": 504
},
{
"epoch": 72.85714285714286,
"grad_norm": 6.063803672790527,
"learning_rate": 9.047619047619046e-05,
"loss": 2.7322,
"step": 510
},
{
"epoch": 73.0,
"eval_accuracy": 0.7824956672443674,
"eval_loss": 0.8682228922843933,
"eval_runtime": 18.1239,
"eval_samples_per_second": 63.673,
"eval_steps_per_second": 0.552,
"step": 511
},
{
"epoch": 74.0,
"eval_accuracy": 0.7885615251299827,
"eval_loss": 0.881618320941925,
"eval_runtime": 17.8842,
"eval_samples_per_second": 64.526,
"eval_steps_per_second": 0.559,
"step": 518
},
{
"epoch": 74.28571428571429,
"grad_norm": 5.207172870635986,
"learning_rate": 8.57142857142857e-05,
"loss": 2.8548,
"step": 520
},
{
"epoch": 75.0,
"eval_accuracy": 0.7902946273830156,
"eval_loss": 0.8523378968238831,
"eval_runtime": 18.1134,
"eval_samples_per_second": 63.71,
"eval_steps_per_second": 0.552,
"step": 525
},
{
"epoch": 75.71428571428571,
"grad_norm": 6.294586658477783,
"learning_rate": 8.095238095238093e-05,
"loss": 2.8696,
"step": 530
},
{
"epoch": 76.0,
"eval_accuracy": 0.7894280762564991,
"eval_loss": 0.8509147763252258,
"eval_runtime": 18.182,
"eval_samples_per_second": 63.469,
"eval_steps_per_second": 0.55,
"step": 532
},
{
"epoch": 77.0,
"eval_accuracy": 0.7807625649913345,
"eval_loss": 0.8682960867881775,
"eval_runtime": 18.3628,
"eval_samples_per_second": 62.845,
"eval_steps_per_second": 0.545,
"step": 539
},
{
"epoch": 77.14285714285714,
"grad_norm": 5.558056831359863,
"learning_rate": 7.619047619047618e-05,
"loss": 2.6439,
"step": 540
},
{
"epoch": 78.0,
"eval_accuracy": 0.7876949740034662,
"eval_loss": 0.860653281211853,
"eval_runtime": 18.2632,
"eval_samples_per_second": 63.187,
"eval_steps_per_second": 0.548,
"step": 546
},
{
"epoch": 78.57142857142857,
"grad_norm": 5.7894415855407715,
"learning_rate": 7.142857142857142e-05,
"loss": 2.9039,
"step": 550
},
{
"epoch": 79.0,
"eval_accuracy": 0.7842287694974004,
"eval_loss": 0.8698387742042542,
"eval_runtime": 18.1385,
"eval_samples_per_second": 63.622,
"eval_steps_per_second": 0.551,
"step": 553
},
{
"epoch": 80.0,
"grad_norm": 28.787755966186523,
"learning_rate": 6.666666666666666e-05,
"loss": 2.6338,
"step": 560
},
{
"epoch": 80.0,
"eval_accuracy": 0.7876949740034662,
"eval_loss": 0.8718376755714417,
"eval_runtime": 18.0357,
"eval_samples_per_second": 63.984,
"eval_steps_per_second": 0.554,
"step": 560
},
{
"epoch": 81.0,
"eval_accuracy": 0.7902946273830156,
"eval_loss": 0.8370843529701233,
"eval_runtime": 18.1407,
"eval_samples_per_second": 63.614,
"eval_steps_per_second": 0.551,
"step": 567
},
{
"epoch": 81.42857142857143,
"grad_norm": 6.290432929992676,
"learning_rate": 6.190476190476189e-05,
"loss": 2.7271,
"step": 570
},
{
"epoch": 82.0,
"eval_accuracy": 0.792894280762565,
"eval_loss": 0.8426641821861267,
"eval_runtime": 17.8494,
"eval_samples_per_second": 64.652,
"eval_steps_per_second": 0.56,
"step": 574
},
{
"epoch": 82.85714285714286,
"grad_norm": 4.4193525314331055,
"learning_rate": 5.7142857142857135e-05,
"loss": 2.7555,
"step": 580
},
{
"epoch": 83.0,
"eval_accuracy": 0.7937608318890814,
"eval_loss": 0.8621939420700073,
"eval_runtime": 17.8242,
"eval_samples_per_second": 64.743,
"eval_steps_per_second": 0.561,
"step": 581
},
{
"epoch": 84.0,
"eval_accuracy": 0.7859618717504333,
"eval_loss": 0.8768612146377563,
"eval_runtime": 17.9828,
"eval_samples_per_second": 64.172,
"eval_steps_per_second": 0.556,
"step": 588
},
{
"epoch": 84.28571428571429,
"grad_norm": 5.777393341064453,
"learning_rate": 5.238095238095237e-05,
"loss": 2.7702,
"step": 590
},
{
"epoch": 85.0,
"eval_accuracy": 0.7859618717504333,
"eval_loss": 0.88438481092453,
"eval_runtime": 17.8963,
"eval_samples_per_second": 64.483,
"eval_steps_per_second": 0.559,
"step": 595
},
{
"epoch": 85.71428571428571,
"grad_norm": 5.748138904571533,
"learning_rate": 4.7619047619047614e-05,
"loss": 2.8678,
"step": 600
},
{
"epoch": 86.0,
"eval_accuracy": 0.7824956672443674,
"eval_loss": 0.8882182836532593,
"eval_runtime": 17.8524,
"eval_samples_per_second": 64.641,
"eval_steps_per_second": 0.56,
"step": 602
},
{
"epoch": 87.0,
"eval_accuracy": 0.7824956672443674,
"eval_loss": 0.8715818524360657,
"eval_runtime": 17.8328,
"eval_samples_per_second": 64.712,
"eval_steps_per_second": 0.561,
"step": 609
},
{
"epoch": 87.14285714285714,
"grad_norm": 4.612086772918701,
"learning_rate": 4.285714285714285e-05,
"loss": 2.6334,
"step": 610
},
{
"epoch": 88.0,
"eval_accuracy": 0.7781629116117851,
"eval_loss": 0.8782148361206055,
"eval_runtime": 17.9213,
"eval_samples_per_second": 64.393,
"eval_steps_per_second": 0.558,
"step": 616
},
{
"epoch": 88.57142857142857,
"grad_norm": 6.36035680770874,
"learning_rate": 3.809523809523809e-05,
"loss": 2.7782,
"step": 620
},
{
"epoch": 89.0,
"eval_accuracy": 0.7807625649913345,
"eval_loss": 0.8752433657646179,
"eval_runtime": 18.042,
"eval_samples_per_second": 63.962,
"eval_steps_per_second": 0.554,
"step": 623
},
{
"epoch": 90.0,
"grad_norm": 6.581643581390381,
"learning_rate": 3.333333333333333e-05,
"loss": 2.5527,
"step": 630
},
{
"epoch": 90.0,
"eval_accuracy": 0.7807625649913345,
"eval_loss": 0.8674911856651306,
"eval_runtime": 17.811,
"eval_samples_per_second": 64.791,
"eval_steps_per_second": 0.561,
"step": 630
},
{
"epoch": 91.0,
"eval_accuracy": 0.7842287694974004,
"eval_loss": 0.8734576106071472,
"eval_runtime": 17.906,
"eval_samples_per_second": 64.448,
"eval_steps_per_second": 0.558,
"step": 637
},
{
"epoch": 91.42857142857143,
"grad_norm": 6.266481399536133,
"learning_rate": 2.8571428571428567e-05,
"loss": 2.6812,
"step": 640
},
{
"epoch": 92.0,
"eval_accuracy": 0.7885615251299827,
"eval_loss": 0.8649889826774597,
"eval_runtime": 18.1196,
"eval_samples_per_second": 63.688,
"eval_steps_per_second": 0.552,
"step": 644
},
{
"epoch": 92.85714285714286,
"grad_norm": 5.178635597229004,
"learning_rate": 2.3809523809523807e-05,
"loss": 2.6167,
"step": 650
},
{
"epoch": 93.0,
"eval_accuracy": 0.7946273830155979,
"eval_loss": 0.8530935049057007,
"eval_runtime": 17.8992,
"eval_samples_per_second": 64.472,
"eval_steps_per_second": 0.559,
"step": 651
},
{
"epoch": 94.0,
"eval_accuracy": 0.7868284228769498,
"eval_loss": 0.8698766827583313,
"eval_runtime": 17.9684,
"eval_samples_per_second": 64.224,
"eval_steps_per_second": 0.557,
"step": 658
},
{
"epoch": 94.28571428571429,
"grad_norm": 4.488171100616455,
"learning_rate": 1.9047619047619046e-05,
"loss": 2.6553,
"step": 660
},
{
"epoch": 95.0,
"eval_accuracy": 0.7894280762564991,
"eval_loss": 0.8666642308235168,
"eval_runtime": 17.9669,
"eval_samples_per_second": 64.229,
"eval_steps_per_second": 0.557,
"step": 665
},
{
"epoch": 95.71428571428571,
"grad_norm": 6.009092330932617,
"learning_rate": 1.4285714285714284e-05,
"loss": 2.7758,
"step": 670
},
{
"epoch": 96.0,
"eval_accuracy": 0.7920277296360485,
"eval_loss": 0.8650416731834412,
"eval_runtime": 18.0841,
"eval_samples_per_second": 63.813,
"eval_steps_per_second": 0.553,
"step": 672
},
{
"epoch": 97.0,
"eval_accuracy": 0.7902946273830156,
"eval_loss": 0.8684815764427185,
"eval_runtime": 17.8482,
"eval_samples_per_second": 64.656,
"eval_steps_per_second": 0.56,
"step": 679
},
{
"epoch": 97.14285714285714,
"grad_norm": 5.19600772857666,
"learning_rate": 9.523809523809523e-06,
"loss": 2.6592,
"step": 680
},
{
"epoch": 98.0,
"eval_accuracy": 0.7885615251299827,
"eval_loss": 0.8592236042022705,
"eval_runtime": 17.9065,
"eval_samples_per_second": 64.446,
"eval_steps_per_second": 0.558,
"step": 686
},
{
"epoch": 98.57142857142857,
"grad_norm": 5.676305770874023,
"learning_rate": 4.7619047619047615e-06,
"loss": 2.5202,
"step": 690
},
{
"epoch": 99.0,
"eval_accuracy": 0.7894280762564991,
"eval_loss": 0.8744557499885559,
"eval_runtime": 17.8619,
"eval_samples_per_second": 64.607,
"eval_steps_per_second": 0.56,
"step": 693
},
{
"epoch": 100.0,
"grad_norm": 48.86530685424805,
"learning_rate": 0.0,
"loss": 2.6577,
"step": 700
},
{
"epoch": 100.0,
"eval_accuracy": 0.7954939341421143,
"eval_loss": 0.8635059595108032,
"eval_runtime": 18.1084,
"eval_samples_per_second": 63.727,
"eval_steps_per_second": 0.552,
"step": 700
},
{
"epoch": 100.0,
"step": 700,
"total_flos": 6.134894724962304e+18,
"train_loss": 3.4289448138645717,
"train_runtime": 8283.6476,
"train_samples_per_second": 41.793,
"train_steps_per_second": 0.085
}
],
"logging_steps": 10,
"max_steps": 700,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.134894724962304e+18,
"train_batch_size": 128,
"trial_name": null,
"trial_params": null
}