top_8_ranking_stackexchange / trainer_state.json
sedrickkeh's picture
End of training
31a9934 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 882,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.034013605442176874,
"grad_norm": 5.093772950493088,
"learning_rate": 5e-06,
"loss": 1.0253,
"step": 10
},
{
"epoch": 0.06802721088435375,
"grad_norm": 1.582810434950759,
"learning_rate": 5e-06,
"loss": 0.9164,
"step": 20
},
{
"epoch": 0.10204081632653061,
"grad_norm": 1.081419957922015,
"learning_rate": 5e-06,
"loss": 0.8837,
"step": 30
},
{
"epoch": 0.1360544217687075,
"grad_norm": 1.334991004257816,
"learning_rate": 5e-06,
"loss": 0.8662,
"step": 40
},
{
"epoch": 0.17006802721088435,
"grad_norm": 0.6437293692108047,
"learning_rate": 5e-06,
"loss": 0.8474,
"step": 50
},
{
"epoch": 0.20408163265306123,
"grad_norm": 0.7291088099940165,
"learning_rate": 5e-06,
"loss": 0.8403,
"step": 60
},
{
"epoch": 0.23809523809523808,
"grad_norm": 1.2250728318971373,
"learning_rate": 5e-06,
"loss": 0.8324,
"step": 70
},
{
"epoch": 0.272108843537415,
"grad_norm": 0.5271316099486635,
"learning_rate": 5e-06,
"loss": 0.8255,
"step": 80
},
{
"epoch": 0.30612244897959184,
"grad_norm": 0.657776756921105,
"learning_rate": 5e-06,
"loss": 0.8268,
"step": 90
},
{
"epoch": 0.3401360544217687,
"grad_norm": 0.6441066353244067,
"learning_rate": 5e-06,
"loss": 0.8244,
"step": 100
},
{
"epoch": 0.3741496598639456,
"grad_norm": 0.8285320708808935,
"learning_rate": 5e-06,
"loss": 0.8204,
"step": 110
},
{
"epoch": 0.40816326530612246,
"grad_norm": 0.7161434853368004,
"learning_rate": 5e-06,
"loss": 0.8162,
"step": 120
},
{
"epoch": 0.4421768707482993,
"grad_norm": 0.5513713742579371,
"learning_rate": 5e-06,
"loss": 0.8145,
"step": 130
},
{
"epoch": 0.47619047619047616,
"grad_norm": 0.7457252021496298,
"learning_rate": 5e-06,
"loss": 0.8055,
"step": 140
},
{
"epoch": 0.5102040816326531,
"grad_norm": 0.5974600771687919,
"learning_rate": 5e-06,
"loss": 0.8046,
"step": 150
},
{
"epoch": 0.54421768707483,
"grad_norm": 0.5106810999294438,
"learning_rate": 5e-06,
"loss": 0.8038,
"step": 160
},
{
"epoch": 0.5782312925170068,
"grad_norm": 0.5233604769586638,
"learning_rate": 5e-06,
"loss": 0.7984,
"step": 170
},
{
"epoch": 0.6122448979591837,
"grad_norm": 0.6621030752821899,
"learning_rate": 5e-06,
"loss": 0.8053,
"step": 180
},
{
"epoch": 0.6462585034013606,
"grad_norm": 0.6082801478115033,
"learning_rate": 5e-06,
"loss": 0.7971,
"step": 190
},
{
"epoch": 0.6802721088435374,
"grad_norm": 0.6921185518271565,
"learning_rate": 5e-06,
"loss": 0.7962,
"step": 200
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.7932977516942055,
"learning_rate": 5e-06,
"loss": 0.7965,
"step": 210
},
{
"epoch": 0.7482993197278912,
"grad_norm": 0.6324128246753213,
"learning_rate": 5e-06,
"loss": 0.7934,
"step": 220
},
{
"epoch": 0.782312925170068,
"grad_norm": 0.559311808415173,
"learning_rate": 5e-06,
"loss": 0.7989,
"step": 230
},
{
"epoch": 0.8163265306122449,
"grad_norm": 0.6432666654821662,
"learning_rate": 5e-06,
"loss": 0.7969,
"step": 240
},
{
"epoch": 0.8503401360544217,
"grad_norm": 0.744813344130182,
"learning_rate": 5e-06,
"loss": 0.7919,
"step": 250
},
{
"epoch": 0.8843537414965986,
"grad_norm": 0.5433587318374555,
"learning_rate": 5e-06,
"loss": 0.7902,
"step": 260
},
{
"epoch": 0.9183673469387755,
"grad_norm": 0.7120343599921604,
"learning_rate": 5e-06,
"loss": 0.7906,
"step": 270
},
{
"epoch": 0.9523809523809523,
"grad_norm": 1.1772393318908365,
"learning_rate": 5e-06,
"loss": 0.7883,
"step": 280
},
{
"epoch": 0.9863945578231292,
"grad_norm": 1.289674010358365,
"learning_rate": 5e-06,
"loss": 0.7865,
"step": 290
},
{
"epoch": 1.0,
"eval_loss": 0.787599503993988,
"eval_runtime": 312.4094,
"eval_samples_per_second": 25.358,
"eval_steps_per_second": 0.397,
"step": 294
},
{
"epoch": 1.0204081632653061,
"grad_norm": 0.9036542003842971,
"learning_rate": 5e-06,
"loss": 0.7746,
"step": 300
},
{
"epoch": 1.054421768707483,
"grad_norm": 0.8034701067288486,
"learning_rate": 5e-06,
"loss": 0.7488,
"step": 310
},
{
"epoch": 1.08843537414966,
"grad_norm": 0.6023506564223867,
"learning_rate": 5e-06,
"loss": 0.7475,
"step": 320
},
{
"epoch": 1.1224489795918366,
"grad_norm": 0.6504967920737302,
"learning_rate": 5e-06,
"loss": 0.7431,
"step": 330
},
{
"epoch": 1.1564625850340136,
"grad_norm": 0.5813877515629605,
"learning_rate": 5e-06,
"loss": 0.7462,
"step": 340
},
{
"epoch": 1.1904761904761905,
"grad_norm": 0.6259897603352809,
"learning_rate": 5e-06,
"loss": 0.747,
"step": 350
},
{
"epoch": 1.2244897959183674,
"grad_norm": 0.6862271161840088,
"learning_rate": 5e-06,
"loss": 0.7455,
"step": 360
},
{
"epoch": 1.2585034013605443,
"grad_norm": 0.5769690007814378,
"learning_rate": 5e-06,
"loss": 0.7406,
"step": 370
},
{
"epoch": 1.2925170068027212,
"grad_norm": 0.557540098903492,
"learning_rate": 5e-06,
"loss": 0.7538,
"step": 380
},
{
"epoch": 1.3265306122448979,
"grad_norm": 0.7131020070740424,
"learning_rate": 5e-06,
"loss": 0.7458,
"step": 390
},
{
"epoch": 1.3605442176870748,
"grad_norm": 0.5948565780053893,
"learning_rate": 5e-06,
"loss": 0.7446,
"step": 400
},
{
"epoch": 1.3945578231292517,
"grad_norm": 0.6911445705490143,
"learning_rate": 5e-06,
"loss": 0.7497,
"step": 410
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.6252849135367671,
"learning_rate": 5e-06,
"loss": 0.7469,
"step": 420
},
{
"epoch": 1.4625850340136055,
"grad_norm": 0.6646202868499425,
"learning_rate": 5e-06,
"loss": 0.7424,
"step": 430
},
{
"epoch": 1.4965986394557822,
"grad_norm": 0.6970040399316466,
"learning_rate": 5e-06,
"loss": 0.7445,
"step": 440
},
{
"epoch": 1.5306122448979593,
"grad_norm": 0.6004043771512654,
"learning_rate": 5e-06,
"loss": 0.7492,
"step": 450
},
{
"epoch": 1.564625850340136,
"grad_norm": 0.6563540827889115,
"learning_rate": 5e-06,
"loss": 0.7423,
"step": 460
},
{
"epoch": 1.598639455782313,
"grad_norm": 0.7454418696472762,
"learning_rate": 5e-06,
"loss": 0.7434,
"step": 470
},
{
"epoch": 1.6326530612244898,
"grad_norm": 0.5564890350927186,
"learning_rate": 5e-06,
"loss": 0.7365,
"step": 480
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.7834978093049169,
"learning_rate": 5e-06,
"loss": 0.7421,
"step": 490
},
{
"epoch": 1.7006802721088436,
"grad_norm": 0.583066628701426,
"learning_rate": 5e-06,
"loss": 0.7463,
"step": 500
},
{
"epoch": 1.7346938775510203,
"grad_norm": 0.5893517422275613,
"learning_rate": 5e-06,
"loss": 0.7376,
"step": 510
},
{
"epoch": 1.7687074829931972,
"grad_norm": 0.5037802702044101,
"learning_rate": 5e-06,
"loss": 0.7441,
"step": 520
},
{
"epoch": 1.8027210884353742,
"grad_norm": 0.5931792416654984,
"learning_rate": 5e-06,
"loss": 0.7431,
"step": 530
},
{
"epoch": 1.836734693877551,
"grad_norm": 0.6402441098113224,
"learning_rate": 5e-06,
"loss": 0.7438,
"step": 540
},
{
"epoch": 1.870748299319728,
"grad_norm": 0.5654472975194821,
"learning_rate": 5e-06,
"loss": 0.7393,
"step": 550
},
{
"epoch": 1.9047619047619047,
"grad_norm": 0.5753064985895123,
"learning_rate": 5e-06,
"loss": 0.7397,
"step": 560
},
{
"epoch": 1.9387755102040818,
"grad_norm": 0.7250548373178971,
"learning_rate": 5e-06,
"loss": 0.7397,
"step": 570
},
{
"epoch": 1.9727891156462585,
"grad_norm": 0.5431763568539679,
"learning_rate": 5e-06,
"loss": 0.7399,
"step": 580
},
{
"epoch": 2.0,
"eval_loss": 0.7747776508331299,
"eval_runtime": 316.2944,
"eval_samples_per_second": 25.046,
"eval_steps_per_second": 0.392,
"step": 588
},
{
"epoch": 2.006802721088435,
"grad_norm": 1.0710107167046419,
"learning_rate": 5e-06,
"loss": 0.7322,
"step": 590
},
{
"epoch": 2.0408163265306123,
"grad_norm": 0.8179848172880277,
"learning_rate": 5e-06,
"loss": 0.6959,
"step": 600
},
{
"epoch": 2.074829931972789,
"grad_norm": 0.7213840408679012,
"learning_rate": 5e-06,
"loss": 0.6963,
"step": 610
},
{
"epoch": 2.108843537414966,
"grad_norm": 0.842144783291229,
"learning_rate": 5e-06,
"loss": 0.6919,
"step": 620
},
{
"epoch": 2.142857142857143,
"grad_norm": 0.8281153884085551,
"learning_rate": 5e-06,
"loss": 0.6885,
"step": 630
},
{
"epoch": 2.17687074829932,
"grad_norm": 0.6466546656089918,
"learning_rate": 5e-06,
"loss": 0.696,
"step": 640
},
{
"epoch": 2.2108843537414966,
"grad_norm": 0.526364663799012,
"learning_rate": 5e-06,
"loss": 0.6937,
"step": 650
},
{
"epoch": 2.2448979591836733,
"grad_norm": 0.5634674036723205,
"learning_rate": 5e-06,
"loss": 0.6981,
"step": 660
},
{
"epoch": 2.2789115646258504,
"grad_norm": 0.5968432701967212,
"learning_rate": 5e-06,
"loss": 0.7015,
"step": 670
},
{
"epoch": 2.312925170068027,
"grad_norm": 0.5913462184907319,
"learning_rate": 5e-06,
"loss": 0.7012,
"step": 680
},
{
"epoch": 2.3469387755102042,
"grad_norm": 0.7450042075059763,
"learning_rate": 5e-06,
"loss": 0.6952,
"step": 690
},
{
"epoch": 2.380952380952381,
"grad_norm": 0.5383186417121737,
"learning_rate": 5e-06,
"loss": 0.6953,
"step": 700
},
{
"epoch": 2.4149659863945576,
"grad_norm": 0.7331040417967113,
"learning_rate": 5e-06,
"loss": 0.6997,
"step": 710
},
{
"epoch": 2.4489795918367347,
"grad_norm": 0.707608838125245,
"learning_rate": 5e-06,
"loss": 0.6947,
"step": 720
},
{
"epoch": 2.4829931972789114,
"grad_norm": 0.5739038681907664,
"learning_rate": 5e-06,
"loss": 0.7002,
"step": 730
},
{
"epoch": 2.5170068027210886,
"grad_norm": 0.6882559394709682,
"learning_rate": 5e-06,
"loss": 0.7,
"step": 740
},
{
"epoch": 2.5510204081632653,
"grad_norm": 0.5748954947299686,
"learning_rate": 5e-06,
"loss": 0.6977,
"step": 750
},
{
"epoch": 2.5850340136054424,
"grad_norm": 0.5661948160767387,
"learning_rate": 5e-06,
"loss": 0.6972,
"step": 760
},
{
"epoch": 2.619047619047619,
"grad_norm": 0.6170581920248837,
"learning_rate": 5e-06,
"loss": 0.6988,
"step": 770
},
{
"epoch": 2.6530612244897958,
"grad_norm": 0.7539930534954333,
"learning_rate": 5e-06,
"loss": 0.6977,
"step": 780
},
{
"epoch": 2.687074829931973,
"grad_norm": 0.5811347422760094,
"learning_rate": 5e-06,
"loss": 0.6987,
"step": 790
},
{
"epoch": 2.7210884353741496,
"grad_norm": 0.6704654193944088,
"learning_rate": 5e-06,
"loss": 0.6982,
"step": 800
},
{
"epoch": 2.7551020408163263,
"grad_norm": 0.6505538631815906,
"learning_rate": 5e-06,
"loss": 0.7006,
"step": 810
},
{
"epoch": 2.7891156462585034,
"grad_norm": 0.5672733814024299,
"learning_rate": 5e-06,
"loss": 0.7038,
"step": 820
},
{
"epoch": 2.8231292517006805,
"grad_norm": 0.6464062527883608,
"learning_rate": 5e-06,
"loss": 0.7031,
"step": 830
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.6783589732638796,
"learning_rate": 5e-06,
"loss": 0.7001,
"step": 840
},
{
"epoch": 2.891156462585034,
"grad_norm": 0.6780111327853662,
"learning_rate": 5e-06,
"loss": 0.6999,
"step": 850
},
{
"epoch": 2.925170068027211,
"grad_norm": 0.6861118715829543,
"learning_rate": 5e-06,
"loss": 0.7007,
"step": 860
},
{
"epoch": 2.9591836734693877,
"grad_norm": 0.6468827032168267,
"learning_rate": 5e-06,
"loss": 0.7029,
"step": 870
},
{
"epoch": 2.9931972789115644,
"grad_norm": 0.5936344265821127,
"learning_rate": 5e-06,
"loss": 0.7006,
"step": 880
},
{
"epoch": 3.0,
"eval_loss": 0.7735684514045715,
"eval_runtime": 316.0096,
"eval_samples_per_second": 25.069,
"eval_steps_per_second": 0.392,
"step": 882
},
{
"epoch": 3.0,
"step": 882,
"total_flos": 1477173470822400.0,
"train_loss": 0.7552210683184687,
"train_runtime": 52264.4132,
"train_samples_per_second": 8.64,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 882,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1477173470822400.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}