top_9_ranking_stackexchange / trainer_state.json
sedrickkeh's picture
End of training
21fe249 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1005,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.029850746268656716,
"grad_norm": 3.479292993887848,
"learning_rate": 5e-06,
"loss": 1.015,
"step": 10
},
{
"epoch": 0.05970149253731343,
"grad_norm": 3.332094949060961,
"learning_rate": 5e-06,
"loss": 0.9229,
"step": 20
},
{
"epoch": 0.08955223880597014,
"grad_norm": 1.206482374615331,
"learning_rate": 5e-06,
"loss": 0.8863,
"step": 30
},
{
"epoch": 0.11940298507462686,
"grad_norm": 4.163345933152562,
"learning_rate": 5e-06,
"loss": 0.8671,
"step": 40
},
{
"epoch": 0.14925373134328357,
"grad_norm": 12.16630142884249,
"learning_rate": 5e-06,
"loss": 0.8564,
"step": 50
},
{
"epoch": 0.1791044776119403,
"grad_norm": 1.0327331476094606,
"learning_rate": 5e-06,
"loss": 0.8496,
"step": 60
},
{
"epoch": 0.208955223880597,
"grad_norm": 0.9990373506833108,
"learning_rate": 5e-06,
"loss": 0.8316,
"step": 70
},
{
"epoch": 0.23880597014925373,
"grad_norm": 0.9110436069182296,
"learning_rate": 5e-06,
"loss": 0.8315,
"step": 80
},
{
"epoch": 0.26865671641791045,
"grad_norm": 0.8361675958854492,
"learning_rate": 5e-06,
"loss": 0.8252,
"step": 90
},
{
"epoch": 0.29850746268656714,
"grad_norm": 0.9109432628272915,
"learning_rate": 5e-06,
"loss": 0.8254,
"step": 100
},
{
"epoch": 0.3283582089552239,
"grad_norm": 0.5754887816326917,
"learning_rate": 5e-06,
"loss": 0.8123,
"step": 110
},
{
"epoch": 0.3582089552238806,
"grad_norm": 0.5177271737564838,
"learning_rate": 5e-06,
"loss": 0.813,
"step": 120
},
{
"epoch": 0.3880597014925373,
"grad_norm": 0.5512037618652581,
"learning_rate": 5e-06,
"loss": 0.811,
"step": 130
},
{
"epoch": 0.417910447761194,
"grad_norm": 0.5700224076141376,
"learning_rate": 5e-06,
"loss": 0.8096,
"step": 140
},
{
"epoch": 0.44776119402985076,
"grad_norm": 0.5599860608870274,
"learning_rate": 5e-06,
"loss": 0.8046,
"step": 150
},
{
"epoch": 0.47761194029850745,
"grad_norm": 0.569170892894082,
"learning_rate": 5e-06,
"loss": 0.8034,
"step": 160
},
{
"epoch": 0.5074626865671642,
"grad_norm": 0.8243458549827131,
"learning_rate": 5e-06,
"loss": 0.7987,
"step": 170
},
{
"epoch": 0.5373134328358209,
"grad_norm": 0.5121916860141221,
"learning_rate": 5e-06,
"loss": 0.806,
"step": 180
},
{
"epoch": 0.5671641791044776,
"grad_norm": 0.6107343758472424,
"learning_rate": 5e-06,
"loss": 0.7948,
"step": 190
},
{
"epoch": 0.5970149253731343,
"grad_norm": 0.5131585752019031,
"learning_rate": 5e-06,
"loss": 0.7946,
"step": 200
},
{
"epoch": 0.6268656716417911,
"grad_norm": 0.6765312142993251,
"learning_rate": 5e-06,
"loss": 0.7992,
"step": 210
},
{
"epoch": 0.6567164179104478,
"grad_norm": 0.6992511282844082,
"learning_rate": 5e-06,
"loss": 0.7932,
"step": 220
},
{
"epoch": 0.6865671641791045,
"grad_norm": 0.5561440226388349,
"learning_rate": 5e-06,
"loss": 0.7947,
"step": 230
},
{
"epoch": 0.7164179104477612,
"grad_norm": 0.611629440350604,
"learning_rate": 5e-06,
"loss": 0.7952,
"step": 240
},
{
"epoch": 0.746268656716418,
"grad_norm": 0.7112309549807347,
"learning_rate": 5e-06,
"loss": 0.7961,
"step": 250
},
{
"epoch": 0.7761194029850746,
"grad_norm": 0.6315198037305403,
"learning_rate": 5e-06,
"loss": 0.7903,
"step": 260
},
{
"epoch": 0.8059701492537313,
"grad_norm": 0.6187802379727967,
"learning_rate": 5e-06,
"loss": 0.7938,
"step": 270
},
{
"epoch": 0.835820895522388,
"grad_norm": 0.6525337849629936,
"learning_rate": 5e-06,
"loss": 0.7845,
"step": 280
},
{
"epoch": 0.8656716417910447,
"grad_norm": 0.4910252956115715,
"learning_rate": 5e-06,
"loss": 0.7858,
"step": 290
},
{
"epoch": 0.8955223880597015,
"grad_norm": 0.5424409964718268,
"learning_rate": 5e-06,
"loss": 0.7889,
"step": 300
},
{
"epoch": 0.9253731343283582,
"grad_norm": 0.5025244254926571,
"learning_rate": 5e-06,
"loss": 0.7901,
"step": 310
},
{
"epoch": 0.9552238805970149,
"grad_norm": 0.611552104571597,
"learning_rate": 5e-06,
"loss": 0.7876,
"step": 320
},
{
"epoch": 0.9850746268656716,
"grad_norm": 0.574143500541867,
"learning_rate": 5e-06,
"loss": 0.788,
"step": 330
},
{
"epoch": 1.0,
"eval_loss": 0.7820407152175903,
"eval_runtime": 356.5952,
"eval_samples_per_second": 25.312,
"eval_steps_per_second": 0.398,
"step": 335
},
{
"epoch": 1.0149253731343284,
"grad_norm": 0.6790286112759999,
"learning_rate": 5e-06,
"loss": 0.7615,
"step": 340
},
{
"epoch": 1.044776119402985,
"grad_norm": 0.5150260000277413,
"learning_rate": 5e-06,
"loss": 0.7459,
"step": 350
},
{
"epoch": 1.0746268656716418,
"grad_norm": 0.6700846811383229,
"learning_rate": 5e-06,
"loss": 0.7394,
"step": 360
},
{
"epoch": 1.1044776119402986,
"grad_norm": 0.720307351495057,
"learning_rate": 5e-06,
"loss": 0.741,
"step": 370
},
{
"epoch": 1.1343283582089552,
"grad_norm": 0.6813902538633815,
"learning_rate": 5e-06,
"loss": 0.7369,
"step": 380
},
{
"epoch": 1.164179104477612,
"grad_norm": 0.63094991849189,
"learning_rate": 5e-06,
"loss": 0.7418,
"step": 390
},
{
"epoch": 1.1940298507462686,
"grad_norm": 0.5792845851337866,
"learning_rate": 5e-06,
"loss": 0.7414,
"step": 400
},
{
"epoch": 1.2238805970149254,
"grad_norm": 0.603252393791289,
"learning_rate": 5e-06,
"loss": 0.7457,
"step": 410
},
{
"epoch": 1.2537313432835822,
"grad_norm": 0.6398195179422785,
"learning_rate": 5e-06,
"loss": 0.74,
"step": 420
},
{
"epoch": 1.2835820895522387,
"grad_norm": 0.5963738792349057,
"learning_rate": 5e-06,
"loss": 0.7485,
"step": 430
},
{
"epoch": 1.3134328358208955,
"grad_norm": 0.6940326337185055,
"learning_rate": 5e-06,
"loss": 0.7448,
"step": 440
},
{
"epoch": 1.3432835820895521,
"grad_norm": 0.5568283328438208,
"learning_rate": 5e-06,
"loss": 0.736,
"step": 450
},
{
"epoch": 1.373134328358209,
"grad_norm": 0.5197916467255219,
"learning_rate": 5e-06,
"loss": 0.7411,
"step": 460
},
{
"epoch": 1.4029850746268657,
"grad_norm": 0.5460153463704294,
"learning_rate": 5e-06,
"loss": 0.7418,
"step": 470
},
{
"epoch": 1.4328358208955223,
"grad_norm": 0.5763789808771301,
"learning_rate": 5e-06,
"loss": 0.7363,
"step": 480
},
{
"epoch": 1.462686567164179,
"grad_norm": 0.5189698614326181,
"learning_rate": 5e-06,
"loss": 0.744,
"step": 490
},
{
"epoch": 1.4925373134328357,
"grad_norm": 0.567498164042545,
"learning_rate": 5e-06,
"loss": 0.7404,
"step": 500
},
{
"epoch": 1.5223880597014925,
"grad_norm": 0.6393388739204092,
"learning_rate": 5e-06,
"loss": 0.7414,
"step": 510
},
{
"epoch": 1.5522388059701493,
"grad_norm": 0.5147979956127535,
"learning_rate": 5e-06,
"loss": 0.7374,
"step": 520
},
{
"epoch": 1.582089552238806,
"grad_norm": 0.6152783990954829,
"learning_rate": 5e-06,
"loss": 0.7414,
"step": 530
},
{
"epoch": 1.6119402985074627,
"grad_norm": 0.6919403728057012,
"learning_rate": 5e-06,
"loss": 0.7354,
"step": 540
},
{
"epoch": 1.6417910447761193,
"grad_norm": 0.6057934431622807,
"learning_rate": 5e-06,
"loss": 0.7422,
"step": 550
},
{
"epoch": 1.671641791044776,
"grad_norm": 0.6659075705857131,
"learning_rate": 5e-06,
"loss": 0.7359,
"step": 560
},
{
"epoch": 1.7014925373134329,
"grad_norm": 0.5120895019545025,
"learning_rate": 5e-06,
"loss": 0.7387,
"step": 570
},
{
"epoch": 1.7313432835820897,
"grad_norm": 0.6192820246303359,
"learning_rate": 5e-06,
"loss": 0.7428,
"step": 580
},
{
"epoch": 1.7611940298507462,
"grad_norm": 0.5647576041065644,
"learning_rate": 5e-06,
"loss": 0.7421,
"step": 590
},
{
"epoch": 1.7910447761194028,
"grad_norm": 0.6215429682194679,
"learning_rate": 5e-06,
"loss": 0.7387,
"step": 600
},
{
"epoch": 1.8208955223880596,
"grad_norm": 0.6981771564826721,
"learning_rate": 5e-06,
"loss": 0.7357,
"step": 610
},
{
"epoch": 1.8507462686567164,
"grad_norm": 0.5861737897737739,
"learning_rate": 5e-06,
"loss": 0.7359,
"step": 620
},
{
"epoch": 1.8805970149253732,
"grad_norm": 0.49215660200886596,
"learning_rate": 5e-06,
"loss": 0.7382,
"step": 630
},
{
"epoch": 1.9104477611940298,
"grad_norm": 0.5126805399974429,
"learning_rate": 5e-06,
"loss": 0.7374,
"step": 640
},
{
"epoch": 1.9402985074626866,
"grad_norm": 0.5418246376116465,
"learning_rate": 5e-06,
"loss": 0.7383,
"step": 650
},
{
"epoch": 1.9701492537313432,
"grad_norm": 0.5200260002573389,
"learning_rate": 5e-06,
"loss": 0.7386,
"step": 660
},
{
"epoch": 2.0,
"grad_norm": 0.4891478343024747,
"learning_rate": 5e-06,
"loss": 0.7335,
"step": 670
},
{
"epoch": 2.0,
"eval_loss": 0.7694990634918213,
"eval_runtime": 357.9559,
"eval_samples_per_second": 25.215,
"eval_steps_per_second": 0.397,
"step": 670
},
{
"epoch": 2.029850746268657,
"grad_norm": 0.6765562554325313,
"learning_rate": 5e-06,
"loss": 0.6893,
"step": 680
},
{
"epoch": 2.0597014925373136,
"grad_norm": 0.778608535751458,
"learning_rate": 5e-06,
"loss": 0.6913,
"step": 690
},
{
"epoch": 2.08955223880597,
"grad_norm": 0.5671615670514111,
"learning_rate": 5e-06,
"loss": 0.6895,
"step": 700
},
{
"epoch": 2.1194029850746268,
"grad_norm": 0.6665893513460222,
"learning_rate": 5e-06,
"loss": 0.6916,
"step": 710
},
{
"epoch": 2.1492537313432836,
"grad_norm": 0.6806197074554154,
"learning_rate": 5e-06,
"loss": 0.6935,
"step": 720
},
{
"epoch": 2.1791044776119404,
"grad_norm": 0.576145716852151,
"learning_rate": 5e-06,
"loss": 0.6951,
"step": 730
},
{
"epoch": 2.208955223880597,
"grad_norm": 0.643535683911534,
"learning_rate": 5e-06,
"loss": 0.6904,
"step": 740
},
{
"epoch": 2.2388059701492535,
"grad_norm": 0.5623086433806925,
"learning_rate": 5e-06,
"loss": 0.6935,
"step": 750
},
{
"epoch": 2.2686567164179103,
"grad_norm": 0.5766234212764187,
"learning_rate": 5e-06,
"loss": 0.6985,
"step": 760
},
{
"epoch": 2.298507462686567,
"grad_norm": 0.5751133605924373,
"learning_rate": 5e-06,
"loss": 0.6989,
"step": 770
},
{
"epoch": 2.328358208955224,
"grad_norm": 0.6038731996857178,
"learning_rate": 5e-06,
"loss": 0.6993,
"step": 780
},
{
"epoch": 2.3582089552238807,
"grad_norm": 0.6822024466871226,
"learning_rate": 5e-06,
"loss": 0.6946,
"step": 790
},
{
"epoch": 2.388059701492537,
"grad_norm": 0.6198933460670559,
"learning_rate": 5e-06,
"loss": 0.6962,
"step": 800
},
{
"epoch": 2.417910447761194,
"grad_norm": 0.6172129701431681,
"learning_rate": 5e-06,
"loss": 0.6953,
"step": 810
},
{
"epoch": 2.4477611940298507,
"grad_norm": 0.5875229638376829,
"learning_rate": 5e-06,
"loss": 0.6931,
"step": 820
},
{
"epoch": 2.4776119402985075,
"grad_norm": 0.6023272569262241,
"learning_rate": 5e-06,
"loss": 0.6955,
"step": 830
},
{
"epoch": 2.5074626865671643,
"grad_norm": 0.5877478924208155,
"learning_rate": 5e-06,
"loss": 0.6933,
"step": 840
},
{
"epoch": 2.5373134328358207,
"grad_norm": 0.5553889178901485,
"learning_rate": 5e-06,
"loss": 0.6945,
"step": 850
},
{
"epoch": 2.5671641791044775,
"grad_norm": 0.595671809413151,
"learning_rate": 5e-06,
"loss": 0.6983,
"step": 860
},
{
"epoch": 2.5970149253731343,
"grad_norm": 0.5458221648218067,
"learning_rate": 5e-06,
"loss": 0.6986,
"step": 870
},
{
"epoch": 2.626865671641791,
"grad_norm": 0.5634666832714791,
"learning_rate": 5e-06,
"loss": 0.6992,
"step": 880
},
{
"epoch": 2.656716417910448,
"grad_norm": 0.5986690445210529,
"learning_rate": 5e-06,
"loss": 0.697,
"step": 890
},
{
"epoch": 2.6865671641791042,
"grad_norm": 0.6806823973105847,
"learning_rate": 5e-06,
"loss": 0.697,
"step": 900
},
{
"epoch": 2.716417910447761,
"grad_norm": 0.5843820752773177,
"learning_rate": 5e-06,
"loss": 0.6926,
"step": 910
},
{
"epoch": 2.746268656716418,
"grad_norm": 0.5748608122491111,
"learning_rate": 5e-06,
"loss": 0.6954,
"step": 920
},
{
"epoch": 2.7761194029850746,
"grad_norm": 0.570179781829274,
"learning_rate": 5e-06,
"loss": 0.6992,
"step": 930
},
{
"epoch": 2.8059701492537314,
"grad_norm": 0.5247671197226909,
"learning_rate": 5e-06,
"loss": 0.6946,
"step": 940
},
{
"epoch": 2.835820895522388,
"grad_norm": 0.5328797886127241,
"learning_rate": 5e-06,
"loss": 0.6954,
"step": 950
},
{
"epoch": 2.8656716417910446,
"grad_norm": 0.533230211383855,
"learning_rate": 5e-06,
"loss": 0.6991,
"step": 960
},
{
"epoch": 2.8955223880597014,
"grad_norm": 0.6204281218133497,
"learning_rate": 5e-06,
"loss": 0.6988,
"step": 970
},
{
"epoch": 2.925373134328358,
"grad_norm": 0.5715769762952346,
"learning_rate": 5e-06,
"loss": 0.6985,
"step": 980
},
{
"epoch": 2.955223880597015,
"grad_norm": 0.5731281297330721,
"learning_rate": 5e-06,
"loss": 0.6957,
"step": 990
},
{
"epoch": 2.9850746268656714,
"grad_norm": 0.6065892745042316,
"learning_rate": 5e-06,
"loss": 0.6979,
"step": 1000
},
{
"epoch": 3.0,
"eval_loss": 0.7694440484046936,
"eval_runtime": 357.3084,
"eval_samples_per_second": 25.261,
"eval_steps_per_second": 0.397,
"step": 1005
},
{
"epoch": 3.0,
"step": 1005,
"total_flos": 1683203052011520.0,
"train_loss": 0.7514027657200448,
"train_runtime": 59331.9423,
"train_samples_per_second": 8.67,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 1005,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1683203052011520.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}