s1.1_7b_QFD / trainer_state.json
lwl-uestc's picture
Upload folder using huggingface_hub
010bb02 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.0,
"eval_steps": 500,
"global_step": 750,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.08,
"grad_norm": 3.1039228439331055,
"learning_rate": 1.3333333333333334e-06,
"loss": 1.2535,
"step": 10
},
{
"epoch": 0.16,
"grad_norm": 1.9736298322677612,
"learning_rate": 2.666666666666667e-06,
"loss": 1.1389,
"step": 20
},
{
"epoch": 0.24,
"grad_norm": 0.9868311285972595,
"learning_rate": 4.000000000000001e-06,
"loss": 1.1137,
"step": 30
},
{
"epoch": 0.32,
"grad_norm": 0.7625114321708679,
"learning_rate": 5.333333333333334e-06,
"loss": 1.0243,
"step": 40
},
{
"epoch": 0.4,
"grad_norm": 0.8334870338439941,
"learning_rate": 6.666666666666667e-06,
"loss": 1.0505,
"step": 50
},
{
"epoch": 0.48,
"grad_norm": 0.7253231406211853,
"learning_rate": 8.000000000000001e-06,
"loss": 0.9779,
"step": 60
},
{
"epoch": 0.56,
"grad_norm": 0.6524099707603455,
"learning_rate": 9.333333333333334e-06,
"loss": 0.9797,
"step": 70
},
{
"epoch": 0.64,
"grad_norm": 0.68406742811203,
"learning_rate": 9.99864620589731e-06,
"loss": 0.9878,
"step": 80
},
{
"epoch": 0.72,
"grad_norm": 0.6620015501976013,
"learning_rate": 9.987820251299121e-06,
"loss": 0.9352,
"step": 90
},
{
"epoch": 0.8,
"grad_norm": 0.7781227231025696,
"learning_rate": 9.966191788709716e-06,
"loss": 0.9205,
"step": 100
},
{
"epoch": 0.88,
"grad_norm": 0.6981615424156189,
"learning_rate": 9.933807660562898e-06,
"loss": 0.9168,
"step": 110
},
{
"epoch": 0.96,
"grad_norm": 0.5809632539749146,
"learning_rate": 9.890738003669029e-06,
"loss": 0.9345,
"step": 120
},
{
"epoch": 1.04,
"grad_norm": 0.6306660771369934,
"learning_rate": 9.83707609731432e-06,
"loss": 0.8842,
"step": 130
},
{
"epoch": 1.12,
"grad_norm": 0.645531952381134,
"learning_rate": 9.77293816123866e-06,
"loss": 0.8496,
"step": 140
},
{
"epoch": 1.2,
"grad_norm": 0.7515069842338562,
"learning_rate": 9.698463103929542e-06,
"loss": 0.8134,
"step": 150
},
{
"epoch": 1.28,
"grad_norm": 0.8647974133491516,
"learning_rate": 9.613812221777212e-06,
"loss": 0.8645,
"step": 160
},
{
"epoch": 1.3599999999999999,
"grad_norm": 0.5082105398178101,
"learning_rate": 9.519168849742603e-06,
"loss": 0.8339,
"step": 170
},
{
"epoch": 1.44,
"grad_norm": 0.6905492544174194,
"learning_rate": 9.414737964294636e-06,
"loss": 0.8524,
"step": 180
},
{
"epoch": 1.52,
"grad_norm": 0.7012932896614075,
"learning_rate": 9.30074573947683e-06,
"loss": 0.8566,
"step": 190
},
{
"epoch": 1.6,
"grad_norm": 0.6087114810943604,
"learning_rate": 9.177439057064684e-06,
"loss": 0.8107,
"step": 200
},
{
"epoch": 1.6800000000000002,
"grad_norm": 0.7151578664779663,
"learning_rate": 9.045084971874738e-06,
"loss": 0.9001,
"step": 210
},
{
"epoch": 1.76,
"grad_norm": 0.746478796005249,
"learning_rate": 8.903970133383297e-06,
"loss": 0.8522,
"step": 220
},
{
"epoch": 1.8399999999999999,
"grad_norm": 0.5843214392662048,
"learning_rate": 8.754400164907496e-06,
"loss": 0.8164,
"step": 230
},
{
"epoch": 1.92,
"grad_norm": 0.6262736916542053,
"learning_rate": 8.596699001693257e-06,
"loss": 0.8739,
"step": 240
},
{
"epoch": 2.0,
"grad_norm": 0.6539023518562317,
"learning_rate": 8.43120818934367e-06,
"loss": 0.8353,
"step": 250
},
{
"epoch": 2.08,
"grad_norm": 0.7221904397010803,
"learning_rate": 8.258286144107277e-06,
"loss": 0.7376,
"step": 260
},
{
"epoch": 2.16,
"grad_norm": 0.6120831966400146,
"learning_rate": 8.078307376628292e-06,
"loss": 0.7247,
"step": 270
},
{
"epoch": 2.24,
"grad_norm": 0.5770366787910461,
"learning_rate": 7.891661680839932e-06,
"loss": 0.6993,
"step": 280
},
{
"epoch": 2.32,
"grad_norm": 0.4881764352321625,
"learning_rate": 7.698753289757565e-06,
"loss": 0.6871,
"step": 290
},
{
"epoch": 2.4,
"grad_norm": 0.6157920360565186,
"learning_rate": 7.500000000000001e-06,
"loss": 0.7247,
"step": 300
},
{
"epoch": 2.48,
"grad_norm": 0.5322688817977905,
"learning_rate": 7.295832266935059e-06,
"loss": 0.7549,
"step": 310
},
{
"epoch": 2.56,
"grad_norm": 0.7124238014221191,
"learning_rate": 7.08669227240909e-06,
"loss": 0.6858,
"step": 320
},
{
"epoch": 2.64,
"grad_norm": 0.5642361044883728,
"learning_rate": 6.873032967079562e-06,
"loss": 0.6737,
"step": 330
},
{
"epoch": 2.7199999999999998,
"grad_norm": 0.7349158525466919,
"learning_rate": 6.655317089424791e-06,
"loss": 0.7092,
"step": 340
},
{
"epoch": 2.8,
"grad_norm": 0.6204760670661926,
"learning_rate": 6.434016163555452e-06,
"loss": 0.6878,
"step": 350
},
{
"epoch": 2.88,
"grad_norm": 0.4934922754764557,
"learning_rate": 6.209609477998339e-06,
"loss": 0.6883,
"step": 360
},
{
"epoch": 2.96,
"grad_norm": 0.5973473787307739,
"learning_rate": 5.982583047664151e-06,
"loss": 0.7312,
"step": 370
},
{
"epoch": 3.04,
"grad_norm": 0.9578109979629517,
"learning_rate": 5.753428561247416e-06,
"loss": 0.6466,
"step": 380
},
{
"epoch": 3.12,
"grad_norm": 0.6420013904571533,
"learning_rate": 5.522642316338268e-06,
"loss": 0.5745,
"step": 390
},
{
"epoch": 3.2,
"grad_norm": 0.6083827614784241,
"learning_rate": 5.290724144552379e-06,
"loss": 0.6074,
"step": 400
},
{
"epoch": 3.2800000000000002,
"grad_norm": 0.5411021709442139,
"learning_rate": 5.0581763290069865e-06,
"loss": 0.5958,
"step": 410
},
{
"epoch": 3.36,
"grad_norm": 0.542936384677887,
"learning_rate": 4.825502516487497e-06,
"loss": 0.517,
"step": 420
},
{
"epoch": 3.44,
"grad_norm": 0.643763542175293,
"learning_rate": 4.59320662666071e-06,
"loss": 0.6125,
"step": 430
},
{
"epoch": 3.52,
"grad_norm": 0.5902148485183716,
"learning_rate": 4.361791760697027e-06,
"loss": 0.5574,
"step": 440
},
{
"epoch": 3.6,
"grad_norm": 0.5409561991691589,
"learning_rate": 4.131759111665349e-06,
"loss": 0.5619,
"step": 450
},
{
"epoch": 3.68,
"grad_norm": 0.5303182601928711,
"learning_rate": 3.903606879060483e-06,
"loss": 0.5723,
"step": 460
},
{
"epoch": 3.76,
"grad_norm": 0.5597081184387207,
"learning_rate": 3.6778291898139907e-06,
"loss": 0.57,
"step": 470
},
{
"epoch": 3.84,
"grad_norm": 0.557049572467804,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.558,
"step": 480
},
{
"epoch": 3.92,
"grad_norm": 0.5532881021499634,
"learning_rate": 3.2353471764306567e-06,
"loss": 0.5642,
"step": 490
},
{
"epoch": 4.0,
"grad_norm": 0.6130328178405762,
"learning_rate": 3.019601169804216e-06,
"loss": 0.582,
"step": 500
},
{
"epoch": 4.08,
"grad_norm": 0.6685742139816284,
"learning_rate": 2.8081442660546126e-06,
"loss": 0.4826,
"step": 510
},
{
"epoch": 4.16,
"grad_norm": 0.7372143268585205,
"learning_rate": 2.601434433748771e-06,
"loss": 0.4799,
"step": 520
},
{
"epoch": 4.24,
"grad_norm": 0.5380725860595703,
"learning_rate": 2.3999193603539234e-06,
"loss": 0.4595,
"step": 530
},
{
"epoch": 4.32,
"grad_norm": 0.611395537853241,
"learning_rate": 2.204035482646267e-06,
"loss": 0.4414,
"step": 540
},
{
"epoch": 4.4,
"grad_norm": 0.6131682991981506,
"learning_rate": 2.0142070414860704e-06,
"loss": 0.42,
"step": 550
},
{
"epoch": 4.48,
"grad_norm": 0.5948655605316162,
"learning_rate": 1.8308451630064484e-06,
"loss": 0.4228,
"step": 560
},
{
"epoch": 4.5600000000000005,
"grad_norm": 0.5260858535766602,
"learning_rate": 1.6543469682057105e-06,
"loss": 0.4486,
"step": 570
},
{
"epoch": 4.64,
"grad_norm": 0.6490086913108826,
"learning_rate": 1.4850947128716914e-06,
"loss": 0.4064,
"step": 580
},
{
"epoch": 4.72,
"grad_norm": 0.512779951095581,
"learning_rate": 1.3234549597008572e-06,
"loss": 0.4806,
"step": 590
},
{
"epoch": 4.8,
"grad_norm": 0.6591514945030212,
"learning_rate": 1.1697777844051105e-06,
"loss": 0.4537,
"step": 600
},
{
"epoch": 4.88,
"grad_norm": 0.5841071605682373,
"learning_rate": 1.0243960175257605e-06,
"loss": 0.425,
"step": 610
},
{
"epoch": 4.96,
"grad_norm": 0.5767258405685425,
"learning_rate": 8.876245235966884e-07,
"loss": 0.4903,
"step": 620
},
{
"epoch": 5.04,
"grad_norm": 0.681662380695343,
"learning_rate": 7.597595192178702e-07,
"loss": 0.4259,
"step": 630
},
{
"epoch": 5.12,
"grad_norm": 0.6221340894699097,
"learning_rate": 6.410779315161885e-07,
"loss": 0.4243,
"step": 640
},
{
"epoch": 5.2,
"grad_norm": 0.5294339656829834,
"learning_rate": 5.318367983829393e-07,
"loss": 0.3387,
"step": 650
},
{
"epoch": 5.28,
"grad_norm": 0.5295357704162598,
"learning_rate": 4.322727117869951e-07,
"loss": 0.3378,
"step": 660
},
{
"epoch": 5.36,
"grad_norm": 0.5497756004333496,
"learning_rate": 3.426013053692878e-07,
"loss": 0.3857,
"step": 670
},
{
"epoch": 5.44,
"grad_norm": 0.5334697961807251,
"learning_rate": 2.63016787428354e-07,
"loss": 0.3996,
"step": 680
},
{
"epoch": 5.52,
"grad_norm": 0.48104792833328247,
"learning_rate": 1.9369152030840553e-07,
"loss": 0.3743,
"step": 690
},
{
"epoch": 5.6,
"grad_norm": 0.53028404712677,
"learning_rate": 1.3477564710088097e-07,
"loss": 0.3542,
"step": 700
},
{
"epoch": 5.68,
"grad_norm": 0.5596076846122742,
"learning_rate": 8.639676646793382e-08,
"loss": 0.3587,
"step": 710
},
{
"epoch": 5.76,
"grad_norm": 0.5660611391067505,
"learning_rate": 4.865965629214819e-08,
"loss": 0.414,
"step": 720
},
{
"epoch": 5.84,
"grad_norm": 0.4768368899822235,
"learning_rate": 2.1646046750978255e-08,
"loss": 0.3887,
"step": 730
},
{
"epoch": 5.92,
"grad_norm": 0.5675653219223022,
"learning_rate": 5.414443307377171e-09,
"loss": 0.3811,
"step": 740
},
{
"epoch": 6.0,
"grad_norm": 0.5585565567016602,
"learning_rate": 0.0,
"loss": 0.3763,
"step": 750
},
{
"epoch": 6.0,
"step": 750,
"total_flos": 118427603697664.0,
"train_loss": 0.662273271560669,
"train_runtime": 26488.1763,
"train_samples_per_second": 0.227,
"train_steps_per_second": 0.028
}
],
"logging_steps": 10,
"max_steps": 750,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 100000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 118427603697664.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}