|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 680, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.14705882352941177, |
|
"grad_norm": 5.515628814697266, |
|
"learning_rate": 2.9411764705882354e-05, |
|
"loss": 1.1436, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.29411764705882354, |
|
"grad_norm": 2.5779266357421875, |
|
"learning_rate": 5.882352941176471e-05, |
|
"loss": 0.4215, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.4411764705882353, |
|
"grad_norm": 1.5243560075759888, |
|
"learning_rate": 8.823529411764706e-05, |
|
"loss": 0.2531, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 1.0052002668380737, |
|
"learning_rate": 9.997871633546257e-05, |
|
"loss": 0.1957, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.7352941176470589, |
|
"grad_norm": 1.7932289838790894, |
|
"learning_rate": 9.98487151097676e-05, |
|
"loss": 0.1667, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8823529411764706, |
|
"grad_norm": 0.6952548623085022, |
|
"learning_rate": 9.960084393841355e-05, |
|
"loss": 0.1425, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.0294117647058822, |
|
"grad_norm": 0.944196343421936, |
|
"learning_rate": 9.923568892600578e-05, |
|
"loss": 0.1222, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.1764705882352942, |
|
"grad_norm": 0.6027107238769531, |
|
"learning_rate": 9.875411350104744e-05, |
|
"loss": 0.1081, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.3235294117647058, |
|
"grad_norm": 0.7133479714393616, |
|
"learning_rate": 9.815725637431662e-05, |
|
"loss": 0.1004, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.4705882352941178, |
|
"grad_norm": 0.6044264435768127, |
|
"learning_rate": 9.744652884632406e-05, |
|
"loss": 0.0935, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.6176470588235294, |
|
"grad_norm": 0.5952054858207703, |
|
"learning_rate": 9.662361147021779e-05, |
|
"loss": 0.0938, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.7647058823529411, |
|
"grad_norm": 0.7661593556404114, |
|
"learning_rate": 9.569045007802559e-05, |
|
"loss": 0.0876, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.9117647058823528, |
|
"grad_norm": 0.5203129649162292, |
|
"learning_rate": 9.464925117963133e-05, |
|
"loss": 0.0799, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.0588235294117645, |
|
"grad_norm": 1.392182469367981, |
|
"learning_rate": 9.35024767453647e-05, |
|
"loss": 0.0848, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.2058823529411766, |
|
"grad_norm": 0.44986793398857117, |
|
"learning_rate": 9.225283838454111e-05, |
|
"loss": 0.0789, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.3529411764705883, |
|
"grad_norm": 0.4528612196445465, |
|
"learning_rate": 9.090329093371666e-05, |
|
"loss": 0.0776, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.5809573531150818, |
|
"learning_rate": 8.945702546981969e-05, |
|
"loss": 0.0715, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.6470588235294117, |
|
"grad_norm": 0.5827745795249939, |
|
"learning_rate": 8.791746176467907e-05, |
|
"loss": 0.0716, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.7941176470588234, |
|
"grad_norm": 0.5849335789680481, |
|
"learning_rate": 8.628824019879137e-05, |
|
"loss": 0.0653, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.9411764705882355, |
|
"grad_norm": 0.661180317401886, |
|
"learning_rate": 8.457321315344694e-05, |
|
"loss": 0.0669, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.088235294117647, |
|
"grad_norm": 0.5251627564430237, |
|
"learning_rate": 8.277643590156894e-05, |
|
"loss": 0.069, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.235294117647059, |
|
"grad_norm": 0.471332311630249, |
|
"learning_rate": 8.090215701880419e-05, |
|
"loss": 0.0609, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.3823529411764706, |
|
"grad_norm": 0.42382729053497314, |
|
"learning_rate": 7.89548083375394e-05, |
|
"loss": 0.0622, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.5294117647058822, |
|
"grad_norm": 0.899319589138031, |
|
"learning_rate": 7.693899446759727e-05, |
|
"loss": 0.0648, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.6764705882352944, |
|
"grad_norm": 0.6775935292243958, |
|
"learning_rate": 7.485948190839077e-05, |
|
"loss": 0.0598, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.8235294117647056, |
|
"grad_norm": 0.43540897965431213, |
|
"learning_rate": 7.272118777828108e-05, |
|
"loss": 0.0573, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.9705882352941178, |
|
"grad_norm": 0.27909591794013977, |
|
"learning_rate": 7.052916818778918e-05, |
|
"loss": 0.0492, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.117647058823529, |
|
"grad_norm": 0.42636606097221375, |
|
"learning_rate": 6.828860628415253e-05, |
|
"loss": 0.0557, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.264705882352941, |
|
"grad_norm": 0.4702949821949005, |
|
"learning_rate": 6.60047999954972e-05, |
|
"loss": 0.0536, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.411764705882353, |
|
"grad_norm": 0.5331495404243469, |
|
"learning_rate": 6.368314950360415e-05, |
|
"loss": 0.0525, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.5588235294117645, |
|
"grad_norm": 0.301176518201828, |
|
"learning_rate": 6.132914447489137e-05, |
|
"loss": 0.0568, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.705882352941177, |
|
"grad_norm": 0.2303120642900467, |
|
"learning_rate": 5.8948351079804875e-05, |
|
"loss": 0.0432, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.852941176470588, |
|
"grad_norm": 0.37262749671936035, |
|
"learning_rate": 5.654639883131178e-05, |
|
"loss": 0.0491, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.6805188059806824, |
|
"learning_rate": 5.4128967273616625e-05, |
|
"loss": 0.0513, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 5.147058823529412, |
|
"grad_norm": 0.4015423655509949, |
|
"learning_rate": 5.170177255257618e-05, |
|
"loss": 0.0456, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 5.294117647058823, |
|
"grad_norm": 0.3756394386291504, |
|
"learning_rate": 4.9270553899567686e-05, |
|
"loss": 0.0535, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 5.4411764705882355, |
|
"grad_norm": 0.3560592532157898, |
|
"learning_rate": 4.6841060060770154e-05, |
|
"loss": 0.0463, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 5.588235294117647, |
|
"grad_norm": 0.4422471523284912, |
|
"learning_rate": 4.441903570394739e-05, |
|
"loss": 0.0417, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 5.735294117647059, |
|
"grad_norm": 0.577297568321228, |
|
"learning_rate": 4.201020783487464e-05, |
|
"loss": 0.0402, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 5.882352941176471, |
|
"grad_norm": 0.7647914290428162, |
|
"learning_rate": 3.962027225552807e-05, |
|
"loss": 0.0402, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 6.029411764705882, |
|
"grad_norm": 0.8858449459075928, |
|
"learning_rate": 3.7254880096057073e-05, |
|
"loss": 0.0479, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 6.176470588235294, |
|
"grad_norm": 0.49515625834465027, |
|
"learning_rate": 3.491962445238569e-05, |
|
"loss": 0.0434, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 6.323529411764706, |
|
"grad_norm": 0.548555850982666, |
|
"learning_rate": 3.262002716103897e-05, |
|
"loss": 0.0442, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 6.470588235294118, |
|
"grad_norm": 0.35759156942367554, |
|
"learning_rate": 3.0361525742465973e-05, |
|
"loss": 0.0411, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 6.617647058823529, |
|
"grad_norm": 0.31173354387283325, |
|
"learning_rate": 2.8149460543732664e-05, |
|
"loss": 0.0376, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 6.764705882352941, |
|
"grad_norm": 0.24947527050971985, |
|
"learning_rate": 2.598906211098643e-05, |
|
"loss": 0.0391, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 6.911764705882353, |
|
"grad_norm": 0.20982353389263153, |
|
"learning_rate": 2.388543882155067e-05, |
|
"loss": 0.0365, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 7.0588235294117645, |
|
"grad_norm": 0.3836628496646881, |
|
"learning_rate": 2.184356480489432e-05, |
|
"loss": 0.0365, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 7.205882352941177, |
|
"grad_norm": 0.23856157064437866, |
|
"learning_rate": 1.9868268181037185e-05, |
|
"loss": 0.0333, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 7.352941176470588, |
|
"grad_norm": 0.6093345880508423, |
|
"learning_rate": 1.796421964420285e-05, |
|
"loss": 0.0389, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.2536391019821167, |
|
"learning_rate": 1.6135921418712956e-05, |
|
"loss": 0.0355, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 7.647058823529412, |
|
"grad_norm": 0.22027313709259033, |
|
"learning_rate": 1.4387696613237612e-05, |
|
"loss": 0.0331, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 7.794117647058823, |
|
"grad_norm": 0.367398738861084, |
|
"learning_rate": 1.2723678998574512e-05, |
|
"loss": 0.0352, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 7.9411764705882355, |
|
"grad_norm": 0.24603775143623352, |
|
"learning_rate": 1.114780323312724e-05, |
|
"loss": 0.0399, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 8.088235294117647, |
|
"grad_norm": 0.22743625938892365, |
|
"learning_rate": 9.663795559195733e-06, |
|
"loss": 0.0315, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 8.235294117647058, |
|
"grad_norm": 0.3211243152618408, |
|
"learning_rate": 8.275164992077556e-06, |
|
"loss": 0.033, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 8.382352941176471, |
|
"grad_norm": 0.3177715241909027, |
|
"learning_rate": 6.985195022814067e-06, |
|
"loss": 0.0366, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 8.529411764705882, |
|
"grad_norm": 0.4995149075984955, |
|
"learning_rate": 5.796935854200763e-06, |
|
"loss": 0.0353, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 8.676470588235293, |
|
"grad_norm": 0.26444506645202637, |
|
"learning_rate": 4.713197188420026e-06, |
|
"loss": 0.0311, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 8.823529411764707, |
|
"grad_norm": 0.2560294568538666, |
|
"learning_rate": 3.7365415833504725e-06, |
|
"loss": 0.0355, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 8.970588235294118, |
|
"grad_norm": 0.20665033161640167, |
|
"learning_rate": 2.869278393262226e-06, |
|
"loss": 0.0367, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 9.117647058823529, |
|
"grad_norm": 0.4208621382713318, |
|
"learning_rate": 2.113458308225458e-06, |
|
"loss": 0.0346, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 9.264705882352942, |
|
"grad_norm": 0.3151559829711914, |
|
"learning_rate": 1.4708685051444515e-06, |
|
"loss": 0.0332, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 9.411764705882353, |
|
"grad_norm": 0.5470899343490601, |
|
"learning_rate": 9.430284218824026e-07, |
|
"loss": 0.0319, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 9.558823529411764, |
|
"grad_norm": 1.0164936780929565, |
|
"learning_rate": 5.311861644696048e-07, |
|
"loss": 0.0378, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 9.705882352941176, |
|
"grad_norm": 0.18899625539779663, |
|
"learning_rate": 2.363155558901542e-07, |
|
"loss": 0.0349, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 9.852941176470589, |
|
"grad_norm": 0.18862251937389374, |
|
"learning_rate": 5.911383342556143e-08, |
|
"loss": 0.036, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.724671483039856, |
|
"learning_rate": 0.0, |
|
"loss": 0.0299, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 680, |
|
"total_flos": 9.581997557091456e+16, |
|
"train_loss": 0.08232775286716573, |
|
"train_runtime": 870.4339, |
|
"train_samples_per_second": 49.378, |
|
"train_steps_per_second": 0.781 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 680, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.581997557091456e+16, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|