{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 680, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14705882352941177, "grad_norm": 5.515628814697266, "learning_rate": 2.9411764705882354e-05, "loss": 1.1436, "step": 10 }, { "epoch": 0.29411764705882354, "grad_norm": 2.5779266357421875, "learning_rate": 5.882352941176471e-05, "loss": 0.4215, "step": 20 }, { "epoch": 0.4411764705882353, "grad_norm": 1.5243560075759888, "learning_rate": 8.823529411764706e-05, "loss": 0.2531, "step": 30 }, { "epoch": 0.5882352941176471, "grad_norm": 1.0052002668380737, "learning_rate": 9.997871633546257e-05, "loss": 0.1957, "step": 40 }, { "epoch": 0.7352941176470589, "grad_norm": 1.7932289838790894, "learning_rate": 9.98487151097676e-05, "loss": 0.1667, "step": 50 }, { "epoch": 0.8823529411764706, "grad_norm": 0.6952548623085022, "learning_rate": 9.960084393841355e-05, "loss": 0.1425, "step": 60 }, { "epoch": 1.0294117647058822, "grad_norm": 0.944196343421936, "learning_rate": 9.923568892600578e-05, "loss": 0.1222, "step": 70 }, { "epoch": 1.1764705882352942, "grad_norm": 0.6027107238769531, "learning_rate": 9.875411350104744e-05, "loss": 0.1081, "step": 80 }, { "epoch": 1.3235294117647058, "grad_norm": 0.7133479714393616, "learning_rate": 9.815725637431662e-05, "loss": 0.1004, "step": 90 }, { "epoch": 1.4705882352941178, "grad_norm": 0.6044264435768127, "learning_rate": 9.744652884632406e-05, "loss": 0.0935, "step": 100 }, { "epoch": 1.6176470588235294, "grad_norm": 0.5952054858207703, "learning_rate": 9.662361147021779e-05, "loss": 0.0938, "step": 110 }, { "epoch": 1.7647058823529411, "grad_norm": 0.7661593556404114, "learning_rate": 9.569045007802559e-05, "loss": 0.0876, "step": 120 }, { "epoch": 1.9117647058823528, "grad_norm": 0.5203129649162292, "learning_rate": 9.464925117963133e-05, "loss": 0.0799, "step": 130 }, { "epoch": 2.0588235294117645, "grad_norm": 1.392182469367981, "learning_rate": 9.35024767453647e-05, "loss": 0.0848, "step": 140 }, { "epoch": 2.2058823529411766, "grad_norm": 0.44986793398857117, "learning_rate": 9.225283838454111e-05, "loss": 0.0789, "step": 150 }, { "epoch": 2.3529411764705883, "grad_norm": 0.4528612196445465, "learning_rate": 9.090329093371666e-05, "loss": 0.0776, "step": 160 }, { "epoch": 2.5, "grad_norm": 0.5809573531150818, "learning_rate": 8.945702546981969e-05, "loss": 0.0715, "step": 170 }, { "epoch": 2.6470588235294117, "grad_norm": 0.5827745795249939, "learning_rate": 8.791746176467907e-05, "loss": 0.0716, "step": 180 }, { "epoch": 2.7941176470588234, "grad_norm": 0.5849335789680481, "learning_rate": 8.628824019879137e-05, "loss": 0.0653, "step": 190 }, { "epoch": 2.9411764705882355, "grad_norm": 0.661180317401886, "learning_rate": 8.457321315344694e-05, "loss": 0.0669, "step": 200 }, { "epoch": 3.088235294117647, "grad_norm": 0.5251627564430237, "learning_rate": 8.277643590156894e-05, "loss": 0.069, "step": 210 }, { "epoch": 3.235294117647059, "grad_norm": 0.471332311630249, "learning_rate": 8.090215701880419e-05, "loss": 0.0609, "step": 220 }, { "epoch": 3.3823529411764706, "grad_norm": 0.42382729053497314, "learning_rate": 7.89548083375394e-05, "loss": 0.0622, "step": 230 }, { "epoch": 3.5294117647058822, "grad_norm": 0.899319589138031, "learning_rate": 7.693899446759727e-05, "loss": 0.0648, "step": 240 }, { "epoch": 3.6764705882352944, "grad_norm": 0.6775935292243958, "learning_rate": 7.485948190839077e-05, "loss": 0.0598, "step": 250 }, { "epoch": 3.8235294117647056, "grad_norm": 0.43540897965431213, "learning_rate": 7.272118777828108e-05, "loss": 0.0573, "step": 260 }, { "epoch": 3.9705882352941178, "grad_norm": 0.27909591794013977, "learning_rate": 7.052916818778918e-05, "loss": 0.0492, "step": 270 }, { "epoch": 4.117647058823529, "grad_norm": 0.42636606097221375, "learning_rate": 6.828860628415253e-05, "loss": 0.0557, "step": 280 }, { "epoch": 4.264705882352941, "grad_norm": 0.4702949821949005, "learning_rate": 6.60047999954972e-05, "loss": 0.0536, "step": 290 }, { "epoch": 4.411764705882353, "grad_norm": 0.5331495404243469, "learning_rate": 6.368314950360415e-05, "loss": 0.0525, "step": 300 }, { "epoch": 4.5588235294117645, "grad_norm": 0.301176518201828, "learning_rate": 6.132914447489137e-05, "loss": 0.0568, "step": 310 }, { "epoch": 4.705882352941177, "grad_norm": 0.2303120642900467, "learning_rate": 5.8948351079804875e-05, "loss": 0.0432, "step": 320 }, { "epoch": 4.852941176470588, "grad_norm": 0.37262749671936035, "learning_rate": 5.654639883131178e-05, "loss": 0.0491, "step": 330 }, { "epoch": 5.0, "grad_norm": 0.6805188059806824, "learning_rate": 5.4128967273616625e-05, "loss": 0.0513, "step": 340 }, { "epoch": 5.147058823529412, "grad_norm": 0.4015423655509949, "learning_rate": 5.170177255257618e-05, "loss": 0.0456, "step": 350 }, { "epoch": 5.294117647058823, "grad_norm": 0.3756394386291504, "learning_rate": 4.9270553899567686e-05, "loss": 0.0535, "step": 360 }, { "epoch": 5.4411764705882355, "grad_norm": 0.3560592532157898, "learning_rate": 4.6841060060770154e-05, "loss": 0.0463, "step": 370 }, { "epoch": 5.588235294117647, "grad_norm": 0.4422471523284912, "learning_rate": 4.441903570394739e-05, "loss": 0.0417, "step": 380 }, { "epoch": 5.735294117647059, "grad_norm": 0.577297568321228, "learning_rate": 4.201020783487464e-05, "loss": 0.0402, "step": 390 }, { "epoch": 5.882352941176471, "grad_norm": 0.7647914290428162, "learning_rate": 3.962027225552807e-05, "loss": 0.0402, "step": 400 }, { "epoch": 6.029411764705882, "grad_norm": 0.8858449459075928, "learning_rate": 3.7254880096057073e-05, "loss": 0.0479, "step": 410 }, { "epoch": 6.176470588235294, "grad_norm": 0.49515625834465027, "learning_rate": 3.491962445238569e-05, "loss": 0.0434, "step": 420 }, { "epoch": 6.323529411764706, "grad_norm": 0.548555850982666, "learning_rate": 3.262002716103897e-05, "loss": 0.0442, "step": 430 }, { "epoch": 6.470588235294118, "grad_norm": 0.35759156942367554, "learning_rate": 3.0361525742465973e-05, "loss": 0.0411, "step": 440 }, { "epoch": 6.617647058823529, "grad_norm": 0.31173354387283325, "learning_rate": 2.8149460543732664e-05, "loss": 0.0376, "step": 450 }, { "epoch": 6.764705882352941, "grad_norm": 0.24947527050971985, "learning_rate": 2.598906211098643e-05, "loss": 0.0391, "step": 460 }, { "epoch": 6.911764705882353, "grad_norm": 0.20982353389263153, "learning_rate": 2.388543882155067e-05, "loss": 0.0365, "step": 470 }, { "epoch": 7.0588235294117645, "grad_norm": 0.3836628496646881, "learning_rate": 2.184356480489432e-05, "loss": 0.0365, "step": 480 }, { "epoch": 7.205882352941177, "grad_norm": 0.23856157064437866, "learning_rate": 1.9868268181037185e-05, "loss": 0.0333, "step": 490 }, { "epoch": 7.352941176470588, "grad_norm": 0.6093345880508423, "learning_rate": 1.796421964420285e-05, "loss": 0.0389, "step": 500 }, { "epoch": 7.5, "grad_norm": 0.2536391019821167, "learning_rate": 1.6135921418712956e-05, "loss": 0.0355, "step": 510 }, { "epoch": 7.647058823529412, "grad_norm": 0.22027313709259033, "learning_rate": 1.4387696613237612e-05, "loss": 0.0331, "step": 520 }, { "epoch": 7.794117647058823, "grad_norm": 0.367398738861084, "learning_rate": 1.2723678998574512e-05, "loss": 0.0352, "step": 530 }, { "epoch": 7.9411764705882355, "grad_norm": 0.24603775143623352, "learning_rate": 1.114780323312724e-05, "loss": 0.0399, "step": 540 }, { "epoch": 8.088235294117647, "grad_norm": 0.22743625938892365, "learning_rate": 9.663795559195733e-06, "loss": 0.0315, "step": 550 }, { "epoch": 8.235294117647058, "grad_norm": 0.3211243152618408, "learning_rate": 8.275164992077556e-06, "loss": 0.033, "step": 560 }, { "epoch": 8.382352941176471, "grad_norm": 0.3177715241909027, "learning_rate": 6.985195022814067e-06, "loss": 0.0366, "step": 570 }, { "epoch": 8.529411764705882, "grad_norm": 0.4995149075984955, "learning_rate": 5.796935854200763e-06, "loss": 0.0353, "step": 580 }, { "epoch": 8.676470588235293, "grad_norm": 0.26444506645202637, "learning_rate": 4.713197188420026e-06, "loss": 0.0311, "step": 590 }, { "epoch": 8.823529411764707, "grad_norm": 0.2560294568538666, "learning_rate": 3.7365415833504725e-06, "loss": 0.0355, "step": 600 }, { "epoch": 8.970588235294118, "grad_norm": 0.20665033161640167, "learning_rate": 2.869278393262226e-06, "loss": 0.0367, "step": 610 }, { "epoch": 9.117647058823529, "grad_norm": 0.4208621382713318, "learning_rate": 2.113458308225458e-06, "loss": 0.0346, "step": 620 }, { "epoch": 9.264705882352942, "grad_norm": 0.3151559829711914, "learning_rate": 1.4708685051444515e-06, "loss": 0.0332, "step": 630 }, { "epoch": 9.411764705882353, "grad_norm": 0.5470899343490601, "learning_rate": 9.430284218824026e-07, "loss": 0.0319, "step": 640 }, { "epoch": 9.558823529411764, "grad_norm": 1.0164936780929565, "learning_rate": 5.311861644696048e-07, "loss": 0.0378, "step": 650 }, { "epoch": 9.705882352941176, "grad_norm": 0.18899625539779663, "learning_rate": 2.363155558901542e-07, "loss": 0.0349, "step": 660 }, { "epoch": 9.852941176470589, "grad_norm": 0.18862251937389374, "learning_rate": 5.911383342556143e-08, "loss": 0.036, "step": 670 }, { "epoch": 10.0, "grad_norm": 0.724671483039856, "learning_rate": 0.0, "loss": 0.0299, "step": 680 }, { "epoch": 10.0, "step": 680, "total_flos": 9.581997557091456e+16, "train_loss": 0.08232775286716573, "train_runtime": 870.4339, "train_samples_per_second": 49.378, "train_steps_per_second": 0.781 } ], "logging_steps": 10, "max_steps": 680, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.581997557091456e+16, "train_batch_size": 64, "trial_name": null, "trial_params": null }