{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 315, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15873015873015872, "grad_norm": 5.117466449737549, "learning_rate": 0.000125, "loss": 1.112, "step": 10 }, { "epoch": 0.31746031746031744, "grad_norm": 3.9644668102264404, "learning_rate": 0.00019991169537951468, "loss": 0.4916, "step": 20 }, { "epoch": 0.47619047619047616, "grad_norm": 2.397569417953491, "learning_rate": 0.00019892005856487878, "loss": 0.3606, "step": 30 }, { "epoch": 0.6349206349206349, "grad_norm": 1.5157995223999023, "learning_rate": 0.00019683737802329244, "loss": 0.2953, "step": 40 }, { "epoch": 0.7936507936507936, "grad_norm": 2.1924238204956055, "learning_rate": 0.00019368662478381799, "loss": 0.2249, "step": 50 }, { "epoch": 0.9523809523809523, "grad_norm": 2.078732967376709, "learning_rate": 0.00018950255023668876, "loss": 0.2383, "step": 60 }, { "epoch": 1.1111111111111112, "grad_norm": 1.4331333637237549, "learning_rate": 0.0001843313028411149, "loss": 0.1957, "step": 70 }, { "epoch": 1.2698412698412698, "grad_norm": 0.9676660299301147, "learning_rate": 0.00017822991912854713, "loss": 0.1727, "step": 80 }, { "epoch": 1.4285714285714286, "grad_norm": 1.7661018371582031, "learning_rate": 0.00017126569461540443, "loss": 0.1687, "step": 90 }, { "epoch": 1.5873015873015874, "grad_norm": 1.013567328453064, "learning_rate": 0.00016351544156381414, "loss": 0.1581, "step": 100 }, { "epoch": 1.746031746031746, "grad_norm": 1.0763999223709106, "learning_rate": 0.0001550646417769301, "loss": 0.1281, "step": 110 }, { "epoch": 1.9047619047619047, "grad_norm": 0.9065868258476257, "learning_rate": 0.00014600650377311522, "loss": 0.1301, "step": 120 }, { "epoch": 2.0634920634920633, "grad_norm": 0.8975210785865784, "learning_rate": 0.00013644093473793215, "loss": 0.1131, "step": 130 }, { "epoch": 2.2222222222222223, "grad_norm": 0.7319889664649963, "learning_rate": 0.00012647343859284997, "loss": 0.0953, "step": 140 }, { "epoch": 2.380952380952381, "grad_norm": 0.849612832069397, "learning_rate": 0.00011621395233447248, "loss": 0.1131, "step": 150 }, { "epoch": 2.5396825396825395, "grad_norm": 0.8651995062828064, "learning_rate": 0.00010577563347894285, "loss": 0.1016, "step": 160 }, { "epoch": 2.6984126984126986, "grad_norm": 1.222275972366333, "learning_rate": 9.527361198546714e-05, "loss": 0.0894, "step": 170 }, { "epoch": 2.857142857142857, "grad_norm": 1.0224725008010864, "learning_rate": 8.48237204246785e-05, "loss": 0.092, "step": 180 }, { "epoch": 3.015873015873016, "grad_norm": 0.9545143246650696, "learning_rate": 7.454121639751371e-05, "loss": 0.0983, "step": 190 }, { "epoch": 3.1746031746031744, "grad_norm": 0.6206408143043518, "learning_rate": 6.453951129574644e-05, "loss": 0.0956, "step": 200 }, { "epoch": 3.3333333333333335, "grad_norm": 0.5555378198623657, "learning_rate": 5.492891942537703e-05, "loss": 0.0828, "step": 210 }, { "epoch": 3.492063492063492, "grad_norm": 0.5887261629104614, "learning_rate": 4.581544128948413e-05, "loss": 0.083, "step": 220 }, { "epoch": 3.6507936507936507, "grad_norm": 0.4162333011627197, "learning_rate": 3.729959445038136e-05, "loss": 0.0714, "step": 230 }, { "epoch": 3.8095238095238093, "grad_norm": 0.4157885015010834, "learning_rate": 2.9475304866143027e-05, "loss": 0.0688, "step": 240 }, { "epoch": 3.9682539682539684, "grad_norm": 0.4919394850730896, "learning_rate": 2.242887092955801e-05, "loss": 0.057, "step": 250 }, { "epoch": 4.1269841269841265, "grad_norm": 0.5343856811523438, "learning_rate": 1.6238011635695848e-05, "loss": 0.0651, "step": 260 }, { "epoch": 4.285714285714286, "grad_norm": 0.42693275213241577, "learning_rate": 1.0971009376368612e-05, "loss": 0.0531, "step": 270 }, { "epoch": 4.444444444444445, "grad_norm": 0.4508126676082611, "learning_rate": 6.6859568160797525e-06, "loss": 0.0644, "step": 280 }, { "epoch": 4.603174603174603, "grad_norm": 0.4287712872028351, "learning_rate": 3.4301161560792774e-06, "loss": 0.0467, "step": 290 }, { "epoch": 4.761904761904762, "grad_norm": 0.39040446281433105, "learning_rate": 1.239397853554336e-06, "loss": 0.0587, "step": 300 }, { "epoch": 4.920634920634921, "grad_norm": 0.39611175656318665, "learning_rate": 1.379645454479661e-07, "loss": 0.056, "step": 310 }, { "epoch": 5.0, "step": 315, "total_flos": 5801722395569280.0, "train_loss": 0.16537500326595608, "train_runtime": 123.2447, "train_samples_per_second": 40.894, "train_steps_per_second": 2.556 } ], "logging_steps": 10, "max_steps": 315, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5801722395569280.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }