{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 340, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14705882352941177, "grad_norm": 2.458341360092163, "learning_rate": 0.00011764705882352942, "loss": 0.8805, "step": 10 }, { "epoch": 0.29411764705882354, "grad_norm": 1.3460900783538818, "learning_rate": 0.00019995743267092514, "loss": 0.4012, "step": 20 }, { "epoch": 0.4411764705882353, "grad_norm": 1.291209101676941, "learning_rate": 0.0001992016878768271, "loss": 0.3084, "step": 30 }, { "epoch": 0.5882352941176471, "grad_norm": 1.0544739961624146, "learning_rate": 0.00019750822700209488, "loss": 0.2579, "step": 40 }, { "epoch": 0.7352941176470589, "grad_norm": 1.0863096714019775, "learning_rate": 0.00019489305769264812, "loss": 0.241, "step": 50 }, { "epoch": 0.8823529411764706, "grad_norm": 0.8423298001289368, "learning_rate": 0.00019138090015605118, "loss": 0.2391, "step": 60 }, { "epoch": 1.0294117647058822, "grad_norm": 0.7478500008583069, "learning_rate": 0.0001870049534907294, "loss": 0.1854, "step": 70 }, { "epoch": 1.1764705882352942, "grad_norm": 0.7334464192390442, "learning_rate": 0.00018180658186743332, "loss": 0.1821, "step": 80 }, { "epoch": 1.3235294117647058, "grad_norm": 1.0398321151733398, "learning_rate": 0.00017583492352935814, "loss": 0.164, "step": 90 }, { "epoch": 1.4705882352941178, "grad_norm": 0.9809992909431458, "learning_rate": 0.00016914642630689388, "loss": 0.1647, "step": 100 }, { "epoch": 1.6176470588235294, "grad_norm": 0.5706636905670166, "learning_rate": 0.00016180431403760837, "loss": 0.1683, "step": 110 }, { "epoch": 1.7647058823529411, "grad_norm": 0.7732464671134949, "learning_rate": 0.00015387798893519455, "loss": 0.1546, "step": 120 }, { "epoch": 1.9117647058823528, "grad_norm": 0.7296786308288574, "learning_rate": 0.00014544237555656216, "loss": 0.1286, "step": 130 }, { "epoch": 2.0588235294117645, "grad_norm": 0.6389240622520447, "learning_rate": 0.00013657721256830506, "loss": 0.1227, "step": 140 }, { "epoch": 2.2058823529411766, "grad_norm": 0.46665242314338684, "learning_rate": 0.0001273662990072083, "loss": 0.1368, "step": 150 }, { "epoch": 2.3529411764705883, "grad_norm": 0.6480994820594788, "learning_rate": 0.00011789670215960975, "loss": 0.1192, "step": 160 }, { "epoch": 2.5, "grad_norm": 0.4292305111885071, "learning_rate": 0.00010825793454723325, "loss": 0.1051, "step": 170 }, { "epoch": 2.6470588235294117, "grad_norm": 0.6904204487800598, "learning_rate": 9.854110779913537e-05, "loss": 0.1065, "step": 180 }, { "epoch": 2.7941176470588234, "grad_norm": 0.6822876334190369, "learning_rate": 8.883807140789478e-05, "loss": 0.1107, "step": 190 }, { "epoch": 2.9411764705882355, "grad_norm": 0.5520216822624207, "learning_rate": 7.924054451105614e-05, "loss": 0.1155, "step": 200 }, { "epoch": 3.088235294117647, "grad_norm": 0.4506421685218811, "learning_rate": 6.983924890477138e-05, "loss": 0.0978, "step": 210 }, { "epoch": 3.235294117647059, "grad_norm": 0.538611650466919, "learning_rate": 6.072305148493195e-05, "loss": 0.0987, "step": 220 }, { "epoch": 3.3823529411764706, "grad_norm": 0.3745705485343933, "learning_rate": 5.197812422197286e-05, "loss": 0.0956, "step": 230 }, { "epoch": 3.5294117647058822, "grad_norm": 0.458775132894516, "learning_rate": 4.368712960978864e-05, "loss": 0.0928, "step": 240 }, { "epoch": 3.6764705882352944, "grad_norm": 0.31457090377807617, "learning_rate": 3.59284392884057e-05, "loss": 0.0906, "step": 250 }, { "epoch": 3.8235294117647056, "grad_norm": 0.48355451226234436, "learning_rate": 2.8775393226475224e-05, "loss": 0.0907, "step": 260 }, { "epoch": 3.9705882352941178, "grad_norm": 0.32496538758277893, "learning_rate": 2.229560646625448e-05, "loss": 0.0813, "step": 270 }, { "epoch": 4.117647058823529, "grad_norm": 0.38124969601631165, "learning_rate": 1.6550329984155112e-05, "loss": 0.0902, "step": 280 }, { "epoch": 4.264705882352941, "grad_norm": 0.3412385880947113, "learning_rate": 1.1593871708401526e-05, "loss": 0.0727, "step": 290 }, { "epoch": 4.411764705882353, "grad_norm": 0.36698442697525024, "learning_rate": 7.473083166700945e-06, "loss": 0.0796, "step": 300 }, { "epoch": 4.5588235294117645, "grad_norm": 0.43249696493148804, "learning_rate": 4.226916616450916e-06, "loss": 0.0807, "step": 310 }, { "epoch": 4.705882352941177, "grad_norm": 0.40810427069664, "learning_rate": 1.8860568437648052e-06, "loss": 0.0825, "step": 320 }, { "epoch": 4.852941176470588, "grad_norm": 0.3101021945476532, "learning_rate": 4.726311117803084e-07, "loss": 0.082, "step": 330 }, { "epoch": 5.0, "grad_norm": 0.40530508756637573, "learning_rate": 0.0, "loss": 0.0693, "step": 340 }, { "epoch": 5.0, "step": 340, "total_flos": 1.1211231724308e+16, "train_loss": 0.161671401998576, "train_runtime": 161.3443, "train_samples_per_second": 33.717, "train_steps_per_second": 2.107 } ], "logging_steps": 10, "max_steps": 340, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1211231724308e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }