{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 340, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014705882352941176, "grad_norm": 1.2318819761276245, "learning_rate": 0.0002, "loss": 1.1427, "mean_token_accuracy": 0.7514051914215087, "num_tokens": 81920.0, "step": 5 }, { "epoch": 0.029411764705882353, "grad_norm": 0.612349808216095, "learning_rate": 0.0002, "loss": 0.5799, "mean_token_accuracy": 0.8597629547119141, "num_tokens": 163840.0, "step": 10 }, { "epoch": 0.04411764705882353, "grad_norm": 0.4277164936065674, "learning_rate": 0.0002, "loss": 0.4103, "mean_token_accuracy": 0.8957844376564026, "num_tokens": 245760.0, "step": 15 }, { "epoch": 0.058823529411764705, "grad_norm": 0.3906048536300659, "learning_rate": 0.0002, "loss": 0.4176, "mean_token_accuracy": 0.8923509180545807, "num_tokens": 327680.0, "step": 20 }, { "epoch": 0.07352941176470588, "grad_norm": 0.3483637571334839, "learning_rate": 0.0002, "loss": 0.3835, "mean_token_accuracy": 0.8978738963603974, "num_tokens": 409600.0, "step": 25 }, { "epoch": 0.08823529411764706, "grad_norm": 0.35842204093933105, "learning_rate": 0.0002, "loss": 0.3728, "mean_token_accuracy": 0.9008919656276703, "num_tokens": 491520.0, "step": 30 }, { "epoch": 0.10294117647058823, "grad_norm": 0.29427987337112427, "learning_rate": 0.0002, "loss": 0.339, "mean_token_accuracy": 0.9093719601631165, "num_tokens": 573440.0, "step": 35 }, { "epoch": 0.11764705882352941, "grad_norm": 0.3110213875770569, "learning_rate": 0.0002, "loss": 0.3479, "mean_token_accuracy": 0.9065738081932068, "num_tokens": 655360.0, "step": 40 }, { "epoch": 0.1323529411764706, "grad_norm": 0.30392590165138245, "learning_rate": 0.0002, "loss": 0.3632, "mean_token_accuracy": 0.90327467918396, "num_tokens": 737280.0, "step": 45 }, { "epoch": 0.14705882352941177, "grad_norm": 0.3164522647857666, "learning_rate": 0.0002, "loss": 0.3519, "mean_token_accuracy": 0.9053433358669281, "num_tokens": 818998.0, "step": 50 }, { "epoch": 0.16176470588235295, "grad_norm": 0.29904985427856445, "learning_rate": 0.0002, "loss": 0.359, "mean_token_accuracy": 0.9019794702529907, "num_tokens": 900918.0, "step": 55 }, { "epoch": 0.17647058823529413, "grad_norm": 0.3399337828159332, "learning_rate": 0.0002, "loss": 0.3472, "mean_token_accuracy": 0.9052419304847718, "num_tokens": 982838.0, "step": 60 }, { "epoch": 0.19117647058823528, "grad_norm": 0.3235512673854828, "learning_rate": 0.0002, "loss": 0.3527, "mean_token_accuracy": 0.9031891584396362, "num_tokens": 1064758.0, "step": 65 }, { "epoch": 0.20588235294117646, "grad_norm": 0.32987555861473083, "learning_rate": 0.0002, "loss": 0.3596, "mean_token_accuracy": 0.9010885059833527, "num_tokens": 1145665.0, "step": 70 }, { "epoch": 0.22058823529411764, "grad_norm": 0.28296959400177, "learning_rate": 0.0002, "loss": 0.3186, "mean_token_accuracy": 0.9128421485424042, "num_tokens": 1227585.0, "step": 75 }, { "epoch": 0.23529411764705882, "grad_norm": 0.2940562665462494, "learning_rate": 0.0002, "loss": 0.3189, "mean_token_accuracy": 0.9119745969772339, "num_tokens": 1309505.0, "step": 80 }, { "epoch": 0.25, "grad_norm": 0.31312814354896545, "learning_rate": 0.0002, "loss": 0.3366, "mean_token_accuracy": 0.9083712756633758, "num_tokens": 1390498.0, "step": 85 }, { "epoch": 0.2647058823529412, "grad_norm": 0.2923528254032135, "learning_rate": 0.0002, "loss": 0.3114, "mean_token_accuracy": 0.9138196527957916, "num_tokens": 1472418.0, "step": 90 }, { "epoch": 0.27941176470588236, "grad_norm": 0.2987738847732544, "learning_rate": 0.0002, "loss": 0.3226, "mean_token_accuracy": 0.9115102827548981, "num_tokens": 1554338.0, "step": 95 }, { "epoch": 0.29411764705882354, "grad_norm": 0.3070703446865082, "learning_rate": 0.0002, "loss": 0.334, "mean_token_accuracy": 0.9086510419845581, "num_tokens": 1636258.0, "step": 100 }, { "epoch": 0.3088235294117647, "grad_norm": 0.2919357419013977, "learning_rate": 0.0002, "loss": 0.322, "mean_token_accuracy": 0.9099951267242432, "num_tokens": 1718178.0, "step": 105 }, { "epoch": 0.3235294117647059, "grad_norm": 0.3079027235507965, "learning_rate": 0.0002, "loss": 0.3195, "mean_token_accuracy": 0.9123972117900848, "num_tokens": 1799262.0, "step": 110 }, { "epoch": 0.3382352941176471, "grad_norm": 0.32008472084999084, "learning_rate": 0.0002, "loss": 0.3211, "mean_token_accuracy": 0.9098729312419891, "num_tokens": 1881182.0, "step": 115 }, { "epoch": 0.35294117647058826, "grad_norm": 0.33167868852615356, "learning_rate": 0.0002, "loss": 0.3165, "mean_token_accuracy": 0.9122434020042419, "num_tokens": 1963102.0, "step": 120 }, { "epoch": 0.36764705882352944, "grad_norm": 0.26130759716033936, "learning_rate": 0.0002, "loss": 0.3101, "mean_token_accuracy": 0.9149560272693634, "num_tokens": 2045022.0, "step": 125 }, { "epoch": 0.38235294117647056, "grad_norm": 0.3016408681869507, "learning_rate": 0.0002, "loss": 0.3149, "mean_token_accuracy": 0.9118534028530121, "num_tokens": 2126154.0, "step": 130 }, { "epoch": 0.39705882352941174, "grad_norm": 0.3000870645046234, "learning_rate": 0.0002, "loss": 0.3157, "mean_token_accuracy": 0.9116202533245087, "num_tokens": 2208074.0, "step": 135 }, { "epoch": 0.4117647058823529, "grad_norm": 0.2947154939174652, "learning_rate": 0.0002, "loss": 0.2991, "mean_token_accuracy": 0.916422301530838, "num_tokens": 2289994.0, "step": 140 }, { "epoch": 0.4264705882352941, "grad_norm": 0.29345065355300903, "learning_rate": 0.0002, "loss": 0.3192, "mean_token_accuracy": 0.9102272808551788, "num_tokens": 2371914.0, "step": 145 }, { "epoch": 0.4411764705882353, "grad_norm": 0.2984428107738495, "learning_rate": 0.0002, "loss": 0.298, "mean_token_accuracy": 0.9163951098918914, "num_tokens": 2453143.0, "step": 150 }, { "epoch": 0.45588235294117646, "grad_norm": 0.2700878977775574, "learning_rate": 0.0002, "loss": 0.291, "mean_token_accuracy": 0.9183040201663971, "num_tokens": 2535063.0, "step": 155 }, { "epoch": 0.47058823529411764, "grad_norm": 0.30076536536216736, "learning_rate": 0.0002, "loss": 0.3097, "mean_token_accuracy": 0.9130865216255188, "num_tokens": 2616983.0, "step": 160 }, { "epoch": 0.4852941176470588, "grad_norm": 0.30549952387809753, "learning_rate": 0.0002, "loss": 0.3136, "mean_token_accuracy": 0.9121212244033814, "num_tokens": 2698903.0, "step": 165 }, { "epoch": 0.5, "grad_norm": 0.2821143865585327, "learning_rate": 0.0002, "loss": 0.3006, "mean_token_accuracy": 0.9160520434379578, "num_tokens": 2780150.0, "step": 170 }, { "epoch": 0.5147058823529411, "grad_norm": 0.2865024507045746, "learning_rate": 0.0002, "loss": 0.3109, "mean_token_accuracy": 0.9121701002120972, "num_tokens": 2862070.0, "step": 175 }, { "epoch": 0.5294117647058824, "grad_norm": 0.299447238445282, "learning_rate": 0.0002, "loss": 0.3045, "mean_token_accuracy": 0.914674985408783, "num_tokens": 2943990.0, "step": 180 }, { "epoch": 0.5441176470588235, "grad_norm": 0.28584349155426025, "learning_rate": 0.0002, "loss": 0.296, "mean_token_accuracy": 0.9169232726097107, "num_tokens": 3025910.0, "step": 185 }, { "epoch": 0.5588235294117647, "grad_norm": 0.28912603855133057, "learning_rate": 0.0002, "loss": 0.2828, "mean_token_accuracy": 0.9202346205711365, "num_tokens": 3107830.0, "step": 190 }, { "epoch": 0.5735294117647058, "grad_norm": 0.2780699133872986, "learning_rate": 0.0002, "loss": 0.2943, "mean_token_accuracy": 0.917925238609314, "num_tokens": 3189750.0, "step": 195 }, { "epoch": 0.5882352941176471, "grad_norm": 0.2849072813987732, "learning_rate": 0.0002, "loss": 0.2909, "mean_token_accuracy": 0.9186461567878723, "num_tokens": 3271670.0, "step": 200 }, { "epoch": 0.6029411764705882, "grad_norm": 0.287589967250824, "learning_rate": 0.0002, "loss": 0.3006, "mean_token_accuracy": 0.9150293409824372, "num_tokens": 3353590.0, "step": 205 }, { "epoch": 0.6176470588235294, "grad_norm": 0.3039202392101288, "learning_rate": 0.0002, "loss": 0.3017, "mean_token_accuracy": 0.9141373574733734, "num_tokens": 3435510.0, "step": 210 }, { "epoch": 0.6323529411764706, "grad_norm": 0.29136523604393005, "learning_rate": 0.0002, "loss": 0.2937, "mean_token_accuracy": 0.9157746970653534, "num_tokens": 3517430.0, "step": 215 }, { "epoch": 0.6470588235294118, "grad_norm": 0.28994059562683105, "learning_rate": 0.0002, "loss": 0.2948, "mean_token_accuracy": 0.9153592526912689, "num_tokens": 3599350.0, "step": 220 }, { "epoch": 0.6617647058823529, "grad_norm": 0.3030713200569153, "learning_rate": 0.0002, "loss": 0.3021, "mean_token_accuracy": 0.9139174222946167, "num_tokens": 3681270.0, "step": 225 }, { "epoch": 0.6764705882352942, "grad_norm": 0.2715919017791748, "learning_rate": 0.0002, "loss": 0.2973, "mean_token_accuracy": 0.9151881873607636, "num_tokens": 3763190.0, "step": 230 }, { "epoch": 0.6911764705882353, "grad_norm": 0.29798802733421326, "learning_rate": 0.0002, "loss": 0.3004, "mean_token_accuracy": 0.9151026546955109, "num_tokens": 3845110.0, "step": 235 }, { "epoch": 0.7058823529411765, "grad_norm": 0.31128421425819397, "learning_rate": 0.0002, "loss": 0.3049, "mean_token_accuracy": 0.9125122249126434, "num_tokens": 3927030.0, "step": 240 }, { "epoch": 0.7205882352941176, "grad_norm": 0.282503604888916, "learning_rate": 0.0002, "loss": 0.2808, "mean_token_accuracy": 0.919000506401062, "num_tokens": 4008950.0, "step": 245 }, { "epoch": 0.7352941176470589, "grad_norm": 0.2817753255367279, "learning_rate": 0.0002, "loss": 0.2879, "mean_token_accuracy": 0.9177908301353455, "num_tokens": 4090870.0, "step": 250 }, { "epoch": 0.75, "grad_norm": 0.29370447993278503, "learning_rate": 0.0002, "loss": 0.2798, "mean_token_accuracy": 0.9193670749664307, "num_tokens": 4172790.0, "step": 255 }, { "epoch": 0.7647058823529411, "grad_norm": 0.2587876617908478, "learning_rate": 0.0002, "loss": 0.2799, "mean_token_accuracy": 0.920650064945221, "num_tokens": 4254710.0, "step": 260 }, { "epoch": 0.7794117647058824, "grad_norm": 0.26823118329048157, "learning_rate": 0.0002, "loss": 0.2896, "mean_token_accuracy": 0.9174364805221558, "num_tokens": 4336630.0, "step": 265 }, { "epoch": 0.7941176470588235, "grad_norm": 0.2886073589324951, "learning_rate": 0.0002, "loss": 0.2807, "mean_token_accuracy": 0.9185728430747986, "num_tokens": 4418550.0, "step": 270 }, { "epoch": 0.8088235294117647, "grad_norm": 0.2849334478378296, "learning_rate": 0.0002, "loss": 0.29, "mean_token_accuracy": 0.9182918071746826, "num_tokens": 4500470.0, "step": 275 }, { "epoch": 0.8235294117647058, "grad_norm": 0.3190767467021942, "learning_rate": 0.0002, "loss": 0.2815, "mean_token_accuracy": 0.9185608327388763, "num_tokens": 4582187.0, "step": 280 }, { "epoch": 0.8382352941176471, "grad_norm": 0.28610959649086, "learning_rate": 0.0002, "loss": 0.2932, "mean_token_accuracy": 0.9168866276741028, "num_tokens": 4664107.0, "step": 285 }, { "epoch": 0.8529411764705882, "grad_norm": 0.282124787569046, "learning_rate": 0.0002, "loss": 0.2833, "mean_token_accuracy": 0.9193059802055359, "num_tokens": 4746027.0, "step": 290 }, { "epoch": 0.8676470588235294, "grad_norm": 0.27180016040802, "learning_rate": 0.0002, "loss": 0.2743, "mean_token_accuracy": 0.9207478165626526, "num_tokens": 4827947.0, "step": 295 }, { "epoch": 0.8823529411764706, "grad_norm": 0.2949499785900116, "learning_rate": 0.0002, "loss": 0.2809, "mean_token_accuracy": 0.9198436141014099, "num_tokens": 4909867.0, "step": 300 }, { "epoch": 0.8970588235294118, "grad_norm": 0.29020780324935913, "learning_rate": 0.0002, "loss": 0.2749, "mean_token_accuracy": 0.9195137023925781, "num_tokens": 4991787.0, "step": 305 }, { "epoch": 0.9117647058823529, "grad_norm": 0.28802114725112915, "learning_rate": 0.0002, "loss": 0.2692, "mean_token_accuracy": 0.9228883624076843, "num_tokens": 5073398.0, "step": 310 }, { "epoch": 0.9264705882352942, "grad_norm": 0.2924538850784302, "learning_rate": 0.0002, "loss": 0.2765, "mean_token_accuracy": 0.919696980714798, "num_tokens": 5155318.0, "step": 315 }, { "epoch": 0.9411764705882353, "grad_norm": 0.29523536562919617, "learning_rate": 0.0002, "loss": 0.2729, "mean_token_accuracy": 0.920906662940979, "num_tokens": 5237238.0, "step": 320 }, { "epoch": 0.9558823529411765, "grad_norm": 0.2890452444553375, "learning_rate": 0.0002, "loss": 0.2734, "mean_token_accuracy": 0.9217497706413269, "num_tokens": 5319158.0, "step": 325 }, { "epoch": 0.9705882352941176, "grad_norm": 0.2990953326225281, "learning_rate": 0.0002, "loss": 0.2701, "mean_token_accuracy": 0.922544002532959, "num_tokens": 5401078.0, "step": 330 }, { "epoch": 0.9852941176470589, "grad_norm": 0.27057918906211853, "learning_rate": 0.0002, "loss": 0.2841, "mean_token_accuracy": 0.9188294410705566, "num_tokens": 5482998.0, "step": 335 }, { "epoch": 1.0, "grad_norm": 0.29458126425743103, "learning_rate": 0.0002, "loss": 0.2791, "mean_token_accuracy": 0.9199340343475342, "num_tokens": 5561846.0, "step": 340 }, { "epoch": 1.0, "step": 340, "total_flos": 2.024814536766259e+16, "train_loss": 0.3264346291037167, "train_runtime": 814.8752, "train_samples_per_second": 6.672, "train_steps_per_second": 0.417 } ], "logging_steps": 5, "max_steps": 340, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.024814536766259e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }