|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 340, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014705882352941176, |
|
"grad_norm": 1.2318819761276245, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1427, |
|
"mean_token_accuracy": 0.7514051914215087, |
|
"num_tokens": 81920.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.029411764705882353, |
|
"grad_norm": 0.612349808216095, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5799, |
|
"mean_token_accuracy": 0.8597629547119141, |
|
"num_tokens": 163840.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04411764705882353, |
|
"grad_norm": 0.4277164936065674, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4103, |
|
"mean_token_accuracy": 0.8957844376564026, |
|
"num_tokens": 245760.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.058823529411764705, |
|
"grad_norm": 0.3906048536300659, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4176, |
|
"mean_token_accuracy": 0.8923509180545807, |
|
"num_tokens": 327680.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07352941176470588, |
|
"grad_norm": 0.3483637571334839, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3835, |
|
"mean_token_accuracy": 0.8978738963603974, |
|
"num_tokens": 409600.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.08823529411764706, |
|
"grad_norm": 0.35842204093933105, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3728, |
|
"mean_token_accuracy": 0.9008919656276703, |
|
"num_tokens": 491520.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10294117647058823, |
|
"grad_norm": 0.29427987337112427, |
|
"learning_rate": 0.0002, |
|
"loss": 0.339, |
|
"mean_token_accuracy": 0.9093719601631165, |
|
"num_tokens": 573440.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.11764705882352941, |
|
"grad_norm": 0.3110213875770569, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3479, |
|
"mean_token_accuracy": 0.9065738081932068, |
|
"num_tokens": 655360.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1323529411764706, |
|
"grad_norm": 0.30392590165138245, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3632, |
|
"mean_token_accuracy": 0.90327467918396, |
|
"num_tokens": 737280.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.14705882352941177, |
|
"grad_norm": 0.3164522647857666, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3519, |
|
"mean_token_accuracy": 0.9053433358669281, |
|
"num_tokens": 818998.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.16176470588235295, |
|
"grad_norm": 0.29904985427856445, |
|
"learning_rate": 0.0002, |
|
"loss": 0.359, |
|
"mean_token_accuracy": 0.9019794702529907, |
|
"num_tokens": 900918.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.17647058823529413, |
|
"grad_norm": 0.3399337828159332, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3472, |
|
"mean_token_accuracy": 0.9052419304847718, |
|
"num_tokens": 982838.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.19117647058823528, |
|
"grad_norm": 0.3235512673854828, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3527, |
|
"mean_token_accuracy": 0.9031891584396362, |
|
"num_tokens": 1064758.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.20588235294117646, |
|
"grad_norm": 0.32987555861473083, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3596, |
|
"mean_token_accuracy": 0.9010885059833527, |
|
"num_tokens": 1145665.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.22058823529411764, |
|
"grad_norm": 0.28296959400177, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3186, |
|
"mean_token_accuracy": 0.9128421485424042, |
|
"num_tokens": 1227585.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.23529411764705882, |
|
"grad_norm": 0.2940562665462494, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3189, |
|
"mean_token_accuracy": 0.9119745969772339, |
|
"num_tokens": 1309505.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.31312814354896545, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3366, |
|
"mean_token_accuracy": 0.9083712756633758, |
|
"num_tokens": 1390498.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2647058823529412, |
|
"grad_norm": 0.2923528254032135, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3114, |
|
"mean_token_accuracy": 0.9138196527957916, |
|
"num_tokens": 1472418.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.27941176470588236, |
|
"grad_norm": 0.2987738847732544, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3226, |
|
"mean_token_accuracy": 0.9115102827548981, |
|
"num_tokens": 1554338.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.29411764705882354, |
|
"grad_norm": 0.3070703446865082, |
|
"learning_rate": 0.0002, |
|
"loss": 0.334, |
|
"mean_token_accuracy": 0.9086510419845581, |
|
"num_tokens": 1636258.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3088235294117647, |
|
"grad_norm": 0.2919357419013977, |
|
"learning_rate": 0.0002, |
|
"loss": 0.322, |
|
"mean_token_accuracy": 0.9099951267242432, |
|
"num_tokens": 1718178.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3235294117647059, |
|
"grad_norm": 0.3079027235507965, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3195, |
|
"mean_token_accuracy": 0.9123972117900848, |
|
"num_tokens": 1799262.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3382352941176471, |
|
"grad_norm": 0.32008472084999084, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3211, |
|
"mean_token_accuracy": 0.9098729312419891, |
|
"num_tokens": 1881182.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.35294117647058826, |
|
"grad_norm": 0.33167868852615356, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3165, |
|
"mean_token_accuracy": 0.9122434020042419, |
|
"num_tokens": 1963102.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.36764705882352944, |
|
"grad_norm": 0.26130759716033936, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3101, |
|
"mean_token_accuracy": 0.9149560272693634, |
|
"num_tokens": 2045022.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.38235294117647056, |
|
"grad_norm": 0.3016408681869507, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3149, |
|
"mean_token_accuracy": 0.9118534028530121, |
|
"num_tokens": 2126154.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.39705882352941174, |
|
"grad_norm": 0.3000870645046234, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3157, |
|
"mean_token_accuracy": 0.9116202533245087, |
|
"num_tokens": 2208074.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.4117647058823529, |
|
"grad_norm": 0.2947154939174652, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2991, |
|
"mean_token_accuracy": 0.916422301530838, |
|
"num_tokens": 2289994.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4264705882352941, |
|
"grad_norm": 0.29345065355300903, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3192, |
|
"mean_token_accuracy": 0.9102272808551788, |
|
"num_tokens": 2371914.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.4411764705882353, |
|
"grad_norm": 0.2984428107738495, |
|
"learning_rate": 0.0002, |
|
"loss": 0.298, |
|
"mean_token_accuracy": 0.9163951098918914, |
|
"num_tokens": 2453143.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.45588235294117646, |
|
"grad_norm": 0.2700878977775574, |
|
"learning_rate": 0.0002, |
|
"loss": 0.291, |
|
"mean_token_accuracy": 0.9183040201663971, |
|
"num_tokens": 2535063.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.47058823529411764, |
|
"grad_norm": 0.30076536536216736, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3097, |
|
"mean_token_accuracy": 0.9130865216255188, |
|
"num_tokens": 2616983.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4852941176470588, |
|
"grad_norm": 0.30549952387809753, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3136, |
|
"mean_token_accuracy": 0.9121212244033814, |
|
"num_tokens": 2698903.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.2821143865585327, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3006, |
|
"mean_token_accuracy": 0.9160520434379578, |
|
"num_tokens": 2780150.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5147058823529411, |
|
"grad_norm": 0.2865024507045746, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3109, |
|
"mean_token_accuracy": 0.9121701002120972, |
|
"num_tokens": 2862070.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5294117647058824, |
|
"grad_norm": 0.299447238445282, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3045, |
|
"mean_token_accuracy": 0.914674985408783, |
|
"num_tokens": 2943990.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5441176470588235, |
|
"grad_norm": 0.28584349155426025, |
|
"learning_rate": 0.0002, |
|
"loss": 0.296, |
|
"mean_token_accuracy": 0.9169232726097107, |
|
"num_tokens": 3025910.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5588235294117647, |
|
"grad_norm": 0.28912603855133057, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2828, |
|
"mean_token_accuracy": 0.9202346205711365, |
|
"num_tokens": 3107830.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5735294117647058, |
|
"grad_norm": 0.2780699133872986, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2943, |
|
"mean_token_accuracy": 0.917925238609314, |
|
"num_tokens": 3189750.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 0.2849072813987732, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2909, |
|
"mean_token_accuracy": 0.9186461567878723, |
|
"num_tokens": 3271670.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6029411764705882, |
|
"grad_norm": 0.287589967250824, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3006, |
|
"mean_token_accuracy": 0.9150293409824372, |
|
"num_tokens": 3353590.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.6176470588235294, |
|
"grad_norm": 0.3039202392101288, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3017, |
|
"mean_token_accuracy": 0.9141373574733734, |
|
"num_tokens": 3435510.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6323529411764706, |
|
"grad_norm": 0.29136523604393005, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2937, |
|
"mean_token_accuracy": 0.9157746970653534, |
|
"num_tokens": 3517430.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.6470588235294118, |
|
"grad_norm": 0.28994059562683105, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2948, |
|
"mean_token_accuracy": 0.9153592526912689, |
|
"num_tokens": 3599350.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6617647058823529, |
|
"grad_norm": 0.3030713200569153, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3021, |
|
"mean_token_accuracy": 0.9139174222946167, |
|
"num_tokens": 3681270.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6764705882352942, |
|
"grad_norm": 0.2715919017791748, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2973, |
|
"mean_token_accuracy": 0.9151881873607636, |
|
"num_tokens": 3763190.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6911764705882353, |
|
"grad_norm": 0.29798802733421326, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3004, |
|
"mean_token_accuracy": 0.9151026546955109, |
|
"num_tokens": 3845110.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.7058823529411765, |
|
"grad_norm": 0.31128421425819397, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3049, |
|
"mean_token_accuracy": 0.9125122249126434, |
|
"num_tokens": 3927030.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7205882352941176, |
|
"grad_norm": 0.282503604888916, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2808, |
|
"mean_token_accuracy": 0.919000506401062, |
|
"num_tokens": 4008950.0, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.7352941176470589, |
|
"grad_norm": 0.2817753255367279, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2879, |
|
"mean_token_accuracy": 0.9177908301353455, |
|
"num_tokens": 4090870.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.29370447993278503, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2798, |
|
"mean_token_accuracy": 0.9193670749664307, |
|
"num_tokens": 4172790.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.7647058823529411, |
|
"grad_norm": 0.2587876617908478, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2799, |
|
"mean_token_accuracy": 0.920650064945221, |
|
"num_tokens": 4254710.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7794117647058824, |
|
"grad_norm": 0.26823118329048157, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2896, |
|
"mean_token_accuracy": 0.9174364805221558, |
|
"num_tokens": 4336630.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.7941176470588235, |
|
"grad_norm": 0.2886073589324951, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2807, |
|
"mean_token_accuracy": 0.9185728430747986, |
|
"num_tokens": 4418550.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8088235294117647, |
|
"grad_norm": 0.2849334478378296, |
|
"learning_rate": 0.0002, |
|
"loss": 0.29, |
|
"mean_token_accuracy": 0.9182918071746826, |
|
"num_tokens": 4500470.0, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.8235294117647058, |
|
"grad_norm": 0.3190767467021942, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2815, |
|
"mean_token_accuracy": 0.9185608327388763, |
|
"num_tokens": 4582187.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8382352941176471, |
|
"grad_norm": 0.28610959649086, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2932, |
|
"mean_token_accuracy": 0.9168866276741028, |
|
"num_tokens": 4664107.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.8529411764705882, |
|
"grad_norm": 0.282124787569046, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2833, |
|
"mean_token_accuracy": 0.9193059802055359, |
|
"num_tokens": 4746027.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8676470588235294, |
|
"grad_norm": 0.27180016040802, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2743, |
|
"mean_token_accuracy": 0.9207478165626526, |
|
"num_tokens": 4827947.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.8823529411764706, |
|
"grad_norm": 0.2949499785900116, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2809, |
|
"mean_token_accuracy": 0.9198436141014099, |
|
"num_tokens": 4909867.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8970588235294118, |
|
"grad_norm": 0.29020780324935913, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2749, |
|
"mean_token_accuracy": 0.9195137023925781, |
|
"num_tokens": 4991787.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.9117647058823529, |
|
"grad_norm": 0.28802114725112915, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2692, |
|
"mean_token_accuracy": 0.9228883624076843, |
|
"num_tokens": 5073398.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9264705882352942, |
|
"grad_norm": 0.2924538850784302, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2765, |
|
"mean_token_accuracy": 0.919696980714798, |
|
"num_tokens": 5155318.0, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.9411764705882353, |
|
"grad_norm": 0.29523536562919617, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2729, |
|
"mean_token_accuracy": 0.920906662940979, |
|
"num_tokens": 5237238.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9558823529411765, |
|
"grad_norm": 0.2890452444553375, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2734, |
|
"mean_token_accuracy": 0.9217497706413269, |
|
"num_tokens": 5319158.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.9705882352941176, |
|
"grad_norm": 0.2990953326225281, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2701, |
|
"mean_token_accuracy": 0.922544002532959, |
|
"num_tokens": 5401078.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9852941176470589, |
|
"grad_norm": 0.27057918906211853, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2841, |
|
"mean_token_accuracy": 0.9188294410705566, |
|
"num_tokens": 5482998.0, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.29458126425743103, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2791, |
|
"mean_token_accuracy": 0.9199340343475342, |
|
"num_tokens": 5561846.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 340, |
|
"total_flos": 2.024814536766259e+16, |
|
"train_loss": 0.3264346291037167, |
|
"train_runtime": 814.8752, |
|
"train_samples_per_second": 6.672, |
|
"train_steps_per_second": 0.417 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 340, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.024814536766259e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|