{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 834, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03597122302158273, "grad_norm": 26.93948859971876, "learning_rate": 5e-06, "loss": 1.0175, "step": 10 }, { "epoch": 0.07194244604316546, "grad_norm": 2.6743423167480063, "learning_rate": 5e-06, "loss": 0.9337, "step": 20 }, { "epoch": 0.1079136690647482, "grad_norm": 1.100805597904231, "learning_rate": 5e-06, "loss": 0.8923, "step": 30 }, { "epoch": 0.14388489208633093, "grad_norm": 0.8355539701078896, "learning_rate": 5e-06, "loss": 0.8673, "step": 40 }, { "epoch": 0.17985611510791366, "grad_norm": 0.7157506047100403, "learning_rate": 5e-06, "loss": 0.8553, "step": 50 }, { "epoch": 0.2158273381294964, "grad_norm": 0.9806631521043339, "learning_rate": 5e-06, "loss": 0.8492, "step": 60 }, { "epoch": 0.2517985611510791, "grad_norm": 0.8360835611944488, "learning_rate": 5e-06, "loss": 0.8382, "step": 70 }, { "epoch": 0.28776978417266186, "grad_norm": 0.7078472519601653, "learning_rate": 5e-06, "loss": 0.8318, "step": 80 }, { "epoch": 0.3237410071942446, "grad_norm": 0.6255785562847258, "learning_rate": 5e-06, "loss": 0.825, "step": 90 }, { "epoch": 0.3597122302158273, "grad_norm": 0.6950072028339258, "learning_rate": 5e-06, "loss": 0.8225, "step": 100 }, { "epoch": 0.39568345323741005, "grad_norm": 0.622757689781733, "learning_rate": 5e-06, "loss": 0.8165, "step": 110 }, { "epoch": 0.4316546762589928, "grad_norm": 0.6855173384055511, "learning_rate": 5e-06, "loss": 0.8162, "step": 120 }, { "epoch": 0.4676258992805755, "grad_norm": 0.555459004966806, "learning_rate": 5e-06, "loss": 0.8141, "step": 130 }, { "epoch": 0.5035971223021583, "grad_norm": 0.7189252900166325, "learning_rate": 5e-06, "loss": 0.8113, "step": 140 }, { "epoch": 0.539568345323741, "grad_norm": 0.8411135438726722, "learning_rate": 5e-06, "loss": 0.8069, "step": 150 }, { "epoch": 0.5755395683453237, "grad_norm": 0.9141854769887011, "learning_rate": 5e-06, "loss": 0.8087, "step": 160 }, { "epoch": 0.6115107913669064, "grad_norm": 0.6527584548807389, "learning_rate": 5e-06, "loss": 0.8048, "step": 170 }, { "epoch": 0.6474820143884892, "grad_norm": 0.6986581112545092, "learning_rate": 5e-06, "loss": 0.8051, "step": 180 }, { "epoch": 0.6834532374100719, "grad_norm": 0.6094857952430536, "learning_rate": 5e-06, "loss": 0.8044, "step": 190 }, { "epoch": 0.7194244604316546, "grad_norm": 0.74096920276776, "learning_rate": 5e-06, "loss": 0.7989, "step": 200 }, { "epoch": 0.7553956834532374, "grad_norm": 0.6584952886572538, "learning_rate": 5e-06, "loss": 0.8025, "step": 210 }, { "epoch": 0.7913669064748201, "grad_norm": 0.5838446606699556, "learning_rate": 5e-06, "loss": 0.7988, "step": 220 }, { "epoch": 0.8273381294964028, "grad_norm": 0.5916175411049406, "learning_rate": 5e-06, "loss": 0.7985, "step": 230 }, { "epoch": 0.8633093525179856, "grad_norm": 0.626471567693148, "learning_rate": 5e-06, "loss": 0.7973, "step": 240 }, { "epoch": 0.8992805755395683, "grad_norm": 0.6338741269795162, "learning_rate": 5e-06, "loss": 0.7933, "step": 250 }, { "epoch": 0.935251798561151, "grad_norm": 0.8343555675066444, "learning_rate": 5e-06, "loss": 0.7969, "step": 260 }, { "epoch": 0.9712230215827338, "grad_norm": 0.6221641429373133, "learning_rate": 5e-06, "loss": 0.7933, "step": 270 }, { "epoch": 1.0, "eval_loss": 0.7923575043678284, "eval_runtime": 27.9533, "eval_samples_per_second": 267.732, "eval_steps_per_second": 1.073, "step": 278 }, { "epoch": 1.0071942446043165, "grad_norm": 0.8944971285319924, "learning_rate": 5e-06, "loss": 0.7823, "step": 280 }, { "epoch": 1.0431654676258992, "grad_norm": 0.7668083853056575, "learning_rate": 5e-06, "loss": 0.7574, "step": 290 }, { "epoch": 1.079136690647482, "grad_norm": 0.6176816592509634, "learning_rate": 5e-06, "loss": 0.7529, "step": 300 }, { "epoch": 1.1151079136690647, "grad_norm": 0.6475301176330789, "learning_rate": 5e-06, "loss": 0.7558, "step": 310 }, { "epoch": 1.1510791366906474, "grad_norm": 0.5811910989874788, "learning_rate": 5e-06, "loss": 0.7623, "step": 320 }, { "epoch": 1.1870503597122302, "grad_norm": 0.6269454462814978, "learning_rate": 5e-06, "loss": 0.7601, "step": 330 }, { "epoch": 1.223021582733813, "grad_norm": 0.5423886247053047, "learning_rate": 5e-06, "loss": 0.7535, "step": 340 }, { "epoch": 1.2589928057553956, "grad_norm": 0.6670401432003603, "learning_rate": 5e-06, "loss": 0.757, "step": 350 }, { "epoch": 1.2949640287769784, "grad_norm": 0.7095322132659916, "learning_rate": 5e-06, "loss": 0.759, "step": 360 }, { "epoch": 1.330935251798561, "grad_norm": 0.6870367808903867, "learning_rate": 5e-06, "loss": 0.7567, "step": 370 }, { "epoch": 1.3669064748201438, "grad_norm": 0.6640094117573664, "learning_rate": 5e-06, "loss": 0.7592, "step": 380 }, { "epoch": 1.4028776978417266, "grad_norm": 0.5994950619117767, "learning_rate": 5e-06, "loss": 0.7529, "step": 390 }, { "epoch": 1.4388489208633093, "grad_norm": 0.7392872817621052, "learning_rate": 5e-06, "loss": 0.7554, "step": 400 }, { "epoch": 1.474820143884892, "grad_norm": 0.5656749568866071, "learning_rate": 5e-06, "loss": 0.7547, "step": 410 }, { "epoch": 1.5107913669064748, "grad_norm": 0.921484641426356, "learning_rate": 5e-06, "loss": 0.7532, "step": 420 }, { "epoch": 1.5467625899280577, "grad_norm": 0.540059029380678, "learning_rate": 5e-06, "loss": 0.7585, "step": 430 }, { "epoch": 1.5827338129496402, "grad_norm": 0.6558652758296812, "learning_rate": 5e-06, "loss": 0.7515, "step": 440 }, { "epoch": 1.6187050359712232, "grad_norm": 0.57268163367781, "learning_rate": 5e-06, "loss": 0.7562, "step": 450 }, { "epoch": 1.6546762589928057, "grad_norm": 0.5407189047091853, "learning_rate": 5e-06, "loss": 0.7559, "step": 460 }, { "epoch": 1.6906474820143886, "grad_norm": 0.6077940984618293, "learning_rate": 5e-06, "loss": 0.757, "step": 470 }, { "epoch": 1.7266187050359711, "grad_norm": 1.001124812241379, "learning_rate": 5e-06, "loss": 0.7552, "step": 480 }, { "epoch": 1.762589928057554, "grad_norm": 0.6254013722291123, "learning_rate": 5e-06, "loss": 0.753, "step": 490 }, { "epoch": 1.7985611510791366, "grad_norm": 0.5767617312575639, "learning_rate": 5e-06, "loss": 0.7594, "step": 500 }, { "epoch": 1.8345323741007196, "grad_norm": 0.665915353902276, "learning_rate": 5e-06, "loss": 0.7554, "step": 510 }, { "epoch": 1.870503597122302, "grad_norm": 0.5596777388150926, "learning_rate": 5e-06, "loss": 0.7537, "step": 520 }, { "epoch": 1.906474820143885, "grad_norm": 0.5547398560915929, "learning_rate": 5e-06, "loss": 0.7555, "step": 530 }, { "epoch": 1.9424460431654675, "grad_norm": 0.5874602156110944, "learning_rate": 5e-06, "loss": 0.7509, "step": 540 }, { "epoch": 1.9784172661870505, "grad_norm": 0.6369533697170318, "learning_rate": 5e-06, "loss": 0.7503, "step": 550 }, { "epoch": 2.0, "eval_loss": 0.7788412570953369, "eval_runtime": 27.8988, "eval_samples_per_second": 268.255, "eval_steps_per_second": 1.075, "step": 556 }, { "epoch": 2.014388489208633, "grad_norm": 1.0929207520027995, "learning_rate": 5e-06, "loss": 0.735, "step": 560 }, { "epoch": 2.050359712230216, "grad_norm": 0.687310495052166, "learning_rate": 5e-06, "loss": 0.7131, "step": 570 }, { "epoch": 2.0863309352517985, "grad_norm": 0.6848749958758751, "learning_rate": 5e-06, "loss": 0.7129, "step": 580 }, { "epoch": 2.1223021582733814, "grad_norm": 0.9700661070159223, "learning_rate": 5e-06, "loss": 0.7154, "step": 590 }, { "epoch": 2.158273381294964, "grad_norm": 0.7429316335562708, "learning_rate": 5e-06, "loss": 0.7163, "step": 600 }, { "epoch": 2.194244604316547, "grad_norm": 0.5731198010767242, "learning_rate": 5e-06, "loss": 0.7197, "step": 610 }, { "epoch": 2.2302158273381294, "grad_norm": 0.6519774548706885, "learning_rate": 5e-06, "loss": 0.7192, "step": 620 }, { "epoch": 2.2661870503597124, "grad_norm": 0.7092939571259266, "learning_rate": 5e-06, "loss": 0.717, "step": 630 }, { "epoch": 2.302158273381295, "grad_norm": 0.8300683342338049, "learning_rate": 5e-06, "loss": 0.7171, "step": 640 }, { "epoch": 2.338129496402878, "grad_norm": 0.6364079517115279, "learning_rate": 5e-06, "loss": 0.7179, "step": 650 }, { "epoch": 2.3741007194244603, "grad_norm": 0.6830216482631195, "learning_rate": 5e-06, "loss": 0.7208, "step": 660 }, { "epoch": 2.4100719424460433, "grad_norm": 0.580810416113199, "learning_rate": 5e-06, "loss": 0.7201, "step": 670 }, { "epoch": 2.446043165467626, "grad_norm": 0.7709663647446697, "learning_rate": 5e-06, "loss": 0.7165, "step": 680 }, { "epoch": 2.4820143884892087, "grad_norm": 0.6587806242655105, "learning_rate": 5e-06, "loss": 0.7199, "step": 690 }, { "epoch": 2.5179856115107913, "grad_norm": 0.6679031168226195, "learning_rate": 5e-06, "loss": 0.7228, "step": 700 }, { "epoch": 2.553956834532374, "grad_norm": 0.5802019851320436, "learning_rate": 5e-06, "loss": 0.7211, "step": 710 }, { "epoch": 2.5899280575539567, "grad_norm": 0.633360775543426, "learning_rate": 5e-06, "loss": 0.7192, "step": 720 }, { "epoch": 2.6258992805755397, "grad_norm": 0.7014721250700231, "learning_rate": 5e-06, "loss": 0.7208, "step": 730 }, { "epoch": 2.661870503597122, "grad_norm": 0.5972726636881343, "learning_rate": 5e-06, "loss": 0.7184, "step": 740 }, { "epoch": 2.697841726618705, "grad_norm": 0.5454556975289979, "learning_rate": 5e-06, "loss": 0.7139, "step": 750 }, { "epoch": 2.7338129496402876, "grad_norm": 0.5626224999737693, "learning_rate": 5e-06, "loss": 0.7207, "step": 760 }, { "epoch": 2.7697841726618706, "grad_norm": 0.5106193565014756, "learning_rate": 5e-06, "loss": 0.7193, "step": 770 }, { "epoch": 2.805755395683453, "grad_norm": 0.6138738602878809, "learning_rate": 5e-06, "loss": 0.7185, "step": 780 }, { "epoch": 2.841726618705036, "grad_norm": 0.6093685279993987, "learning_rate": 5e-06, "loss": 0.7217, "step": 790 }, { "epoch": 2.8776978417266186, "grad_norm": 0.5564883285882788, "learning_rate": 5e-06, "loss": 0.7213, "step": 800 }, { "epoch": 2.9136690647482015, "grad_norm": 0.5906548449538034, "learning_rate": 5e-06, "loss": 0.7183, "step": 810 }, { "epoch": 2.949640287769784, "grad_norm": 0.5460219561244413, "learning_rate": 5e-06, "loss": 0.7216, "step": 820 }, { "epoch": 2.985611510791367, "grad_norm": 0.6453368774762195, "learning_rate": 5e-06, "loss": 0.7198, "step": 830 }, { "epoch": 3.0, "eval_loss": 0.7752296328544617, "eval_runtime": 27.5746, "eval_samples_per_second": 271.409, "eval_steps_per_second": 1.088, "step": 834 }, { "epoch": 3.0, "step": 834, "total_flos": 1396981062696960.0, "train_loss": 0.7675551453368555, "train_runtime": 5571.5313, "train_samples_per_second": 76.563, "train_steps_per_second": 0.15 } ], "logging_steps": 10, "max_steps": 834, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1396981062696960.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }