{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5945303210463734, "eval_steps": 50, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023781212841854936, "grad_norm": 11.536224365234375, "learning_rate": 3e-05, "loss": 5.0938, "step": 10 }, { "epoch": 0.04756242568370987, "grad_norm": 3.4980711936950684, "learning_rate": 0.00013000000000000002, "loss": 3.1992, "step": 20 }, { "epoch": 0.0713436385255648, "grad_norm": 2.206911325454712, "learning_rate": 0.00019951729686242962, "loss": 2.0348, "step": 30 }, { "epoch": 0.09512485136741974, "grad_norm": 1.7694568634033203, "learning_rate": 0.00019790828640386162, "loss": 1.9145, "step": 40 }, { "epoch": 0.11890606420927467, "grad_norm": 1.6293410062789917, "learning_rate": 0.00019629927594529365, "loss": 1.7651, "step": 50 }, { "epoch": 0.1426872770511296, "grad_norm": 1.8220889568328857, "learning_rate": 0.00019469026548672567, "loss": 1.8897, "step": 60 }, { "epoch": 0.16646848989298454, "grad_norm": 1.842227578163147, "learning_rate": 0.0001930812550281577, "loss": 1.8391, "step": 70 }, { "epoch": 0.1902497027348395, "grad_norm": 1.6302684545516968, "learning_rate": 0.00019147224456958972, "loss": 1.7881, "step": 80 }, { "epoch": 0.2140309155766944, "grad_norm": 1.7633960247039795, "learning_rate": 0.00018986323411102174, "loss": 1.7444, "step": 90 }, { "epoch": 0.23781212841854935, "grad_norm": 1.9522314071655273, "learning_rate": 0.00018825422365245377, "loss": 1.6647, "step": 100 }, { "epoch": 0.2615933412604043, "grad_norm": 1.5591909885406494, "learning_rate": 0.00018664521319388576, "loss": 1.6839, "step": 110 }, { "epoch": 0.2853745541022592, "grad_norm": 1.9646824598312378, "learning_rate": 0.00018503620273531779, "loss": 1.7177, "step": 120 }, { "epoch": 0.3091557669441142, "grad_norm": 2.0045852661132812, "learning_rate": 0.0001834271922767498, "loss": 1.7352, "step": 130 }, { "epoch": 0.3329369797859691, "grad_norm": 1.6761493682861328, "learning_rate": 0.00018181818181818183, "loss": 1.5562, "step": 140 }, { "epoch": 0.356718192627824, "grad_norm": 1.720191478729248, "learning_rate": 0.00018020917135961383, "loss": 1.8305, "step": 150 }, { "epoch": 0.380499405469679, "grad_norm": 1.689537763595581, "learning_rate": 0.00017860016090104586, "loss": 1.6121, "step": 160 }, { "epoch": 0.4042806183115339, "grad_norm": 1.6469579935073853, "learning_rate": 0.0001769911504424779, "loss": 1.5827, "step": 170 }, { "epoch": 0.4280618311533888, "grad_norm": 1.637831449508667, "learning_rate": 0.0001753821399839099, "loss": 1.6039, "step": 180 }, { "epoch": 0.4518430439952438, "grad_norm": 2.1786320209503174, "learning_rate": 0.00017377312952534193, "loss": 1.7008, "step": 190 }, { "epoch": 0.4756242568370987, "grad_norm": 1.634881615638733, "learning_rate": 0.00017216411906677395, "loss": 1.5673, "step": 200 }, { "epoch": 0.4994054696789536, "grad_norm": 1.66987144947052, "learning_rate": 0.00017055510860820595, "loss": 1.5921, "step": 210 }, { "epoch": 0.5231866825208086, "grad_norm": 1.7795851230621338, "learning_rate": 0.00016894609814963797, "loss": 1.7279, "step": 220 }, { "epoch": 0.5469678953626635, "grad_norm": 1.574320673942566, "learning_rate": 0.00016733708769107, "loss": 1.5779, "step": 230 }, { "epoch": 0.5707491082045184, "grad_norm": 1.588630199432373, "learning_rate": 0.00016572807723250202, "loss": 1.6371, "step": 240 }, { "epoch": 0.5945303210463734, "grad_norm": 1.5262938737869263, "learning_rate": 0.00016411906677393404, "loss": 1.6228, "step": 250 } ], "logging_steps": 10, "max_steps": 1263, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.882807873959104e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }