{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7194244604316546, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007194244604316547, "eval_loss": 1.3505613803863525, "eval_runtime": 18.9626, "eval_samples_per_second": 12.34, "eval_steps_per_second": 1.582, "step": 1 }, { "epoch": 0.02158273381294964, "grad_norm": 7.126910209655762, "learning_rate": 1.5e-05, "loss": 5.4633, "step": 3 }, { "epoch": 0.04316546762589928, "grad_norm": 5.5504536628723145, "learning_rate": 3e-05, "loss": 5.4233, "step": 6 }, { "epoch": 0.06474820143884892, "grad_norm": 4.061417579650879, "learning_rate": 4.5e-05, "loss": 5.131, "step": 9 }, { "epoch": 0.06474820143884892, "eval_loss": 1.1897938251495361, "eval_runtime": 19.3146, "eval_samples_per_second": 12.115, "eval_steps_per_second": 1.553, "step": 9 }, { "epoch": 0.08633093525179857, "grad_norm": 2.9950151443481445, "learning_rate": 4.993910125649561e-05, "loss": 4.8732, "step": 12 }, { "epoch": 0.1079136690647482, "grad_norm": 3.6927051544189453, "learning_rate": 4.962019382530521e-05, "loss": 4.6017, "step": 15 }, { "epoch": 0.12949640287769784, "grad_norm": 3.0494635105133057, "learning_rate": 4.9031542398457974e-05, "loss": 4.6042, "step": 18 }, { "epoch": 0.12949640287769784, "eval_loss": 1.0919393301010132, "eval_runtime": 19.3623, "eval_samples_per_second": 12.085, "eval_steps_per_second": 1.549, "step": 18 }, { "epoch": 0.1510791366906475, "grad_norm": 2.4984405040740967, "learning_rate": 4.817959636416969e-05, "loss": 4.4278, "step": 21 }, { "epoch": 0.17266187050359713, "grad_norm": 2.3186981678009033, "learning_rate": 4.707368982147318e-05, "loss": 4.4248, "step": 24 }, { "epoch": 0.19424460431654678, "grad_norm": 2.286144733428955, "learning_rate": 4.572593931387604e-05, "loss": 4.4557, "step": 27 }, { "epoch": 0.19424460431654678, "eval_loss": 1.0573580265045166, "eval_runtime": 19.3833, "eval_samples_per_second": 12.072, "eval_steps_per_second": 1.548, "step": 27 }, { "epoch": 0.2158273381294964, "grad_norm": 2.0901005268096924, "learning_rate": 4.415111107797445e-05, "loss": 4.397, "step": 30 }, { "epoch": 0.23741007194244604, "grad_norm": 3.103538751602173, "learning_rate": 4.2366459261474933e-05, "loss": 4.2221, "step": 33 }, { "epoch": 0.2589928057553957, "grad_norm": 1.9494324922561646, "learning_rate": 4.039153688314145e-05, "loss": 4.3688, "step": 36 }, { "epoch": 0.2589928057553957, "eval_loss": 1.0416053533554077, "eval_runtime": 19.3673, "eval_samples_per_second": 12.082, "eval_steps_per_second": 1.549, "step": 36 }, { "epoch": 0.2805755395683453, "grad_norm": 2.292299270629883, "learning_rate": 3.824798160583012e-05, "loss": 4.446, "step": 39 }, { "epoch": 0.302158273381295, "grad_norm": 2.913973808288574, "learning_rate": 3.5959278669726935e-05, "loss": 4.0109, "step": 42 }, { "epoch": 0.3237410071942446, "grad_norm": 2.1814777851104736, "learning_rate": 3.355050358314172e-05, "loss": 4.2421, "step": 45 }, { "epoch": 0.3237410071942446, "eval_loss": 1.0296789407730103, "eval_runtime": 19.3399, "eval_samples_per_second": 12.099, "eval_steps_per_second": 1.551, "step": 45 }, { "epoch": 0.34532374100719426, "grad_norm": 2.3433644771575928, "learning_rate": 3.104804738999169e-05, "loss": 4.616, "step": 48 }, { "epoch": 0.3669064748201439, "grad_norm": 1.8417110443115234, "learning_rate": 2.8479327524001636e-05, "loss": 3.9965, "step": 51 }, { "epoch": 0.38848920863309355, "grad_norm": 2.258572578430176, "learning_rate": 2.587248741756253e-05, "loss": 4.2638, "step": 54 }, { "epoch": 0.38848920863309355, "eval_loss": 1.0222728252410889, "eval_runtime": 19.3615, "eval_samples_per_second": 12.086, "eval_steps_per_second": 1.549, "step": 54 }, { "epoch": 0.41007194244604317, "grad_norm": 2.020596504211426, "learning_rate": 2.3256088156396868e-05, "loss": 4.1487, "step": 57 }, { "epoch": 0.4316546762589928, "grad_norm": 1.7905324697494507, "learning_rate": 2.0658795558326743e-05, "loss": 4.1138, "step": 60 }, { "epoch": 0.45323741007194246, "grad_norm": 2.2661585807800293, "learning_rate": 1.8109066104575023e-05, "loss": 4.1022, "step": 63 }, { "epoch": 0.45323741007194246, "eval_loss": 1.0169346332550049, "eval_runtime": 19.3659, "eval_samples_per_second": 12.083, "eval_steps_per_second": 1.549, "step": 63 }, { "epoch": 0.4748201438848921, "grad_norm": 2.1625380516052246, "learning_rate": 1.56348351646022e-05, "loss": 4.3836, "step": 66 }, { "epoch": 0.49640287769784175, "grad_norm": 2.1454732418060303, "learning_rate": 1.3263210930352737e-05, "loss": 4.2899, "step": 69 }, { "epoch": 0.5179856115107914, "grad_norm": 2.406993865966797, "learning_rate": 1.1020177413231334e-05, "loss": 3.9958, "step": 72 }, { "epoch": 0.5179856115107914, "eval_loss": 1.0135488510131836, "eval_runtime": 19.3546, "eval_samples_per_second": 12.09, "eval_steps_per_second": 1.55, "step": 72 }, { "epoch": 0.539568345323741, "grad_norm": 2.6071054935455322, "learning_rate": 8.930309757836517e-06, "loss": 4.2177, "step": 75 }, { "epoch": 0.5611510791366906, "grad_norm": 2.0609230995178223, "learning_rate": 7.016504991533726e-06, "loss": 4.2697, "step": 78 }, { "epoch": 0.5827338129496403, "grad_norm": 2.143162250518799, "learning_rate": 5.299731159831953e-06, "loss": 4.122, "step": 81 }, { "epoch": 0.5827338129496403, "eval_loss": 1.011610507965088, "eval_runtime": 19.3757, "eval_samples_per_second": 12.077, "eval_steps_per_second": 1.548, "step": 81 }, { "epoch": 0.60431654676259, "grad_norm": 1.9092354774475098, "learning_rate": 3.798797596089351e-06, "loss": 3.9762, "step": 84 }, { "epoch": 0.6258992805755396, "grad_norm": 2.086115837097168, "learning_rate": 2.5301488425208296e-06, "loss": 4.1775, "step": 87 }, { "epoch": 0.6474820143884892, "grad_norm": 2.177617311477661, "learning_rate": 1.5076844803522922e-06, "loss": 4.2366, "step": 90 }, { "epoch": 0.6474820143884892, "eval_loss": 1.0106626749038696, "eval_runtime": 19.3765, "eval_samples_per_second": 12.076, "eval_steps_per_second": 1.548, "step": 90 }, { "epoch": 0.6690647482014388, "grad_norm": 2.176861047744751, "learning_rate": 7.426068431000882e-07, "loss": 4.1401, "step": 93 }, { "epoch": 0.6906474820143885, "grad_norm": 1.9706419706344604, "learning_rate": 2.4329828146074095e-07, "loss": 3.9663, "step": 96 }, { "epoch": 0.7122302158273381, "grad_norm": 2.0750808715820312, "learning_rate": 1.522932452260595e-08, "loss": 4.1725, "step": 99 }, { "epoch": 0.7122302158273381, "eval_loss": 1.0105128288269043, "eval_runtime": 19.3722, "eval_samples_per_second": 12.079, "eval_steps_per_second": 1.549, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.708604681795666e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }