dixedus's picture
Training in progress, step 100, checkpoint
d4e8a96 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7194244604316546,
"eval_steps": 9,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007194244604316547,
"eval_loss": 1.3505613803863525,
"eval_runtime": 18.9626,
"eval_samples_per_second": 12.34,
"eval_steps_per_second": 1.582,
"step": 1
},
{
"epoch": 0.02158273381294964,
"grad_norm": 7.126910209655762,
"learning_rate": 1.5e-05,
"loss": 5.4633,
"step": 3
},
{
"epoch": 0.04316546762589928,
"grad_norm": 5.5504536628723145,
"learning_rate": 3e-05,
"loss": 5.4233,
"step": 6
},
{
"epoch": 0.06474820143884892,
"grad_norm": 4.061417579650879,
"learning_rate": 4.5e-05,
"loss": 5.131,
"step": 9
},
{
"epoch": 0.06474820143884892,
"eval_loss": 1.1897938251495361,
"eval_runtime": 19.3146,
"eval_samples_per_second": 12.115,
"eval_steps_per_second": 1.553,
"step": 9
},
{
"epoch": 0.08633093525179857,
"grad_norm": 2.9950151443481445,
"learning_rate": 4.993910125649561e-05,
"loss": 4.8732,
"step": 12
},
{
"epoch": 0.1079136690647482,
"grad_norm": 3.6927051544189453,
"learning_rate": 4.962019382530521e-05,
"loss": 4.6017,
"step": 15
},
{
"epoch": 0.12949640287769784,
"grad_norm": 3.0494635105133057,
"learning_rate": 4.9031542398457974e-05,
"loss": 4.6042,
"step": 18
},
{
"epoch": 0.12949640287769784,
"eval_loss": 1.0919393301010132,
"eval_runtime": 19.3623,
"eval_samples_per_second": 12.085,
"eval_steps_per_second": 1.549,
"step": 18
},
{
"epoch": 0.1510791366906475,
"grad_norm": 2.4984405040740967,
"learning_rate": 4.817959636416969e-05,
"loss": 4.4278,
"step": 21
},
{
"epoch": 0.17266187050359713,
"grad_norm": 2.3186981678009033,
"learning_rate": 4.707368982147318e-05,
"loss": 4.4248,
"step": 24
},
{
"epoch": 0.19424460431654678,
"grad_norm": 2.286144733428955,
"learning_rate": 4.572593931387604e-05,
"loss": 4.4557,
"step": 27
},
{
"epoch": 0.19424460431654678,
"eval_loss": 1.0573580265045166,
"eval_runtime": 19.3833,
"eval_samples_per_second": 12.072,
"eval_steps_per_second": 1.548,
"step": 27
},
{
"epoch": 0.2158273381294964,
"grad_norm": 2.0901005268096924,
"learning_rate": 4.415111107797445e-05,
"loss": 4.397,
"step": 30
},
{
"epoch": 0.23741007194244604,
"grad_norm": 3.103538751602173,
"learning_rate": 4.2366459261474933e-05,
"loss": 4.2221,
"step": 33
},
{
"epoch": 0.2589928057553957,
"grad_norm": 1.9494324922561646,
"learning_rate": 4.039153688314145e-05,
"loss": 4.3688,
"step": 36
},
{
"epoch": 0.2589928057553957,
"eval_loss": 1.0416053533554077,
"eval_runtime": 19.3673,
"eval_samples_per_second": 12.082,
"eval_steps_per_second": 1.549,
"step": 36
},
{
"epoch": 0.2805755395683453,
"grad_norm": 2.292299270629883,
"learning_rate": 3.824798160583012e-05,
"loss": 4.446,
"step": 39
},
{
"epoch": 0.302158273381295,
"grad_norm": 2.913973808288574,
"learning_rate": 3.5959278669726935e-05,
"loss": 4.0109,
"step": 42
},
{
"epoch": 0.3237410071942446,
"grad_norm": 2.1814777851104736,
"learning_rate": 3.355050358314172e-05,
"loss": 4.2421,
"step": 45
},
{
"epoch": 0.3237410071942446,
"eval_loss": 1.0296789407730103,
"eval_runtime": 19.3399,
"eval_samples_per_second": 12.099,
"eval_steps_per_second": 1.551,
"step": 45
},
{
"epoch": 0.34532374100719426,
"grad_norm": 2.3433644771575928,
"learning_rate": 3.104804738999169e-05,
"loss": 4.616,
"step": 48
},
{
"epoch": 0.3669064748201439,
"grad_norm": 1.8417110443115234,
"learning_rate": 2.8479327524001636e-05,
"loss": 3.9965,
"step": 51
},
{
"epoch": 0.38848920863309355,
"grad_norm": 2.258572578430176,
"learning_rate": 2.587248741756253e-05,
"loss": 4.2638,
"step": 54
},
{
"epoch": 0.38848920863309355,
"eval_loss": 1.0222728252410889,
"eval_runtime": 19.3615,
"eval_samples_per_second": 12.086,
"eval_steps_per_second": 1.549,
"step": 54
},
{
"epoch": 0.41007194244604317,
"grad_norm": 2.020596504211426,
"learning_rate": 2.3256088156396868e-05,
"loss": 4.1487,
"step": 57
},
{
"epoch": 0.4316546762589928,
"grad_norm": 1.7905324697494507,
"learning_rate": 2.0658795558326743e-05,
"loss": 4.1138,
"step": 60
},
{
"epoch": 0.45323741007194246,
"grad_norm": 2.2661585807800293,
"learning_rate": 1.8109066104575023e-05,
"loss": 4.1022,
"step": 63
},
{
"epoch": 0.45323741007194246,
"eval_loss": 1.0169346332550049,
"eval_runtime": 19.3659,
"eval_samples_per_second": 12.083,
"eval_steps_per_second": 1.549,
"step": 63
},
{
"epoch": 0.4748201438848921,
"grad_norm": 2.1625380516052246,
"learning_rate": 1.56348351646022e-05,
"loss": 4.3836,
"step": 66
},
{
"epoch": 0.49640287769784175,
"grad_norm": 2.1454732418060303,
"learning_rate": 1.3263210930352737e-05,
"loss": 4.2899,
"step": 69
},
{
"epoch": 0.5179856115107914,
"grad_norm": 2.406993865966797,
"learning_rate": 1.1020177413231334e-05,
"loss": 3.9958,
"step": 72
},
{
"epoch": 0.5179856115107914,
"eval_loss": 1.0135488510131836,
"eval_runtime": 19.3546,
"eval_samples_per_second": 12.09,
"eval_steps_per_second": 1.55,
"step": 72
},
{
"epoch": 0.539568345323741,
"grad_norm": 2.6071054935455322,
"learning_rate": 8.930309757836517e-06,
"loss": 4.2177,
"step": 75
},
{
"epoch": 0.5611510791366906,
"grad_norm": 2.0609230995178223,
"learning_rate": 7.016504991533726e-06,
"loss": 4.2697,
"step": 78
},
{
"epoch": 0.5827338129496403,
"grad_norm": 2.143162250518799,
"learning_rate": 5.299731159831953e-06,
"loss": 4.122,
"step": 81
},
{
"epoch": 0.5827338129496403,
"eval_loss": 1.011610507965088,
"eval_runtime": 19.3757,
"eval_samples_per_second": 12.077,
"eval_steps_per_second": 1.548,
"step": 81
},
{
"epoch": 0.60431654676259,
"grad_norm": 1.9092354774475098,
"learning_rate": 3.798797596089351e-06,
"loss": 3.9762,
"step": 84
},
{
"epoch": 0.6258992805755396,
"grad_norm": 2.086115837097168,
"learning_rate": 2.5301488425208296e-06,
"loss": 4.1775,
"step": 87
},
{
"epoch": 0.6474820143884892,
"grad_norm": 2.177617311477661,
"learning_rate": 1.5076844803522922e-06,
"loss": 4.2366,
"step": 90
},
{
"epoch": 0.6474820143884892,
"eval_loss": 1.0106626749038696,
"eval_runtime": 19.3765,
"eval_samples_per_second": 12.076,
"eval_steps_per_second": 1.548,
"step": 90
},
{
"epoch": 0.6690647482014388,
"grad_norm": 2.176861047744751,
"learning_rate": 7.426068431000882e-07,
"loss": 4.1401,
"step": 93
},
{
"epoch": 0.6906474820143885,
"grad_norm": 1.9706419706344604,
"learning_rate": 2.4329828146074095e-07,
"loss": 3.9663,
"step": 96
},
{
"epoch": 0.7122302158273381,
"grad_norm": 2.0750808715820312,
"learning_rate": 1.522932452260595e-08,
"loss": 4.1725,
"step": 99
},
{
"epoch": 0.7122302158273381,
"eval_loss": 1.0105128288269043,
"eval_runtime": 19.3722,
"eval_samples_per_second": 12.079,
"eval_steps_per_second": 1.549,
"step": 99
}
],
"logging_steps": 3,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 9,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.708604681795666e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}