|
{ |
|
"best_metric": 0.7502281069755554, |
|
"best_model_checkpoint": "./checkpoints/llava-v1.6-34b-chatml_direct-anyres/checkpoint-115", |
|
"epoch": 3.59375, |
|
"eval_steps": 1.0, |
|
"global_step": 115, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03125, |
|
"grad_norm": 0.3345325833952841, |
|
"learning_rate": 0.0, |
|
"loss": 1.2677, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03125, |
|
"eval_loss": 1.3042412996292114, |
|
"eval_runtime": 195.8656, |
|
"eval_samples_per_second": 1.021, |
|
"eval_steps_per_second": 0.128, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 0.35890614982474045, |
|
"learning_rate": 8.613531161467863e-06, |
|
"loss": 1.3505, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"eval_loss": 1.3042412996292114, |
|
"eval_runtime": 190.4145, |
|
"eval_samples_per_second": 1.05, |
|
"eval_steps_per_second": 0.131, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"grad_norm": 0.3316701046946783, |
|
"learning_rate": 1.3652123889719709e-05, |
|
"loss": 1.2569, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"eval_loss": 1.2942205667495728, |
|
"eval_runtime": 190.6475, |
|
"eval_samples_per_second": 1.049, |
|
"eval_steps_per_second": 0.131, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 0.32188096642751235, |
|
"learning_rate": 1.7227062322935725e-05, |
|
"loss": 1.2323, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"eval_loss": 1.2789555788040161, |
|
"eval_runtime": 189.7666, |
|
"eval_samples_per_second": 1.054, |
|
"eval_steps_per_second": 0.132, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 0.3767527705004001, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2785, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"eval_loss": 1.258152723312378, |
|
"eval_runtime": 189.5935, |
|
"eval_samples_per_second": 1.055, |
|
"eval_steps_per_second": 0.132, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 0.3287126070774628, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2151, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"eval_loss": 1.2347216606140137, |
|
"eval_runtime": 190.4111, |
|
"eval_samples_per_second": 1.05, |
|
"eval_steps_per_second": 0.131, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"grad_norm": 0.34451127286331007, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2968, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"eval_loss": 1.210167646408081, |
|
"eval_runtime": 190.9799, |
|
"eval_samples_per_second": 1.047, |
|
"eval_steps_per_second": 0.131, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.36105870692958336, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2277, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 1.1862907409667969, |
|
"eval_runtime": 190.685, |
|
"eval_samples_per_second": 1.049, |
|
"eval_steps_per_second": 0.131, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"grad_norm": 0.35460549637546845, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2101, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"eval_loss": 1.1649302244186401, |
|
"eval_runtime": 190.1569, |
|
"eval_samples_per_second": 1.052, |
|
"eval_steps_per_second": 0.131, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 0.3134923556618721, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1163, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"eval_loss": 1.144965410232544, |
|
"eval_runtime": 190.0982, |
|
"eval_samples_per_second": 1.052, |
|
"eval_steps_per_second": 0.132, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"grad_norm": 0.3069481492118633, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1483, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"eval_loss": 1.124668002128601, |
|
"eval_runtime": 192.0572, |
|
"eval_samples_per_second": 1.041, |
|
"eval_steps_per_second": 0.13, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 0.2801324709168811, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1172, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"eval_loss": 1.1061824560165405, |
|
"eval_runtime": 192.6406, |
|
"eval_samples_per_second": 1.038, |
|
"eval_steps_per_second": 0.13, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"grad_norm": 0.33156251919932406, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1902, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"eval_loss": 1.0897018909454346, |
|
"eval_runtime": 192.8064, |
|
"eval_samples_per_second": 1.037, |
|
"eval_steps_per_second": 0.13, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 0.3307149375898363, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1014, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"eval_loss": 1.075058937072754, |
|
"eval_runtime": 192.1353, |
|
"eval_samples_per_second": 1.041, |
|
"eval_steps_per_second": 0.13, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 0.31999611930431227, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0847, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"eval_loss": 1.0613062381744385, |
|
"eval_runtime": 192.0586, |
|
"eval_samples_per_second": 1.041, |
|
"eval_steps_per_second": 0.13, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.2494159223446848, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0428, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 1.0484405755996704, |
|
"eval_runtime": 192.52, |
|
"eval_samples_per_second": 1.039, |
|
"eval_steps_per_second": 0.13, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"grad_norm": 0.2899303168196212, |
|
"learning_rate": 2e-05, |
|
"loss": 1.122, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"eval_loss": 1.036120891571045, |
|
"eval_runtime": 192.5716, |
|
"eval_samples_per_second": 1.039, |
|
"eval_steps_per_second": 0.13, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 0.2995776829874209, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0425, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"eval_loss": 1.0226774215698242, |
|
"eval_runtime": 192.5256, |
|
"eval_samples_per_second": 1.039, |
|
"eval_steps_per_second": 0.13, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"grad_norm": 0.28709859243892955, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0098, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"eval_loss": 1.0081837177276611, |
|
"eval_runtime": 192.3486, |
|
"eval_samples_per_second": 1.04, |
|
"eval_steps_per_second": 0.13, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.27612474678791227, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0563, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"eval_loss": 0.994163990020752, |
|
"eval_runtime": 191.9782, |
|
"eval_samples_per_second": 1.042, |
|
"eval_steps_per_second": 0.13, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"grad_norm": 0.24260720679126926, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0355, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"eval_loss": 0.9819543361663818, |
|
"eval_runtime": 191.9306, |
|
"eval_samples_per_second": 1.042, |
|
"eval_steps_per_second": 0.13, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 0.25336536603884946, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0525, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"eval_loss": 0.9709838032722473, |
|
"eval_runtime": 192.9913, |
|
"eval_samples_per_second": 1.036, |
|
"eval_steps_per_second": 0.13, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"grad_norm": 0.24820839136364292, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1392, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"eval_loss": 0.9616628885269165, |
|
"eval_runtime": 192.6673, |
|
"eval_samples_per_second": 1.038, |
|
"eval_steps_per_second": 0.13, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.24589291203527217, |
|
"learning_rate": 2e-05, |
|
"loss": 1.058, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 0.9531083703041077, |
|
"eval_runtime": 193.0994, |
|
"eval_samples_per_second": 1.036, |
|
"eval_steps_per_second": 0.129, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 0.249532642718915, |
|
"learning_rate": 2e-05, |
|
"loss": 0.938, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"eval_loss": 0.9455437660217285, |
|
"eval_runtime": 191.9941, |
|
"eval_samples_per_second": 1.042, |
|
"eval_steps_per_second": 0.13, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 0.28034242585086017, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9387, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"eval_loss": 0.93752121925354, |
|
"eval_runtime": 195.4083, |
|
"eval_samples_per_second": 1.023, |
|
"eval_steps_per_second": 0.128, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"grad_norm": 0.2692565070546352, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0474, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"eval_loss": 0.9300512075424194, |
|
"eval_runtime": 195.3651, |
|
"eval_samples_per_second": 1.024, |
|
"eval_steps_per_second": 0.128, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 0.24705041646949316, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9596, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"eval_loss": 0.9226720929145813, |
|
"eval_runtime": 195.4848, |
|
"eval_samples_per_second": 1.023, |
|
"eval_steps_per_second": 0.128, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"grad_norm": 0.24799871352606165, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0172, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"eval_loss": 0.9159422516822815, |
|
"eval_runtime": 196.157, |
|
"eval_samples_per_second": 1.02, |
|
"eval_steps_per_second": 0.127, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 0.29755264040904106, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9324, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"eval_loss": 0.9090733528137207, |
|
"eval_runtime": 196.5295, |
|
"eval_samples_per_second": 1.018, |
|
"eval_steps_per_second": 0.127, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"grad_norm": 0.2629221961008751, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9265, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"eval_loss": 0.9027940630912781, |
|
"eval_runtime": 196.281, |
|
"eval_samples_per_second": 1.019, |
|
"eval_steps_per_second": 0.127, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.2901110704218056, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9933, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.8970211148262024, |
|
"eval_runtime": 190.2702, |
|
"eval_samples_per_second": 1.051, |
|
"eval_steps_per_second": 0.131, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.03125, |
|
"grad_norm": 0.27746608883483487, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9339, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.03125, |
|
"eval_loss": 0.8916085958480835, |
|
"eval_runtime": 189.4543, |
|
"eval_samples_per_second": 1.056, |
|
"eval_steps_per_second": 0.132, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"grad_norm": 0.26134437145600353, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9438, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"eval_loss": 0.8867039680480957, |
|
"eval_runtime": 189.6926, |
|
"eval_samples_per_second": 1.054, |
|
"eval_steps_per_second": 0.132, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"grad_norm": 0.252882507519195, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8979, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"eval_loss": 0.8824067711830139, |
|
"eval_runtime": 189.9217, |
|
"eval_samples_per_second": 1.053, |
|
"eval_steps_per_second": 0.132, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 0.25443025949474585, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9411, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"eval_loss": 0.8788293600082397, |
|
"eval_runtime": 191.4083, |
|
"eval_samples_per_second": 1.045, |
|
"eval_steps_per_second": 0.131, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.15625, |
|
"grad_norm": 0.2559343621244427, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9827, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.15625, |
|
"eval_loss": 0.8760793805122375, |
|
"eval_runtime": 191.2732, |
|
"eval_samples_per_second": 1.046, |
|
"eval_steps_per_second": 0.131, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"grad_norm": 0.25403189851366254, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8658, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"eval_loss": 0.8727380633354187, |
|
"eval_runtime": 190.4281, |
|
"eval_samples_per_second": 1.05, |
|
"eval_steps_per_second": 0.131, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.21875, |
|
"grad_norm": 0.2493777578005398, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0053, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.21875, |
|
"eval_loss": 0.869698703289032, |
|
"eval_runtime": 190.3431, |
|
"eval_samples_per_second": 1.051, |
|
"eval_steps_per_second": 0.131, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.24823573574563138, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8967, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 0.8664910793304443, |
|
"eval_runtime": 189.9802, |
|
"eval_samples_per_second": 1.053, |
|
"eval_steps_per_second": 0.132, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.28125, |
|
"grad_norm": 0.25462243237743476, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0064, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.28125, |
|
"eval_loss": 0.8638657927513123, |
|
"eval_runtime": 195.3373, |
|
"eval_samples_per_second": 1.024, |
|
"eval_steps_per_second": 0.128, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"grad_norm": 0.2604089386111215, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9898, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"eval_loss": 0.8607734441757202, |
|
"eval_runtime": 195.219, |
|
"eval_samples_per_second": 1.024, |
|
"eval_steps_per_second": 0.128, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.34375, |
|
"grad_norm": 0.27139202440805793, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0539, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.34375, |
|
"eval_loss": 0.8573687672615051, |
|
"eval_runtime": 195.8828, |
|
"eval_samples_per_second": 1.021, |
|
"eval_steps_per_second": 0.128, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 0.27474433057157854, |
|
"learning_rate": 2e-05, |
|
"loss": 0.86, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"eval_loss": 0.8537396192550659, |
|
"eval_runtime": 194.9741, |
|
"eval_samples_per_second": 1.026, |
|
"eval_steps_per_second": 0.128, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"grad_norm": 0.2537208760747199, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9562, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"eval_loss": 0.8497809767723083, |
|
"eval_runtime": 194.9162, |
|
"eval_samples_per_second": 1.026, |
|
"eval_steps_per_second": 0.128, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"grad_norm": 0.27560461131090846, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8767, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"eval_loss": 0.8458660244941711, |
|
"eval_runtime": 195.1157, |
|
"eval_samples_per_second": 1.025, |
|
"eval_steps_per_second": 0.128, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.46875, |
|
"grad_norm": 0.2594536794112662, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9256, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.46875, |
|
"eval_loss": 0.8429936766624451, |
|
"eval_runtime": 195.048, |
|
"eval_samples_per_second": 1.025, |
|
"eval_steps_per_second": 0.128, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.28583207453838866, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9858, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 0.84013831615448, |
|
"eval_runtime": 194.3046, |
|
"eval_samples_per_second": 1.029, |
|
"eval_steps_per_second": 0.129, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.53125, |
|
"grad_norm": 0.28118976636788506, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9158, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.53125, |
|
"eval_loss": 0.8369531035423279, |
|
"eval_runtime": 194.521, |
|
"eval_samples_per_second": 1.028, |
|
"eval_steps_per_second": 0.129, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 0.29276573776696546, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8745, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"eval_loss": 0.8341982960700989, |
|
"eval_runtime": 194.1114, |
|
"eval_samples_per_second": 1.03, |
|
"eval_steps_per_second": 0.129, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.59375, |
|
"grad_norm": 0.2860638141439372, |
|
"learning_rate": 2e-05, |
|
"loss": 0.854, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.59375, |
|
"eval_loss": 0.8317239284515381, |
|
"eval_runtime": 198.1029, |
|
"eval_samples_per_second": 1.01, |
|
"eval_steps_per_second": 0.126, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 0.29960349722496704, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8399, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"eval_loss": 0.8290513753890991, |
|
"eval_runtime": 198.0764, |
|
"eval_samples_per_second": 1.01, |
|
"eval_steps_per_second": 0.126, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.65625, |
|
"grad_norm": 0.2964234305808419, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9694, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.65625, |
|
"eval_loss": 0.8267760276794434, |
|
"eval_runtime": 197.8284, |
|
"eval_samples_per_second": 1.011, |
|
"eval_steps_per_second": 0.126, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"grad_norm": 0.26183932644077784, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8153, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"eval_loss": 0.824044942855835, |
|
"eval_runtime": 198.1694, |
|
"eval_samples_per_second": 1.009, |
|
"eval_steps_per_second": 0.126, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"grad_norm": 0.3067024314453144, |
|
"learning_rate": 2e-05, |
|
"loss": 0.883, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"eval_loss": 0.8216392397880554, |
|
"eval_runtime": 198.0249, |
|
"eval_samples_per_second": 1.01, |
|
"eval_steps_per_second": 0.126, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.27888658705355013, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8771, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"eval_loss": 0.8194215297698975, |
|
"eval_runtime": 195.4688, |
|
"eval_samples_per_second": 1.023, |
|
"eval_steps_per_second": 0.128, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.78125, |
|
"grad_norm": 0.32571765544245934, |
|
"learning_rate": 2e-05, |
|
"loss": 0.897, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.78125, |
|
"eval_loss": 0.8167170882225037, |
|
"eval_runtime": 189.6243, |
|
"eval_samples_per_second": 1.055, |
|
"eval_steps_per_second": 0.132, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"grad_norm": 0.292216058855145, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9277, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"eval_loss": 0.8145509958267212, |
|
"eval_runtime": 190.2429, |
|
"eval_samples_per_second": 1.051, |
|
"eval_steps_per_second": 0.131, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.84375, |
|
"grad_norm": 0.29002612820437024, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8971, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.84375, |
|
"eval_loss": 0.8122230768203735, |
|
"eval_runtime": 189.9403, |
|
"eval_samples_per_second": 1.053, |
|
"eval_steps_per_second": 0.132, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.2926088029288858, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9225, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"eval_loss": 0.8100479245185852, |
|
"eval_runtime": 190.2569, |
|
"eval_samples_per_second": 1.051, |
|
"eval_steps_per_second": 0.131, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.90625, |
|
"grad_norm": 0.30068993111077397, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9134, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.90625, |
|
"eval_loss": 0.808087944984436, |
|
"eval_runtime": 192.4896, |
|
"eval_samples_per_second": 1.039, |
|
"eval_steps_per_second": 0.13, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"grad_norm": 0.3157573686768343, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8965, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"eval_loss": 0.8057371377944946, |
|
"eval_runtime": 190.0158, |
|
"eval_samples_per_second": 1.053, |
|
"eval_steps_per_second": 0.132, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.96875, |
|
"grad_norm": 0.31215592754506605, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7828, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.96875, |
|
"eval_loss": 0.8031384944915771, |
|
"eval_runtime": 189.5204, |
|
"eval_samples_per_second": 1.055, |
|
"eval_steps_per_second": 0.132, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.29422828766227993, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8196, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.8012601733207703, |
|
"eval_runtime": 189.7041, |
|
"eval_samples_per_second": 1.054, |
|
"eval_steps_per_second": 0.132, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"grad_norm": 0.2885449518895793, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9715, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"eval_loss": 0.8001161813735962, |
|
"eval_runtime": 189.57, |
|
"eval_samples_per_second": 1.055, |
|
"eval_steps_per_second": 0.132, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.0625, |
|
"grad_norm": 0.30260184063348483, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7912, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.0625, |
|
"eval_loss": 0.7989436388015747, |
|
"eval_runtime": 193.0193, |
|
"eval_samples_per_second": 1.036, |
|
"eval_steps_per_second": 0.13, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.09375, |
|
"grad_norm": 0.32650294605024255, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8176, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.09375, |
|
"eval_loss": 0.7972333431243896, |
|
"eval_runtime": 193.2225, |
|
"eval_samples_per_second": 1.035, |
|
"eval_steps_per_second": 0.129, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 0.3382679480741134, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8141, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"eval_loss": 0.7950598001480103, |
|
"eval_runtime": 193.2781, |
|
"eval_samples_per_second": 1.035, |
|
"eval_steps_per_second": 0.129, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.15625, |
|
"grad_norm": 0.3094090784935889, |
|
"learning_rate": 2e-05, |
|
"loss": 0.796, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.15625, |
|
"eval_loss": 0.7932476997375488, |
|
"eval_runtime": 193.0282, |
|
"eval_samples_per_second": 1.036, |
|
"eval_steps_per_second": 0.13, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 0.30209558834780514, |
|
"learning_rate": 2e-05, |
|
"loss": 0.927, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"eval_loss": 0.7922118902206421, |
|
"eval_runtime": 192.8284, |
|
"eval_samples_per_second": 1.037, |
|
"eval_steps_per_second": 0.13, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.21875, |
|
"grad_norm": 0.35958652905266686, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8224, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.21875, |
|
"eval_loss": 0.7909810543060303, |
|
"eval_runtime": 202.3128, |
|
"eval_samples_per_second": 0.989, |
|
"eval_steps_per_second": 0.124, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.356338004067507, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8376, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"eval_loss": 0.7894656658172607, |
|
"eval_runtime": 195.1481, |
|
"eval_samples_per_second": 1.025, |
|
"eval_steps_per_second": 0.128, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.28125, |
|
"grad_norm": 0.31886905989727465, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8688, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.28125, |
|
"eval_loss": 0.7889463901519775, |
|
"eval_runtime": 194.8418, |
|
"eval_samples_per_second": 1.026, |
|
"eval_steps_per_second": 0.128, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.3125, |
|
"grad_norm": 0.35606342918056466, |
|
"learning_rate": 2e-05, |
|
"loss": 0.835, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.3125, |
|
"eval_loss": 0.7875587344169617, |
|
"eval_runtime": 194.7701, |
|
"eval_samples_per_second": 1.027, |
|
"eval_steps_per_second": 0.128, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"grad_norm": 0.3161858862696026, |
|
"learning_rate": 2e-05, |
|
"loss": 0.873, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"eval_loss": 0.7863460779190063, |
|
"eval_runtime": 195.4811, |
|
"eval_samples_per_second": 1.023, |
|
"eval_steps_per_second": 0.128, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"grad_norm": 0.35771781884741477, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9021, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"eval_loss": 0.7847577929496765, |
|
"eval_runtime": 195.3724, |
|
"eval_samples_per_second": 1.024, |
|
"eval_steps_per_second": 0.128, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.40625, |
|
"grad_norm": 0.3549789155823785, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9195, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.40625, |
|
"eval_loss": 0.783415675163269, |
|
"eval_runtime": 190.226, |
|
"eval_samples_per_second": 1.051, |
|
"eval_steps_per_second": 0.131, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.4375, |
|
"grad_norm": 0.34734314309709374, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8386, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.4375, |
|
"eval_loss": 0.7814657688140869, |
|
"eval_runtime": 190.3177, |
|
"eval_samples_per_second": 1.051, |
|
"eval_steps_per_second": 0.131, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.46875, |
|
"grad_norm": 0.35540762574897183, |
|
"learning_rate": 2e-05, |
|
"loss": 0.851, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.46875, |
|
"eval_loss": 0.7798058390617371, |
|
"eval_runtime": 190.7447, |
|
"eval_samples_per_second": 1.049, |
|
"eval_steps_per_second": 0.131, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.3844458514717174, |
|
"learning_rate": 2e-05, |
|
"loss": 0.767, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 0.7777827978134155, |
|
"eval_runtime": 190.8548, |
|
"eval_samples_per_second": 1.048, |
|
"eval_steps_per_second": 0.131, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.53125, |
|
"grad_norm": 0.36232344175264375, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8508, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.53125, |
|
"eval_loss": 0.7763205170631409, |
|
"eval_runtime": 190.335, |
|
"eval_samples_per_second": 1.051, |
|
"eval_steps_per_second": 0.131, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.5625, |
|
"grad_norm": 0.36279843147857743, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8331, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.5625, |
|
"eval_loss": 0.7757676839828491, |
|
"eval_runtime": 190.9559, |
|
"eval_samples_per_second": 1.047, |
|
"eval_steps_per_second": 0.131, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.59375, |
|
"grad_norm": 0.395360566032837, |
|
"learning_rate": 2e-05, |
|
"loss": 0.847, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.59375, |
|
"eval_loss": 0.7743326425552368, |
|
"eval_runtime": 190.6372, |
|
"eval_samples_per_second": 1.049, |
|
"eval_steps_per_second": 0.131, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 0.4268568783791123, |
|
"learning_rate": 2e-05, |
|
"loss": 0.869, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"eval_loss": 0.772053062915802, |
|
"eval_runtime": 190.2072, |
|
"eval_samples_per_second": 1.051, |
|
"eval_steps_per_second": 0.131, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"grad_norm": 0.3581495538253167, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8591, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"eval_loss": 0.7711917757987976, |
|
"eval_runtime": 190.4392, |
|
"eval_samples_per_second": 1.05, |
|
"eval_steps_per_second": 0.131, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.6875, |
|
"grad_norm": 0.3952841797586726, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8167, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.6875, |
|
"eval_loss": 0.7714033722877502, |
|
"eval_runtime": 193.8038, |
|
"eval_samples_per_second": 1.032, |
|
"eval_steps_per_second": 0.129, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.71875, |
|
"grad_norm": 0.41820009905687616, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8165, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.71875, |
|
"eval_loss": 0.771486759185791, |
|
"eval_runtime": 194.4791, |
|
"eval_samples_per_second": 1.028, |
|
"eval_steps_per_second": 0.129, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.3852566717747202, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8459, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"eval_loss": 0.7710732817649841, |
|
"eval_runtime": 194.3404, |
|
"eval_samples_per_second": 1.029, |
|
"eval_steps_per_second": 0.129, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.78125, |
|
"grad_norm": 0.39909292055831935, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8945, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.78125, |
|
"eval_loss": 0.7708308696746826, |
|
"eval_runtime": 194.4483, |
|
"eval_samples_per_second": 1.029, |
|
"eval_steps_per_second": 0.129, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 0.3916487629667217, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8029, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"eval_loss": 0.7713395953178406, |
|
"eval_runtime": 194.6045, |
|
"eval_samples_per_second": 1.028, |
|
"eval_steps_per_second": 0.128, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.84375, |
|
"grad_norm": 0.36969072235715195, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7704, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.84375, |
|
"eval_loss": 0.7713618278503418, |
|
"eval_runtime": 194.3895, |
|
"eval_samples_per_second": 1.029, |
|
"eval_steps_per_second": 0.129, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"grad_norm": 0.3853248559868725, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8247, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"eval_loss": 0.7703633308410645, |
|
"eval_runtime": 194.0457, |
|
"eval_samples_per_second": 1.031, |
|
"eval_steps_per_second": 0.129, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.90625, |
|
"grad_norm": 0.38111471762069055, |
|
"learning_rate": 2e-05, |
|
"loss": 0.855, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.90625, |
|
"eval_loss": 0.7690189480781555, |
|
"eval_runtime": 194.3506, |
|
"eval_samples_per_second": 1.029, |
|
"eval_steps_per_second": 0.129, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.9375, |
|
"grad_norm": 0.3701270310997752, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7518, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.9375, |
|
"eval_loss": 0.7675644159317017, |
|
"eval_runtime": 194.3756, |
|
"eval_samples_per_second": 1.029, |
|
"eval_steps_per_second": 0.129, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"grad_norm": 0.40489524752286055, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8559, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"eval_loss": 0.766002357006073, |
|
"eval_runtime": 193.9472, |
|
"eval_samples_per_second": 1.031, |
|
"eval_steps_per_second": 0.129, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.39220887464051457, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8629, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.7644355893135071, |
|
"eval_runtime": 196.1686, |
|
"eval_samples_per_second": 1.02, |
|
"eval_steps_per_second": 0.127, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.03125, |
|
"grad_norm": 0.3644925708419195, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7434, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.03125, |
|
"eval_loss": 0.7628399133682251, |
|
"eval_runtime": 196.1515, |
|
"eval_samples_per_second": 1.02, |
|
"eval_steps_per_second": 0.127, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.0625, |
|
"grad_norm": 0.407089942317534, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8038, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.0625, |
|
"eval_loss": 0.7609645128250122, |
|
"eval_runtime": 196.8662, |
|
"eval_samples_per_second": 1.016, |
|
"eval_steps_per_second": 0.127, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.09375, |
|
"grad_norm": 0.38849177572880716, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8106, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.09375, |
|
"eval_loss": 0.7598288059234619, |
|
"eval_runtime": 196.1846, |
|
"eval_samples_per_second": 1.019, |
|
"eval_steps_per_second": 0.127, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 0.41885563528617265, |
|
"learning_rate": 2e-05, |
|
"loss": 0.808, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"eval_loss": 0.7587143778800964, |
|
"eval_runtime": 195.7296, |
|
"eval_samples_per_second": 1.022, |
|
"eval_steps_per_second": 0.128, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.15625, |
|
"grad_norm": 0.4003909227323588, |
|
"learning_rate": 2e-05, |
|
"loss": 0.791, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 3.15625, |
|
"eval_loss": 0.7578326463699341, |
|
"eval_runtime": 195.2831, |
|
"eval_samples_per_second": 1.024, |
|
"eval_steps_per_second": 0.128, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 3.1875, |
|
"grad_norm": 0.4014550365826672, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7402, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.1875, |
|
"eval_loss": 0.7573958039283752, |
|
"eval_runtime": 189.5234, |
|
"eval_samples_per_second": 1.055, |
|
"eval_steps_per_second": 0.132, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.21875, |
|
"grad_norm": 0.4018554316691014, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8165, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 3.21875, |
|
"eval_loss": 0.7571737766265869, |
|
"eval_runtime": 190.0146, |
|
"eval_samples_per_second": 1.053, |
|
"eval_steps_per_second": 0.132, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.39691385018938347, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7806, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"eval_loss": 0.7581367492675781, |
|
"eval_runtime": 190.1851, |
|
"eval_samples_per_second": 1.052, |
|
"eval_steps_per_second": 0.131, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.28125, |
|
"grad_norm": 0.390373263306042, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7454, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.28125, |
|
"eval_loss": 0.7590533494949341, |
|
"eval_runtime": 190.1255, |
|
"eval_samples_per_second": 1.052, |
|
"eval_steps_per_second": 0.131, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.3125, |
|
"grad_norm": 0.45093404603350434, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8598, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 3.3125, |
|
"eval_loss": 0.7584137916564941, |
|
"eval_runtime": 193.5956, |
|
"eval_samples_per_second": 1.033, |
|
"eval_steps_per_second": 0.129, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 3.34375, |
|
"grad_norm": 0.4112664411035318, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8612, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 3.34375, |
|
"eval_loss": 0.757759690284729, |
|
"eval_runtime": 191.7864, |
|
"eval_samples_per_second": 1.043, |
|
"eval_steps_per_second": 0.13, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"grad_norm": 0.4158875890717671, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7916, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"eval_loss": 0.756908655166626, |
|
"eval_runtime": 190.1833, |
|
"eval_samples_per_second": 1.052, |
|
"eval_steps_per_second": 0.131, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 3.40625, |
|
"grad_norm": 0.4234644828447959, |
|
"learning_rate": 2e-05, |
|
"loss": 0.798, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 3.40625, |
|
"eval_loss": 0.7559736371040344, |
|
"eval_runtime": 190.0515, |
|
"eval_samples_per_second": 1.052, |
|
"eval_steps_per_second": 0.132, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"grad_norm": 0.480506693884699, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7964, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"eval_loss": 0.7547717094421387, |
|
"eval_runtime": 189.9579, |
|
"eval_samples_per_second": 1.053, |
|
"eval_steps_per_second": 0.132, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.46875, |
|
"grad_norm": 0.4540166631930203, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8172, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 3.46875, |
|
"eval_loss": 0.7538467645645142, |
|
"eval_runtime": 194.2136, |
|
"eval_samples_per_second": 1.03, |
|
"eval_steps_per_second": 0.129, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.46782505650509537, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7826, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"eval_loss": 0.7537503838539124, |
|
"eval_runtime": 194.396, |
|
"eval_samples_per_second": 1.029, |
|
"eval_steps_per_second": 0.129, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.53125, |
|
"grad_norm": 0.5212465522615498, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7715, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 3.53125, |
|
"eval_loss": 0.7527998089790344, |
|
"eval_runtime": 193.1123, |
|
"eval_samples_per_second": 1.036, |
|
"eval_steps_per_second": 0.129, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 3.5625, |
|
"grad_norm": 0.4869709286188453, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8007, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 3.5625, |
|
"eval_loss": 0.7510444521903992, |
|
"eval_runtime": 193.3613, |
|
"eval_samples_per_second": 1.034, |
|
"eval_steps_per_second": 0.129, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 3.59375, |
|
"grad_norm": 0.45218808224844204, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7183, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.59375, |
|
"eval_loss": 0.7502281069755554, |
|
"eval_runtime": 192.9756, |
|
"eval_samples_per_second": 1.036, |
|
"eval_steps_per_second": 0.13, |
|
"step": 115 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 128, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 5, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 254953078521856.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|