|
{ |
|
"best_global_step": 4500, |
|
"best_metric": 4.731945037841797, |
|
"best_model_checkpoint": "checkpoints/checkpoint-4500", |
|
"epoch": 4.999113362887904, |
|
"eval_steps": 500, |
|
"global_step": 4930, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1013299556681444, |
|
"grad_norm": 4.671788692474365, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 9.818, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2026599113362888, |
|
"grad_norm": 2.0833730697631836, |
|
"learning_rate": 1.9600000000000002e-05, |
|
"loss": 6.0125, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3039898670044332, |
|
"grad_norm": 2.2255606651306152, |
|
"learning_rate": 2.96e-05, |
|
"loss": 5.5478, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4053198226725776, |
|
"grad_norm": 2.375220537185669, |
|
"learning_rate": 3.960000000000001e-05, |
|
"loss": 5.3821, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.506649778340722, |
|
"grad_norm": 2.1432723999023438, |
|
"learning_rate": 4.96e-05, |
|
"loss": 5.2798, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.506649778340722, |
|
"eval_loss": 5.099383354187012, |
|
"eval_runtime": 61.4753, |
|
"eval_samples_per_second": 57.08, |
|
"eval_steps_per_second": 14.282, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6079797340088664, |
|
"grad_norm": 2.5531654357910156, |
|
"learning_rate": 4.891647855530474e-05, |
|
"loss": 5.2792, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7093096896770108, |
|
"grad_norm": 1.8546665906906128, |
|
"learning_rate": 4.7787810383747176e-05, |
|
"loss": 5.1646, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8106396453451552, |
|
"grad_norm": 2.103972911834717, |
|
"learning_rate": 4.665914221218962e-05, |
|
"loss": 5.1678, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9119696010132996, |
|
"grad_norm": 2.0656638145446777, |
|
"learning_rate": 4.553047404063205e-05, |
|
"loss": 5.1145, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.0141861937935401, |
|
"grad_norm": 2.003814697265625, |
|
"learning_rate": 4.440180586907449e-05, |
|
"loss": 5.1537, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0141861937935401, |
|
"eval_loss": 4.938778877258301, |
|
"eval_runtime": 61.4905, |
|
"eval_samples_per_second": 57.066, |
|
"eval_steps_per_second": 14.279, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.1155161494616845, |
|
"grad_norm": 2.0927860736846924, |
|
"learning_rate": 4.327313769751693e-05, |
|
"loss": 5.0454, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.216846105129829, |
|
"grad_norm": 2.080012321472168, |
|
"learning_rate": 4.214446952595937e-05, |
|
"loss": 5.0665, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.3181760607979733, |
|
"grad_norm": 1.8747535943984985, |
|
"learning_rate": 4.101580135440181e-05, |
|
"loss": 4.9706, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.4195060164661177, |
|
"grad_norm": 1.901370882987976, |
|
"learning_rate": 3.988713318284424e-05, |
|
"loss": 4.9639, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.5208359721342621, |
|
"grad_norm": 1.9110698699951172, |
|
"learning_rate": 3.875846501128668e-05, |
|
"loss": 4.9622, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.5208359721342621, |
|
"eval_loss": 4.870736598968506, |
|
"eval_runtime": 61.3393, |
|
"eval_samples_per_second": 57.206, |
|
"eval_steps_per_second": 14.314, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.6221659278024065, |
|
"grad_norm": 1.8562686443328857, |
|
"learning_rate": 3.762979683972912e-05, |
|
"loss": 4.9633, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.723495883470551, |
|
"grad_norm": 1.970841884613037, |
|
"learning_rate": 3.650112866817156e-05, |
|
"loss": 4.9279, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.8248258391386953, |
|
"grad_norm": 1.9571095705032349, |
|
"learning_rate": 3.5372460496614e-05, |
|
"loss": 4.952, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.9261557948068397, |
|
"grad_norm": 2.0035080909729004, |
|
"learning_rate": 3.424379232505643e-05, |
|
"loss": 4.9448, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.0283723875870803, |
|
"grad_norm": 1.978408694267273, |
|
"learning_rate": 3.3115124153498873e-05, |
|
"loss": 4.9862, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.0283723875870803, |
|
"eval_loss": 4.826657295227051, |
|
"eval_runtime": 61.364, |
|
"eval_samples_per_second": 57.183, |
|
"eval_steps_per_second": 14.308, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.1297023432552247, |
|
"grad_norm": 2.2265381813049316, |
|
"learning_rate": 3.198645598194131e-05, |
|
"loss": 4.8837, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.231032298923369, |
|
"grad_norm": 1.8263903856277466, |
|
"learning_rate": 3.085778781038375e-05, |
|
"loss": 4.893, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.3323622545915135, |
|
"grad_norm": 1.8423362970352173, |
|
"learning_rate": 2.9729119638826186e-05, |
|
"loss": 4.9038, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.433692210259658, |
|
"grad_norm": 1.9064007997512817, |
|
"learning_rate": 2.8600451467268623e-05, |
|
"loss": 4.8971, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.5350221659278023, |
|
"grad_norm": 1.8199445009231567, |
|
"learning_rate": 2.747178329571106e-05, |
|
"loss": 4.8563, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.5350221659278023, |
|
"eval_loss": 4.789968013763428, |
|
"eval_runtime": 61.2645, |
|
"eval_samples_per_second": 57.276, |
|
"eval_steps_per_second": 14.331, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.6363521215959467, |
|
"grad_norm": 2.2458302974700928, |
|
"learning_rate": 2.63431151241535e-05, |
|
"loss": 4.8214, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.737682077264091, |
|
"grad_norm": 1.9292908906936646, |
|
"learning_rate": 2.521444695259594e-05, |
|
"loss": 4.8538, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.8390120329322355, |
|
"grad_norm": 2.03075909614563, |
|
"learning_rate": 2.4085778781038376e-05, |
|
"loss": 4.8307, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.94034198860038, |
|
"grad_norm": 1.909643530845642, |
|
"learning_rate": 2.2957110609480814e-05, |
|
"loss": 4.8282, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.0425585813806206, |
|
"grad_norm": 1.8887925148010254, |
|
"learning_rate": 2.182844243792325e-05, |
|
"loss": 4.8738, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.0425585813806206, |
|
"eval_loss": 4.765661716461182, |
|
"eval_runtime": 61.3376, |
|
"eval_samples_per_second": 57.208, |
|
"eval_steps_per_second": 14.314, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.143888537048765, |
|
"grad_norm": 1.8953306674957275, |
|
"learning_rate": 2.069977426636569e-05, |
|
"loss": 4.7451, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 3.2452184927169094, |
|
"grad_norm": 2.1468937397003174, |
|
"learning_rate": 1.957110609480813e-05, |
|
"loss": 4.7813, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.346548448385054, |
|
"grad_norm": 1.9347341060638428, |
|
"learning_rate": 1.8442437923250567e-05, |
|
"loss": 4.792, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.4478784040531982, |
|
"grad_norm": 1.8998669385910034, |
|
"learning_rate": 1.7313769751693004e-05, |
|
"loss": 4.7867, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.5492083597213426, |
|
"grad_norm": 1.899141788482666, |
|
"learning_rate": 1.6185101580135442e-05, |
|
"loss": 4.8095, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.5492083597213426, |
|
"eval_loss": 4.752286434173584, |
|
"eval_runtime": 61.3687, |
|
"eval_samples_per_second": 57.179, |
|
"eval_steps_per_second": 14.307, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.650538315389487, |
|
"grad_norm": 1.9524105787277222, |
|
"learning_rate": 1.5056433408577881e-05, |
|
"loss": 4.8354, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.7518682710576314, |
|
"grad_norm": 2.0022027492523193, |
|
"learning_rate": 1.3927765237020315e-05, |
|
"loss": 4.8232, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 3.853198226725776, |
|
"grad_norm": 1.9039005041122437, |
|
"learning_rate": 1.2799097065462754e-05, |
|
"loss": 4.7974, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.9545281823939202, |
|
"grad_norm": 1.902718186378479, |
|
"learning_rate": 1.1670428893905193e-05, |
|
"loss": 4.8301, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 4.0567447751741605, |
|
"grad_norm": 2.1200180053710938, |
|
"learning_rate": 1.054176072234763e-05, |
|
"loss": 4.8426, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.0567447751741605, |
|
"eval_loss": 4.738107681274414, |
|
"eval_runtime": 61.3616, |
|
"eval_samples_per_second": 57.186, |
|
"eval_steps_per_second": 14.309, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.158074730842305, |
|
"grad_norm": 2.008924961090088, |
|
"learning_rate": 9.413092550790068e-06, |
|
"loss": 4.7473, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 4.259404686510449, |
|
"grad_norm": 1.881294846534729, |
|
"learning_rate": 8.284424379232506e-06, |
|
"loss": 4.7348, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 4.360734642178594, |
|
"grad_norm": 1.8294726610183716, |
|
"learning_rate": 7.155756207674943e-06, |
|
"loss": 4.7728, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 4.462064597846738, |
|
"grad_norm": 1.9369500875473022, |
|
"learning_rate": 6.0270880361173815e-06, |
|
"loss": 4.7811, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 4.5633945535148825, |
|
"grad_norm": 1.9159400463104248, |
|
"learning_rate": 4.89841986455982e-06, |
|
"loss": 4.7478, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 4.5633945535148825, |
|
"eval_loss": 4.731945037841797, |
|
"eval_runtime": 61.3768, |
|
"eval_samples_per_second": 57.171, |
|
"eval_steps_per_second": 14.305, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 4.664724509183027, |
|
"grad_norm": 1.9592151641845703, |
|
"learning_rate": 3.7697516930022577e-06, |
|
"loss": 4.7561, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 4.766054464851171, |
|
"grad_norm": 1.8825119733810425, |
|
"learning_rate": 2.6410835214446955e-06, |
|
"loss": 4.8118, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 4.867384420519316, |
|
"grad_norm": 2.025451898574829, |
|
"learning_rate": 1.5124153498871334e-06, |
|
"loss": 4.7582, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 4.96871437618746, |
|
"grad_norm": 2.0353872776031494, |
|
"learning_rate": 3.837471783295711e-07, |
|
"loss": 4.7274, |
|
"step": 4900 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 4930, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.1592623788544e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|