|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9100676482292083, |
|
"eval_steps": 200, |
|
"global_step": 4800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03979307600477517, |
|
"grad_norm": 4.196262359619141, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.7988, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07958615200955034, |
|
"grad_norm": 1.3701090812683105, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6726, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07958615200955034, |
|
"eval_accuracy": 0.571913002806361, |
|
"eval_loss": 0.6743200421333313, |
|
"eval_runtime": 70.7503, |
|
"eval_samples_per_second": 120.876, |
|
"eval_steps_per_second": 1.894, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1193792280143255, |
|
"grad_norm": 1.1724385023117065, |
|
"learning_rate": 4.994704807509721e-05, |
|
"loss": 0.6713, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.15917230401910068, |
|
"grad_norm": 2.7935872077941895, |
|
"learning_rate": 4.9788416612896896e-05, |
|
"loss": 0.665, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.15917230401910068, |
|
"eval_accuracy": 0.5989242282507016, |
|
"eval_loss": 0.6570606231689453, |
|
"eval_runtime": 70.7356, |
|
"eval_samples_per_second": 120.901, |
|
"eval_steps_per_second": 1.894, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19896538002387584, |
|
"grad_norm": 2.3964438438415527, |
|
"learning_rate": 4.952477760070096e-05, |
|
"loss": 0.6614, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.238758456028651, |
|
"grad_norm": 2.5537328720092773, |
|
"learning_rate": 4.9157247853963424e-05, |
|
"loss": 0.6644, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.238758456028651, |
|
"eval_accuracy": 0.5892188961646398, |
|
"eval_loss": 0.6891445517539978, |
|
"eval_runtime": 70.7814, |
|
"eval_samples_per_second": 120.823, |
|
"eval_steps_per_second": 1.893, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2785515320334262, |
|
"grad_norm": 6.364818572998047, |
|
"learning_rate": 4.8687384285288185e-05, |
|
"loss": 0.6698, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.31834460803820136, |
|
"grad_norm": 5.387712478637695, |
|
"learning_rate": 4.811717730910749e-05, |
|
"loss": 0.6665, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.31834460803820136, |
|
"eval_accuracy": 0.5938961646398503, |
|
"eval_loss": 0.66575688123703, |
|
"eval_runtime": 70.8533, |
|
"eval_samples_per_second": 120.7, |
|
"eval_steps_per_second": 1.891, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3581376840429765, |
|
"grad_norm": 2.7228779792785645, |
|
"learning_rate": 4.744904240997987e-05, |
|
"loss": 0.674, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3979307600477517, |
|
"grad_norm": 5.516107082366943, |
|
"learning_rate": 4.668580991022563e-05, |
|
"loss": 0.6545, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3979307600477517, |
|
"eval_accuracy": 0.5989242282507016, |
|
"eval_loss": 0.663463294506073, |
|
"eval_runtime": 70.5961, |
|
"eval_samples_per_second": 121.14, |
|
"eval_steps_per_second": 1.898, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.43772383605252685, |
|
"grad_norm": 1.8297033309936523, |
|
"learning_rate": 4.5830712980245576e-05, |
|
"loss": 0.6549, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.477516912057302, |
|
"grad_norm": 1.0240168571472168, |
|
"learning_rate": 4.48873739423134e-05, |
|
"loss": 0.6797, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.477516912057302, |
|
"eval_accuracy": 0.5445509822263798, |
|
"eval_loss": 0.688525915145874, |
|
"eval_runtime": 70.7499, |
|
"eval_samples_per_second": 120.876, |
|
"eval_steps_per_second": 1.894, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5173099880620772, |
|
"grad_norm": 1.1778732538223267, |
|
"learning_rate": 4.385978892586064e-05, |
|
"loss": 0.6805, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.5571030640668524, |
|
"grad_norm": 1.2296079397201538, |
|
"learning_rate": 4.275231093925711e-05, |
|
"loss": 0.6722, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.5571030640668524, |
|
"eval_accuracy": 0.5915575304022451, |
|
"eval_loss": 0.6639552116394043, |
|
"eval_runtime": 71.55, |
|
"eval_samples_per_second": 119.525, |
|
"eval_steps_per_second": 1.873, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.5968961400716275, |
|
"grad_norm": 5.959015369415283, |
|
"learning_rate": 4.1569631429797054e-05, |
|
"loss": 0.6624, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6366892160764027, |
|
"grad_norm": 4.830468654632568, |
|
"learning_rate": 4.031676041000599e-05, |
|
"loss": 0.6536, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.6366892160764027, |
|
"eval_accuracy": 0.6013797942001871, |
|
"eval_loss": 0.6594148278236389, |
|
"eval_runtime": 71.4819, |
|
"eval_samples_per_second": 119.639, |
|
"eval_steps_per_second": 1.875, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.6764822920811778, |
|
"grad_norm": 5.347845077514648, |
|
"learning_rate": 3.899900523445614e-05, |
|
"loss": 0.6496, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.716275368085953, |
|
"grad_norm": 2.6406002044677734, |
|
"learning_rate": 3.76219481169952e-05, |
|
"loss": 0.6514, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.716275368085953, |
|
"eval_accuracy": 0.6093311506080449, |
|
"eval_loss": 0.6483238935470581, |
|
"eval_runtime": 71.7594, |
|
"eval_samples_per_second": 119.176, |
|
"eval_steps_per_second": 1.867, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.7560684440907283, |
|
"grad_norm": 2.9834582805633545, |
|
"learning_rate": 3.61914224836288e-05, |
|
"loss": 0.6433, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.7958615200955034, |
|
"grad_norm": 1.781412124633789, |
|
"learning_rate": 3.4713488261229724e-05, |
|
"loss": 0.6481, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.7958615200955034, |
|
"eval_accuracy": 0.6115528531337698, |
|
"eval_loss": 0.6509259939193726, |
|
"eval_runtime": 71.5666, |
|
"eval_samples_per_second": 119.497, |
|
"eval_steps_per_second": 1.872, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.8356545961002786, |
|
"grad_norm": 2.0181474685668945, |
|
"learning_rate": 3.319440620675442e-05, |
|
"loss": 0.6479, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.8754476721050537, |
|
"grad_norm": 1.780383586883545, |
|
"learning_rate": 3.164061138571247e-05, |
|
"loss": 0.6455, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.8754476721050537, |
|
"eval_accuracy": 0.6241814780168382, |
|
"eval_loss": 0.6406447887420654, |
|
"eval_runtime": 71.0764, |
|
"eval_samples_per_second": 120.321, |
|
"eval_steps_per_second": 1.885, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.9152407481098289, |
|
"grad_norm": 1.8478803634643555, |
|
"learning_rate": 3.005868591223814e-05, |
|
"loss": 0.6414, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.955033824114604, |
|
"grad_norm": 2.130547046661377, |
|
"learning_rate": 2.8455331066241263e-05, |
|
"loss": 0.6345, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.955033824114604, |
|
"eval_accuracy": 0.6162301216089804, |
|
"eval_loss": 0.6427032351493835, |
|
"eval_runtime": 71.4565, |
|
"eval_samples_per_second": 119.681, |
|
"eval_steps_per_second": 1.875, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.9948269001193792, |
|
"grad_norm": 1.3212426900863647, |
|
"learning_rate": 2.6837338905753685e-05, |
|
"loss": 0.6455, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.0346199761241544, |
|
"grad_norm": 2.934187412261963, |
|
"learning_rate": 2.5211563494725422e-05, |
|
"loss": 0.6022, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.0346199761241544, |
|
"eval_accuracy": 0.6224275023386342, |
|
"eval_loss": 0.6543847322463989, |
|
"eval_runtime": 71.1065, |
|
"eval_samples_per_second": 120.27, |
|
"eval_steps_per_second": 1.884, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.0744130521289295, |
|
"grad_norm": 2.43818736076355, |
|
"learning_rate": 2.3584891868154375e-05, |
|
"loss": 0.5924, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.1142061281337048, |
|
"grad_norm": 3.3583297729492188, |
|
"learning_rate": 2.196421485754547e-05, |
|
"loss": 0.5914, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.1142061281337048, |
|
"eval_accuracy": 0.6216089803554724, |
|
"eval_loss": 0.653032124042511, |
|
"eval_runtime": 71.6366, |
|
"eval_samples_per_second": 119.38, |
|
"eval_steps_per_second": 1.871, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.15399920413848, |
|
"grad_norm": 3.130229949951172, |
|
"learning_rate": 2.0356397900287307e-05, |
|
"loss": 0.5859, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.193792280143255, |
|
"grad_norm": 3.7583000659942627, |
|
"learning_rate": 1.8768251956602138e-05, |
|
"loss": 0.5798, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.193792280143255, |
|
"eval_accuracy": 0.6282740879326474, |
|
"eval_loss": 0.6515339612960815, |
|
"eval_runtime": 71.5158, |
|
"eval_samples_per_second": 119.582, |
|
"eval_steps_per_second": 1.874, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.2335853561480303, |
|
"grad_norm": 3.0673882961273193, |
|
"learning_rate": 1.7206504657269547e-05, |
|
"loss": 0.5836, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.2733784321528054, |
|
"grad_norm": 3.4926602840423584, |
|
"learning_rate": 1.5677771804346432e-05, |
|
"loss": 0.5757, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.2733784321528054, |
|
"eval_accuracy": 0.6352899906454631, |
|
"eval_loss": 0.6421129107475281, |
|
"eval_runtime": 71.3733, |
|
"eval_samples_per_second": 119.821, |
|
"eval_steps_per_second": 1.877, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.3131715081575805, |
|
"grad_norm": 4.403800964355469, |
|
"learning_rate": 1.4188529345610756e-05, |
|
"loss": 0.5776, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.3529645841623559, |
|
"grad_norm": 2.677273750305176, |
|
"learning_rate": 1.2745085941449484e-05, |
|
"loss": 0.5843, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.3529645841623559, |
|
"eval_accuracy": 0.6383302151543498, |
|
"eval_loss": 0.6391754150390625, |
|
"eval_runtime": 71.8797, |
|
"eval_samples_per_second": 118.977, |
|
"eval_steps_per_second": 1.864, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.392757660167131, |
|
"grad_norm": 2.8799660205841064, |
|
"learning_rate": 1.1353556240401692e-05, |
|
"loss": 0.5758, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.432550736171906, |
|
"grad_norm": 3.6387939453125, |
|
"learning_rate": 1.0019834976565779e-05, |
|
"loss": 0.5758, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.432550736171906, |
|
"eval_accuracy": 0.6308465855940131, |
|
"eval_loss": 0.642611563205719, |
|
"eval_runtime": 71.6295, |
|
"eval_samples_per_second": 119.392, |
|
"eval_steps_per_second": 1.871, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.4723438121766812, |
|
"grad_norm": 3.1025497913360596, |
|
"learning_rate": 8.749571998598052e-06, |
|
"loss": 0.5826, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.5121368881814563, |
|
"grad_norm": 3.0935654640197754, |
|
"learning_rate": 7.548148336083807e-06, |
|
"loss": 0.5739, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.5121368881814563, |
|
"eval_accuracy": 0.6361085126286249, |
|
"eval_loss": 0.6375930309295654, |
|
"eval_runtime": 71.3926, |
|
"eval_samples_per_second": 119.788, |
|
"eval_steps_per_second": 1.877, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.5519299641862316, |
|
"grad_norm": 2.953458786010742, |
|
"learning_rate": 6.420653404667356e-06, |
|
"loss": 0.5688, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.5917230401910067, |
|
"grad_norm": 3.1475229263305664, |
|
"learning_rate": 5.371863446503628e-06, |
|
"loss": 0.5717, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.5917230401910067, |
|
"eval_accuracy": 0.6387979420018709, |
|
"eval_loss": 0.6375735402107239, |
|
"eval_runtime": 71.6947, |
|
"eval_samples_per_second": 119.284, |
|
"eval_steps_per_second": 1.869, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.6315161161957819, |
|
"grad_norm": 3.241312265396118, |
|
"learning_rate": 4.406221297360902e-06, |
|
"loss": 0.564, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.6713091922005572, |
|
"grad_norm": 3.462087392807007, |
|
"learning_rate": 3.5278175660843267e-06, |
|
"loss": 0.5679, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.6713091922005572, |
|
"eval_accuracy": 0.6421889616463985, |
|
"eval_loss": 0.6346476674079895, |
|
"eval_runtime": 71.4517, |
|
"eval_samples_per_second": 119.689, |
|
"eval_steps_per_second": 1.875, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.7111022682053323, |
|
"grad_norm": 3.355526924133301, |
|
"learning_rate": 2.7403733061469454e-06, |
|
"loss": 0.5736, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.7508953442101074, |
|
"grad_norm": 4.239299297332764, |
|
"learning_rate": 2.047224252694127e-06, |
|
"loss": 0.5525, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.7508953442101074, |
|
"eval_accuracy": 0.6434752104770813, |
|
"eval_loss": 0.638582706451416, |
|
"eval_runtime": 71.4737, |
|
"eval_samples_per_second": 119.652, |
|
"eval_steps_per_second": 1.875, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.7906884202148827, |
|
"grad_norm": 4.165896415710449, |
|
"learning_rate": 1.4513066918558722e-06, |
|
"loss": 0.5634, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.8304814962196578, |
|
"grad_norm": 3.2473928928375244, |
|
"learning_rate": 9.551450221865599e-07, |
|
"loss": 0.5702, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.8304814962196578, |
|
"eval_accuracy": 0.6438260056127222, |
|
"eval_loss": 0.6338381767272949, |
|
"eval_runtime": 71.8808, |
|
"eval_samples_per_second": 118.975, |
|
"eval_steps_per_second": 1.864, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.870274572224433, |
|
"grad_norm": 3.846665620803833, |
|
"learning_rate": 5.608410609239501e-07, |
|
"loss": 0.5714, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.9100676482292083, |
|
"grad_norm": 3.0735666751861572, |
|
"learning_rate": 2.7006514036768957e-07, |
|
"loss": 0.5652, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.9100676482292083, |
|
"eval_accuracy": 0.6452291861552854, |
|
"eval_loss": 0.6332165002822876, |
|
"eval_runtime": 71.8255, |
|
"eval_samples_per_second": 119.066, |
|
"eval_steps_per_second": 1.866, |
|
"step": 4800 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 5026, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 400, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|