|
{ |
|
"best_metric": 0.05326759070158005, |
|
"best_model_checkpoint": "./output/checkpoint-750", |
|
"epoch": 0.7560483870967742, |
|
"eval_steps": 150, |
|
"global_step": 750, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010080645161290322, |
|
"grad_norm": 4.718156814575195, |
|
"learning_rate": 1.25e-05, |
|
"loss": 1.4603, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.020161290322580645, |
|
"grad_norm": 2.660947561264038, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.9242, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03024193548387097, |
|
"grad_norm": 1.879041075706482, |
|
"learning_rate": 3.75e-05, |
|
"loss": 0.448, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04032258064516129, |
|
"grad_norm": 1.869635820388794, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3737, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05040322580645161, |
|
"grad_norm": 1.2459924221038818, |
|
"learning_rate": 6.25e-05, |
|
"loss": 0.2163, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06048387096774194, |
|
"grad_norm": 1.4758652448654175, |
|
"learning_rate": 7.5e-05, |
|
"loss": 0.2292, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07056451612903226, |
|
"grad_norm": 2.0161190032958984, |
|
"learning_rate": 8.75e-05, |
|
"loss": 0.1981, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08064516129032258, |
|
"grad_norm": 1.2931406497955322, |
|
"learning_rate": 0.0001, |
|
"loss": 0.1827, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0907258064516129, |
|
"grad_norm": 1.0741955041885376, |
|
"learning_rate": 0.00011250000000000001, |
|
"loss": 0.1407, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.10080645161290322, |
|
"grad_norm": 1.0442911386489868, |
|
"learning_rate": 0.000125, |
|
"loss": 0.1546, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11088709677419355, |
|
"grad_norm": 1.0597628355026245, |
|
"learning_rate": 0.00012499871543489787, |
|
"loss": 0.102, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12096774193548387, |
|
"grad_norm": 0.9368671774864197, |
|
"learning_rate": 0.00012499486179239495, |
|
"loss": 0.1541, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1310483870967742, |
|
"grad_norm": 0.7519365549087524, |
|
"learning_rate": 0.00012498843923089938, |
|
"loss": 0.1643, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14112903225806453, |
|
"grad_norm": 0.4844861328601837, |
|
"learning_rate": 0.0001249794480144175, |
|
"loss": 0.1243, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.15120967741935484, |
|
"grad_norm": 0.7643828392028809, |
|
"learning_rate": 0.000124967888512543, |
|
"loss": 0.1229, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.15120967741935484, |
|
"eval_loss": 0.08507546782493591, |
|
"eval_runtime": 57.5316, |
|
"eval_samples_per_second": 8.691, |
|
"eval_steps_per_second": 8.691, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16129032258064516, |
|
"grad_norm": 0.47043290734291077, |
|
"learning_rate": 0.00012495376120044173, |
|
"loss": 0.1681, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.17137096774193547, |
|
"grad_norm": 0.6022606492042542, |
|
"learning_rate": 0.00012493706665883217, |
|
"loss": 0.1328, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.1814516129032258, |
|
"grad_norm": 0.5551249980926514, |
|
"learning_rate": 0.00012491780557396154, |
|
"loss": 0.1344, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.19153225806451613, |
|
"grad_norm": 0.8580924868583679, |
|
"learning_rate": 0.00012489597873757756, |
|
"loss": 0.1418, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.20161290322580644, |
|
"grad_norm": 0.5060119032859802, |
|
"learning_rate": 0.00012487158704689602, |
|
"loss": 0.1207, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.21169354838709678, |
|
"grad_norm": 0.5322176218032837, |
|
"learning_rate": 0.0001248446315045638, |
|
"loss": 0.0723, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2217741935483871, |
|
"grad_norm": 1.1198161840438843, |
|
"learning_rate": 0.00012481511321861763, |
|
"loss": 0.1267, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2318548387096774, |
|
"grad_norm": 0.317910760641098, |
|
"learning_rate": 0.00012478303340243864, |
|
"loss": 0.0905, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.24193548387096775, |
|
"grad_norm": 0.5275834202766418, |
|
"learning_rate": 0.00012474839337470246, |
|
"loss": 0.1228, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.25201612903225806, |
|
"grad_norm": 0.2738340497016907, |
|
"learning_rate": 0.0001247111945593249, |
|
"loss": 0.1433, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2620967741935484, |
|
"grad_norm": 0.718310534954071, |
|
"learning_rate": 0.00012467143848540359, |
|
"loss": 0.1075, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2721774193548387, |
|
"grad_norm": 0.18305560946464539, |
|
"learning_rate": 0.000124629126787155, |
|
"loss": 0.1053, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.28225806451612906, |
|
"grad_norm": 0.48630911111831665, |
|
"learning_rate": 0.00012458426120384738, |
|
"loss": 0.107, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2923387096774194, |
|
"grad_norm": 0.24452829360961914, |
|
"learning_rate": 0.00012453684357972906, |
|
"loss": 0.117, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3024193548387097, |
|
"grad_norm": 0.7329663634300232, |
|
"learning_rate": 0.00012448687586395289, |
|
"loss": 0.0766, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3024193548387097, |
|
"eval_loss": 0.061199020594358444, |
|
"eval_runtime": 60.0112, |
|
"eval_samples_per_second": 8.332, |
|
"eval_steps_per_second": 8.332, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 0.9389004111289978, |
|
"learning_rate": 0.00012443436011049593, |
|
"loss": 0.129, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3225806451612903, |
|
"grad_norm": 0.8787228465080261, |
|
"learning_rate": 0.0001243792984780751, |
|
"loss": 0.1333, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3326612903225806, |
|
"grad_norm": 0.3159072697162628, |
|
"learning_rate": 0.00012432169323005853, |
|
"loss": 0.0931, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.34274193548387094, |
|
"grad_norm": 0.7588217258453369, |
|
"learning_rate": 0.00012426154673437223, |
|
"loss": 0.1053, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3528225806451613, |
|
"grad_norm": 1.1616908311843872, |
|
"learning_rate": 0.00012419886146340314, |
|
"loss": 0.1468, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3629032258064516, |
|
"grad_norm": 0.8137270212173462, |
|
"learning_rate": 0.0001241336399938972, |
|
"loss": 0.1196, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.37298387096774194, |
|
"grad_norm": 0.27941054105758667, |
|
"learning_rate": 0.00012406588500685355, |
|
"loss": 0.0915, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.38306451612903225, |
|
"grad_norm": 0.22469285130500793, |
|
"learning_rate": 0.00012399559928741435, |
|
"loss": 0.0607, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.39314516129032256, |
|
"grad_norm": 0.20622070133686066, |
|
"learning_rate": 0.00012392278572475023, |
|
"loss": 0.0657, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4032258064516129, |
|
"grad_norm": 0.1868823766708374, |
|
"learning_rate": 0.0001238474473119416, |
|
"loss": 0.0873, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.41330645161290325, |
|
"grad_norm": 0.262215793132782, |
|
"learning_rate": 0.00012376958714585545, |
|
"loss": 0.0899, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.42338709677419356, |
|
"grad_norm": 0.8614699840545654, |
|
"learning_rate": 0.0001236892084270183, |
|
"loss": 0.0724, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4334677419354839, |
|
"grad_norm": 0.917412281036377, |
|
"learning_rate": 0.00012360631445948448, |
|
"loss": 0.1351, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4435483870967742, |
|
"grad_norm": 0.8552457094192505, |
|
"learning_rate": 0.00012352090865070026, |
|
"loss": 0.1108, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4536290322580645, |
|
"grad_norm": 0.6661000847816467, |
|
"learning_rate": 0.00012343299451136397, |
|
"loss": 0.0681, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4536290322580645, |
|
"eval_loss": 0.06194188818335533, |
|
"eval_runtime": 57.5815, |
|
"eval_samples_per_second": 8.683, |
|
"eval_steps_per_second": 8.683, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4637096774193548, |
|
"grad_norm": 0.17224998772144318, |
|
"learning_rate": 0.00012334257565528155, |
|
"loss": 0.0752, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.4737903225806452, |
|
"grad_norm": 0.6695602536201477, |
|
"learning_rate": 0.000123249655799218, |
|
"loss": 0.1084, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4838709677419355, |
|
"grad_norm": 0.256228506565094, |
|
"learning_rate": 0.00012315423876274468, |
|
"loss": 0.0635, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4939516129032258, |
|
"grad_norm": 0.25890034437179565, |
|
"learning_rate": 0.0001230563284680822, |
|
"loss": 0.0857, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5040322580645161, |
|
"grad_norm": 0.20878875255584717, |
|
"learning_rate": 0.00012295592893993935, |
|
"loss": 0.0967, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5141129032258065, |
|
"grad_norm": 0.23766882717609406, |
|
"learning_rate": 0.00012285304430534745, |
|
"loss": 0.1212, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5241935483870968, |
|
"grad_norm": 0.18952979147434235, |
|
"learning_rate": 0.00012274767879349083, |
|
"loss": 0.0889, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5342741935483871, |
|
"grad_norm": 0.4890676736831665, |
|
"learning_rate": 0.00012263983673553306, |
|
"loss": 0.09, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5443548387096774, |
|
"grad_norm": 0.6612870097160339, |
|
"learning_rate": 0.0001225295225644387, |
|
"loss": 0.1209, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5544354838709677, |
|
"grad_norm": 0.3861521780490875, |
|
"learning_rate": 0.0001224167408147913, |
|
"loss": 0.085, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5645161290322581, |
|
"grad_norm": 0.22604888677597046, |
|
"learning_rate": 0.0001223014961226068, |
|
"loss": 0.0877, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5745967741935484, |
|
"grad_norm": 0.4841513931751251, |
|
"learning_rate": 0.00012218379322514317, |
|
"loss": 0.0861, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5846774193548387, |
|
"grad_norm": 0.16400082409381866, |
|
"learning_rate": 0.00012206363696070545, |
|
"loss": 0.1509, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.594758064516129, |
|
"grad_norm": 0.14709672331809998, |
|
"learning_rate": 0.0001219410322684471, |
|
"loss": 0.0619, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6048387096774194, |
|
"grad_norm": 0.17841552197933197, |
|
"learning_rate": 0.0001218159841881668, |
|
"loss": 0.0782, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6048387096774194, |
|
"eval_loss": 0.053511977195739746, |
|
"eval_runtime": 58.4751, |
|
"eval_samples_per_second": 8.551, |
|
"eval_steps_per_second": 8.551, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6149193548387096, |
|
"grad_norm": 0.6630131006240845, |
|
"learning_rate": 0.00012168849786010133, |
|
"loss": 0.077, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.16494181752204895, |
|
"learning_rate": 0.00012155857852471433, |
|
"loss": 0.1101, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6350806451612904, |
|
"grad_norm": 0.9052111506462097, |
|
"learning_rate": 0.0001214262315224808, |
|
"loss": 0.1151, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6451612903225806, |
|
"grad_norm": 0.7651068568229675, |
|
"learning_rate": 0.00012129146229366766, |
|
"loss": 0.1093, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.655241935483871, |
|
"grad_norm": 0.16763581335544586, |
|
"learning_rate": 0.00012115427637811003, |
|
"loss": 0.0711, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6653225806451613, |
|
"grad_norm": 0.6529646515846252, |
|
"learning_rate": 0.00012101467941498357, |
|
"loss": 0.1042, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6754032258064516, |
|
"grad_norm": 0.21442104876041412, |
|
"learning_rate": 0.0001208726771425727, |
|
"loss": 0.0624, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6854838709677419, |
|
"grad_norm": 0.27069932222366333, |
|
"learning_rate": 0.00012072827539803463, |
|
"loss": 0.0808, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6955645161290323, |
|
"grad_norm": 0.203572615981102, |
|
"learning_rate": 0.00012058148011715949, |
|
"loss": 0.0861, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.7056451612903226, |
|
"grad_norm": 0.18072012066841125, |
|
"learning_rate": 0.00012043229733412636, |
|
"loss": 0.053, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7157258064516129, |
|
"grad_norm": 0.62689608335495, |
|
"learning_rate": 0.0001202807331812551, |
|
"loss": 0.0998, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7258064516129032, |
|
"grad_norm": 0.14409402012825012, |
|
"learning_rate": 0.00012012679388875441, |
|
"loss": 0.0709, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7358870967741935, |
|
"grad_norm": 0.17387409508228302, |
|
"learning_rate": 0.00011997048578446568, |
|
"loss": 0.1087, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7459677419354839, |
|
"grad_norm": 0.6958820223808289, |
|
"learning_rate": 0.00011981181529360282, |
|
"loss": 0.1266, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.7560483870967742, |
|
"grad_norm": 0.6624193787574768, |
|
"learning_rate": 0.00011965078893848828, |
|
"loss": 0.1003, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7560483870967742, |
|
"eval_loss": 0.05326759070158005, |
|
"eval_runtime": 57.7375, |
|
"eval_samples_per_second": 8.66, |
|
"eval_steps_per_second": 8.66, |
|
"step": 750 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 150, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.769083236135731e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|