|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.015641769143410204, |
|
"eval_steps": 334, |
|
"global_step": 501, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 3.1221096094631145e-05, |
|
"eval_loss": 2.5151805877685547, |
|
"eval_runtime": 12858.7571, |
|
"eval_samples_per_second": 2.214, |
|
"eval_steps_per_second": 2.214, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0003122109609463114, |
|
"grad_norm": 1.6301891803741455, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2603, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0006244219218926228, |
|
"grad_norm": 2.2189297676086426, |
|
"learning_rate": 0.00019994965423831854, |
|
"loss": 0.8201, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0009366328828389343, |
|
"grad_norm": 1.6551645994186401, |
|
"learning_rate": 0.00019979866764718843, |
|
"loss": 0.7913, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0012488438437852456, |
|
"grad_norm": 4.437395095825195, |
|
"learning_rate": 0.00019954719225730847, |
|
"loss": 1.2831, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.001561054804731557, |
|
"grad_norm": 7.653807163238525, |
|
"learning_rate": 0.00019919548128307954, |
|
"loss": 1.6268, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0018732657656778686, |
|
"grad_norm": 1.2926839590072632, |
|
"learning_rate": 0.00019874388886763944, |
|
"loss": 1.1779, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.00218547672662418, |
|
"grad_norm": 1.5756969451904297, |
|
"learning_rate": 0.00019819286972627066, |
|
"loss": 0.6204, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.002497687687570491, |
|
"grad_norm": 1.8183478116989136, |
|
"learning_rate": 0.00019754297868854073, |
|
"loss": 0.8666, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0028098986485168027, |
|
"grad_norm": 4.023663520812988, |
|
"learning_rate": 0.00019679487013963564, |
|
"loss": 1.1012, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.003122109609463114, |
|
"grad_norm": 3.675027847290039, |
|
"learning_rate": 0.00019594929736144976, |
|
"loss": 1.3794, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0034343205704094257, |
|
"grad_norm": 1.1626209020614624, |
|
"learning_rate": 0.00019500711177409454, |
|
"loss": 1.1787, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.003746531531355737, |
|
"grad_norm": 1.4152737855911255, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 0.471, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.004058742492302049, |
|
"grad_norm": 2.0098798274993896, |
|
"learning_rate": 0.00019283679330160726, |
|
"loss": 0.9037, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.00437095345324836, |
|
"grad_norm": 2.845428943634033, |
|
"learning_rate": 0.00019161084574320696, |
|
"loss": 1.2114, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.004683164414194672, |
|
"grad_norm": 5.3034749031066895, |
|
"learning_rate": 0.00019029265382866214, |
|
"loss": 1.397, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.004995375375140982, |
|
"grad_norm": 1.5945558547973633, |
|
"learning_rate": 0.00018888354486549237, |
|
"loss": 1.145, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.005307586336087294, |
|
"grad_norm": 1.1813420057296753, |
|
"learning_rate": 0.00018738493770697852, |
|
"loss": 0.5603, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.005619797297033605, |
|
"grad_norm": 2.35960054397583, |
|
"learning_rate": 0.00018579834132349772, |
|
"loss": 0.8248, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.005932008257979917, |
|
"grad_norm": 2.443915605545044, |
|
"learning_rate": 0.00018412535328311814, |
|
"loss": 1.1141, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.006244219218926228, |
|
"grad_norm": 5.689703941345215, |
|
"learning_rate": 0.0001823676581429833, |
|
"loss": 1.2009, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.00655643017987254, |
|
"grad_norm": 1.4314906597137451, |
|
"learning_rate": 0.00018052702575310588, |
|
"loss": 1.1061, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.006868641140818851, |
|
"grad_norm": 0.6448104977607727, |
|
"learning_rate": 0.00017860530947427875, |
|
"loss": 0.4016, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.007180852101765163, |
|
"grad_norm": 2.0396196842193604, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 0.8709, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.007493063062711474, |
|
"grad_norm": 2.5875227451324463, |
|
"learning_rate": 0.0001745264449675755, |
|
"loss": 1.1121, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.007805274023657786, |
|
"grad_norm": 3.9609525203704834, |
|
"learning_rate": 0.00017237340381050703, |
|
"loss": 1.251, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.008117484984604097, |
|
"grad_norm": 1.2032607793807983, |
|
"learning_rate": 0.00017014748877063214, |
|
"loss": 1.1823, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.008429695945550408, |
|
"grad_norm": 1.186848521232605, |
|
"learning_rate": 0.00016785094115571322, |
|
"loss": 0.6219, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.00874190690649672, |
|
"grad_norm": 1.85453462600708, |
|
"learning_rate": 0.00016548607339452853, |
|
"loss": 0.5809, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.009054117867443031, |
|
"grad_norm": 2.0443332195281982, |
|
"learning_rate": 0.00016305526670845226, |
|
"loss": 1.2146, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.009366328828389343, |
|
"grad_norm": 7.1448516845703125, |
|
"learning_rate": 0.00016056096871376667, |
|
"loss": 1.2524, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.009678539789335654, |
|
"grad_norm": 1.334848165512085, |
|
"learning_rate": 0.00015800569095711982, |
|
"loss": 1.1966, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.009990750750281965, |
|
"grad_norm": 0.4558267295360565, |
|
"learning_rate": 0.00015539200638661104, |
|
"loss": 0.589, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.010302961711228277, |
|
"grad_norm": 1.8344190120697021, |
|
"learning_rate": 0.00015272254676105025, |
|
"loss": 0.5806, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.010427846095606801, |
|
"eval_loss": 0.9566133618354797, |
|
"eval_runtime": 13592.0734, |
|
"eval_samples_per_second": 2.095, |
|
"eval_steps_per_second": 2.095, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.010615172672174588, |
|
"grad_norm": 1.9632649421691895, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.0551, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.0109273836331209, |
|
"grad_norm": 4.136826992034912, |
|
"learning_rate": 0.0001472271074772683, |
|
"loss": 1.0784, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.01123959459406721, |
|
"grad_norm": 1.1779104471206665, |
|
"learning_rate": 0.00014440666126057744, |
|
"loss": 1.1613, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.011551805555013523, |
|
"grad_norm": 0.8325644731521606, |
|
"learning_rate": 0.00014154150130018866, |
|
"loss": 0.5119, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.011864016515959834, |
|
"grad_norm": 1.6711801290512085, |
|
"learning_rate": 0.00013863451256931287, |
|
"loss": 0.6156, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.012176227476906146, |
|
"grad_norm": 2.293975353240967, |
|
"learning_rate": 0.00013568862215918717, |
|
"loss": 1.0706, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.012488438437852457, |
|
"grad_norm": 2.2785656452178955, |
|
"learning_rate": 0.00013270679633174218, |
|
"loss": 1.2872, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.012800649398798769, |
|
"grad_norm": 1.2502048015594482, |
|
"learning_rate": 0.0001296920375328275, |
|
"loss": 1.0768, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.01311286035974508, |
|
"grad_norm": 0.7812928557395935, |
|
"learning_rate": 0.00012664738136900348, |
|
"loss": 0.5199, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.01342507132069139, |
|
"grad_norm": 2.0176918506622314, |
|
"learning_rate": 0.00012357589355094275, |
|
"loss": 0.8125, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.013737282281637703, |
|
"grad_norm": 2.014697313308716, |
|
"learning_rate": 0.00012048066680651908, |
|
"loss": 1.0261, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.014049493242584013, |
|
"grad_norm": 3.0161404609680176, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 1.1352, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.014361704203530326, |
|
"grad_norm": 1.1186920404434204, |
|
"learning_rate": 0.00011423148382732853, |
|
"loss": 1.1374, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.014673915164476636, |
|
"grad_norm": 0.9820886850357056, |
|
"learning_rate": 0.00011108381999010111, |
|
"loss": 0.5135, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.014986126125422949, |
|
"grad_norm": 2.8473262786865234, |
|
"learning_rate": 0.00010792499568567884, |
|
"loss": 0.8812, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.01529833708636926, |
|
"grad_norm": 2.1481053829193115, |
|
"learning_rate": 0.00010475819158237425, |
|
"loss": 1.0178, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.015610548047315572, |
|
"grad_norm": 1.20015287399292, |
|
"learning_rate": 0.00010158659638348081, |
|
"loss": 1.0468, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 167, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.824081922120745e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|