|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.662921348314606, |
|
"eval_steps": 500, |
|
"global_step": 593, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11235955056179775, |
|
"grad_norm": 3.5541882514953613, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 0.9261, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.2247191011235955, |
|
"grad_norm": 1.3109055757522583, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 0.3754, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.33707865168539325, |
|
"grad_norm": 1.4723390340805054, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2649, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.449438202247191, |
|
"grad_norm": 0.7010077238082886, |
|
"learning_rate": 0.000199844353174683, |
|
"loss": 0.2166, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5617977528089888, |
|
"grad_norm": 1.1943029165267944, |
|
"learning_rate": 0.00019937789721741653, |
|
"loss": 0.1767, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6741573033707865, |
|
"grad_norm": 0.5831332206726074, |
|
"learning_rate": 0.00019860208417597864, |
|
"loss": 0.1481, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.7865168539325843, |
|
"grad_norm": 1.139088749885559, |
|
"learning_rate": 0.00019751932910710805, |
|
"loss": 0.1318, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.898876404494382, |
|
"grad_norm": 1.130523681640625, |
|
"learning_rate": 0.00019613300255858616, |
|
"loss": 0.1277, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.0112359550561798, |
|
"grad_norm": 0.8881850242614746, |
|
"learning_rate": 0.0001944474200769355, |
|
"loss": 0.1202, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.1235955056179776, |
|
"grad_norm": 0.6445964574813843, |
|
"learning_rate": 0.00019246782877339766, |
|
"loss": 0.101, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.2359550561797752, |
|
"grad_norm": 0.45679864287376404, |
|
"learning_rate": 0.00019020039099000907, |
|
"loss": 0.1078, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.348314606741573, |
|
"grad_norm": 0.5455164909362793, |
|
"learning_rate": 0.0001876521651166215, |
|
"loss": 0.0931, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.4606741573033708, |
|
"grad_norm": 0.5435442924499512, |
|
"learning_rate": 0.00018483108361858262, |
|
"loss": 0.0866, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.5730337078651684, |
|
"grad_norm": 0.4356015622615814, |
|
"learning_rate": 0.00018174592834347504, |
|
"loss": 0.0899, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.6853932584269664, |
|
"grad_norm": 0.6584963798522949, |
|
"learning_rate": 0.00017840630318378232, |
|
"loss": 0.0905, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.797752808988764, |
|
"grad_norm": 0.500027060508728, |
|
"learning_rate": 0.00017482260418058164, |
|
"loss": 0.091, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.9101123595505618, |
|
"grad_norm": 0.720208466053009, |
|
"learning_rate": 0.00017100598716132773, |
|
"loss": 0.0805, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.0224719101123596, |
|
"grad_norm": 0.7830264568328857, |
|
"learning_rate": 0.0001669683330124706, |
|
"loss": 0.0779, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.134831460674157, |
|
"grad_norm": 0.6720086932182312, |
|
"learning_rate": 0.0001627222106950102, |
|
"loss": 0.0731, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.247191011235955, |
|
"grad_norm": 0.4705169200897217, |
|
"learning_rate": 0.0001582808381181189, |
|
"loss": 0.0705, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.359550561797753, |
|
"grad_norm": 0.40476611256599426, |
|
"learning_rate": 0.0001536580409926296, |
|
"loss": 0.0715, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.4719101123595504, |
|
"grad_norm": 0.5198217034339905, |
|
"learning_rate": 0.0001488682097924756, |
|
"loss": 0.0658, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.5842696629213484, |
|
"grad_norm": 0.3450973331928253, |
|
"learning_rate": 0.00014392625495805912, |
|
"loss": 0.0613, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.696629213483146, |
|
"grad_norm": 0.4224022626876831, |
|
"learning_rate": 0.00013884756048099687, |
|
"loss": 0.0663, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.808988764044944, |
|
"grad_norm": 0.3117457926273346, |
|
"learning_rate": 0.00013364793601473106, |
|
"loss": 0.0577, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.9213483146067416, |
|
"grad_norm": 0.2872294783592224, |
|
"learning_rate": 0.00012834356766008197, |
|
"loss": 0.0626, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.033707865168539, |
|
"grad_norm": 0.4661742150783539, |
|
"learning_rate": 0.0001229509675789439, |
|
"loss": 0.0619, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.146067415730337, |
|
"grad_norm": 0.3543541729450226, |
|
"learning_rate": 0.00011748692259297347, |
|
"loss": 0.0494, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.258426966292135, |
|
"grad_norm": 0.3537474572658539, |
|
"learning_rate": 0.00011196844192727984, |
|
"loss": 0.052, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.370786516853933, |
|
"grad_norm": 0.3915044367313385, |
|
"learning_rate": 0.00010641270426178676, |
|
"loss": 0.0547, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.4831460674157304, |
|
"grad_norm": 0.3162959814071655, |
|
"learning_rate": 0.00010083700425509279, |
|
"loss": 0.0524, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 3.595505617977528, |
|
"grad_norm": 0.38653671741485596, |
|
"learning_rate": 9.52586987072972e-05, |
|
"loss": 0.0465, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.7078651685393256, |
|
"grad_norm": 0.3417101800441742, |
|
"learning_rate": 8.969515252938322e-05, |
|
"loss": 0.0482, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.8202247191011236, |
|
"grad_norm": 0.28841620683670044, |
|
"learning_rate": 8.41636846873528e-05, |
|
"loss": 0.0564, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.932584269662921, |
|
"grad_norm": 0.3317897915840149, |
|
"learning_rate": 7.868151428938502e-05, |
|
"loss": 0.0458, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.044943820224719, |
|
"grad_norm": 0.1963573545217514, |
|
"learning_rate": 7.326570698384568e-05, |
|
"loss": 0.0498, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 4.157303370786517, |
|
"grad_norm": 0.2925940454006195, |
|
"learning_rate": 6.793312183500759e-05, |
|
"loss": 0.0411, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 4.269662921348314, |
|
"grad_norm": 0.2857739329338074, |
|
"learning_rate": 6.270035884185367e-05, |
|
"loss": 0.0391, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 4.382022471910112, |
|
"grad_norm": 0.3383257985115051, |
|
"learning_rate": 5.758370726333434e-05, |
|
"loss": 0.0448, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 4.49438202247191, |
|
"grad_norm": 0.2550479471683502, |
|
"learning_rate": 5.2599094910938594e-05, |
|
"loss": 0.0382, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 4.606741573033708, |
|
"grad_norm": 0.21575377881526947, |
|
"learning_rate": 4.7762038566428155e-05, |
|
"loss": 0.043, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 4.719101123595506, |
|
"grad_norm": 0.24605822563171387, |
|
"learning_rate": 4.3087595679081096e-05, |
|
"loss": 0.0417, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 4.831460674157303, |
|
"grad_norm": 0.2827017903327942, |
|
"learning_rate": 3.8590317492808236e-05, |
|
"loss": 0.0421, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 4.943820224719101, |
|
"grad_norm": 0.2163945734500885, |
|
"learning_rate": 3.428420374905483e-05, |
|
"loss": 0.0412, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 5.056179775280899, |
|
"grad_norm": 0.22983698546886444, |
|
"learning_rate": 3.0182659106494192e-05, |
|
"loss": 0.0393, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 5.168539325842697, |
|
"grad_norm": 0.25033506751060486, |
|
"learning_rate": 2.629845141317656e-05, |
|
"loss": 0.0358, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 5.280898876404494, |
|
"grad_norm": 0.21685783565044403, |
|
"learning_rate": 2.264367196102869e-05, |
|
"loss": 0.0332, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 5.393258426966292, |
|
"grad_norm": 0.2170724868774414, |
|
"learning_rate": 1.9229697846429773e-05, |
|
"loss": 0.0379, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 5.50561797752809, |
|
"grad_norm": 0.17588071525096893, |
|
"learning_rate": 1.606715655403289e-05, |
|
"loss": 0.033, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 5.617977528089888, |
|
"grad_norm": 0.3224557936191559, |
|
"learning_rate": 1.3165892874079899e-05, |
|
"loss": 0.0379, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 5.730337078651686, |
|
"grad_norm": 0.443591445684433, |
|
"learning_rate": 1.0534938256194671e-05, |
|
"loss": 0.0352, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 5.842696629213483, |
|
"grad_norm": 0.23971615731716156, |
|
"learning_rate": 8.182482695053728e-06, |
|
"loss": 0.0343, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 5.955056179775281, |
|
"grad_norm": 0.33371302485466003, |
|
"learning_rate": 6.1158492354529195e-06, |
|
"loss": 0.0346, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 6.067415730337078, |
|
"grad_norm": 0.2301403433084488, |
|
"learning_rate": 4.3414711761338375e-06, |
|
"loss": 0.0343, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 6.179775280898877, |
|
"grad_norm": 0.16674034297466278, |
|
"learning_rate": 2.8648720433333996e-06, |
|
"loss": 0.03, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 6.292134831460674, |
|
"grad_norm": 0.2350272834300995, |
|
"learning_rate": 1.6906483963973207e-06, |
|
"loss": 0.0344, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 6.404494382022472, |
|
"grad_norm": 0.1408814638853073, |
|
"learning_rate": 8.224555189827565e-07, |
|
"loss": 0.0387, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 6.51685393258427, |
|
"grad_norm": 0.2586834132671356, |
|
"learning_rate": 2.629960403923715e-07, |
|
"loss": 0.0329, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 6.629213483146067, |
|
"grad_norm": 0.21313035488128662, |
|
"learning_rate": 1.4011522460866122e-08, |
|
"loss": 0.0324, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 6.662921348314606, |
|
"step": 593, |
|
"total_flos": 8.40154581763943e+16, |
|
"train_loss": 0.0879759074865666, |
|
"train_runtime": 704.4365, |
|
"train_samples_per_second": 53.876, |
|
"train_steps_per_second": 0.842 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 593, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 7, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.40154581763943e+16, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|