|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.25460956377173916, |
|
"eval_steps": 500, |
|
"global_step": 3800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 6.700251678203663e-05, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3034, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0067002516782036625, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00019867336683417085, |
|
"loss": 1.0562, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.013400503356407325, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00019733333333333335, |
|
"loss": 0.9729, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02010075503461099, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00019599329983249582, |
|
"loss": 0.9693, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.02680100671281465, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.00019465326633165831, |
|
"loss": 0.9563, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.033501258391018314, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00019331323283082078, |
|
"loss": 0.9593, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04020151006922198, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00019197319932998325, |
|
"loss": 0.9478, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.046901761747425635, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00019063316582914575, |
|
"loss": 0.9474, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.0536020134256293, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00018929313232830821, |
|
"loss": 0.9438, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.060302265103832964, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0001879530988274707, |
|
"loss": 0.9297, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.06700251678203663, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00018661306532663318, |
|
"loss": 0.9297, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.07370276846024029, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00018527303182579565, |
|
"loss": 0.9218, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.08040302013844396, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00018393299832495814, |
|
"loss": 0.9129, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.0871032718166476, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0001825929648241206, |
|
"loss": 0.9156, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.09380352349485127, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00018125293132328308, |
|
"loss": 0.9151, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.10050377517305494, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.00017991289782244557, |
|
"loss": 0.9115, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.1072040268512586, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.00017857286432160804, |
|
"loss": 0.9114, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.11390427852946226, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00017723283082077054, |
|
"loss": 0.9064, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.12060453020766593, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.000175892797319933, |
|
"loss": 0.9039, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.12730478188586958, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.00017455276381909548, |
|
"loss": 0.8975, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.13400503356407326, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00017321273031825794, |
|
"loss": 0.8919, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.1407052852422769, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00017187269681742044, |
|
"loss": 0.8902, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.14740553692048058, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.00017053266331658293, |
|
"loss": 0.8941, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.15410578859868423, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0001691926298157454, |
|
"loss": 0.8928, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.1608060402768879, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00016785259631490787, |
|
"loss": 0.8889, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.16750629195509156, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00016651256281407034, |
|
"loss": 0.8825, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.1742065436332952, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00016517252931323284, |
|
"loss": 0.8742, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.1809067953114989, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.00016383249581239533, |
|
"loss": 0.878, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.18760704698970254, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0001624924623115578, |
|
"loss": 0.8679, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.19430729866790622, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00016115242881072027, |
|
"loss": 0.8618, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.20100755034610987, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.00015981239530988274, |
|
"loss": 0.8695, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.20770780202431355, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00015847236180904523, |
|
"loss": 0.8769, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.2144080537025172, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00015713232830820773, |
|
"loss": 0.8665, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.22110830538072088, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0001557922948073702, |
|
"loss": 0.8572, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.22780855705892453, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00015445226130653266, |
|
"loss": 0.8696, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.23450880873712818, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00015311222780569513, |
|
"loss": 0.8543, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.24120906041533186, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00015177219430485763, |
|
"loss": 0.8618, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.2479093120935355, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.00015043216080402012, |
|
"loss": 0.8553, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.25460956377173916, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0001490921273031826, |
|
"loss": 0.8541, |
|
"step": 3800 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 14925, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.0541408970364707e+19, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|