|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 14.629948364888124, |
|
"eval_steps": 5000, |
|
"global_step": 8500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1721170395869191, |
|
"grad_norm": 134.16664123535156, |
|
"learning_rate": 2.224770642201835e-06, |
|
"loss": 10.8697, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3442340791738382, |
|
"grad_norm": 69.6041259765625, |
|
"learning_rate": 4.5183486238532115e-06, |
|
"loss": 9.1125, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5163511187607573, |
|
"grad_norm": 103.4394760131836, |
|
"learning_rate": 6.8119266055045875e-06, |
|
"loss": 6.8873, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6884681583476764, |
|
"grad_norm": 32.237342834472656, |
|
"learning_rate": 9.08256880733945e-06, |
|
"loss": 3.1124, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8605851979345955, |
|
"grad_norm": 13.523033142089844, |
|
"learning_rate": 1.1376146788990828e-05, |
|
"loss": 1.0882, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0327022375215147, |
|
"grad_norm": 56.13002395629883, |
|
"learning_rate": 1.3669724770642203e-05, |
|
"loss": 0.869, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.2048192771084336, |
|
"grad_norm": 4.41276741027832, |
|
"learning_rate": 1.5963302752293578e-05, |
|
"loss": 0.6952, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.3769363166953528, |
|
"grad_norm": 1.8771318197250366, |
|
"learning_rate": 1.8256880733944955e-05, |
|
"loss": 0.5522, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.549053356282272, |
|
"grad_norm": 5.131401062011719, |
|
"learning_rate": 1.9938798928981258e-05, |
|
"loss": 0.5184, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.721170395869191, |
|
"grad_norm": 3.2520999908447266, |
|
"learning_rate": 1.9683794466403164e-05, |
|
"loss": 0.3996, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.8932874354561102, |
|
"grad_norm": 1.1916402578353882, |
|
"learning_rate": 1.9428790003825067e-05, |
|
"loss": 0.6316, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.0654044750430294, |
|
"grad_norm": 3.3565962314605713, |
|
"learning_rate": 1.9173785541246974e-05, |
|
"loss": 0.5352, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.2375215146299485, |
|
"grad_norm": 2.182133436203003, |
|
"learning_rate": 1.8918781078668877e-05, |
|
"loss": 0.3731, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.4096385542168672, |
|
"grad_norm": 3.537748098373413, |
|
"learning_rate": 1.8663776616090783e-05, |
|
"loss": 0.3376, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.581755593803787, |
|
"grad_norm": 2.066549777984619, |
|
"learning_rate": 1.840877215351269e-05, |
|
"loss": 0.597, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.7538726333907055, |
|
"grad_norm": 2.867453098297119, |
|
"learning_rate": 1.8153767690934592e-05, |
|
"loss": 0.5737, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.9259896729776247, |
|
"grad_norm": 0.8096536993980408, |
|
"learning_rate": 1.7898763228356495e-05, |
|
"loss": 0.7107, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.098106712564544, |
|
"grad_norm": 5.293230056762695, |
|
"learning_rate": 1.7643758765778402e-05, |
|
"loss": 0.4356, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.270223752151463, |
|
"grad_norm": 2.1939845085144043, |
|
"learning_rate": 1.7388754303200308e-05, |
|
"loss": 0.5581, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.442340791738382, |
|
"grad_norm": 2.1973116397857666, |
|
"learning_rate": 1.713374984062221e-05, |
|
"loss": 0.2012, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.6144578313253013, |
|
"grad_norm": 1.3364547491073608, |
|
"learning_rate": 1.6878745378044118e-05, |
|
"loss": 0.3906, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.7865748709122204, |
|
"grad_norm": 3.2359094619750977, |
|
"learning_rate": 1.662374091546602e-05, |
|
"loss": 0.5386, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.958691910499139, |
|
"grad_norm": 31.699663162231445, |
|
"learning_rate": 1.6368736452887927e-05, |
|
"loss": 0.2624, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.130808950086059, |
|
"grad_norm": 92.98713684082031, |
|
"learning_rate": 1.611373199030983e-05, |
|
"loss": 0.3573, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.3029259896729775, |
|
"grad_norm": 2.056157350540161, |
|
"learning_rate": 1.5858727527731736e-05, |
|
"loss": 0.4798, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.475043029259897, |
|
"grad_norm": 7.822810649871826, |
|
"learning_rate": 1.5606273109779423e-05, |
|
"loss": 0.2465, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.647160068846816, |
|
"grad_norm": 1.6002038717269897, |
|
"learning_rate": 1.5351268647201326e-05, |
|
"loss": 0.3482, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.8192771084337345, |
|
"grad_norm": 2.061086416244507, |
|
"learning_rate": 1.5096264184623233e-05, |
|
"loss": 0.1915, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.991394148020654, |
|
"grad_norm": 1.1744683980941772, |
|
"learning_rate": 1.4841259722045136e-05, |
|
"loss": 0.4617, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 5.163511187607573, |
|
"grad_norm": 2.5757875442504883, |
|
"learning_rate": 1.4586255259467042e-05, |
|
"loss": 0.2874, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.335628227194492, |
|
"grad_norm": 8.106232643127441, |
|
"learning_rate": 1.4331250796888947e-05, |
|
"loss": 0.4636, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 5.507745266781411, |
|
"grad_norm": 2.139594316482544, |
|
"learning_rate": 1.4076246334310853e-05, |
|
"loss": 0.1344, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 5.679862306368331, |
|
"grad_norm": 8.198427200317383, |
|
"learning_rate": 1.3821241871732756e-05, |
|
"loss": 0.3615, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 5.851979345955249, |
|
"grad_norm": 0.706113338470459, |
|
"learning_rate": 1.3566237409154661e-05, |
|
"loss": 0.309, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 6.024096385542169, |
|
"grad_norm": 1.0154913663864136, |
|
"learning_rate": 1.3311232946576567e-05, |
|
"loss": 0.1883, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 6.196213425129088, |
|
"grad_norm": 27.715837478637695, |
|
"learning_rate": 1.3056228483998472e-05, |
|
"loss": 0.4029, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 6.368330464716007, |
|
"grad_norm": 3.0514609813690186, |
|
"learning_rate": 1.2801224021420375e-05, |
|
"loss": 0.2082, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 6.540447504302926, |
|
"grad_norm": 3.8193249702453613, |
|
"learning_rate": 1.2546219558842281e-05, |
|
"loss": 0.1333, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 6.712564543889846, |
|
"grad_norm": 1.4768047332763672, |
|
"learning_rate": 1.2291215096264186e-05, |
|
"loss": 0.1509, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 6.884681583476764, |
|
"grad_norm": 1.5106594562530518, |
|
"learning_rate": 1.2036210633686089e-05, |
|
"loss": 0.6264, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 7.056798623063683, |
|
"grad_norm": 1.1024622917175293, |
|
"learning_rate": 1.1781206171107995e-05, |
|
"loss": 0.2177, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 7.228915662650603, |
|
"grad_norm": 0.900026798248291, |
|
"learning_rate": 1.15262017085299e-05, |
|
"loss": 0.1957, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 7.401032702237521, |
|
"grad_norm": 144.5244140625, |
|
"learning_rate": 1.1271197245951807e-05, |
|
"loss": 0.2887, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 7.573149741824441, |
|
"grad_norm": 4.466265678405762, |
|
"learning_rate": 1.101619278337371e-05, |
|
"loss": 0.2271, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 7.74526678141136, |
|
"grad_norm": 2.862029790878296, |
|
"learning_rate": 1.0761188320795614e-05, |
|
"loss": 0.3486, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 7.917383820998279, |
|
"grad_norm": 1.178603172302246, |
|
"learning_rate": 1.050618385821752e-05, |
|
"loss": 0.4429, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 8.089500860585199, |
|
"grad_norm": 6.430075645446777, |
|
"learning_rate": 1.0251179395639424e-05, |
|
"loss": 0.4398, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 8.261617900172118, |
|
"grad_norm": 6.42482852935791, |
|
"learning_rate": 9.996174933061328e-06, |
|
"loss": 0.31, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 8.433734939759036, |
|
"grad_norm": 6.2779622077941895, |
|
"learning_rate": 9.743720515109015e-06, |
|
"loss": 0.2045, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 8.605851979345955, |
|
"grad_norm": 4.175030708312988, |
|
"learning_rate": 9.48871605253092e-06, |
|
"loss": 0.2583, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 8.605851979345955, |
|
"eval_loss": 0.23712533712387085, |
|
"eval_runtime": 13.9021, |
|
"eval_samples_per_second": 1335.195, |
|
"eval_steps_per_second": 10.502, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 8.777969018932874, |
|
"grad_norm": 1.0522035360336304, |
|
"learning_rate": 9.233711589952825e-06, |
|
"loss": 0.2774, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 8.950086058519794, |
|
"grad_norm": 0.9467515349388123, |
|
"learning_rate": 8.98125717200051e-06, |
|
"loss": 0.1902, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 9.122203098106713, |
|
"grad_norm": 9.148195266723633, |
|
"learning_rate": 8.726252709422416e-06, |
|
"loss": 0.3058, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 9.294320137693632, |
|
"grad_norm": 9.301542282104492, |
|
"learning_rate": 8.471248246844321e-06, |
|
"loss": 0.3742, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 9.46643717728055, |
|
"grad_norm": 17.278079986572266, |
|
"learning_rate": 8.216243784266226e-06, |
|
"loss": 0.2972, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 9.638554216867469, |
|
"grad_norm": 4.043286323547363, |
|
"learning_rate": 7.96123932168813e-06, |
|
"loss": 0.3084, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 9.81067125645439, |
|
"grad_norm": 2.227259874343872, |
|
"learning_rate": 7.706234859110035e-06, |
|
"loss": 0.1215, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 9.982788296041308, |
|
"grad_norm": 0.9134290218353271, |
|
"learning_rate": 7.45123039653194e-06, |
|
"loss": 0.1876, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 10.154905335628227, |
|
"grad_norm": 1.4163002967834473, |
|
"learning_rate": 7.196225933953844e-06, |
|
"loss": 0.1702, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 10.327022375215146, |
|
"grad_norm": 1.564228892326355, |
|
"learning_rate": 6.94122147137575e-06, |
|
"loss": 0.2506, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 10.499139414802066, |
|
"grad_norm": 5.47558069229126, |
|
"learning_rate": 6.686217008797654e-06, |
|
"loss": 0.2852, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 10.671256454388985, |
|
"grad_norm": 10.801889419555664, |
|
"learning_rate": 6.431212546219559e-06, |
|
"loss": 0.2354, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 10.843373493975903, |
|
"grad_norm": 1.8754569292068481, |
|
"learning_rate": 6.176208083641464e-06, |
|
"loss": 0.214, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 11.015490533562822, |
|
"grad_norm": 2.237508773803711, |
|
"learning_rate": 5.9212036210633696e-06, |
|
"loss": 0.3815, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 11.187607573149743, |
|
"grad_norm": 13.412964820861816, |
|
"learning_rate": 5.666199158485273e-06, |
|
"loss": 0.0803, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 11.359724612736661, |
|
"grad_norm": 0.996343195438385, |
|
"learning_rate": 5.411194695907179e-06, |
|
"loss": 0.1941, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 11.53184165232358, |
|
"grad_norm": 64.30641174316406, |
|
"learning_rate": 5.156190233329084e-06, |
|
"loss": 0.1576, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 11.703958691910499, |
|
"grad_norm": 0.9045078158378601, |
|
"learning_rate": 4.901185770750988e-06, |
|
"loss": 0.2911, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 11.876075731497417, |
|
"grad_norm": 1.798627495765686, |
|
"learning_rate": 4.646181308172894e-06, |
|
"loss": 0.4913, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 12.048192771084338, |
|
"grad_norm": 6.165831565856934, |
|
"learning_rate": 4.3911768455947986e-06, |
|
"loss": 0.2759, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 12.220309810671257, |
|
"grad_norm": 3.460507392883301, |
|
"learning_rate": 4.136172383016703e-06, |
|
"loss": 0.2928, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 12.392426850258175, |
|
"grad_norm": 5.5960187911987305, |
|
"learning_rate": 3.881167920438608e-06, |
|
"loss": 0.2181, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 12.564543889845094, |
|
"grad_norm": 1.473883032798767, |
|
"learning_rate": 3.6261634578605126e-06, |
|
"loss": 0.1286, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 12.736660929432015, |
|
"grad_norm": 10.179828643798828, |
|
"learning_rate": 3.3711589952824173e-06, |
|
"loss": 0.3342, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 12.908777969018933, |
|
"grad_norm": 3.2593960762023926, |
|
"learning_rate": 3.1187045773301034e-06, |
|
"loss": 0.1577, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 13.080895008605852, |
|
"grad_norm": 1.169028878211975, |
|
"learning_rate": 2.863700114752008e-06, |
|
"loss": 0.2578, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 13.25301204819277, |
|
"grad_norm": 1.2204866409301758, |
|
"learning_rate": 2.6086956521739132e-06, |
|
"loss": 0.2844, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 13.42512908777969, |
|
"grad_norm": 1.1637088060379028, |
|
"learning_rate": 2.353691189595818e-06, |
|
"loss": 0.0917, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 13.59724612736661, |
|
"grad_norm": 0.6639829277992249, |
|
"learning_rate": 2.098686727017723e-06, |
|
"loss": 0.2617, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 13.769363166953529, |
|
"grad_norm": 4.155405044555664, |
|
"learning_rate": 1.843682264439628e-06, |
|
"loss": 0.3021, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 13.941480206540447, |
|
"grad_norm": 1.9663244485855103, |
|
"learning_rate": 1.5886778018615326e-06, |
|
"loss": 0.1036, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 14.113597246127366, |
|
"grad_norm": 32.85494613647461, |
|
"learning_rate": 1.3336733392834375e-06, |
|
"loss": 0.5471, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 14.285714285714286, |
|
"grad_norm": 0.6067169904708862, |
|
"learning_rate": 1.0786688767053424e-06, |
|
"loss": 0.2395, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 14.457831325301205, |
|
"grad_norm": 0.9747382998466492, |
|
"learning_rate": 8.236644141272474e-07, |
|
"loss": 0.2664, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 14.629948364888124, |
|
"grad_norm": 21.624757766723633, |
|
"learning_rate": 5.686599515491522e-07, |
|
"loss": 0.2697, |
|
"step": 8500 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 8715, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|