|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.030517578125, |
|
"eval_steps": 500, |
|
"global_step": 8000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0003814697265625, |
|
"grad_norm": 1.8202160596847534, |
|
"learning_rate": 4e-05, |
|
"loss": 1.7732, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.000762939453125, |
|
"grad_norm": 1.0439223051071167, |
|
"learning_rate": 8e-05, |
|
"loss": 0.3055, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0011444091796875, |
|
"grad_norm": 0.49549755454063416, |
|
"learning_rate": 0.00012, |
|
"loss": 0.2596, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.00152587890625, |
|
"grad_norm": 0.9390599727630615, |
|
"learning_rate": 0.00016, |
|
"loss": 0.2547, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.0019073486328125, |
|
"grad_norm": 0.4973730444908142, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2534, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.002288818359375, |
|
"grad_norm": 0.45400354266166687, |
|
"learning_rate": 0.000199748427672956, |
|
"loss": 0.2446, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.0026702880859375, |
|
"grad_norm": 0.39165329933166504, |
|
"learning_rate": 0.00019949685534591195, |
|
"loss": 0.2411, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.0030517578125, |
|
"grad_norm": 0.4248354434967041, |
|
"learning_rate": 0.00019924528301886794, |
|
"loss": 0.2392, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.0034332275390625, |
|
"grad_norm": 0.35752373933792114, |
|
"learning_rate": 0.0001989937106918239, |
|
"loss": 0.236, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.003814697265625, |
|
"grad_norm": 0.39206448197364807, |
|
"learning_rate": 0.00019874213836477988, |
|
"loss": 0.2322, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0041961669921875, |
|
"grad_norm": 0.3509558439254761, |
|
"learning_rate": 0.00019849056603773587, |
|
"loss": 0.2312, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.00457763671875, |
|
"grad_norm": 0.3513820171356201, |
|
"learning_rate": 0.00019823899371069183, |
|
"loss": 0.2308, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.0049591064453125, |
|
"grad_norm": 0.434176504611969, |
|
"learning_rate": 0.0001979874213836478, |
|
"loss": 0.2284, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.005340576171875, |
|
"grad_norm": 0.37612399458885193, |
|
"learning_rate": 0.0001977358490566038, |
|
"loss": 0.2289, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.0057220458984375, |
|
"grad_norm": 0.3991953134536743, |
|
"learning_rate": 0.00019748427672955975, |
|
"loss": 0.23, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.006103515625, |
|
"grad_norm": 0.4121605157852173, |
|
"learning_rate": 0.00019723270440251574, |
|
"loss": 0.2284, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.0064849853515625, |
|
"grad_norm": 0.3937987983226776, |
|
"learning_rate": 0.0001969811320754717, |
|
"loss": 0.2249, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.006866455078125, |
|
"grad_norm": 0.2995181083679199, |
|
"learning_rate": 0.00019672955974842768, |
|
"loss": 0.2257, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.0072479248046875, |
|
"grad_norm": 0.5119357705116272, |
|
"learning_rate": 0.00019647798742138367, |
|
"loss": 0.2292, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.00762939453125, |
|
"grad_norm": 0.31295427680015564, |
|
"learning_rate": 0.00019622641509433963, |
|
"loss": 0.2289, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.0080108642578125, |
|
"grad_norm": 0.2797456979751587, |
|
"learning_rate": 0.0001959748427672956, |
|
"loss": 0.2232, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.008392333984375, |
|
"grad_norm": 0.45458996295928955, |
|
"learning_rate": 0.00019572327044025157, |
|
"loss": 0.2244, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.0087738037109375, |
|
"grad_norm": 0.29631954431533813, |
|
"learning_rate": 0.00019547169811320755, |
|
"loss": 0.2234, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.0091552734375, |
|
"grad_norm": 0.5060445070266724, |
|
"learning_rate": 0.00019522012578616354, |
|
"loss": 0.2265, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.0095367431640625, |
|
"grad_norm": 0.28566980361938477, |
|
"learning_rate": 0.0001949685534591195, |
|
"loss": 0.2279, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.009918212890625, |
|
"grad_norm": 0.24325500428676605, |
|
"learning_rate": 0.00019471698113207548, |
|
"loss": 0.2306, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.0102996826171875, |
|
"grad_norm": 0.3140350878238678, |
|
"learning_rate": 0.00019446540880503147, |
|
"loss": 0.2234, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.01068115234375, |
|
"grad_norm": 0.4366394877433777, |
|
"learning_rate": 0.00019421383647798743, |
|
"loss": 0.2224, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.0110626220703125, |
|
"grad_norm": 0.27782708406448364, |
|
"learning_rate": 0.0001939622641509434, |
|
"loss": 0.2236, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.011444091796875, |
|
"grad_norm": 0.3332788944244385, |
|
"learning_rate": 0.00019371069182389937, |
|
"loss": 0.2241, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.0118255615234375, |
|
"grad_norm": 0.3888827860355377, |
|
"learning_rate": 0.00019345911949685536, |
|
"loss": 0.2217, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.01220703125, |
|
"grad_norm": 0.24029745161533356, |
|
"learning_rate": 0.00019320754716981134, |
|
"loss": 0.2216, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.0125885009765625, |
|
"grad_norm": 1.8477509021759033, |
|
"learning_rate": 0.0001929559748427673, |
|
"loss": 0.2252, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.012969970703125, |
|
"grad_norm": 0.5924927592277527, |
|
"learning_rate": 0.00019270440251572328, |
|
"loss": 0.2352, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.0133514404296875, |
|
"grad_norm": 0.33940935134887695, |
|
"learning_rate": 0.00019245283018867927, |
|
"loss": 0.2253, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.01373291015625, |
|
"grad_norm": 0.3898316025733948, |
|
"learning_rate": 0.00019220125786163523, |
|
"loss": 0.2216, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.0141143798828125, |
|
"grad_norm": 0.2601265609264374, |
|
"learning_rate": 0.0001919496855345912, |
|
"loss": 0.2261, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.014495849609375, |
|
"grad_norm": 0.32615959644317627, |
|
"learning_rate": 0.00019169811320754717, |
|
"loss": 0.2225, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.0148773193359375, |
|
"grad_norm": 0.2891947627067566, |
|
"learning_rate": 0.00019144654088050316, |
|
"loss": 0.2216, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.0152587890625, |
|
"grad_norm": 0.2846430242061615, |
|
"learning_rate": 0.00019119496855345914, |
|
"loss": 0.2197, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.0156402587890625, |
|
"grad_norm": 0.2938269078731537, |
|
"learning_rate": 0.0001909433962264151, |
|
"loss": 0.2212, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.016021728515625, |
|
"grad_norm": 0.2718958258628845, |
|
"learning_rate": 0.00019069182389937108, |
|
"loss": 0.2205, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.0164031982421875, |
|
"grad_norm": 0.3561397194862366, |
|
"learning_rate": 0.00019044025157232704, |
|
"loss": 0.2205, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.01678466796875, |
|
"grad_norm": 0.4546607732772827, |
|
"learning_rate": 0.00019018867924528303, |
|
"loss": 0.2234, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.0171661376953125, |
|
"grad_norm": 0.29250577092170715, |
|
"learning_rate": 0.00018993710691823901, |
|
"loss": 0.2197, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.017547607421875, |
|
"grad_norm": 1.6952908039093018, |
|
"learning_rate": 0.00018968553459119497, |
|
"loss": 0.2217, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.0179290771484375, |
|
"grad_norm": 0.3261864483356476, |
|
"learning_rate": 0.00018943396226415096, |
|
"loss": 0.2269, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.018310546875, |
|
"grad_norm": 0.2668060064315796, |
|
"learning_rate": 0.00018918238993710694, |
|
"loss": 0.2203, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.0186920166015625, |
|
"grad_norm": 0.31689000129699707, |
|
"learning_rate": 0.0001889308176100629, |
|
"loss": 0.2201, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.019073486328125, |
|
"grad_norm": 0.26320216059684753, |
|
"learning_rate": 0.00018867924528301889, |
|
"loss": 0.2214, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.0194549560546875, |
|
"grad_norm": 0.26768413186073303, |
|
"learning_rate": 0.00018842767295597484, |
|
"loss": 0.2225, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.01983642578125, |
|
"grad_norm": 0.2808452248573303, |
|
"learning_rate": 0.00018817610062893083, |
|
"loss": 0.2208, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.0202178955078125, |
|
"grad_norm": 0.25958341360092163, |
|
"learning_rate": 0.00018792452830188681, |
|
"loss": 0.2207, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.020599365234375, |
|
"grad_norm": 0.22953402996063232, |
|
"learning_rate": 0.00018767295597484277, |
|
"loss": 0.2193, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.0209808349609375, |
|
"grad_norm": 0.9375737905502319, |
|
"learning_rate": 0.00018742138364779876, |
|
"loss": 0.2206, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.0213623046875, |
|
"grad_norm": 0.2852359712123871, |
|
"learning_rate": 0.00018716981132075472, |
|
"loss": 0.2211, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.0217437744140625, |
|
"grad_norm": 0.25367122888565063, |
|
"learning_rate": 0.0001869182389937107, |
|
"loss": 0.2191, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.022125244140625, |
|
"grad_norm": 0.2215207815170288, |
|
"learning_rate": 0.0001866666666666667, |
|
"loss": 0.2218, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.0225067138671875, |
|
"grad_norm": 0.24178574979305267, |
|
"learning_rate": 0.00018641509433962264, |
|
"loss": 0.2283, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.02288818359375, |
|
"grad_norm": 0.3638046979904175, |
|
"learning_rate": 0.00018616352201257863, |
|
"loss": 0.2217, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.0232696533203125, |
|
"grad_norm": 0.40834301710128784, |
|
"learning_rate": 0.00018591194968553462, |
|
"loss": 0.2204, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.023651123046875, |
|
"grad_norm": 0.24277737736701965, |
|
"learning_rate": 0.00018566037735849057, |
|
"loss": 0.2178, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.0240325927734375, |
|
"grad_norm": 0.3276098370552063, |
|
"learning_rate": 0.00018540880503144656, |
|
"loss": 0.2245, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.0244140625, |
|
"grad_norm": 0.40407466888427734, |
|
"learning_rate": 0.00018515723270440252, |
|
"loss": 0.2258, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.0247955322265625, |
|
"grad_norm": 0.31675395369529724, |
|
"learning_rate": 0.0001849056603773585, |
|
"loss": 0.223, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.025177001953125, |
|
"grad_norm": 0.2858389616012573, |
|
"learning_rate": 0.0001846540880503145, |
|
"loss": 0.2201, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.0255584716796875, |
|
"grad_norm": 0.2711004912853241, |
|
"learning_rate": 0.00018440251572327045, |
|
"loss": 0.2175, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.02593994140625, |
|
"grad_norm": 0.24398334324359894, |
|
"learning_rate": 0.00018415094339622643, |
|
"loss": 0.2195, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.0263214111328125, |
|
"grad_norm": 0.29580453038215637, |
|
"learning_rate": 0.0001838993710691824, |
|
"loss": 0.2198, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.026702880859375, |
|
"grad_norm": 0.2624952495098114, |
|
"learning_rate": 0.00018364779874213837, |
|
"loss": 0.217, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.0270843505859375, |
|
"grad_norm": 0.2129925936460495, |
|
"learning_rate": 0.00018339622641509436, |
|
"loss": 0.2188, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.0274658203125, |
|
"grad_norm": 0.27471479773521423, |
|
"learning_rate": 0.00018314465408805032, |
|
"loss": 0.2174, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.0278472900390625, |
|
"grad_norm": 1.0204274654388428, |
|
"learning_rate": 0.0001828930817610063, |
|
"loss": 0.2186, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.028228759765625, |
|
"grad_norm": 0.5174055695533752, |
|
"learning_rate": 0.0001826415094339623, |
|
"loss": 0.2198, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.0286102294921875, |
|
"grad_norm": 1.7667677402496338, |
|
"learning_rate": 0.00018238993710691825, |
|
"loss": 0.2221, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.02899169921875, |
|
"grad_norm": 0.34651100635528564, |
|
"learning_rate": 0.00018213836477987423, |
|
"loss": 0.2215, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.0293731689453125, |
|
"grad_norm": 0.2900320589542389, |
|
"learning_rate": 0.0001818867924528302, |
|
"loss": 0.2184, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.029754638671875, |
|
"grad_norm": 0.21523432433605194, |
|
"learning_rate": 0.00018163522012578617, |
|
"loss": 0.2171, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.0301361083984375, |
|
"grad_norm": 0.28846126794815063, |
|
"learning_rate": 0.00018138364779874216, |
|
"loss": 0.2175, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.030517578125, |
|
"grad_norm": 0.27318933606147766, |
|
"learning_rate": 0.00018113207547169812, |
|
"loss": 0.218, |
|
"step": 8000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 80000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.909320422780109e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|