|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.4339419978517722, |
|
"eval_steps": 500, |
|
"global_step": 800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04296455424274973, |
|
"grad_norm": 1.6397182941436768, |
|
"learning_rate": 5e-06, |
|
"loss": 3.4918, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08592910848549946, |
|
"grad_norm": 1.1852494478225708, |
|
"learning_rate": 4.998023493068255e-06, |
|
"loss": 3.1439, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1288936627282492, |
|
"grad_norm": 0.813017725944519, |
|
"learning_rate": 4.99209709753674e-06, |
|
"loss": 2.7821, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.17185821697099893, |
|
"grad_norm": 0.8324390053749084, |
|
"learning_rate": 4.982230184254934e-06, |
|
"loss": 2.5004, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.21482277121374865, |
|
"grad_norm": 0.7486415505409241, |
|
"learning_rate": 4.968438354840834e-06, |
|
"loss": 2.2932, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2577873254564984, |
|
"grad_norm": 0.8176449537277222, |
|
"learning_rate": 4.950743417011591e-06, |
|
"loss": 2.0388, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3007518796992481, |
|
"grad_norm": 1.1593029499053955, |
|
"learning_rate": 4.929173350101025e-06, |
|
"loss": 1.798, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.34371643394199786, |
|
"grad_norm": 0.8432386517524719, |
|
"learning_rate": 4.903762260818552e-06, |
|
"loss": 1.5069, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3866809881847476, |
|
"grad_norm": 0.5978631973266602, |
|
"learning_rate": 4.874550329319457e-06, |
|
"loss": 1.2985, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4296455424274973, |
|
"grad_norm": 0.43345287442207336, |
|
"learning_rate": 4.84158374567182e-06, |
|
"loss": 1.1899, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.47261009667024706, |
|
"grad_norm": 0.41255077719688416, |
|
"learning_rate": 4.804914636820517e-06, |
|
"loss": 1.207, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5155746509129968, |
|
"grad_norm": 0.3531062602996826, |
|
"learning_rate": 4.764600984163809e-06, |
|
"loss": 1.097, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5585392051557465, |
|
"grad_norm": 0.3125358819961548, |
|
"learning_rate": 4.72070653187283e-06, |
|
"loss": 1.0538, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6015037593984962, |
|
"grad_norm": 0.3715002238750458, |
|
"learning_rate": 4.673300686098957e-06, |
|
"loss": 1.0238, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.644468313641246, |
|
"grad_norm": 0.3236304819583893, |
|
"learning_rate": 4.622458405228411e-06, |
|
"loss": 1.0329, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6874328678839957, |
|
"grad_norm": 0.389879435300827, |
|
"learning_rate": 4.568260081357644e-06, |
|
"loss": 1.0452, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.7303974221267454, |
|
"grad_norm": 0.3259812593460083, |
|
"learning_rate": 4.510791413176912e-06, |
|
"loss": 1.0407, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7733619763694952, |
|
"grad_norm": 0.3405851125717163, |
|
"learning_rate": 4.450143270463031e-06, |
|
"loss": 1.0219, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8163265306122449, |
|
"grad_norm": 0.35685139894485474, |
|
"learning_rate": 4.386411550395576e-06, |
|
"loss": 0.9988, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8592910848549946, |
|
"grad_norm": 0.3287046253681183, |
|
"learning_rate": 4.319697025923736e-06, |
|
"loss": 1.0164, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.9022556390977443, |
|
"grad_norm": 0.33687785267829895, |
|
"learning_rate": 4.250105186423564e-06, |
|
"loss": 0.9938, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9452201933404941, |
|
"grad_norm": 0.40256965160369873, |
|
"learning_rate": 4.177746070897593e-06, |
|
"loss": 1.001, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9881847475832438, |
|
"grad_norm": 0.48085835576057434, |
|
"learning_rate": 4.10273409398055e-06, |
|
"loss": 1.0145, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.0300751879699248, |
|
"grad_norm": 0.4048563241958618, |
|
"learning_rate": 4.025187865026311e-06, |
|
"loss": 0.9861, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.0730397422126745, |
|
"grad_norm": 0.34213146567344666, |
|
"learning_rate": 3.945230000562121e-06, |
|
"loss": 0.9843, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.1160042964554242, |
|
"grad_norm": 0.3930899500846863, |
|
"learning_rate": 3.862986930406669e-06, |
|
"loss": 0.9539, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.158968850698174, |
|
"grad_norm": 0.41636350750923157, |
|
"learning_rate": 3.7785886977585562e-06, |
|
"loss": 0.9629, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.2019334049409238, |
|
"grad_norm": 0.3695228397846222, |
|
"learning_rate": 3.6921687535712657e-06, |
|
"loss": 0.9778, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.2448979591836735, |
|
"grad_norm": 0.3623664677143097, |
|
"learning_rate": 3.6038637455397802e-06, |
|
"loss": 0.9613, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.2878625134264232, |
|
"grad_norm": 0.4789970815181732, |
|
"learning_rate": 3.513813302032485e-06, |
|
"loss": 0.9671, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.330827067669173, |
|
"grad_norm": 0.4214249849319458, |
|
"learning_rate": 3.4221598113100196e-06, |
|
"loss": 0.9597, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.3737916219119226, |
|
"grad_norm": 0.4314541518688202, |
|
"learning_rate": 3.32904819638017e-06, |
|
"loss": 0.9872, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.4167561761546725, |
|
"grad_norm": 0.45763522386550903, |
|
"learning_rate": 3.234625685844803e-06, |
|
"loss": 1.006, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.459720730397422, |
|
"grad_norm": 0.41263076663017273, |
|
"learning_rate": 3.139041581101187e-06, |
|
"loss": 0.973, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.502685284640172, |
|
"grad_norm": 0.5277674198150635, |
|
"learning_rate": 3.0424470202657953e-06, |
|
"loss": 0.9525, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.5456498388829216, |
|
"grad_norm": 0.5219724178314209, |
|
"learning_rate": 2.9449947391938768e-06, |
|
"loss": 0.9516, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.5886143931256713, |
|
"grad_norm": 0.47409552335739136, |
|
"learning_rate": 2.8468388299726714e-06, |
|
"loss": 0.9599, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.631578947368421, |
|
"grad_norm": 0.4466871917247772, |
|
"learning_rate": 2.7481344972701545e-06, |
|
"loss": 0.952, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.6745435016111707, |
|
"grad_norm": 0.4826906621456146, |
|
"learning_rate": 2.64903781292455e-06, |
|
"loss": 0.9492, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.7175080558539206, |
|
"grad_norm": 0.4338424801826477, |
|
"learning_rate": 2.5497054691626754e-06, |
|
"loss": 0.9584, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.76047261009667, |
|
"grad_norm": 0.6052074432373047, |
|
"learning_rate": 2.4502945308373246e-06, |
|
"loss": 0.9446, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.80343716433942, |
|
"grad_norm": 0.5061154961585999, |
|
"learning_rate": 2.3509621870754505e-06, |
|
"loss": 0.9369, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.8464017185821697, |
|
"grad_norm": 0.47927358746528625, |
|
"learning_rate": 2.2518655027298468e-06, |
|
"loss": 1.0038, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.8893662728249194, |
|
"grad_norm": 0.4831957221031189, |
|
"learning_rate": 2.15316117002733e-06, |
|
"loss": 0.9501, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.9323308270676691, |
|
"grad_norm": 0.4502115249633789, |
|
"learning_rate": 2.055005260806125e-06, |
|
"loss": 0.9333, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.9752953813104188, |
|
"grad_norm": 0.6315404772758484, |
|
"learning_rate": 1.957552979734205e-06, |
|
"loss": 0.9279, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.0171858216970997, |
|
"grad_norm": 0.5396577715873718, |
|
"learning_rate": 1.8609584188988135e-06, |
|
"loss": 0.9254, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.0601503759398496, |
|
"grad_norm": 0.49860578775405884, |
|
"learning_rate": 1.7653743141551983e-06, |
|
"loss": 0.9307, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.1031149301825995, |
|
"grad_norm": 0.4839128255844116, |
|
"learning_rate": 1.6709518036198307e-06, |
|
"loss": 0.9491, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.146079484425349, |
|
"grad_norm": 0.4922753572463989, |
|
"learning_rate": 1.5778401886899808e-06, |
|
"loss": 0.9209, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.189044038668099, |
|
"grad_norm": 0.6326885223388672, |
|
"learning_rate": 1.4861866979675155e-06, |
|
"loss": 0.9163, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.2320085929108484, |
|
"grad_norm": 0.5038188099861145, |
|
"learning_rate": 1.3961362544602215e-06, |
|
"loss": 0.934, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.2749731471535983, |
|
"grad_norm": 0.5759682059288025, |
|
"learning_rate": 1.3078312464287355e-06, |
|
"loss": 0.9335, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.317937701396348, |
|
"grad_norm": 0.5074816942214966, |
|
"learning_rate": 1.2214113022414448e-06, |
|
"loss": 0.9194, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.3609022556390977, |
|
"grad_norm": 0.5167147517204285, |
|
"learning_rate": 1.1370130695933317e-06, |
|
"loss": 0.9442, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.4038668098818476, |
|
"grad_norm": 0.4864475131034851, |
|
"learning_rate": 1.0547699994378787e-06, |
|
"loss": 0.9248, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.446831364124597, |
|
"grad_norm": 0.5640864968299866, |
|
"learning_rate": 9.74812134973689e-07, |
|
"loss": 0.9063, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.489795918367347, |
|
"grad_norm": 0.5532556772232056, |
|
"learning_rate": 8.972659060194505e-07, |
|
"loss": 0.9045, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.5327604726100965, |
|
"grad_norm": 0.539094090461731, |
|
"learning_rate": 8.222539291024079e-07, |
|
"loss": 0.9115, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.5757250268528464, |
|
"grad_norm": 0.5470451712608337, |
|
"learning_rate": 7.49894813576437e-07, |
|
"loss": 0.9261, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.6186895810955964, |
|
"grad_norm": 0.6289984583854675, |
|
"learning_rate": 6.803029740762648e-07, |
|
"loss": 0.9183, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.661654135338346, |
|
"grad_norm": 0.6060165166854858, |
|
"learning_rate": 6.135884496044245e-07, |
|
"loss": 0.9308, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.7046186895810957, |
|
"grad_norm": 0.5452494025230408, |
|
"learning_rate": 5.4985672953697e-07, |
|
"loss": 0.9702, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.7475832438238452, |
|
"grad_norm": 0.577720582485199, |
|
"learning_rate": 4.892085868230881e-07, |
|
"loss": 0.9176, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.790547798066595, |
|
"grad_norm": 0.5641165375709534, |
|
"learning_rate": 4.317399186423574e-07, |
|
"loss": 0.9452, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.833512352309345, |
|
"grad_norm": 0.5322738289833069, |
|
"learning_rate": 3.7754159477158994e-07, |
|
"loss": 0.9271, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.8764769065520945, |
|
"grad_norm": 0.6401134133338928, |
|
"learning_rate": 3.266993139010438e-07, |
|
"loss": 0.9151, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.919441460794844, |
|
"grad_norm": 0.653149425983429, |
|
"learning_rate": 2.792934681271708e-07, |
|
"loss": 0.8951, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.962406015037594, |
|
"grad_norm": 0.5724528431892395, |
|
"learning_rate": 2.3539901583619186e-07, |
|
"loss": 0.9102, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.004296455424275, |
|
"grad_norm": 0.5610048174858093, |
|
"learning_rate": 1.9508536317948358e-07, |
|
"loss": 0.8989, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.0472610096670247, |
|
"grad_norm": 0.6965809464454651, |
|
"learning_rate": 1.584162543281806e-07, |
|
"loss": 0.9079, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.090225563909774, |
|
"grad_norm": 0.594599723815918, |
|
"learning_rate": 1.2544967068054332e-07, |
|
"loss": 0.9214, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.133190118152524, |
|
"grad_norm": 0.5906569361686707, |
|
"learning_rate": 9.623773918144896e-08, |
|
"loss": 0.9054, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.176154672395274, |
|
"grad_norm": 0.5923225283622742, |
|
"learning_rate": 7.082664989897486e-08, |
|
"loss": 0.9305, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.2191192266380235, |
|
"grad_norm": 0.5577116012573242, |
|
"learning_rate": 4.9256582988409795e-08, |
|
"loss": 0.9231, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.2620837808807734, |
|
"grad_norm": 0.5889368057250977, |
|
"learning_rate": 3.15616451591666e-08, |
|
"loss": 0.9072, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.305048335123523, |
|
"grad_norm": 0.5589332580566406, |
|
"learning_rate": 1.7769815745066476e-08, |
|
"loss": 0.9138, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.348012889366273, |
|
"grad_norm": 0.646337628364563, |
|
"learning_rate": 7.90290246326042e-09, |
|
"loss": 0.9175, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.3909774436090228, |
|
"grad_norm": 0.6335355043411255, |
|
"learning_rate": 1.976506931745392e-09, |
|
"loss": 0.9135, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.4339419978517722, |
|
"grad_norm": 0.7010214924812317, |
|
"learning_rate": 0.0, |
|
"loss": 0.9279, |
|
"step": 800 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 800, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.9266685807744512e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|