{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.4339419978517722, "eval_steps": 500, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04296455424274973, "grad_norm": 1.6397182941436768, "learning_rate": 5e-06, "loss": 3.4918, "step": 10 }, { "epoch": 0.08592910848549946, "grad_norm": 1.1852494478225708, "learning_rate": 4.998023493068255e-06, "loss": 3.1439, "step": 20 }, { "epoch": 0.1288936627282492, "grad_norm": 0.813017725944519, "learning_rate": 4.99209709753674e-06, "loss": 2.7821, "step": 30 }, { "epoch": 0.17185821697099893, "grad_norm": 0.8324390053749084, "learning_rate": 4.982230184254934e-06, "loss": 2.5004, "step": 40 }, { "epoch": 0.21482277121374865, "grad_norm": 0.7486415505409241, "learning_rate": 4.968438354840834e-06, "loss": 2.2932, "step": 50 }, { "epoch": 0.2577873254564984, "grad_norm": 0.8176449537277222, "learning_rate": 4.950743417011591e-06, "loss": 2.0388, "step": 60 }, { "epoch": 0.3007518796992481, "grad_norm": 1.1593029499053955, "learning_rate": 4.929173350101025e-06, "loss": 1.798, "step": 70 }, { "epoch": 0.34371643394199786, "grad_norm": 0.8432386517524719, "learning_rate": 4.903762260818552e-06, "loss": 1.5069, "step": 80 }, { "epoch": 0.3866809881847476, "grad_norm": 0.5978631973266602, "learning_rate": 4.874550329319457e-06, "loss": 1.2985, "step": 90 }, { "epoch": 0.4296455424274973, "grad_norm": 0.43345287442207336, "learning_rate": 4.84158374567182e-06, "loss": 1.1899, "step": 100 }, { "epoch": 0.47261009667024706, "grad_norm": 0.41255077719688416, "learning_rate": 4.804914636820517e-06, "loss": 1.207, "step": 110 }, { "epoch": 0.5155746509129968, "grad_norm": 0.3531062602996826, "learning_rate": 4.764600984163809e-06, "loss": 1.097, "step": 120 }, { "epoch": 0.5585392051557465, "grad_norm": 0.3125358819961548, "learning_rate": 4.72070653187283e-06, "loss": 1.0538, "step": 130 }, { "epoch": 0.6015037593984962, "grad_norm": 0.3715002238750458, "learning_rate": 4.673300686098957e-06, "loss": 1.0238, "step": 140 }, { "epoch": 0.644468313641246, "grad_norm": 0.3236304819583893, "learning_rate": 4.622458405228411e-06, "loss": 1.0329, "step": 150 }, { "epoch": 0.6874328678839957, "grad_norm": 0.389879435300827, "learning_rate": 4.568260081357644e-06, "loss": 1.0452, "step": 160 }, { "epoch": 0.7303974221267454, "grad_norm": 0.3259812593460083, "learning_rate": 4.510791413176912e-06, "loss": 1.0407, "step": 170 }, { "epoch": 0.7733619763694952, "grad_norm": 0.3405851125717163, "learning_rate": 4.450143270463031e-06, "loss": 1.0219, "step": 180 }, { "epoch": 0.8163265306122449, "grad_norm": 0.35685139894485474, "learning_rate": 4.386411550395576e-06, "loss": 0.9988, "step": 190 }, { "epoch": 0.8592910848549946, "grad_norm": 0.3287046253681183, "learning_rate": 4.319697025923736e-06, "loss": 1.0164, "step": 200 }, { "epoch": 0.9022556390977443, "grad_norm": 0.33687785267829895, "learning_rate": 4.250105186423564e-06, "loss": 0.9938, "step": 210 }, { "epoch": 0.9452201933404941, "grad_norm": 0.40256965160369873, "learning_rate": 4.177746070897593e-06, "loss": 1.001, "step": 220 }, { "epoch": 0.9881847475832438, "grad_norm": 0.48085835576057434, "learning_rate": 4.10273409398055e-06, "loss": 1.0145, "step": 230 }, { "epoch": 1.0300751879699248, "grad_norm": 0.4048563241958618, "learning_rate": 4.025187865026311e-06, "loss": 0.9861, "step": 240 }, { "epoch": 1.0730397422126745, "grad_norm": 0.34213146567344666, "learning_rate": 3.945230000562121e-06, "loss": 0.9843, "step": 250 }, { "epoch": 1.1160042964554242, "grad_norm": 0.3930899500846863, "learning_rate": 3.862986930406669e-06, "loss": 0.9539, "step": 260 }, { "epoch": 1.158968850698174, "grad_norm": 0.41636350750923157, "learning_rate": 3.7785886977585562e-06, "loss": 0.9629, "step": 270 }, { "epoch": 1.2019334049409238, "grad_norm": 0.3695228397846222, "learning_rate": 3.6921687535712657e-06, "loss": 0.9778, "step": 280 }, { "epoch": 1.2448979591836735, "grad_norm": 0.3623664677143097, "learning_rate": 3.6038637455397802e-06, "loss": 0.9613, "step": 290 }, { "epoch": 1.2878625134264232, "grad_norm": 0.4789970815181732, "learning_rate": 3.513813302032485e-06, "loss": 0.9671, "step": 300 }, { "epoch": 1.330827067669173, "grad_norm": 0.4214249849319458, "learning_rate": 3.4221598113100196e-06, "loss": 0.9597, "step": 310 }, { "epoch": 1.3737916219119226, "grad_norm": 0.4314541518688202, "learning_rate": 3.32904819638017e-06, "loss": 0.9872, "step": 320 }, { "epoch": 1.4167561761546725, "grad_norm": 0.45763522386550903, "learning_rate": 3.234625685844803e-06, "loss": 1.006, "step": 330 }, { "epoch": 1.459720730397422, "grad_norm": 0.41263076663017273, "learning_rate": 3.139041581101187e-06, "loss": 0.973, "step": 340 }, { "epoch": 1.502685284640172, "grad_norm": 0.5277674198150635, "learning_rate": 3.0424470202657953e-06, "loss": 0.9525, "step": 350 }, { "epoch": 1.5456498388829216, "grad_norm": 0.5219724178314209, "learning_rate": 2.9449947391938768e-06, "loss": 0.9516, "step": 360 }, { "epoch": 1.5886143931256713, "grad_norm": 0.47409552335739136, "learning_rate": 2.8468388299726714e-06, "loss": 0.9599, "step": 370 }, { "epoch": 1.631578947368421, "grad_norm": 0.4466871917247772, "learning_rate": 2.7481344972701545e-06, "loss": 0.952, "step": 380 }, { "epoch": 1.6745435016111707, "grad_norm": 0.4826906621456146, "learning_rate": 2.64903781292455e-06, "loss": 0.9492, "step": 390 }, { "epoch": 1.7175080558539206, "grad_norm": 0.4338424801826477, "learning_rate": 2.5497054691626754e-06, "loss": 0.9584, "step": 400 }, { "epoch": 1.76047261009667, "grad_norm": 0.6052074432373047, "learning_rate": 2.4502945308373246e-06, "loss": 0.9446, "step": 410 }, { "epoch": 1.80343716433942, "grad_norm": 0.5061154961585999, "learning_rate": 2.3509621870754505e-06, "loss": 0.9369, "step": 420 }, { "epoch": 1.8464017185821697, "grad_norm": 0.47927358746528625, "learning_rate": 2.2518655027298468e-06, "loss": 1.0038, "step": 430 }, { "epoch": 1.8893662728249194, "grad_norm": 0.4831957221031189, "learning_rate": 2.15316117002733e-06, "loss": 0.9501, "step": 440 }, { "epoch": 1.9323308270676691, "grad_norm": 0.4502115249633789, "learning_rate": 2.055005260806125e-06, "loss": 0.9333, "step": 450 }, { "epoch": 1.9752953813104188, "grad_norm": 0.6315404772758484, "learning_rate": 1.957552979734205e-06, "loss": 0.9279, "step": 460 }, { "epoch": 2.0171858216970997, "grad_norm": 0.5396577715873718, "learning_rate": 1.8609584188988135e-06, "loss": 0.9254, "step": 470 }, { "epoch": 2.0601503759398496, "grad_norm": 0.49860578775405884, "learning_rate": 1.7653743141551983e-06, "loss": 0.9307, "step": 480 }, { "epoch": 2.1031149301825995, "grad_norm": 0.4839128255844116, "learning_rate": 1.6709518036198307e-06, "loss": 0.9491, "step": 490 }, { "epoch": 2.146079484425349, "grad_norm": 0.4922753572463989, "learning_rate": 1.5778401886899808e-06, "loss": 0.9209, "step": 500 }, { "epoch": 2.189044038668099, "grad_norm": 0.6326885223388672, "learning_rate": 1.4861866979675155e-06, "loss": 0.9163, "step": 510 }, { "epoch": 2.2320085929108484, "grad_norm": 0.5038188099861145, "learning_rate": 1.3961362544602215e-06, "loss": 0.934, "step": 520 }, { "epoch": 2.2749731471535983, "grad_norm": 0.5759682059288025, "learning_rate": 1.3078312464287355e-06, "loss": 0.9335, "step": 530 }, { "epoch": 2.317937701396348, "grad_norm": 0.5074816942214966, "learning_rate": 1.2214113022414448e-06, "loss": 0.9194, "step": 540 }, { "epoch": 2.3609022556390977, "grad_norm": 0.5167147517204285, "learning_rate": 1.1370130695933317e-06, "loss": 0.9442, "step": 550 }, { "epoch": 2.4038668098818476, "grad_norm": 0.4864475131034851, "learning_rate": 1.0547699994378787e-06, "loss": 0.9248, "step": 560 }, { "epoch": 2.446831364124597, "grad_norm": 0.5640864968299866, "learning_rate": 9.74812134973689e-07, "loss": 0.9063, "step": 570 }, { "epoch": 2.489795918367347, "grad_norm": 0.5532556772232056, "learning_rate": 8.972659060194505e-07, "loss": 0.9045, "step": 580 }, { "epoch": 2.5327604726100965, "grad_norm": 0.539094090461731, "learning_rate": 8.222539291024079e-07, "loss": 0.9115, "step": 590 }, { "epoch": 2.5757250268528464, "grad_norm": 0.5470451712608337, "learning_rate": 7.49894813576437e-07, "loss": 0.9261, "step": 600 }, { "epoch": 2.6186895810955964, "grad_norm": 0.6289984583854675, "learning_rate": 6.803029740762648e-07, "loss": 0.9183, "step": 610 }, { "epoch": 2.661654135338346, "grad_norm": 0.6060165166854858, "learning_rate": 6.135884496044245e-07, "loss": 0.9308, "step": 620 }, { "epoch": 2.7046186895810957, "grad_norm": 0.5452494025230408, "learning_rate": 5.4985672953697e-07, "loss": 0.9702, "step": 630 }, { "epoch": 2.7475832438238452, "grad_norm": 0.577720582485199, "learning_rate": 4.892085868230881e-07, "loss": 0.9176, "step": 640 }, { "epoch": 2.790547798066595, "grad_norm": 0.5641165375709534, "learning_rate": 4.317399186423574e-07, "loss": 0.9452, "step": 650 }, { "epoch": 2.833512352309345, "grad_norm": 0.5322738289833069, "learning_rate": 3.7754159477158994e-07, "loss": 0.9271, "step": 660 }, { "epoch": 2.8764769065520945, "grad_norm": 0.6401134133338928, "learning_rate": 3.266993139010438e-07, "loss": 0.9151, "step": 670 }, { "epoch": 2.919441460794844, "grad_norm": 0.653149425983429, "learning_rate": 2.792934681271708e-07, "loss": 0.8951, "step": 680 }, { "epoch": 2.962406015037594, "grad_norm": 0.5724528431892395, "learning_rate": 2.3539901583619186e-07, "loss": 0.9102, "step": 690 }, { "epoch": 3.004296455424275, "grad_norm": 0.5610048174858093, "learning_rate": 1.9508536317948358e-07, "loss": 0.8989, "step": 700 }, { "epoch": 3.0472610096670247, "grad_norm": 0.6965809464454651, "learning_rate": 1.584162543281806e-07, "loss": 0.9079, "step": 710 }, { "epoch": 3.090225563909774, "grad_norm": 0.594599723815918, "learning_rate": 1.2544967068054332e-07, "loss": 0.9214, "step": 720 }, { "epoch": 3.133190118152524, "grad_norm": 0.5906569361686707, "learning_rate": 9.623773918144896e-08, "loss": 0.9054, "step": 730 }, { "epoch": 3.176154672395274, "grad_norm": 0.5923225283622742, "learning_rate": 7.082664989897486e-08, "loss": 0.9305, "step": 740 }, { "epoch": 3.2191192266380235, "grad_norm": 0.5577116012573242, "learning_rate": 4.9256582988409795e-08, "loss": 0.9231, "step": 750 }, { "epoch": 3.2620837808807734, "grad_norm": 0.5889368057250977, "learning_rate": 3.15616451591666e-08, "loss": 0.9072, "step": 760 }, { "epoch": 3.305048335123523, "grad_norm": 0.5589332580566406, "learning_rate": 1.7769815745066476e-08, "loss": 0.9138, "step": 770 }, { "epoch": 3.348012889366273, "grad_norm": 0.646337628364563, "learning_rate": 7.90290246326042e-09, "loss": 0.9175, "step": 780 }, { "epoch": 3.3909774436090228, "grad_norm": 0.6335355043411255, "learning_rate": 1.976506931745392e-09, "loss": 0.9135, "step": 790 }, { "epoch": 3.4339419978517722, "grad_norm": 0.7010214924812317, "learning_rate": 0.0, "loss": 0.9279, "step": 800 } ], "logging_steps": 10, "max_steps": 800, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.9266685807744512e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }