{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 10000, "global_step": 735, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.13646055437100213, "grad_norm": 0.14500385522842407, "learning_rate": 0.00011399999999999999, "loss": 0.1268, "step": 20 }, { "epoch": 0.27292110874200426, "grad_norm": 0.06265528500080109, "learning_rate": 0.000234, "loss": 0.0282, "step": 40 }, { "epoch": 0.4093816631130064, "grad_norm": 0.03309512510895729, "learning_rate": 0.00029987223755234907, "loss": 0.0157, "step": 60 }, { "epoch": 0.5458422174840085, "grad_norm": 0.03388677537441254, "learning_rate": 0.00029867524500941253, "loss": 0.0125, "step": 80 }, { "epoch": 0.6823027718550106, "grad_norm": 0.038252197206020355, "learning_rate": 0.00029622824461983995, "loss": 0.0128, "step": 100 }, { "epoch": 0.8187633262260128, "grad_norm": 0.06330293416976929, "learning_rate": 0.00029255180988050044, "loss": 0.0112, "step": 120 }, { "epoch": 0.9552238805970149, "grad_norm": 0.03660163655877113, "learning_rate": 0.0002876768509289324, "loss": 0.0106, "step": 140 }, { "epoch": 1.0886993603411514, "grad_norm": 0.028105631470680237, "learning_rate": 0.0002816443546620542, "loss": 0.0085, "step": 160 }, { "epoch": 1.2251599147121535, "grad_norm": 0.04313601925969124, "learning_rate": 0.00027450504013311436, "loss": 0.007, "step": 180 }, { "epoch": 1.3616204690831557, "grad_norm": 0.03731823340058327, "learning_rate": 0.00026631893212418224, "loss": 0.0056, "step": 200 }, { "epoch": 1.4980810234541577, "grad_norm": 0.02299409918487072, "learning_rate": 0.00025715485647942525, "loss": 0.0075, "step": 220 }, { "epoch": 1.63454157782516, "grad_norm": 0.03759332001209259, "learning_rate": 0.00024708986144223035, "loss": 0.0063, "step": 240 }, { "epoch": 1.7710021321961622, "grad_norm": 0.013533813878893852, "learning_rate": 0.00023620856986135804, "loss": 0.0052, "step": 260 }, { "epoch": 1.9074626865671642, "grad_norm": 0.0647633746266365, "learning_rate": 0.00022460246771254522, "loss": 0.0045, "step": 280 }, { "epoch": 2.0409381663113004, "grad_norm": 0.022289317101240158, "learning_rate": 0.0002123691349174121, "loss": 0.003, "step": 300 }, { "epoch": 2.177398720682303, "grad_norm": 0.040852464735507965, "learning_rate": 0.00019961142492666903, "loss": 0.0028, "step": 320 }, { "epoch": 2.313859275053305, "grad_norm": 0.04412081092596054, "learning_rate": 0.00018643659996539272, "loss": 0.0029, "step": 340 }, { "epoch": 2.450319829424307, "grad_norm": 0.047590937465429306, "learning_rate": 0.00017295542921091727, "loss": 0.0025, "step": 360 }, { "epoch": 2.5867803837953094, "grad_norm": 0.032162390649318695, "learning_rate": 0.00015928125748553563, "loss": 0.002, "step": 380 }, { "epoch": 2.7232409381663114, "grad_norm": 0.02574550174176693, "learning_rate": 0.00014552905229410626, "loss": 0.0014, "step": 400 }, { "epoch": 2.8597014925373134, "grad_norm": 0.014707539230585098, "learning_rate": 0.000131814437218731, "loss": 0.0012, "step": 420 }, { "epoch": 2.9961620469083154, "grad_norm": 0.04477572441101074, "learning_rate": 0.0001182527197973709, "loss": 0.0011, "step": 440 }, { "epoch": 3.129637526652452, "grad_norm": 0.012851215898990631, "learning_rate": 0.00010495792205964832, "loss": 0.0008, "step": 460 }, { "epoch": 3.266098081023454, "grad_norm": 0.02783357724547386, "learning_rate": 9.204182187073868e-05, "loss": 0.0007, "step": 480 }, { "epoch": 3.402558635394456, "grad_norm": 0.004735818598419428, "learning_rate": 7.961301314338808e-05, "loss": 0.0004, "step": 500 }, { "epoch": 3.539019189765458, "grad_norm": 0.003670594422146678, "learning_rate": 6.777599281945507e-05, "loss": 0.0004, "step": 520 }, { "epoch": 3.6754797441364606, "grad_norm": 0.013839378952980042, "learning_rate": 5.66302822973053e-05, "loss": 0.0004, "step": 540 }, { "epoch": 3.8119402985074626, "grad_norm": 0.00302703189663589, "learning_rate": 4.626959069178253e-05, "loss": 0.0004, "step": 560 }, { "epoch": 3.948400852878465, "grad_norm": 0.026532089337706566, "learning_rate": 3.6781026961763353e-05, "loss": 0.0004, "step": 580 }, { "epoch": 4.081876332622601, "grad_norm": 0.001038851565681398, "learning_rate": 2.8244367529442822e-05, "loss": 0.0002, "step": 600 }, { "epoch": 4.218336886993604, "grad_norm": 0.0008404534310102463, "learning_rate": 2.0731385548944725e-05, "loss": 0.0002, "step": 620 }, { "epoch": 4.354797441364606, "grad_norm": 0.0016892015701159835, "learning_rate": 1.4305247463523778e-05, "loss": 0.0002, "step": 640 }, { "epoch": 4.491257995735608, "grad_norm": 0.002141030738130212, "learning_rate": 9.019981924888797e-06, "loss": 0.0001, "step": 660 }, { "epoch": 4.62771855010661, "grad_norm": 0.0017192725790664554, "learning_rate": 4.920025539782397e-06, "loss": 0.0002, "step": 680 }, { "epoch": 4.764179104477612, "grad_norm": 0.002778939437121153, "learning_rate": 2.0398492630157303e-06, "loss": 0.0002, "step": 700 }, { "epoch": 4.900639658848614, "grad_norm": 0.002974987495690584, "learning_rate": 4.036685781107329e-07, "loss": 0.0001, "step": 720 } ], "logging_steps": 20, "max_steps": 735, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8319121075920896e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }