{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.17805475183618963, "eval_steps": 13, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0035610950367237926, "grad_norm": 4.179213523864746, "learning_rate": 1e-05, "loss": 4.8864, "step": 1 }, { "epoch": 0.0035610950367237926, "eval_loss": 5.30247163772583, "eval_runtime": 126.7922, "eval_samples_per_second": 3.731, "eval_steps_per_second": 1.869, "step": 1 }, { "epoch": 0.007122190073447585, "grad_norm": 3.5513782501220703, "learning_rate": 2e-05, "loss": 5.1702, "step": 2 }, { "epoch": 0.010683285110171378, "grad_norm": 4.266094207763672, "learning_rate": 3e-05, "loss": 5.3837, "step": 3 }, { "epoch": 0.01424438014689517, "grad_norm": 4.963681697845459, "learning_rate": 4e-05, "loss": 4.7062, "step": 4 }, { "epoch": 0.01780547518361896, "grad_norm": 5.114255905151367, "learning_rate": 5e-05, "loss": 5.045, "step": 5 }, { "epoch": 0.021366570220342756, "grad_norm": 6.496891498565674, "learning_rate": 6e-05, "loss": 5.0374, "step": 6 }, { "epoch": 0.024927665257066547, "grad_norm": 8.91849422454834, "learning_rate": 7e-05, "loss": 4.6738, "step": 7 }, { "epoch": 0.02848876029379034, "grad_norm": 4.901350498199463, "learning_rate": 8e-05, "loss": 4.7986, "step": 8 }, { "epoch": 0.03204985533051413, "grad_norm": 3.618027925491333, "learning_rate": 9e-05, "loss": 3.4925, "step": 9 }, { "epoch": 0.03561095036723792, "grad_norm": 3.863649368286133, "learning_rate": 0.0001, "loss": 4.3912, "step": 10 }, { "epoch": 0.03917204540396172, "grad_norm": 4.39957857131958, "learning_rate": 9.98458666866564e-05, "loss": 3.5932, "step": 11 }, { "epoch": 0.04273314044068551, "grad_norm": 5.253383636474609, "learning_rate": 9.938441702975689e-05, "loss": 3.0919, "step": 12 }, { "epoch": 0.0462942354774093, "grad_norm": 2.535303831100464, "learning_rate": 9.861849601988383e-05, "loss": 2.5898, "step": 13 }, { "epoch": 0.0462942354774093, "eval_loss": 2.55889630317688, "eval_runtime": 110.9541, "eval_samples_per_second": 4.263, "eval_steps_per_second": 2.136, "step": 13 }, { "epoch": 0.049855330514133094, "grad_norm": 2.924900770187378, "learning_rate": 9.755282581475769e-05, "loss": 2.3642, "step": 14 }, { "epoch": 0.05341642555085689, "grad_norm": 1.9190605878829956, "learning_rate": 9.619397662556435e-05, "loss": 2.245, "step": 15 }, { "epoch": 0.05697752058758068, "grad_norm": 4.164839744567871, "learning_rate": 9.45503262094184e-05, "loss": 2.4275, "step": 16 }, { "epoch": 0.06053861562430447, "grad_norm": 3.0086185932159424, "learning_rate": 9.263200821770461e-05, "loss": 2.5735, "step": 17 }, { "epoch": 0.06409971066102826, "grad_norm": 2.273061513900757, "learning_rate": 9.045084971874738e-05, "loss": 2.2379, "step": 18 }, { "epoch": 0.06766080569775205, "grad_norm": 1.9004080295562744, "learning_rate": 8.802029828000156e-05, "loss": 2.3375, "step": 19 }, { "epoch": 0.07122190073447585, "grad_norm": 2.002903461456299, "learning_rate": 8.535533905932738e-05, "loss": 2.2203, "step": 20 }, { "epoch": 0.07478299577119965, "grad_norm": 1.6309473514556885, "learning_rate": 8.247240241650918e-05, "loss": 2.1511, "step": 21 }, { "epoch": 0.07834409080792344, "grad_norm": 1.8154593706130981, "learning_rate": 7.938926261462366e-05, "loss": 2.0112, "step": 22 }, { "epoch": 0.08190518584464723, "grad_norm": 1.8795080184936523, "learning_rate": 7.612492823579745e-05, "loss": 1.8034, "step": 23 }, { "epoch": 0.08546628088137102, "grad_norm": 1.9428141117095947, "learning_rate": 7.269952498697734e-05, "loss": 1.9395, "step": 24 }, { "epoch": 0.08902737591809481, "grad_norm": 1.6623902320861816, "learning_rate": 6.91341716182545e-05, "loss": 2.1667, "step": 25 }, { "epoch": 0.0925884709548186, "grad_norm": 1.343985915184021, "learning_rate": 6.545084971874738e-05, "loss": 2.2539, "step": 26 }, { "epoch": 0.0925884709548186, "eval_loss": 2.0758728981018066, "eval_runtime": 110.911, "eval_samples_per_second": 4.265, "eval_steps_per_second": 2.137, "step": 26 }, { "epoch": 0.0961495659915424, "grad_norm": 1.4864414930343628, "learning_rate": 6.167226819279528e-05, "loss": 2.1355, "step": 27 }, { "epoch": 0.09971066102826619, "grad_norm": 1.316426396369934, "learning_rate": 5.782172325201155e-05, "loss": 1.9895, "step": 28 }, { "epoch": 0.10327175606498998, "grad_norm": 1.437731146812439, "learning_rate": 5.392295478639225e-05, "loss": 2.0277, "step": 29 }, { "epoch": 0.10683285110171378, "grad_norm": 1.660027027130127, "learning_rate": 5e-05, "loss": 2.228, "step": 30 }, { "epoch": 0.11039394613843757, "grad_norm": 1.3694123029708862, "learning_rate": 4.607704521360776e-05, "loss": 1.9222, "step": 31 }, { "epoch": 0.11395504117516136, "grad_norm": 1.639948844909668, "learning_rate": 4.2178276747988446e-05, "loss": 2.158, "step": 32 }, { "epoch": 0.11751613621188516, "grad_norm": 2.648879051208496, "learning_rate": 3.832773180720475e-05, "loss": 2.0388, "step": 33 }, { "epoch": 0.12107723124860895, "grad_norm": 1.5599783658981323, "learning_rate": 3.4549150281252636e-05, "loss": 2.4029, "step": 34 }, { "epoch": 0.12463832628533274, "grad_norm": 1.3389352560043335, "learning_rate": 3.086582838174551e-05, "loss": 1.7365, "step": 35 }, { "epoch": 0.12819942132205653, "grad_norm": 1.6567745208740234, "learning_rate": 2.7300475013022663e-05, "loss": 2.1814, "step": 36 }, { "epoch": 0.13176051635878033, "grad_norm": 1.775384783744812, "learning_rate": 2.3875071764202563e-05, "loss": 2.3459, "step": 37 }, { "epoch": 0.1353216113955041, "grad_norm": 1.3190244436264038, "learning_rate": 2.061073738537635e-05, "loss": 1.7901, "step": 38 }, { "epoch": 0.13888270643222791, "grad_norm": 1.489214539527893, "learning_rate": 1.7527597583490822e-05, "loss": 1.8801, "step": 39 }, { "epoch": 0.13888270643222791, "eval_loss": 2.036982297897339, "eval_runtime": 110.9189, "eval_samples_per_second": 4.264, "eval_steps_per_second": 2.137, "step": 39 }, { "epoch": 0.1424438014689517, "grad_norm": 1.3451905250549316, "learning_rate": 1.4644660940672627e-05, "loss": 1.8144, "step": 40 }, { "epoch": 0.1460048965056755, "grad_norm": 1.5614116191864014, "learning_rate": 1.1979701719998453e-05, "loss": 1.8939, "step": 41 }, { "epoch": 0.1495659915423993, "grad_norm": 1.434024453163147, "learning_rate": 9.549150281252633e-06, "loss": 2.1714, "step": 42 }, { "epoch": 0.15312708657912308, "grad_norm": 1.9399369955062866, "learning_rate": 7.367991782295391e-06, "loss": 2.3259, "step": 43 }, { "epoch": 0.15668818161584688, "grad_norm": 1.6020128726959229, "learning_rate": 5.449673790581611e-06, "loss": 2.3047, "step": 44 }, { "epoch": 0.16024927665257066, "grad_norm": 1.3612734079360962, "learning_rate": 3.8060233744356633e-06, "loss": 2.0302, "step": 45 }, { "epoch": 0.16381037168929447, "grad_norm": 1.40798819065094, "learning_rate": 2.4471741852423237e-06, "loss": 1.9763, "step": 46 }, { "epoch": 0.16737146672601824, "grad_norm": 1.7111047506332397, "learning_rate": 1.3815039801161721e-06, "loss": 2.0167, "step": 47 }, { "epoch": 0.17093256176274205, "grad_norm": 1.519776701927185, "learning_rate": 6.15582970243117e-07, "loss": 2.0705, "step": 48 }, { "epoch": 0.17449365679946582, "grad_norm": 1.4649944305419922, "learning_rate": 1.5413331334360182e-07, "loss": 1.8096, "step": 49 }, { "epoch": 0.17805475183618963, "grad_norm": 1.449601173400879, "learning_rate": 0.0, "loss": 1.6536, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.08957047128064e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }