{ "best_metric": null, "best_model_checkpoint": null, "epoch": 19.91111111111111, "eval_steps": 500, "global_step": 896, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2222222222222222, "grad_norm": 3.4277842044830322, "learning_rate": 4.4444444444444447e-05, "loss": 1.3797, "step": 10 }, { "epoch": 0.4444444444444444, "grad_norm": 2.029878854751587, "learning_rate": 8.888888888888889e-05, "loss": 0.424, "step": 20 }, { "epoch": 0.6666666666666666, "grad_norm": 1.6295349597930908, "learning_rate": 0.00013333333333333334, "loss": 0.2302, "step": 30 }, { "epoch": 0.8888888888888888, "grad_norm": 0.916685163974762, "learning_rate": 0.00017777777777777779, "loss": 0.1664, "step": 40 }, { "epoch": 1.1111111111111112, "grad_norm": 0.7659821510314941, "learning_rate": 0.000199982965150241, "loss": 0.1398, "step": 50 }, { "epoch": 1.3333333333333333, "grad_norm": 1.0613561868667603, "learning_rate": 0.00019984672117252423, "loss": 0.1207, "step": 60 }, { "epoch": 1.5555555555555556, "grad_norm": 0.7991588711738586, "learning_rate": 0.00019957441887293156, "loss": 0.1119, "step": 70 }, { "epoch": 1.7777777777777777, "grad_norm": 0.6806069016456604, "learning_rate": 0.00019916642931015662, "loss": 0.0885, "step": 80 }, { "epoch": 2.0, "grad_norm": 0.6726401448249817, "learning_rate": 0.00019862330844011466, "loss": 0.0849, "step": 90 }, { "epoch": 2.2222222222222223, "grad_norm": 0.44822949171066284, "learning_rate": 0.00019794579635835704, "loss": 0.0721, "step": 100 }, { "epoch": 2.4444444444444446, "grad_norm": 0.40653809905052185, "learning_rate": 0.0001971348162915637, "loss": 0.0729, "step": 110 }, { "epoch": 2.6666666666666665, "grad_norm": 0.40992143750190735, "learning_rate": 0.00019619147333948823, "loss": 0.0666, "step": 120 }, { "epoch": 2.888888888888889, "grad_norm": 0.6051583290100098, "learning_rate": 0.00019511705296906945, "loss": 0.0606, "step": 130 }, { "epoch": 3.111111111111111, "grad_norm": 0.46464696526527405, "learning_rate": 0.00019391301926276156, "loss": 0.0631, "step": 140 }, { "epoch": 3.3333333333333335, "grad_norm": 0.5700137615203857, "learning_rate": 0.00019258101292347042, "loss": 0.0594, "step": 150 }, { "epoch": 3.5555555555555554, "grad_norm": 0.5737802386283875, "learning_rate": 0.0001911228490388136, "loss": 0.0609, "step": 160 }, { "epoch": 3.7777777777777777, "grad_norm": 0.4904007315635681, "learning_rate": 0.0001895405146077514, "loss": 0.0539, "step": 170 }, { "epoch": 4.0, "grad_norm": 0.4444526731967926, "learning_rate": 0.00018783616583295943, "loss": 0.0514, "step": 180 }, { "epoch": 4.222222222222222, "grad_norm": 0.3706776201725006, "learning_rate": 0.00018601212518263156, "loss": 0.0495, "step": 190 }, { "epoch": 4.444444444444445, "grad_norm": 0.29509109258651733, "learning_rate": 0.00018407087822571794, "loss": 0.0477, "step": 200 }, { "epoch": 4.666666666666667, "grad_norm": 0.4063428044319153, "learning_rate": 0.00018201507024490988, "loss": 0.0485, "step": 210 }, { "epoch": 4.888888888888889, "grad_norm": 0.4438524544239044, "learning_rate": 0.0001798475026319875, "loss": 0.046, "step": 220 }, { "epoch": 5.111111111111111, "grad_norm": 0.26472207903862, "learning_rate": 0.000177571129070442, "loss": 0.0452, "step": 230 }, { "epoch": 5.333333333333333, "grad_norm": 0.3637102246284485, "learning_rate": 0.0001751890515105738, "loss": 0.0471, "step": 240 }, { "epoch": 5.555555555555555, "grad_norm": 0.2891571819782257, "learning_rate": 0.00017270451594255233, "loss": 0.0436, "step": 250 }, { "epoch": 5.777777777777778, "grad_norm": 0.380941778421402, "learning_rate": 0.00017012090797319628, "loss": 0.0439, "step": 260 }, { "epoch": 6.0, "grad_norm": 0.573648989200592, "learning_rate": 0.00016744174821250237, "loss": 0.0434, "step": 270 }, { "epoch": 6.222222222222222, "grad_norm": 0.34533384442329407, "learning_rate": 0.0001646706874762089, "loss": 0.0369, "step": 280 }, { "epoch": 6.444444444444445, "grad_norm": 0.5272351503372192, "learning_rate": 0.0001618115018109318, "loss": 0.0403, "step": 290 }, { "epoch": 6.666666666666667, "grad_norm": 0.2849768400192261, "learning_rate": 0.00015886808734865202, "loss": 0.0388, "step": 300 }, { "epoch": 6.888888888888889, "grad_norm": 0.301411896944046, "learning_rate": 0.00015584445499756578, "loss": 0.0405, "step": 310 }, { "epoch": 7.111111111111111, "grad_norm": 0.32372748851776123, "learning_rate": 0.0001527447249765329, "loss": 0.0362, "step": 320 }, { "epoch": 7.333333333333333, "grad_norm": 0.26995155215263367, "learning_rate": 0.00014957312120057005, "loss": 0.0335, "step": 330 }, { "epoch": 7.555555555555555, "grad_norm": 0.35481250286102295, "learning_rate": 0.00014633396552504063, "loss": 0.0424, "step": 340 }, { "epoch": 7.777777777777778, "grad_norm": 0.3813558518886566, "learning_rate": 0.00014303167185638366, "loss": 0.0378, "step": 350 }, { "epoch": 8.0, "grad_norm": 0.3790789842605591, "learning_rate": 0.0001396707401374078, "loss": 0.0403, "step": 360 }, { "epoch": 8.222222222222221, "grad_norm": 0.28233516216278076, "learning_rate": 0.00013625575021534536, "loss": 0.0304, "step": 370 }, { "epoch": 8.444444444444445, "grad_norm": 0.4000020921230316, "learning_rate": 0.00013279135560102337, "loss": 0.033, "step": 380 }, { "epoch": 8.666666666666666, "grad_norm": 0.29253950715065, "learning_rate": 0.00012928227712765504, "loss": 0.033, "step": 390 }, { "epoch": 8.88888888888889, "grad_norm": 0.33955538272857666, "learning_rate": 0.00012573329651789297, "loss": 0.0338, "step": 400 }, { "epoch": 9.11111111111111, "grad_norm": 0.3720948398113251, "learning_rate": 0.00012214924986791003, "loss": 0.0353, "step": 410 }, { "epoch": 9.333333333333334, "grad_norm": 0.2159433513879776, "learning_rate": 0.00011853502105738692, "loss": 0.0328, "step": 420 }, { "epoch": 9.555555555555555, "grad_norm": 0.20489104092121124, "learning_rate": 0.00011489553509438657, "loss": 0.0351, "step": 430 }, { "epoch": 9.777777777777779, "grad_norm": 0.19427303969860077, "learning_rate": 0.00011123575140418414, "loss": 0.0302, "step": 440 }, { "epoch": 10.0, "grad_norm": 0.2984829545021057, "learning_rate": 0.00010756065707119729, "loss": 0.0285, "step": 450 }, { "epoch": 10.222222222222221, "grad_norm": 0.19880931079387665, "learning_rate": 0.0001038752600432265, "loss": 0.0283, "step": 460 }, { "epoch": 10.444444444444445, "grad_norm": 0.30339422821998596, "learning_rate": 0.00010018458230726523, "loss": 0.0268, "step": 470 }, { "epoch": 10.666666666666666, "grad_norm": 0.20127274096012115, "learning_rate": 9.649365304617952e-05, "loss": 0.031, "step": 480 }, { "epoch": 10.88888888888889, "grad_norm": 0.2175353467464447, "learning_rate": 9.280750178558138e-05, "loss": 0.0259, "step": 490 }, { "epoch": 11.11111111111111, "grad_norm": 0.26974818110466003, "learning_rate": 8.913115154023605e-05, "loss": 0.0306, "step": 500 }, { "epoch": 11.333333333333334, "grad_norm": 0.4030226767063141, "learning_rate": 8.546961196934043e-05, "loss": 0.0248, "step": 510 }, { "epoch": 11.555555555555555, "grad_norm": 0.23470337688922882, "learning_rate": 8.182787255000155e-05, "loss": 0.0262, "step": 520 }, { "epoch": 11.777777777777779, "grad_norm": 0.40099358558654785, "learning_rate": 7.82108957782161e-05, "loss": 0.0259, "step": 530 }, { "epoch": 12.0, "grad_norm": 0.17941424250602722, "learning_rate": 7.462361040661667e-05, "loss": 0.0299, "step": 540 }, { "epoch": 12.222222222222221, "grad_norm": 0.24401065707206726, "learning_rate": 7.107090472819896e-05, "loss": 0.0231, "step": 550 }, { "epoch": 12.444444444444445, "grad_norm": 0.26819756627082825, "learning_rate": 6.755761991518219e-05, "loss": 0.0262, "step": 560 }, { "epoch": 12.666666666666666, "grad_norm": 0.300659716129303, "learning_rate": 6.408854342207982e-05, "loss": 0.0289, "step": 570 }, { "epoch": 12.88888888888889, "grad_norm": 0.2212854027748108, "learning_rate": 6.0668402461969807e-05, "loss": 0.0243, "step": 580 }, { "epoch": 13.11111111111111, "grad_norm": 0.28721633553504944, "learning_rate": 5.730185756485395e-05, "loss": 0.0256, "step": 590 }, { "epoch": 13.333333333333334, "grad_norm": 0.2350863218307495, "learning_rate": 5.399349622688479e-05, "loss": 0.0261, "step": 600 }, { "epoch": 13.555555555555555, "grad_norm": 0.31170952320098877, "learning_rate": 5.074782665911341e-05, "loss": 0.0247, "step": 610 }, { "epoch": 13.777777777777779, "grad_norm": 0.18656305968761444, "learning_rate": 4.756927164427685e-05, "loss": 0.0233, "step": 620 }, { "epoch": 14.0, "grad_norm": 0.2578571140766144, "learning_rate": 4.446216250999641e-05, "loss": 0.0257, "step": 630 }, { "epoch": 14.222222222222221, "grad_norm": 0.16936838626861572, "learning_rate": 4.1430733226599114e-05, "loss": 0.0212, "step": 640 }, { "epoch": 14.444444444444445, "grad_norm": 0.15445557236671448, "learning_rate": 3.8479114637605285e-05, "loss": 0.0246, "step": 650 }, { "epoch": 14.666666666666666, "grad_norm": 0.1869278848171234, "learning_rate": 3.561132883074427e-05, "loss": 0.0232, "step": 660 }, { "epoch": 14.88888888888889, "grad_norm": 0.20103834569454193, "learning_rate": 3.2831283657168275e-05, "loss": 0.0226, "step": 670 }, { "epoch": 15.11111111111111, "grad_norm": 0.2504471242427826, "learning_rate": 3.0142767406333518e-05, "loss": 0.0184, "step": 680 }, { "epoch": 15.333333333333334, "grad_norm": 0.18746283650398254, "learning_rate": 2.7549443643804585e-05, "loss": 0.019, "step": 690 }, { "epoch": 15.555555555555555, "grad_norm": 0.25808605551719666, "learning_rate": 2.505484621901655e-05, "loss": 0.0181, "step": 700 }, { "epoch": 15.777777777777779, "grad_norm": 0.2831310033798218, "learning_rate": 2.2662374449797664e-05, "loss": 0.022, "step": 710 }, { "epoch": 16.0, "grad_norm": 0.18062320351600647, "learning_rate": 2.0375288490214404e-05, "loss": 0.0205, "step": 720 }, { "epoch": 16.22222222222222, "grad_norm": 0.15133315324783325, "learning_rate": 1.819670488805111e-05, "loss": 0.0198, "step": 730 }, { "epoch": 16.444444444444443, "grad_norm": 0.20342513918876648, "learning_rate": 1.6129592337977995e-05, "loss": 0.02, "step": 740 }, { "epoch": 16.666666666666668, "grad_norm": 0.21008381247520447, "learning_rate": 1.4176767636194122e-05, "loss": 0.0232, "step": 750 }, { "epoch": 16.88888888888889, "grad_norm": 0.25737613439559937, "learning_rate": 1.234089184205851e-05, "loss": 0.0227, "step": 760 }, { "epoch": 17.11111111111111, "grad_norm": 0.15780089795589447, "learning_rate": 1.0624466651939247e-05, "loss": 0.0203, "step": 770 }, { "epoch": 17.333333333333332, "grad_norm": 0.16062302887439728, "learning_rate": 9.029830990222132e-06, "loss": 0.0244, "step": 780 }, { "epoch": 17.555555555555557, "grad_norm": 0.18100588023662567, "learning_rate": 7.55915782212413e-06, "loss": 0.0173, "step": 790 }, { "epoch": 17.77777777777778, "grad_norm": 0.20718730986118317, "learning_rate": 6.214451192654747e-06, "loss": 0.0186, "step": 800 }, { "epoch": 18.0, "grad_norm": 0.18695631623268127, "learning_rate": 4.9975434957601264e-06, "loss": 0.0177, "step": 810 }, { "epoch": 18.22222222222222, "grad_norm": 0.1640794426202774, "learning_rate": 3.910092977371394e-06, "loss": 0.0169, "step": 820 }, { "epoch": 18.444444444444443, "grad_norm": 0.18873189389705658, "learning_rate": 2.953581475759404e-06, "loss": 0.0214, "step": 830 }, { "epoch": 18.666666666666668, "grad_norm": 0.1896572709083557, "learning_rate": 2.1293124022754407e-06, "loss": 0.0173, "step": 840 }, { "epoch": 18.88888888888889, "grad_norm": 0.09888505190610886, "learning_rate": 1.4384089652291543e-06, "loss": 0.0185, "step": 850 }, { "epoch": 19.11111111111111, "grad_norm": 0.17348803579807281, "learning_rate": 8.818126393241643e-07, "loss": 0.0182, "step": 860 }, { "epoch": 19.333333333333332, "grad_norm": 0.20844842493534088, "learning_rate": 4.602818827369126e-07, "loss": 0.0217, "step": 870 }, { "epoch": 19.555555555555557, "grad_norm": 0.16763387620449066, "learning_rate": 1.7439110358704602e-07, "loss": 0.0175, "step": 880 }, { "epoch": 19.77777777777778, "grad_norm": 0.13927114009857178, "learning_rate": 2.4529877207557505e-08, "loss": 0.0221, "step": 890 }, { "epoch": 19.91111111111111, "step": 896, "total_flos": 6.431855929189344e+16, "train_loss": 0.060006462794262916, "train_runtime": 634.4726, "train_samples_per_second": 90.381, "train_steps_per_second": 1.412 } ], "logging_steps": 10, "max_steps": 896, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.431855929189344e+16, "train_batch_size": 64, "trial_name": null, "trial_params": null }