{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9826771653543307, "eval_steps": 100, "global_step": 158, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 652.5913221359253, "epoch": 0.12598425196850394, "grad_norm": 0.5412344932556152, "kl": 0.00025856494903564453, "learning_rate": 1.875e-06, "loss": 0.0, "reward": 0.5777902046218515, "reward_std": 0.32899713758379223, "rewards/accuracy_reward": 0.5776785971596837, "rewards/format_reward": 0.00011160714784637093, "step": 10 }, { "completion_length": 694.438868522644, "epoch": 0.25196850393700787, "grad_norm": 0.24628566205501556, "kl": 0.0019156813621520996, "learning_rate": 2.994130233112417e-06, "loss": 0.0001, "reward": 0.6052455639466643, "reward_std": 0.26475782548077403, "rewards/accuracy_reward": 0.6052455639466643, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 651.5066148757935, "epoch": 0.3779527559055118, "grad_norm": 0.14223581552505493, "kl": 0.0024756908416748045, "learning_rate": 2.9286218000371976e-06, "loss": 0.0001, "reward": 0.6724330654367805, "reward_std": 0.23531078966334462, "rewards/accuracy_reward": 0.6724330654367805, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 642.1838449478149, "epoch": 0.5039370078740157, "grad_norm": 0.1239105761051178, "kl": 0.0031515121459960937, "learning_rate": 2.7934718587800417e-06, "loss": 0.0001, "reward": 0.7046875322237611, "reward_std": 0.19434297760017216, "rewards/accuracy_reward": 0.7046875322237611, "rewards/format_reward": 0.0, "step": 40 }, { "completion_length": 627.14924659729, "epoch": 0.6299212598425197, "grad_norm": 0.13240313529968262, "kl": 0.003639984130859375, "learning_rate": 2.595268609058752e-06, "loss": 0.0001, "reward": 0.7179687809199095, "reward_std": 0.19313886840827763, "rewards/accuracy_reward": 0.7179687809199095, "rewards/format_reward": 0.0, "step": 50 }, { "completion_length": 626.9296024322509, "epoch": 0.7559055118110236, "grad_norm": 0.15062075853347778, "kl": 0.004168796539306641, "learning_rate": 2.343673931461171e-06, "loss": 0.0002, "reward": 0.6809152102097868, "reward_std": 0.1983337783254683, "rewards/accuracy_reward": 0.6809152102097868, "rewards/format_reward": 0.0, "step": 60 }, { "completion_length": 610.840876197815, "epoch": 0.8818897637795275, "grad_norm": 0.11126791685819626, "kl": 0.004203128814697266, "learning_rate": 2.0509523964971355e-06, "loss": 0.0002, "reward": 0.7165178887546062, "reward_std": 0.1934912689961493, "rewards/accuracy_reward": 0.7165178887546062, "rewards/format_reward": 0.0, "step": 70 }, { "completion_length": 592.2336000569661, "epoch": 1.0, "grad_norm": 0.17808477580547333, "kl": 0.0042411295572916665, "learning_rate": 1.7313733994479534e-06, "loss": 0.0002, "reward": 0.7291666994492213, "reward_std": 0.1846819964547952, "rewards/accuracy_reward": 0.7291666994492213, "rewards/format_reward": 0.0, "step": 80 }, { "completion_length": 609.6807176589966, "epoch": 1.125984251968504, "grad_norm": 0.08229045569896698, "kl": 0.004135942459106446, "learning_rate": 1.4005155653473445e-06, "loss": 0.0002, "reward": 0.7156250355765224, "reward_std": 0.20761510250158607, "rewards/accuracy_reward": 0.7156250355765224, "rewards/format_reward": 0.0, "step": 90 }, { "completion_length": 611.2092897415162, "epoch": 1.2519685039370079, "grad_norm": 0.16556662321090698, "kl": 0.0037270545959472655, "learning_rate": 1.0745073324985549e-06, "loss": 0.0001, "reward": 0.7110491398721933, "reward_std": 0.18295098417438566, "rewards/accuracy_reward": 0.7110491398721933, "rewards/format_reward": 0.0, "step": 100 }, { "completion_length": 606.3881959915161, "epoch": 1.3779527559055118, "grad_norm": 0.09432197362184525, "kl": 0.0037145614624023438, "learning_rate": 7.692407340588056e-07, "loss": 0.0001, "reward": 0.7323661027476192, "reward_std": 0.1929833421483636, "rewards/accuracy_reward": 0.7323661027476192, "rewards/format_reward": 0.0, "step": 110 }, { "completion_length": 604.5668788909912, "epoch": 1.5039370078740157, "grad_norm": 0.38694441318511963, "kl": 0.004090404510498047, "learning_rate": 4.995967037450238e-07, "loss": 0.0002, "reward": 0.7164062798023224, "reward_std": 0.18084403886459768, "rewards/accuracy_reward": 0.7164062798023224, "rewards/format_reward": 0.0, "step": 120 }, { "completion_length": 606.2777070999146, "epoch": 1.6299212598425197, "grad_norm": 0.15648125112056732, "kl": 0.0037802696228027345, "learning_rate": 2.787196699446389e-07, "loss": 0.0002, "reward": 0.7242187837138772, "reward_std": 0.19052648572251202, "rewards/accuracy_reward": 0.7242187837138772, "rewards/format_reward": 0.0, "step": 130 }, { "completion_length": 605.7184408187866, "epoch": 1.7559055118110236, "grad_norm": 0.4628942608833313, "kl": 0.003756284713745117, "learning_rate": 1.1737679983668259e-07, "loss": 0.0002, "reward": 0.7152902094647289, "reward_std": 0.20197481904178857, "rewards/accuracy_reward": 0.7152902094647289, "rewards/format_reward": 0.0, "step": 140 }, { "completion_length": 605.061745262146, "epoch": 1.8818897637795275, "grad_norm": 0.1207461878657341, "kl": 0.007715559005737305, "learning_rate": 2.343312866591163e-08, "loss": 0.0003, "reward": 0.7013393187895417, "reward_std": 0.1918664438650012, "rewards/accuracy_reward": 0.7013393187895417, "rewards/format_reward": 0.0, "step": 150 }, { "completion_length": 607.0647583007812, "epoch": 1.9826771653543307, "kl": 0.0038232803344726562, "reward": 0.7250279379077256, "reward_std": 0.17406430409755558, "rewards/accuracy_reward": 0.7250279379077256, "rewards/format_reward": 0.0, "step": 158, "total_flos": 0.0, "train_loss": 0.00014615306474896194, "train_runtime": 28501.5732, "train_samples_per_second": 0.624, "train_steps_per_second": 0.006 } ], "logging_steps": 10, "max_steps": 158, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }