{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9893390191897654, "eval_steps": 100, "global_step": 58, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 593.9107284545898, "epoch": 0.017057569296375266, "grad_norm": 0.4269763231277466, "kl": 0.0, "learning_rate": 5e-07, "loss": 0.0167, "reward": 0.6294643059372902, "reward_std": 0.35917505994439125, "rewards/accuracy_reward": 0.6283482387661934, "rewards/format_reward": 0.0011160714784637094, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 605.0845737457275, "epoch": 0.08528784648187633, "grad_norm": 1.2334303855895996, "kl": 0.0003300309181213379, "learning_rate": 2.5e-06, "loss": 0.0188, "reward": 0.6121652042493224, "reward_std": 0.3490289170295, "rewards/accuracy_reward": 0.6116071688011289, "rewards/format_reward": 0.0005580357392318547, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 612.8330627441406, "epoch": 0.17057569296375266, "grad_norm": 4.521842956542969, "kl": 0.0027939796447753905, "learning_rate": 2.956412726139078e-06, "loss": 0.0577, "reward": 0.6854911021888256, "reward_std": 0.2840955166146159, "rewards/accuracy_reward": 0.6854911021888256, "rewards/format_reward": 0.0, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 604.9821670532226, "epoch": 0.255863539445629, "grad_norm": 1.5347368717193604, "kl": 0.015767669677734374, "learning_rate": 2.7836719084521715e-06, "loss": 0.07, "reward": 0.7609375312924385, "reward_std": 0.21701927129179238, "rewards/accuracy_reward": 0.7609375312924385, "rewards/format_reward": 0.0, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 591.6801628112793, "epoch": 0.3411513859275053, "grad_norm": 0.6672903895378113, "kl": 0.005676651000976562, "learning_rate": 2.4946839873611927e-06, "loss": 0.0579, "reward": 0.7703125327825546, "reward_std": 0.20545081831514836, "rewards/accuracy_reward": 0.7703125327825546, "rewards/format_reward": 0.0, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 599.0326194763184, "epoch": 0.42643923240938164, "grad_norm": 2323443.0, "kl": 85.62775764465331, "learning_rate": 2.1156192081791355e-06, "loss": 3.4627, "reward": 0.761160746216774, "reward_std": 0.1914736282080412, "rewards/accuracy_reward": 0.761160746216774, "rewards/format_reward": 0.0, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 599.8656478881836, "epoch": 0.511727078891258, "grad_norm": 31896284.0, "kl": 4184.925457000732, "learning_rate": 1.6808050203829845e-06, "loss": 166.556, "reward": 0.7537946760654449, "reward_std": 0.1866186775267124, "rewards/accuracy_reward": 0.7537946760654449, "rewards/format_reward": 0.0, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 591.7852882385254, "epoch": 0.5970149253731343, "grad_norm": 0.18381251394748688, "kl": 175321907.9516266, "learning_rate": 1.2296174432791415e-06, "loss": 6963359.2, "reward": 0.7421875357627868, "reward_std": 0.18768558986485004, "rewards/accuracy_reward": 0.7421875357627868, "rewards/format_reward": 0.0, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 581.8495811462402, "epoch": 0.6823027718550106, "grad_norm": 187.1607208251953, "kl": 179490.89021034242, "learning_rate": 8.029152419343472e-07, "loss": 7215.3883, "reward": 0.7627232506871223, "reward_std": 0.18175358334556221, "rewards/accuracy_reward": 0.7627232506871223, "rewards/format_reward": 0.0, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 598.5533767700196, "epoch": 0.767590618336887, "grad_norm": 170799.046875, "kl": 1.5941051483154296, "learning_rate": 4.3933982822017883e-07, "loss": 0.0995, "reward": 0.7484375357627868, "reward_std": 0.18508986476808786, "rewards/accuracy_reward": 0.7484375357627868, "rewards/format_reward": 0.0, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 602.7493576049804, "epoch": 0.8528784648187633, "grad_norm": 23.201330184936523, "kl": 0.7593372344970704, "learning_rate": 1.718159615201853e-07, "loss": 0.071, "reward": 0.7455357477068901, "reward_std": 0.1892501600086689, "rewards/accuracy_reward": 0.7455357477068901, "rewards/format_reward": 0.0, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 590.3078407287597, "epoch": 0.9381663113006397, "grad_norm": 1513.896484375, "kl": 0.4785778045654297, "learning_rate": 2.4570139579284723e-08, "loss": 0.059, "reward": 0.7736607506871224, "reward_std": 0.185056865029037, "rewards/accuracy_reward": 0.7736607506871224, "rewards/format_reward": 0.0, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 586.8786061604818, "epoch": 0.9893390191897654, "kl": 2.2987874348958335, "reward": 0.77194944024086, "reward_std": 0.1801375082383553, "rewards/accuracy_reward": 0.77194944024086, "rewards/format_reward": 0.0, "step": 58, "total_flos": 0.0, "train_loss": 600926.2988391328, "train_runtime": 7660.7166, "train_samples_per_second": 0.979, "train_steps_per_second": 0.008 } ], "logging_steps": 5, "max_steps": 58, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }