Model save

Browse files

Files changed (10) hide show

README.md +68 -0
all_results.json +8 -0
generation_config.json +6 -0
model-00001-of-00004.safetensors +1 -1
model-00002-of-00004.safetensors +1 -1
model-00003-of-00004.safetensors +1 -1
model-00004-of-00004.safetensors +1 -1
train_results.json +8 -0
trainer_state.json +1509 -0
training_args.bin +1 -1

README.md ADDED Viewed

	@@ -0,0 +1,68 @@

+---
+base_model: Qwen/Qwen2.5-Math-7B
+library_name: transformers
+model_name: Qwen-2.5-7B-Simple-RL
+tags:
+- generated_from_trainer
+- trl
+- grpo
+licence: license
+---
+# Model Card for Qwen-2.5-7B-Simple-RL
+This model is a fine-tuned version of [Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="Maker-0409/Qwen-2.5-7B-Simple-RL", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/taochenai/huggingface/runs/hihrk4wo)
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+### Framework versions
+- TRL: 0.15.0.dev0
+- Transformers: 4.49.0.dev0
+- Pytorch: 2.5.1+cu121
+- Datasets: 3.2.0
+- Tokenizers: 0.21.0
+## Citations
+Cite GRPO as:
+```bibtex
+@article{zhihong2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+```
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

all_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "total_flos": 0.0,
+    "train_loss": 4841.422249500714,
+    "train_runtime": 180396.3107,
+    "train_samples": 7500,
+    "train_samples_per_second": 0.042,
+    "train_steps_per_second": 0.003
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "max_new_tokens": 2048,
+  "transformers_version": "4.49.0.dev0"
+}

model-00001-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:988336cece8f0bddbadb4ddc1af32578d73546bbcbee61322797acc652909568
 size 4877660776

 version https://git-lfs.github.com/spec/v1
+oid sha256:c29d9eecbf30eef037859879dbc6af4acc10a3ff8fb79cb4732cbca41c35fbe3
 size 4877660776

model-00002-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d3bf9fc7e0cdb7f18431de4c1c4bd7242a4aa1489da083210e3b5604aecc5bfb
 size 4932751008

 version https://git-lfs.github.com/spec/v1
+oid sha256:ab9b09035bebd0c729f8bbd320c3152f62583df612cf7faaac3eac466ae6557c
 size 4932751008

model-00003-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:697b20a7db1670a093afc92cf8d3a3f8000f17dd201d3d115115282b86963d4f
 size 4330865200

 version https://git-lfs.github.com/spec/v1
+oid sha256:b2c78dec2dd3327d10869c4da125411e4e739412f00b94cdc15580e698d9ac77
 size 4330865200

model-00004-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a489988de3cb1566b1845d3ab7bfcdd065744b45f0a4e459444bfa2ca361fc49
 size 1089994880

 version https://git-lfs.github.com/spec/v1
+oid sha256:9d39c18161aa93e39851678ad03057059b91c440ffec216018c112758dcec9a8
 size 1089994880

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "total_flos": 0.0,
+    "train_loss": 4841.422249500714,
+    "train_runtime": 180396.3107,
+    "train_samples": 7500,
+    "train_samples_per_second": 0.042,
+    "train_steps_per_second": 0.003
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1509 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 100,
+  "global_step": 468,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "completion_length": 633.2446681976319,
+      "epoch": 0.010666666666666666,
+      "grad_norm": 2.2443933486938477,
+      "kl": 0.00011417865753173828,
+      "learning_rate": 3.1914893617021275e-07,
+      "loss": 0.0,
+      "reward": 1.138736367225647,
+      "reward_std": 0.8278621450066567,
+      "rewards/accuracy_reward": 0.5946428831666708,
+      "rewards/cosine_scaled_reward": 0.2899268216686323,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.25416668243706225,
+      "step": 5
+    },
+    {
+      "completion_length": 600.8857383728027,
+      "epoch": 0.021333333333333333,
+      "grad_norm": 5.001251220703125,
+      "kl": 0.00020779371261596679,
+      "learning_rate": 6.382978723404255e-07,
+      "loss": 0.0,
+      "reward": 1.2528822764754295,
+      "reward_std": 0.8592379853129387,
+      "rewards/accuracy_reward": 0.6553571775555611,
+      "rewards/cosine_scaled_reward": 0.34097747248015364,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.25654763616621495,
+      "step": 10
+    },
+    {
+      "completion_length": 601.8518112182617,
+      "epoch": 0.032,
+      "grad_norm": 3.453845500946045,
+      "kl": 0.00034580230712890627,
+      "learning_rate": 9.574468085106384e-07,
+      "loss": 0.0,
+      "reward": 1.2825960636138916,
+      "reward_std": 0.7762525148689747,
+      "rewards/accuracy_reward": 0.6642857484519482,
+      "rewards/cosine_scaled_reward": 0.3486674582702108,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.26964287189766767,
+      "step": 15
+    },
+    {
+      "completion_length": 620.7839553833007,
+      "epoch": 0.042666666666666665,
+      "grad_norm": 63.01131057739258,
+      "kl": 0.001246500015258789,
+      "learning_rate": 1.276595744680851e-06,
+      "loss": 0.0001,
+      "reward": 1.2914750523865224,
+      "reward_std": 0.7945833645761013,
+      "rewards/accuracy_reward": 0.6571428865194321,
+      "rewards/cosine_scaled_reward": 0.3593321413063677,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.2750000203028321,
+      "step": 20
+    },
+    {
+      "completion_length": 639.3946762084961,
+      "epoch": 0.05333333333333334,
+      "grad_norm": 1.1951252222061157,
+      "kl": 0.001938199996948242,
+      "learning_rate": 1.5957446808510639e-06,
+      "loss": 0.0001,
+      "reward": 1.2197763450443744,
+      "reward_std": 0.7964548453688621,
+      "rewards/accuracy_reward": 0.6285714630037547,
+      "rewards/cosine_scaled_reward": 0.323942980915308,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.2672619212418795,
+      "step": 25
+    },
+    {
+      "completion_length": 645.9482414245606,
+      "epoch": 0.064,
+      "grad_norm": 0.5322187542915344,
+      "kl": 0.0028698921203613283,
+      "learning_rate": 1.9148936170212767e-06,
+      "loss": 0.0001,
+      "reward": 1.34233574308455,
+      "reward_std": 0.7051636058837175,
+      "rewards/accuracy_reward": 0.6821428902447224,
+      "rewards/cosine_scaled_reward": 0.38400235488079487,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.2761904950253665,
+      "step": 30
+    },
+    {
+      "completion_length": 630.1071678161621,
+      "epoch": 0.07466666666666667,
+      "grad_norm": 0.686019241809845,
+      "kl": 0.00424489974975586,
+      "learning_rate": 2.2340425531914894e-06,
+      "loss": 0.0002,
+      "reward": 1.2706220560474322,
+      "reward_std": 0.7081292014569044,
+      "rewards/accuracy_reward": 0.6839286010712385,
+      "rewards/cosine_scaled_reward": 0.34145535016432405,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.2452381114475429,
+      "step": 35
+    },
+    {
+      "completion_length": 663.8464553833007,
+      "epoch": 0.08533333333333333,
+      "grad_norm": 10619385856.0,
+      "kl": 11324620.806011772,
+      "learning_rate": 2.553191489361702e-06,
+      "loss": 453134.65,
+      "reward": 1.4818414891138674,
+      "reward_std": 0.724718413501978,
+      "rewards/accuracy_reward": 0.7196428954601288,
+      "rewards/cosine_scaled_reward": 0.43124618427827954,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.3309524044394493,
+      "step": 40
+    },
+    {
+      "completion_length": 636.5178840637207,
+      "epoch": 0.096,
+      "grad_norm": 0.4083445370197296,
+      "kl": 0.1388763427734375,
+      "learning_rate": 2.872340425531915e-06,
+      "loss": 0.0055,
+      "reward": 1.5206772923469543,
+      "reward_std": 0.6890950493514538,
+      "rewards/accuracy_reward": 0.7428571715950966,
+      "rewards/cosine_scaled_reward": 0.4444867596961558,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.3333333550952375,
+      "step": 45
+    },
+    {
+      "completion_length": 624.0178833007812,
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.6491600275039673,
+      "kl": 0.014713478088378907,
+      "learning_rate": 2.9996241442585123e-06,
+      "loss": 0.0006,
+      "reward": 1.5073627218604089,
+      "reward_std": 0.7132997542619706,
+      "rewards/accuracy_reward": 0.712500025331974,
+      "rewards/cosine_scaled_reward": 0.41093407664448023,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.38392860516905786,
+      "step": 50
+    },
+    {
+      "completion_length": 631.5339569091797,
+      "epoch": 0.11733333333333333,
+      "grad_norm": 0.7147920727729797,
+      "kl": 0.007195663452148437,
+      "learning_rate": 2.9973279301399446e-06,
+      "loss": 0.0003,
+      "reward": 1.5377919152379036,
+      "reward_std": 0.76092077344656,
+      "rewards/accuracy_reward": 0.7232143200933934,
+      "rewards/cosine_scaled_reward": 0.4282680474221706,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.386309552192688,
+      "step": 55
+    },
+    {
+      "completion_length": 627.9214561462402,
+      "epoch": 0.128,
+      "grad_norm": 0.8942143321037292,
+      "kl": 0.008642578125,
+      "learning_rate": 2.992947502998804e-06,
+      "loss": 0.0003,
+      "reward": 1.6543699458241463,
+      "reward_std": 0.7264986954629421,
+      "rewards/accuracy_reward": 0.7214285999536514,
+      "rewards/cosine_scaled_reward": 0.40972703909501434,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.5232143249362707,
+      "step": 60
+    },
+    {
+      "completion_length": 633.0232421875,
+      "epoch": 0.13866666666666666,
+      "grad_norm": 6.921348571777344,
+      "kl": 0.01439208984375,
+      "learning_rate": 2.9864889601923268e-06,
+      "loss": 0.0006,
+      "reward": 1.7206872910261155,
+      "reward_std": 0.7344334974884987,
+      "rewards/accuracy_reward": 0.725000036507845,
+      "rewards/cosine_scaled_reward": 0.43497296012938025,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.5607143200933933,
+      "step": 65
+    },
+    {
+      "completion_length": 656.7178894042969,
+      "epoch": 0.14933333333333335,
+      "grad_norm": 0.6442045569419861,
+      "kl": 0.01673736572265625,
+      "learning_rate": 2.977961291721137e-06,
+      "loss": 0.0007,
+      "reward": 1.8801582887768746,
+      "reward_std": 0.7263622097671032,
+      "rewards/accuracy_reward": 0.7571428894996644,
+      "rewards/cosine_scaled_reward": 0.47301534870639445,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.6500000521540642,
+      "step": 70
+    },
+    {
+      "completion_length": 619.4536033630371,
+      "epoch": 0.16,
+      "grad_norm": 1.7239394187927246,
+      "kl": 0.026496124267578126,
+      "learning_rate": 2.9673763677155655e-06,
+      "loss": 0.0011,
+      "reward": 1.8051109313964844,
+      "reward_std": 0.7346500240266323,
+      "rewards/accuracy_reward": 0.7160714596509934,
+      "rewards/cosine_scaled_reward": 0.39439656864851713,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.6946429140865803,
+      "step": 75
+    },
+    {
+      "completion_length": 623.1785926818848,
+      "epoch": 0.17066666666666666,
+      "grad_norm": 0.6716666221618652,
+      "kl": 0.018997955322265624,
+      "learning_rate": 2.9547489219129666e-06,
+      "loss": 0.0008,
+      "reward": 1.9212585434317588,
+      "reward_std": 0.634969700500369,
+      "rewards/accuracy_reward": 0.7785714574158191,
+      "rewards/cosine_scaled_reward": 0.4653060721466318,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.6773809991776943,
+      "step": 80
+    },
+    {
+      "completion_length": 690.1518196105957,
+      "epoch": 0.18133333333333335,
+      "grad_norm": 1.1456305980682373,
+      "kl": 0.02204437255859375,
+      "learning_rate": 2.9400965311490175e-06,
+      "loss": 0.0009,
+      "reward": 1.9084690719842912,
+      "reward_std": 0.7263222638517618,
+      "rewards/accuracy_reward": 0.7303571783006191,
+      "rewards/cosine_scaled_reward": 0.4507309086387977,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.7273810178041458,
+      "step": 85
+    },
+    {
+      "completion_length": 650.4768188476562,
+      "epoch": 0.192,
+      "grad_norm": 29.814361572265625,
+      "kl": 0.078216552734375,
+      "learning_rate": 2.9234395908915565e-06,
+      "loss": 0.0031,
+      "reward": 1.8972563683986663,
+      "reward_std": 0.7165740359574556,
+      "rewards/accuracy_reward": 0.6875000324100256,
+      "rewards/cosine_scaled_reward": 0.4055896209087223,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8041667267680168,
+      "step": 90
+    },
+    {
+      "completion_length": 668.3339584350585,
+      "epoch": 0.20266666666666666,
+      "grad_norm": 0.48750847578048706,
+      "kl": 0.02767181396484375,
+      "learning_rate": 2.904801286851009e-06,
+      "loss": 0.0011,
+      "reward": 1.9524270623922348,
+      "reward_std": 0.6363851364701987,
+      "rewards/accuracy_reward": 0.7035714564844966,
+      "rewards/cosine_scaled_reward": 0.42206980669870975,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.826785783469677,
+      "step": 95
+    },
+    {
+      "completion_length": 645.9428840637207,
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.8315287232398987,
+      "kl": 0.02986602783203125,
+      "learning_rate": 2.884207562706925e-06,
+      "loss": 0.0012,
+      "reward": 2.0384097367525102,
+      "reward_std": 0.6786769151687622,
+      "rewards/accuracy_reward": 0.7517857387661934,
+      "rewards/cosine_scaled_reward": 0.4657905898289755,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.820833396166563,
+      "step": 100
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "eval_completion_length": 688.0076597412109,
+      "eval_kl": 0.0332870361328125,
+      "eval_loss": 0.0013802805915474892,
+      "eval_reward": 1.86520801551342,
+      "eval_reward_std": 0.7114028903335333,
+      "eval_rewards/accuracy_reward": 0.650542886838317,
+      "eval_rewards/cosine_scaled_reward": 0.3737031816519331,
+      "eval_rewards/format_reward": 0.0,
+      "eval_rewards/reasoning_steps_reward": 0.8409619681358338,
+      "eval_runtime": 32350.4437,
+      "eval_samples_per_second": 0.155,
+      "eval_steps_per_second": 0.011,
+      "step": 100
+    },
+    {
+      "completion_length": 717.150033569336,
+      "epoch": 0.224,
+      "grad_norm": 1.5486549139022827,
+      "kl": 0.03196563720703125,
+      "learning_rate": 2.8616870839955444e-06,
+      "loss": 0.0013,
+      "reward": 2.0346583992242815,
+      "reward_std": 0.7014419212937355,
+      "rewards/accuracy_reward": 0.7232143215835094,
+      "rewards/cosine_scaled_reward": 0.457277343980968,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8541667237877846,
+      "step": 105
+    },
+    {
+      "completion_length": 708.8571708679199,
+      "epoch": 0.23466666666666666,
+      "grad_norm": 0.5981384515762329,
+      "kl": 0.02979583740234375,
+      "learning_rate": 2.837271198208662e-06,
+      "loss": 0.0012,
+      "reward": 2.0179374665021896,
+      "reward_std": 0.6652137346565723,
+      "rewards/accuracy_reward": 0.7250000320374965,
+      "rewards/cosine_scaled_reward": 0.47091358043253423,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8220238700509072,
+      "step": 110
+    },
+    {
+      "completion_length": 632.7732406616211,
+      "epoch": 0.24533333333333332,
+      "grad_norm": 0.7111315131187439,
+      "kl": 0.02539825439453125,
+      "learning_rate": 2.8109938911593322e-06,
+      "loss": 0.001,
+      "reward": 2.0148118153214454,
+      "reward_std": 0.6429756574332715,
+      "rewards/accuracy_reward": 0.728571455553174,
+      "rewards/cosine_scaled_reward": 0.44754982106387614,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8386905357241631,
+      "step": 115
+    },
+    {
+      "completion_length": 655.8321723937988,
+      "epoch": 0.256,
+      "grad_norm": 0.5316483974456787,
+      "kl": 0.02179107666015625,
+      "learning_rate": 2.7828917396751474e-06,
+      "loss": 0.0009,
+      "reward": 1.9900789648294448,
+      "reward_std": 0.6477071691304446,
+      "rewards/accuracy_reward": 0.7160714656114578,
+      "rewards/cosine_scaled_reward": 0.43412648113444446,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8398810118436814,
+      "step": 120
+    },
+    {
+      "completion_length": 644.7321693420411,
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.4458823800086975,
+      "kl": 0.025299072265625,
+      "learning_rate": 2.753003860684943e-06,
+      "loss": 0.001,
+      "reward": 2.1427780210971834,
+      "reward_std": 0.6711063630878925,
+      "rewards/accuracy_reward": 0.7750000268220901,
+      "rewards/cosine_scaled_reward": 0.5183731818571686,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8494048312306404,
+      "step": 125
+    },
+    {
+      "completion_length": 684.2911033630371,
+      "epoch": 0.2773333333333333,
+      "grad_norm": 0.7146270871162415,
+      "kl": 0.034222412109375,
+      "learning_rate": 2.721371856769793e-06,
+      "loss": 0.0014,
+      "reward": 1.9814838409423827,
+      "reward_std": 0.7353869907557964,
+      "rewards/accuracy_reward": 0.6625000331550837,
+      "rewards/cosine_scaled_reward": 0.3981504186260281,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9208333924412727,
+      "step": 130
+    },
+    {
+      "completion_length": 650.483960723877,
+      "epoch": 0.288,
+      "grad_norm": 0.8331003189086914,
+      "kl": 0.046978759765625,
+      "learning_rate": 2.688039758254093e-06,
+      "loss": 0.0019,
+      "reward": 2.223627084493637,
+      "reward_std": 0.6465678755193949,
+      "rewards/accuracy_reward": 0.7732143219560385,
+      "rewards/cosine_scaled_reward": 0.506960358901415,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.94345243871212,
+      "step": 135
+    },
+    {
+      "completion_length": 702.9536026000976,
+      "epoch": 0.2986666666666667,
+      "grad_norm": 1.9107334613800049,
+      "kl": 0.0536590576171875,
+      "learning_rate": 2.65305396191733e-06,
+      "loss": 0.0021,
+      "reward": 2.1239778250455856,
+      "reward_std": 0.6765143848955631,
+      "rewards/accuracy_reward": 0.7071428891271353,
+      "rewards/cosine_scaled_reward": 0.4555253505706787,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9613095715641975,
+      "step": 140
+    },
+    {
+      "completion_length": 733.6089630126953,
+      "epoch": 0.30933333333333335,
+      "grad_norm": 0.5300867557525635,
+      "kl": 0.05316162109375,
+      "learning_rate": 2.61646316641186e-06,
+      "loss": 0.0021,
+      "reward": 2.1554796636104583,
+      "reward_std": 0.6578622825443745,
+      "rewards/accuracy_reward": 0.7303571704775095,
+      "rewards/cosine_scaled_reward": 0.47036054339259864,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9547619551420212,
+      "step": 145
+    },
+    {
+      "completion_length": 713.221459197998,
+      "epoch": 0.32,
+      "grad_norm": 0.6026062369346619,
+      "kl": 0.0533843994140625,
+      "learning_rate": 2.5783183044765715e-06,
+      "loss": 0.0021,
+      "reward": 2.1126459658145906,
+      "reward_std": 0.5920085646212101,
+      "rewards/accuracy_reward": 0.7089285995811224,
+      "rewards/cosine_scaled_reward": 0.4566935421898961,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9470238655805587,
+      "step": 150
+    },
+    {
+      "completion_length": 678.6428886413574,
+      "epoch": 0.33066666666666666,
+      "grad_norm": 0.6598377227783203,
+      "kl": 0.049908447265625,
+      "learning_rate": 2.5386724720408135e-06,
+      "loss": 0.002,
+      "reward": 2.243595580756664,
+      "reward_std": 0.6088640403002501,
+      "rewards/accuracy_reward": 0.7767857441678643,
+      "rewards/cosine_scaled_reward": 0.5435954930260778,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9232143476605416,
+      "step": 155
+    },
+    {
+      "completion_length": 683.9268142700196,
+      "epoch": 0.3413333333333333,
+      "grad_norm": 0.6654959321022034,
+      "kl": 0.0447540283203125,
+      "learning_rate": 2.49758085431725e-06,
+      "loss": 0.0018,
+      "reward": 2.0952899247407912,
+      "reward_std": 0.6968366518616677,
+      "rewards/accuracy_reward": 0.7232143208384514,
+      "rewards/cosine_scaled_reward": 0.4637422326952219,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9083333939313889,
+      "step": 160
+    },
+    {
+      "completion_length": 691.3464614868165,
+      "epoch": 0.352,
+      "grad_norm": 0.689552903175354,
+      "kl": 0.0448211669921875,
+      "learning_rate": 2.455100648986533e-06,
+      "loss": 0.0018,
+      "reward": 2.0519487097859384,
+      "reward_std": 0.7221721112728119,
+      "rewards/accuracy_reward": 0.6964286031201482,
+      "rewards/cosine_scaled_reward": 0.4602819522842765,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8952381581068038,
+      "step": 165
+    },
+    {
+      "completion_length": 696.5268180847168,
+      "epoch": 0.3626666666666667,
+      "grad_norm": 1.0024878978729248,
+      "kl": 0.065167236328125,
+      "learning_rate": 2.4112909865807053e-06,
+      "loss": 0.0026,
+      "reward": 1.7887505039572715,
+      "reward_std": 0.7482936225831509,
+      "rewards/accuracy_reward": 0.571428600884974,
+      "rewards/cosine_scaled_reward": 0.3333932981360704,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8839286401867866,
+      "step": 170
+    },
+    {
+      "completion_length": 703.2714614868164,
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.5711168050765991,
+      "kl": 0.093731689453125,
+      "learning_rate": 2.366212848176164e-06,
+      "loss": 0.0037,
+      "reward": 1.9069189459085465,
+      "reward_std": 0.8069212771952152,
+      "rewards/accuracy_reward": 0.6500000327825546,
+      "rewards/cosine_scaled_reward": 0.42358550764620306,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8333333879709244,
+      "step": 175
+    },
+    {
+      "completion_length": 714.2536003112793,
+      "epoch": 0.384,
+      "grad_norm": 3.1069464683532715,
+      "kl": 0.1747802734375,
+      "learning_rate": 2.319928980510752e-06,
+      "loss": 0.007,
+      "reward": 1.6917703241109847,
+      "reward_std": 0.8836216881871224,
+      "rewards/accuracy_reward": 0.6089285977184773,
+      "rewards/cosine_scaled_reward": 0.35307975246978457,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.7297619581222534,
+      "step": 180
+    },
+    {
+      "completion_length": 727.7018188476562,
+      "epoch": 0.39466666666666667,
+      "grad_norm": 1.1932159662246704,
+      "kl": 0.193988037109375,
+      "learning_rate": 2.272503808643123e-06,
+      "loss": 0.0078,
+      "reward": 1.7027929693460464,
+      "reward_std": 0.7921728197485208,
+      "rewards/accuracy_reward": 0.6267857421189547,
+      "rewards/cosine_scaled_reward": 0.3605310095474124,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.7154762461781502,
+      "step": 185
+    },
+    {
+      "completion_length": 677.6518127441407,
+      "epoch": 0.4053333333333333,
+      "grad_norm": 0.6525413393974304,
+      "kl": 0.1227813720703125,
+      "learning_rate": 2.2240033462759628e-06,
+      "loss": 0.0049,
+      "reward": 2.055608908832073,
+      "reward_std": 0.6409808352589608,
+      "rewards/accuracy_reward": 0.7428571667522192,
+      "rewards/cosine_scaled_reward": 0.4907278836122714,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8220238700509072,
+      "step": 190
+    },
+    {
+      "completion_length": 729.3125358581543,
+      "epoch": 0.416,
+      "grad_norm": 0.470821738243103,
+      "kl": 0.1053009033203125,
+      "learning_rate": 2.1744951038678905e-06,
+      "loss": 0.0042,
+      "reward": 2.1352262631058694,
+      "reward_std": 0.6541992913931608,
+      "rewards/accuracy_reward": 0.7446428880095481,
+      "rewards/cosine_scaled_reward": 0.5340357202105224,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8565476804971695,
+      "step": 195
+    },
+    {
+      "completion_length": 736.6607482910156,
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.3663829267024994,
+      "kl": 0.145220947265625,
+      "learning_rate": 2.124047994661941e-06,
+      "loss": 0.0058,
+      "reward": 2.0683016672730448,
+      "reward_std": 0.6785697277635336,
+      "rewards/accuracy_reward": 0.7107143150642514,
+      "rewards/cosine_scaled_reward": 0.4861587251536548,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8714286342263222,
+      "step": 200
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "eval_completion_length": 743.3604330322265,
+      "eval_kl": 0.1699279296875,
+      "eval_loss": 0.006734147202223539,
+      "eval_reward": 1.8947704853653908,
+      "eval_reward_std": 0.7092250557422638,
+      "eval_rewards/accuracy_reward": 0.6307143133163452,
+      "eval_rewards/cosine_scaled_reward": 0.39257041423644407,
+      "eval_rewards/format_reward": 0.0,
+      "eval_rewards/reasoning_steps_reward": 0.871485775399208,
+      "eval_runtime": 32670.592,
+      "eval_samples_per_second": 0.153,
+      "eval_steps_per_second": 0.011,
+      "step": 200
+    },
+    {
+      "completion_length": 752.7053955078125,
+      "epoch": 0.43733333333333335,
+      "grad_norm": 0.5299625396728516,
+      "kl": 0.1930633544921875,
+      "learning_rate": 2.072732238761434e-06,
+      "loss": 0.0077,
+      "reward": 1.8860187515616418,
+      "reward_std": 0.7606242794543505,
+      "rewards/accuracy_reward": 0.6446428863331676,
+      "rewards/cosine_scaled_reward": 0.40447108587541153,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8369048193097115,
+      "step": 205
+    },
+    {
+      "completion_length": 733.603606414795,
+      "epoch": 0.448,
+      "grad_norm": 1.6152819395065308,
+      "kl": 0.219268798828125,
+      "learning_rate": 2.0206192653867536e-06,
+      "loss": 0.0088,
+      "reward": 1.997245892137289,
+      "reward_std": 0.7402419943362475,
+      "rewards/accuracy_reward": 0.7017857382073999,
+      "rewards/cosine_scaled_reward": 0.47284105569124224,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8226191058754921,
+      "step": 210
+    },
+    {
+      "completion_length": 844.0661102294922,
+      "epoch": 0.45866666666666667,
+      "grad_norm": 7.516280651092529,
+      "kl": 0.27982177734375,
+      "learning_rate": 1.967781613449095e-06,
+      "loss": 0.0112,
+      "reward": 1.5464881896972655,
+      "reward_std": 0.8091491930186748,
+      "rewards/accuracy_reward": 0.49107144959270954,
+      "rewards/cosine_scaled_reward": 0.21672622584737838,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8386905357241631,
+      "step": 215
+    },
+    {
+      "completion_length": 814.1696807861329,
+      "epoch": 0.4693333333333333,
+      "grad_norm": 0.4684678018093109,
+      "kl": 0.194140625,
+      "learning_rate": 1.9142928305795637e-06,
+      "loss": 0.0078,
+      "reward": 1.8477135568857193,
+      "reward_std": 0.7414120733737946,
+      "rewards/accuracy_reward": 0.6178571652621031,
+      "rewards/cosine_scaled_reward": 0.3584277655696496,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8714286401867867,
+      "step": 220
+    },
+    {
+      "completion_length": 754.1857452392578,
+      "epoch": 0.48,
+      "grad_norm": 0.4328997731208801,
+      "kl": 0.12838134765625,
+      "learning_rate": 1.8602273707541886e-06,
+      "loss": 0.0051,
+      "reward": 2.1135876968503,
+      "reward_std": 0.6965163860470056,
+      "rewards/accuracy_reward": 0.742857176810503,
+      "rewards/cosine_scaled_reward": 0.5159685641527176,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8547619715332985,
+      "step": 225
+    },
+    {
+      "completion_length": 742.7750381469726,
+      "epoch": 0.49066666666666664,
+      "grad_norm": 0.4649052619934082,
+      "kl": 0.1558837890625,
+      "learning_rate": 1.8056604906573418e-06,
+      "loss": 0.0062,
+      "reward": 2.0384344711899756,
+      "reward_std": 0.6620127268135547,
+      "rewards/accuracy_reward": 0.7035714626312256,
+      "rewards/cosine_scaled_reward": 0.483077246020548,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8517857760190963,
+      "step": 230
+    },
+    {
+      "completion_length": 739.6268203735351,
+      "epoch": 0.5013333333333333,
+      "grad_norm": 1.5264660120010376,
+      "kl": 0.145806884765625,
+      "learning_rate": 1.7506681449278226e-06,
+      "loss": 0.0058,
+      "reward": 1.999456986784935,
+      "reward_std": 0.7032103724777699,
+      "rewards/accuracy_reward": 0.6785714574158191,
+      "rewards/cosine_scaled_reward": 0.45302835907787087,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8678572103381157,
+      "step": 235
+    },
+    {
+      "completion_length": 725.905387878418,
+      "epoch": 0.512,
+      "grad_norm": 13.703657150268555,
+      "kl": 0.354132080078125,
+      "learning_rate": 1.6953268804334257e-06,
+      "loss": 0.0142,
+      "reward": 2.012031316757202,
+      "reward_std": 0.6349152896553278,
+      "rewards/accuracy_reward": 0.6660714553669095,
+      "rewards/cosine_scaled_reward": 0.46024551438167693,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8857143551111222,
+      "step": 240
+    },
+    {
+      "completion_length": 711.9410980224609,
+      "epoch": 0.5226666666666666,
+      "grad_norm": 42.922752380371094,
+      "kl": 0.81356201171875,
+      "learning_rate": 1.6397137297211436e-06,
+      "loss": 0.0325,
+      "reward": 2.129089578986168,
+      "reward_std": 0.699107101932168,
+      "rewards/accuracy_reward": 0.7160714577883482,
+      "rewards/cosine_scaled_reward": 0.5064704709046055,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9065476730465889,
+      "step": 245
+    },
+    {
+      "completion_length": 738.9821746826171,
+      "epoch": 0.5333333333333333,
+      "grad_norm": 212.6622314453125,
+      "kl": 1.157550048828125,
+      "learning_rate": 1.5839061037913395e-06,
+      "loss": 0.0463,
+      "reward": 2.1009622782468798,
+      "reward_std": 0.7158728931099176,
+      "rewards/accuracy_reward": 0.7000000283122063,
+      "rewards/cosine_scaled_reward": 0.5027479250915349,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8982143506407738,
+      "step": 250
+    },
+    {
+      "completion_length": 760.2428916931152,
+      "epoch": 0.544,
+      "grad_norm": 10.118670463562012,
+      "kl": 0.637158203125,
+      "learning_rate": 1.527981684345115e-06,
+      "loss": 0.0255,
+      "reward": 1.9621681660413741,
+      "reward_std": 0.67494813259691,
+      "rewards/accuracy_reward": 0.639285740070045,
+      "rewards/cosine_scaled_reward": 0.4276442806003615,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8952381491661072,
+      "step": 255
+    },
+    {
+      "completion_length": 754.6803894042969,
+      "epoch": 0.5546666666666666,
+      "grad_norm": 7.878048419952393,
+      "kl": 0.972845458984375,
+      "learning_rate": 1.4720183156548855e-06,
+      "loss": 0.0389,
+      "reward": 1.9780788227915764,
+      "reward_std": 0.6262619759887457,
+      "rewards/accuracy_reward": 0.6339285982772708,
+      "rewards/cosine_scaled_reward": 0.4304597085807472,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9136905416846275,
+      "step": 260
+    },
+    {
+      "completion_length": 751.5857498168946,
+      "epoch": 0.5653333333333334,
+      "grad_norm": 12.42583179473877,
+      "kl": 3.09744873046875,
+      "learning_rate": 1.4160938962086612e-06,
+      "loss": 0.1241,
+      "reward": 2.0433208346366882,
+      "reward_std": 0.661328698694706,
+      "rewards/accuracy_reward": 0.676785740442574,
+      "rewards/cosine_scaled_reward": 0.44689220561413096,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9196429163217544,
+      "step": 265
+    },
+    {
+      "completion_length": 729.028604888916,
+      "epoch": 0.576,
+      "grad_norm": 7.453009605407715,
+      "kl": 2.2955322265625,
+      "learning_rate": 1.3602862702788567e-06,
+      "loss": 0.0917,
+      "reward": 2.094664843380451,
+      "reward_std": 0.6356621380895376,
+      "rewards/accuracy_reward": 0.7000000346451998,
+      "rewards/cosine_scaled_reward": 0.46371242445893585,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9309524431824684,
+      "step": 270
+    },
+    {
+      "completion_length": 730.825032043457,
+      "epoch": 0.5866666666666667,
+      "grad_norm": 7.0367817878723145,
+      "kl": 0.6509521484375,
+      "learning_rate": 1.3046731195665748e-06,
+      "loss": 0.0261,
+      "reward": 2.083331751823425,
+      "reward_std": 0.6676435235887765,
+      "rewards/accuracy_reward": 0.6821428818628192,
+      "rewards/cosine_scaled_reward": 0.45714118536561726,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.944047674536705,
+      "step": 275
+    },
+    {
+      "completion_length": 742.180387878418,
+      "epoch": 0.5973333333333334,
+      "grad_norm": 1.3236949443817139,
+      "kl": 4.09298095703125,
+      "learning_rate": 1.2493318550721775e-06,
+      "loss": 0.1637,
+      "reward": 2.075996032357216,
+      "reward_std": 0.6379393456503749,
+      "rewards/accuracy_reward": 0.6857143174856901,
+      "rewards/cosine_scaled_reward": 0.4563530746847391,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9339286297559738,
+      "step": 280
+    },
+    {
+      "completion_length": 708.1018157958985,
+      "epoch": 0.608,
+      "grad_norm": 5.264936447143555,
+      "kl": 0.21192626953125,
+      "learning_rate": 1.1943395093426585e-06,
+      "loss": 0.0085,
+      "reward": 2.1390477627515794,
+      "reward_std": 0.600306774303317,
+      "rewards/accuracy_reward": 0.7196428820490837,
+      "rewards/cosine_scaled_reward": 0.49619057439267633,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9232143506407737,
+      "step": 285
+    },
+    {
+      "completion_length": 715.4125289916992,
+      "epoch": 0.6186666666666667,
+      "grad_norm": 2.6887574195861816,
+      "kl": 2.8669677734375,
+      "learning_rate": 1.1397726292458115e-06,
+      "loss": 0.1151,
+      "reward": 2.1179503470659258,
+      "reward_std": 0.5490788316354156,
+      "rewards/accuracy_reward": 0.7053571708500386,
+      "rewards/cosine_scaled_reward": 0.4905693273060024,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9220238789916039,
+      "step": 290
+    },
+    {
+      "completion_length": 742.6803916931152,
+      "epoch": 0.6293333333333333,
+      "grad_norm": 6.9418721199035645,
+      "kl": 0.39151611328125,
+      "learning_rate": 1.085707169420437e-06,
+      "loss": 0.0157,
+      "reward": 1.8962592497467994,
+      "reward_std": 0.6060247957706452,
+      "rewards/accuracy_reward": 0.5964285938069225,
+      "rewards/cosine_scaled_reward": 0.3754258565604687,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.924404813349247,
+      "step": 295
+    },
+    {
+      "completion_length": 716.3464584350586,
+      "epoch": 0.64,
+      "grad_norm": 4.2906060218811035,
+      "kl": 0.57667236328125,
+      "learning_rate": 1.0322183865509054e-06,
+      "loss": 0.0231,
+      "reward": 2.1815308302640917,
+      "reward_std": 0.6235232371836901,
+      "rewards/accuracy_reward": 0.7428571732714773,
+      "rewards/cosine_scaled_reward": 0.5255783690838143,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.913095298409462,
+      "step": 300
+    },
+    {
+      "epoch": 0.64,
+      "eval_completion_length": 728.9849459716797,
+      "eval_kl": 22.31169453125,
+      "eval_loss": 0.8926114439964294,
+      "eval_reward": 1.9843467233777046,
+      "eval_reward_std": 0.6538388645738363,
+      "eval_rewards/accuracy_reward": 0.6382285982251167,
+      "eval_rewards/cosine_scaled_reward": 0.41530855364510644,
+      "eval_rewards/format_reward": 0.0,
+      "eval_rewards/reasoning_steps_reward": 0.9308095807313919,
+      "eval_runtime": 32207.7986,
+      "eval_samples_per_second": 0.155,
+      "eval_steps_per_second": 0.011,
+      "step": 300
+    },
+    {
+      "completion_length": 723.2625328063965,
+      "epoch": 0.6506666666666666,
+      "grad_norm": 79.97950744628906,
+      "kl": 487.1179443359375,
+      "learning_rate": 9.793807346132464e-07,
+      "loss": 19.4474,
+      "reward": 2.162437987327576,
+      "reward_std": 0.6324797321110964,
+      "rewards/accuracy_reward": 0.7267857410013676,
+      "rewards/cosine_scaled_reward": 0.5112474345514784,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9244048178195954,
+      "step": 305
+    },
+    {
+      "completion_length": 739.6375335693359,
+      "epoch": 0.6613333333333333,
+      "grad_norm": 9.395992279052734,
+      "kl": 0.60579833984375,
+      "learning_rate": 9.272677612385667e-07,
+      "loss": 0.0242,
+      "reward": 2.004467612504959,
+      "reward_std": 0.6282935816794634,
+      "rewards/accuracy_reward": 0.6607143184170127,
+      "rewards/cosine_scaled_reward": 0.42589613443706187,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9178571999073029,
+      "step": 310
+    },
+    {
+      "completion_length": 735.6286071777344,
+      "epoch": 0.672,
+      "grad_norm": 12.830111503601074,
+      "kl": 0.9565673828125,
+      "learning_rate": 8.759520053380591e-07,
+      "loss": 0.0383,
+      "reward": 1.9197196617722512,
+      "reward_std": 0.6299623921513557,
+      "rewards/accuracy_reward": 0.6035714576020836,
+      "rewards/cosine_scaled_reward": 0.39055290608666837,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9255953043699264,
+      "step": 315
+    },
+    {
+      "completion_length": 718.0571731567383,
+      "epoch": 0.6826666666666666,
+      "grad_norm": 176.6972198486328,
+      "kl": 1.54287109375,
+      "learning_rate": 8.255048961321088e-07,
+      "loss": 0.0618,
+      "reward": 2.1281729131937026,
+      "reward_std": 0.6808584026992321,
+      "rewards/accuracy_reward": 0.714285746216774,
+      "rewards/cosine_scaled_reward": 0.4888871216215193,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9250000536441803,
+      "step": 320
+    },
+    {
+      "completion_length": 721.4732475280762,
+      "epoch": 0.6933333333333334,
+      "grad_norm": 6.025720119476318,
+      "kl": 0.98104248046875,
+      "learning_rate": 7.759966537240373e-07,
+      "loss": 0.0392,
+      "reward": 2.054315000772476,
+      "reward_std": 0.6834255807101727,
+      "rewards/accuracy_reward": 0.6714285992085933,
+      "rewards/cosine_scaled_reward": 0.45312447142787277,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9297619640827179,
+      "step": 325
+    },
+    {
+      "completion_length": 729.3982498168946,
+      "epoch": 0.704,
+      "grad_norm": 6.682721138000488,
+      "kl": 2.40982666015625,
+      "learning_rate": 7.274961913568773e-07,
+      "loss": 0.0964,
+      "reward": 2.0376005843281746,
+      "reward_std": 0.7055317234247923,
+      "rewards/accuracy_reward": 0.6660714562982321,
+      "rewards/cosine_scaled_reward": 0.4655766852200031,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9059524461627007,
+      "step": 330
+    },
+    {
+      "completion_length": 737.005387878418,
+      "epoch": 0.7146666666666667,
+      "grad_norm": 21.818754196166992,
+      "kl": 0.653094482421875,
+      "learning_rate": 6.800710194892484e-07,
+      "loss": 0.0261,
+      "reward": 2.056803268194199,
+      "reward_std": 0.7108213260769844,
+      "rewards/accuracy_reward": 0.6660714574158192,
+      "rewards/cosine_scaled_reward": 0.45680318772792816,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9339286327362061,
+      "step": 335
+    },
+    {
+      "completion_length": 729.6393203735352,
+      "epoch": 0.7253333333333334,
+      "grad_norm": 4.025352954864502,
+      "kl": 0.63848876953125,
+      "learning_rate": 6.33787151823836e-07,
+      "loss": 0.0256,
+      "reward": 1.9720933943986894,
+      "reward_std": 0.6898978160694241,
+      "rewards/accuracy_reward": 0.6250000264495611,
+      "rewards/cosine_scaled_reward": 0.42685523356776683,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9202381581068039,
+      "step": 340
+    },
+    {
+      "completion_length": 699.1571701049804,
+      "epoch": 0.736,
+      "grad_norm": 5.142830848693848,
+      "kl": 0.65721435546875,
+      "learning_rate": 5.887090134192947e-07,
+      "loss": 0.0263,
+      "reward": 2.100009024143219,
+      "reward_std": 0.6496724892407656,
+      "rewards/accuracy_reward": 0.6910714615136385,
+      "rewards/cosine_scaled_reward": 0.4851280112750828,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9238095805048943,
+      "step": 345
+    },
+    {
+      "completion_length": 723.3910995483399,
+      "epoch": 0.7466666666666667,
+      "grad_norm": 4.602946758270264,
+      "kl": 0.394140625,
+      "learning_rate": 5.448993510134669e-07,
+      "loss": 0.0158,
+      "reward": 2.0926264360547067,
+      "reward_std": 0.6916316740214825,
+      "rewards/accuracy_reward": 0.6857143180444837,
+      "rewards/cosine_scaled_reward": 0.4831025514518842,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9238095790147781,
+      "step": 350
+    },
+    {
+      "completion_length": 722.5375305175781,
+      "epoch": 0.7573333333333333,
+      "grad_norm": 6.0756731033325195,
+      "kl": 1.08592529296875,
+      "learning_rate": 5.024191456827498e-07,
+      "loss": 0.0435,
+      "reward": 2.0994770556688307,
+      "reward_std": 0.666194306127727,
+      "rewards/accuracy_reward": 0.6982143167406321,
+      "rewards/cosine_scaled_reward": 0.4917388891801238,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9095238700509072,
+      "step": 355
+    },
+    {
+      "completion_length": 713.5250350952149,
+      "epoch": 0.768,
+      "grad_norm": 7.16264533996582,
+      "kl": 26.27894287109375,
+      "learning_rate": 4.6132752795918667e-07,
+      "loss": 1.0497,
+      "reward": 2.055359125137329,
+      "reward_std": 0.7066416556015611,
+      "rewards/accuracy_reward": 0.6678571753203869,
+      "rewards/cosine_scaled_reward": 0.4732161985710263,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9142857760190963,
+      "step": 360
+    },
+    {
+      "completion_length": 751.5964584350586,
+      "epoch": 0.7786666666666666,
+      "grad_norm": 3.023808002471924,
+      "kl": 1.154327392578125,
+      "learning_rate": 4.2168169552342905e-07,
+      "loss": 0.0462,
+      "reward": 1.9766315311193465,
+      "reward_std": 0.7433438140898943,
+      "rewards/accuracy_reward": 0.6339286021888256,
+      "rewards/cosine_scaled_reward": 0.42544099894585086,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9172619596123696,
+      "step": 365
+    },
+    {
+      "completion_length": 704.278596496582,
+      "epoch": 0.7893333333333333,
+      "grad_norm": 1.0741926431655884,
+      "kl": 0.53934326171875,
+      "learning_rate": 3.8353683358814046e-07,
+      "loss": 0.0216,
+      "reward": 2.0491741001605988,
+      "reward_std": 0.587555892020464,
+      "rewards/accuracy_reward": 0.6678571693599225,
+      "rewards/cosine_scaled_reward": 0.46226926781237127,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9190476790070534,
+      "step": 370
+    },
+    {
+      "completion_length": 738.875033569336,
+      "epoch": 0.8,
+      "grad_norm": 41.52888870239258,
+      "kl": 0.6643310546875,
+      "learning_rate": 3.469460380826697e-07,
+      "loss": 0.0265,
+      "reward": 2.0449665546417237,
+      "reward_std": 0.6989724855870009,
+      "rewards/accuracy_reward": 0.6625000312924385,
+      "rewards/cosine_scaled_reward": 0.4574664521496743,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9250000640749931,
+      "step": 375
+    },
+    {
+      "completion_length": 724.0678855895997,
+      "epoch": 0.8106666666666666,
+      "grad_norm": 4.322193145751953,
+      "kl": 0.7086669921875,
+      "learning_rate": 3.119602417459075e-07,
+      "loss": 0.0284,
+      "reward": 2.055614770948887,
+      "reward_std": 0.6039443843066692,
+      "rewards/accuracy_reward": 0.667857171408832,
+      "rewards/cosine_scaled_reward": 0.46275755076203495,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9250000655651093,
+      "step": 380
+    },
+    {
+      "completion_length": 739.5125350952148,
+      "epoch": 0.8213333333333334,
+      "grad_norm": 4.056361198425293,
+      "kl": 0.7447509765625,
+      "learning_rate": 2.786281432302071e-07,
+      "loss": 0.0298,
+      "reward": 2.0523035705089567,
+      "reward_std": 0.6267267379909753,
+      "rewards/accuracy_reward": 0.6750000279396773,
+      "rewards/cosine_scaled_reward": 0.4463511134439614,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9309524476528168,
+      "step": 385
+    },
+    {
+      "completion_length": 722.548243713379,
+      "epoch": 0.832,
+      "grad_norm": 1.378568410873413,
+      "kl": 0.501007080078125,
+      "learning_rate": 2.46996139315057e-07,
+      "loss": 0.02,
+      "reward": 2.0793206453323365,
+      "reward_std": 0.6533296214416623,
+      "rewards/accuracy_reward": 0.6875000290572644,
+      "rewards/cosine_scaled_reward": 0.4781300783797633,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9136905401945115,
+      "step": 390
+    },
+    {
+      "completion_length": 732.4714630126953,
+      "epoch": 0.8426666666666667,
+      "grad_norm": 2.4824626445770264,
+      "kl": 0.71015625,
+      "learning_rate": 2.1710826032485286e-07,
+      "loss": 0.0284,
+      "reward": 2.1464335188269614,
+      "reward_std": 0.6267410140484572,
+      "rewards/accuracy_reward": 0.7071428874507546,
+      "rewards/cosine_scaled_reward": 0.5136953465640545,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9255953013896943,
+      "step": 395
+    },
+    {
+      "completion_length": 769.8607528686523,
+      "epoch": 0.8533333333333334,
+      "grad_norm": 5.1279401779174805,
+      "kl": 0.787158203125,
+      "learning_rate": 1.8900610884066817e-07,
+      "loss": 0.0315,
+      "reward": 1.9811220198869706,
+      "reward_std": 0.6900037627667188,
+      "rewards/accuracy_reward": 0.6357143126428128,
+      "rewards/cosine_scaled_reward": 0.4329076783033088,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.912500062584877,
+      "step": 400
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "eval_completion_length": 738.6478045166016,
+      "eval_kl": 0.67065634765625,
+      "eval_loss": 0.026821324601769447,
+      "eval_reward": 1.9358687758922577,
+      "eval_reward_std": 0.681571420711279,
+      "eval_rewards/accuracy_reward": 0.6160857413113118,
+      "eval_rewards/cosine_scaled_reward": 0.4032782297934056,
+      "eval_rewards/format_reward": 0.0,
+      "eval_rewards/reasoning_steps_reward": 0.9165048221349716,
+      "eval_runtime": 32285.4404,
+      "eval_samples_per_second": 0.155,
+      "eval_steps_per_second": 0.011,
+      "step": 400
+    },
+    {
+      "completion_length": 763.4339599609375,
+      "epoch": 0.864,
+      "grad_norm": 4.143102169036865,
+      "kl": 0.609136962890625,
+      "learning_rate": 1.627288017913383e-07,
+      "loss": 0.0244,
+      "reward": 1.9788720414042473,
+      "reward_std": 0.6925495602190495,
+      "rewards/accuracy_reward": 0.6375000275671482,
+      "rewards/cosine_scaled_reward": 0.42411007191985844,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9172619670629502,
+      "step": 405
+    },
+    {
+      "completion_length": 754.2500312805175,
+      "epoch": 0.8746666666666667,
+      "grad_norm": 4.33268928527832,
+      "kl": 0.9586181640625,
+      "learning_rate": 1.3831291600445573e-07,
+      "loss": 0.0383,
+      "reward": 1.9650759071111679,
+      "reward_std": 0.6423604141920805,
+      "rewards/accuracy_reward": 0.6303571704775095,
+      "rewards/cosine_scaled_reward": 0.4222186904400587,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.912500062584877,
+      "step": 410
+    },
+    {
+      "completion_length": 751.8071762084961,
+      "epoch": 0.8853333333333333,
+      "grad_norm": 7.097233295440674,
+      "kl": 0.8556884765625,
+      "learning_rate": 1.1579243729307487e-07,
+      "loss": 0.0342,
+      "reward": 1.9338065341114998,
+      "reward_std": 0.7414230849593878,
+      "rewards/accuracy_reward": 0.6321428898721934,
+      "rewards/cosine_scaled_reward": 0.41178265907801687,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8898810192942619,
+      "step": 415
+    },
+    {
+      "completion_length": 752.8571723937988,
+      "epoch": 0.896,
+      "grad_norm": 3.0274124145507812,
+      "kl": 0.67294921875,
+      "learning_rate": 9.519871314899092e-08,
+      "loss": 0.0269,
+      "reward": 1.9913182631134987,
+      "reward_std": 0.7086525153368711,
+      "rewards/accuracy_reward": 0.6571428876370191,
+      "rewards/cosine_scaled_reward": 0.4359610580140725,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8982143491506577,
+      "step": 420
+    },
+    {
+      "completion_length": 751.7411056518555,
+      "epoch": 0.9066666666666666,
+      "grad_norm": 1.3194289207458496,
+      "kl": 0.722802734375,
+      "learning_rate": 7.656040910844358e-08,
+      "loss": 0.0289,
+      "reward": 2.0188252568244933,
+      "reward_std": 0.7707155652344226,
+      "rewards/accuracy_reward": 0.644642885029316,
+      "rewards/cosine_scaled_reward": 0.44144419142976404,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9327381521463394,
+      "step": 425
+    },
+    {
+      "completion_length": 755.0464630126953,
+      "epoch": 0.9173333333333333,
+      "grad_norm": 4.276956081390381,
+      "kl": 0.9569580078125,
+      "learning_rate": 5.990346885098235e-08,
+      "loss": 0.0383,
+      "reward": 2.000167742371559,
+      "reward_std": 0.7376608021557332,
+      "rewards/accuracy_reward": 0.6589285988360644,
+      "rewards/cosine_scaled_reward": 0.45314384531229734,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8880952954292297,
+      "step": 430
+    },
+    {
+      "completion_length": 727.2536087036133,
+      "epoch": 0.928,
+      "grad_norm": 19.139204025268555,
+      "kl": 1.32947998046875,
+      "learning_rate": 4.5251078087033493e-08,
+      "loss": 0.0532,
+      "reward": 2.039694218337536,
+      "reward_std": 0.6533694989979267,
+      "rewards/accuracy_reward": 0.6732143165543676,
+      "rewards/cosine_scaled_reward": 0.4462417368311435,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9202381521463394,
+      "step": 435
+    },
+    {
+      "completion_length": 734.5536064147949,
+      "epoch": 0.9386666666666666,
+      "grad_norm": 9.922527313232422,
+      "kl": 1.4177001953125,
+      "learning_rate": 3.262363228443427e-08,
+      "loss": 0.0567,
+      "reward": 1.9774114236235618,
+      "reward_std": 0.7198221303522587,
+      "rewards/accuracy_reward": 0.6571428865194321,
+      "rewards/cosine_scaled_reward": 0.4309827778954059,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.8892857789993286,
+      "step": 440
+    },
+    {
+      "completion_length": 755.5053962707519,
+      "epoch": 0.9493333333333334,
+      "grad_norm": 3.058717727661133,
+      "kl": 1.02747802734375,
+      "learning_rate": 2.2038708278862952e-08,
+      "loss": 0.0411,
+      "reward": 1.9413904681801797,
+      "reward_std": 0.6192027345299721,
+      "rewards/accuracy_reward": 0.6214285951107741,
+      "rewards/cosine_scaled_reward": 0.41579519272781906,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9041667267680168,
+      "step": 445
+    },
+    {
+      "completion_length": 723.6143173217773,
+      "epoch": 0.96,
+      "grad_norm": 2.64345383644104,
+      "kl": 0.74544677734375,
+      "learning_rate": 1.3511039807673209e-08,
+      "loss": 0.0298,
+      "reward": 2.1570381984114646,
+      "reward_std": 0.6153812855482101,
+      "rewards/accuracy_reward": 0.7089286003261804,
+      "rewards/cosine_scaled_reward": 0.5165619559586048,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9315476790070534,
+      "step": 450
+    },
+    {
+      "completion_length": 728.894679260254,
+      "epoch": 0.9706666666666667,
+      "grad_norm": 2.217505693435669,
+      "kl": 0.678607177734375,
+      "learning_rate": 7.0524970011963675e-09,
+      "loss": 0.0272,
+      "reward": 2.2157696574926375,
+      "reward_std": 0.6317826233804226,
+      "rewards/accuracy_reward": 0.7500000305473804,
+      "rewards/cosine_scaled_reward": 0.5425553207285703,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9232143491506577,
+      "step": 455
+    },
+    {
+      "completion_length": 722.3839637756348,
+      "epoch": 0.9813333333333333,
+      "grad_norm": 3.196773052215576,
+      "kl": 0.709228515625,
+      "learning_rate": 2.6720698600553595e-09,
+      "loss": 0.0284,
+      "reward": 2.122882993519306,
+      "reward_std": 0.599827627837658,
+      "rewards/accuracy_reward": 0.7017857432365417,
+      "rewards/cosine_scaled_reward": 0.5175257750786841,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9035714983940124,
+      "step": 460
+    },
+    {
+      "completion_length": 754.775032043457,
+      "epoch": 0.992,
+      "grad_norm": 8.455827713012695,
+      "kl": 0.835205078125,
+      "learning_rate": 3.7585574148779613e-10,
+      "loss": 0.0334,
+      "reward": 1.9985675051808358,
+      "reward_std": 0.7642196819186211,
+      "rewards/accuracy_reward": 0.6500000316649676,
+      "rewards/cosine_scaled_reward": 0.4402340850589098,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.9083333894610405,
+      "step": 465
+    },
+    {
+      "completion_length": 746.1607462565104,
+      "epoch": 0.9984,
+      "kl": 0.8069661458333334,
+      "reward": 2.0161508160332837,
+      "reward_std": 0.7148686709503332,
+      "rewards/accuracy_reward": 0.6636905111372471,
+      "rewards/cosine_scaled_reward": 0.4536507367156446,
+      "rewards/format_reward": 0.0,
+      "rewards/reasoning_steps_reward": 0.898809589445591,
+      "step": 468,
+      "total_flos": 0.0,
+      "train_loss": 4841.422249500714,
+      "train_runtime": 180396.3107,
+      "train_samples_per_second": 0.042,
+      "train_steps_per_second": 0.003
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 468,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dce257db8de0248ed189e939e06690f3b2a1bc42bd4d26daa1cac73a1b5e4131
 size 7480

 version https://git-lfs.github.com/spec/v1
+oid sha256:1373fdcb93653901ac277fd852565f578e438f5f5b258915dc888ae42be7ad33
 size 7480