Model save

Browse files

Files changed (5) hide show

README.md +67 -0
all_results.json +8 -0
generation_config.json +9 -0
train_results.json +8 -0
trainer_state.json +1493 -0

README.md ADDED Viewed

	@@ -0,0 +1,67 @@

+---
+library_name: transformers
+model_name: ER-GRPO-alpha30
+tags:
+- generated_from_trainer
+- trl
+- grpo
+licence: license
+---
+# Model Card for ER-GRPO-alpha30
+This model is a fine-tuned version of [None](https://huggingface.co/None).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="lalalaDa/ER-GRPO-alpha30", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+### Framework versions
+- TRL: 0.18.1
+- Transformers: 4.52.4
+- Pytorch: 2.5.1
+- Datasets: 3.6.0
+- Tokenizers: 0.21.1
+## Citations
+Cite GRPO as:
+```bibtex
+@article{zhihong2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+```
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

all_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "total_flos": 0.0,
+    "train_loss": -0.45832799572497607,
+    "train_runtime": 4441.2019,
+    "train_samples": 7000,
+    "train_samples_per_second": 0.54,
+    "train_steps_per_second": 0.011
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151646,
+  "do_sample": true,
+  "eos_token_id": 151643,
+  "temperature": 0.6,
+  "top_p": 0.95,
+  "transformers_version": "4.52.4"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "total_flos": 0.0,
+    "train_loss": -0.45832799572497607,
+    "train_runtime": 4441.2019,
+    "train_samples": 7000,
+    "train_samples_per_second": 0.54,
+    "train_steps_per_second": 0.011
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1493 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.05714285714285714,
+  "eval_steps": 500,
+  "global_step": 50,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5208333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3128.0,
+      "completions/mean_length": 2584.104248046875,
+      "completions/mean_terminated_length": 1497.2608642578125,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "epoch": 0.001142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6251945495605469,
+      "kl": 0.0,
+      "learning_rate": 0.0,
+      "loss": -0.6946,
+      "num_tokens": 131153.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.2579294741153717,
+      "reward_std": 0.505131185054779,
+      "rewards/cosine_scaled_reward/mean": -0.062009382992982864,
+      "rewards/cosine_scaled_reward/std": 0.43048128485679626,
+      "rewards/format_reward/mean": 0.5208333134651184,
+      "rewards/format_reward/std": 0.504852294921875,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5833333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3280.0,
+      "completions/mean_length": 2761.666748046875,
+      "completions/mean_terminated_length": 1610.4000244140625,
+      "completions/min_length": 465.0,
+      "completions/min_terminated_length": 465.0,
+      "epoch": 0.002285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6243528127670288,
+      "kl": 0.0,
+      "learning_rate": 2e-07,
+      "loss": -1.3501,
+      "num_tokens": 271243.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.26637595891952515,
+      "reward_std": 0.5504351854324341,
+      "rewards/cosine_scaled_reward/mean": -0.003428752301260829,
+      "rewards/cosine_scaled_reward/std": 0.4935320317745209,
+      "rewards/format_reward/mean": 0.4166666567325592,
+      "rewards/format_reward/std": 0.49822381138801575,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9166666666666666,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 2914.0,
+      "completions/mean_length": 3405.541748046875,
+      "completions/mean_terminated_length": 1442.5,
+      "completions/min_length": 490.0,
+      "completions/min_terminated_length": 490.0,
+      "epoch": 0.0034285714285714284,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.6798118352890015,
+      "kl": 0.0006580352783203125,
+      "learning_rate": 4e-07,
+      "loss": 1.3636,
+      "num_tokens": 442563.0,
+      "policy_entropy_avg": 8.125,
+      "reward": -0.14459262788295746,
+      "reward_std": 0.37038296461105347,
+      "rewards/cosine_scaled_reward/mean": -0.16330842673778534,
+      "rewards/cosine_scaled_reward/std": 0.2756437659263611,
+      "rewards/format_reward/mean": 0.1041666641831398,
+      "rewards/format_reward/std": 0.3087092638015747,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.45833333333333337,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3581.0,
+      "completions/mean_length": 2397.45849609375,
+      "completions/mean_terminated_length": 1393.4615478515625,
+      "completions/min_length": 416.0,
+      "completions/min_terminated_length": 416.0,
+      "epoch": 0.004571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.0801085233688354,
+      "kl": 0.0006548563639322916,
+      "learning_rate": 6e-07,
+      "loss": -0.4683,
+      "num_tokens": 564997.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.4537672996520996,
+      "reward_std": 0.6287642121315002,
+      "rewards/cosine_scaled_reward/mean": 0.0053017884492874146,
+      "rewards/cosine_scaled_reward/std": 0.44670990109443665,
+      "rewards/format_reward/mean": 0.6875,
+      "rewards/format_reward/std": 0.4684174358844757,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.7291666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3368.0,
+      "completions/mean_length": 3252.5,
+      "completions/mean_terminated_length": 2360.0,
+      "completions/min_length": 598.0,
+      "completions/min_terminated_length": 598.0,
+      "epoch": 0.005714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.6357815265655518,
+      "kl": 0.0006510416666666666,
+      "learning_rate": 8e-07,
+      "loss": -0.2046,
+      "num_tokens": 729229.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.025602590292692184,
+      "reward_std": 0.46193015575408936,
+      "rewards/cosine_scaled_reward/mean": -0.15738904476165771,
+      "rewards/cosine_scaled_reward/std": 0.38784292340278625,
+      "rewards/format_reward/mean": 0.3541666567325592,
+      "rewards/format_reward/std": 0.48332110047340393,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.7916666666666666,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3435.0,
+      "completions/mean_length": 3184.45849609375,
+      "completions/mean_terminated_length": 1666.2000732421875,
+      "completions/min_length": 600.0,
+      "completions/min_terminated_length": 600.0,
+      "epoch": 0.006857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.611210286617279,
+      "kl": 0.0006860097249348959,
+      "learning_rate": 1e-06,
+      "loss": 0.6384,
+      "num_tokens": 890819.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.006859039422124624,
+      "reward_std": 0.45094144344329834,
+      "rewards/cosine_scaled_reward/mean": -0.14055712521076202,
+      "rewards/cosine_scaled_reward/std": 0.32825249433517456,
+      "rewards/format_reward/mean": 0.2916666567325592,
+      "rewards/format_reward/std": 0.4593396484851837,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6041666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3543.0,
+      "completions/mean_length": 3090.0,
+      "completions/mean_terminated_length": 2336.0,
+      "completions/min_length": 952.0,
+      "completions/min_terminated_length": 952.0,
+      "epoch": 0.008,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.6517191529273987,
+      "kl": 0.0005308787027994791,
+      "learning_rate": 9.989038226169207e-07,
+      "loss": -0.3634,
+      "num_tokens": 1046945.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.3611307144165039,
+      "reward_std": 0.6935849189758301,
+      "rewards/cosine_scaled_reward/mean": 0.017376163974404335,
+      "rewards/cosine_scaled_reward/std": 0.5143836736679077,
+      "rewards/format_reward/mean": 0.5208333134651184,
+      "rewards/format_reward/std": 0.5048523545265198,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3421.0,
+      "completions/mean_length": 2711.27099609375,
+      "completions/mean_terminated_length": 1589.1905517578125,
+      "completions/min_length": 375.0,
+      "completions/min_terminated_length": 375.0,
+      "epoch": 0.009142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.013113021850586,
+      "kl": 0.0005372365315755209,
+      "learning_rate": 9.956206309337066e-07,
+      "loss": -1.336,
+      "num_tokens": 1184898.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.5312491655349731,
+      "reward_std": 0.5121049880981445,
+      "rewards/cosine_scaled_reward/mean": 0.16906984150409698,
+      "rewards/cosine_scaled_reward/std": 0.48645660281181335,
+      "rewards/format_reward/mean": 0.4791666567325592,
+      "rewards/format_reward/std": 0.5048523545265198,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3575.0,
+      "completions/mean_length": 2904.875,
+      "completions/mean_terminated_length": 1773.0,
+      "completions/min_length": 553.0,
+      "completions/min_terminated_length": 553.0,
+      "epoch": 0.010285714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.7274989485740662,
+      "kl": 0.00063323974609375,
+      "learning_rate": 9.901664203302124e-07,
+      "loss": 0.2595,
+      "num_tokens": 1332924.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.15100964903831482,
+      "reward_std": 0.5337426066398621,
+      "rewards/cosine_scaled_reward/mean": -0.09217208623886108,
+      "rewards/cosine_scaled_reward/std": 0.36159586906433105,
+      "rewards/format_reward/mean": 0.4166666567325592,
+      "rewards/format_reward/std": 0.49822381138801575,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3407.0,
+      "completions/mean_length": 2811.229248046875,
+      "completions/mean_terminated_length": 1111.1334228515625,
+      "completions/min_length": 407.0,
+      "completions/min_terminated_length": 407.0,
+      "epoch": 0.011428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.9453772306442261,
+      "kl": 0.0005699793497721354,
+      "learning_rate": 9.825677631722435e-07,
+      "loss": 0.1,
+      "num_tokens": 1475987.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.001259398995898664,
+      "reward_std": 0.425419420003891,
+      "rewards/cosine_scaled_reward/mean": -0.17611455917358398,
+      "rewards/cosine_scaled_reward/std": 0.3572066128253937,
+      "rewards/format_reward/mean": 0.3541666567325592,
+      "rewards/format_reward/std": 0.48332110047340393,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8958333333333334,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3246.0,
+      "completions/mean_length": 3474.0,
+      "completions/mean_terminated_length": 2528.0,
+      "completions/min_length": 1503.0,
+      "completions/min_terminated_length": 1503.0,
+      "epoch": 0.012571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.8819871544837952,
+      "kl": 0.0006033579508463541,
+      "learning_rate": 9.728616793536587e-07,
+      "loss": 1.6889,
+      "num_tokens": 1651493.0,
+      "policy_entropy_avg": 8.125,
+      "reward": -0.1606103479862213,
+      "reward_std": 0.441455602645874,
+      "rewards/cosine_scaled_reward/mean": -0.19646306335926056,
+      "rewards/cosine_scaled_reward/std": 0.34247782826423645,
+      "rewards/format_reward/mean": 0.1458333283662796,
+      "rewards/format_reward/std": 0.3566739559173584,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.45833333333333337,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3479.0,
+      "completions/mean_length": 2459.3125,
+      "completions/mean_terminated_length": 1507.6539306640625,
+      "completions/min_length": 698.0,
+      "completions/min_terminated_length": 698.0,
+      "epoch": 0.013714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.9987263083457947,
+      "kl": 0.0006459554036458334,
+      "learning_rate": 9.610954559391704e-07,
+      "loss": -0.7409,
+      "num_tokens": 1777748.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.1868935525417328,
+      "reward_std": 0.5621633529663086,
+      "rewards/cosine_scaled_reward/mean": -0.13748572766780853,
+      "rewards/cosine_scaled_reward/std": 0.37971118092536926,
+      "rewards/format_reward/mean": 0.5625,
+      "rewards/format_reward/std": 0.5013279914855957,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3377.0,
+      "completions/mean_length": 2769.729248046875,
+      "completions/mean_terminated_length": 1722.8095703125,
+      "completions/min_length": 706.0,
+      "completions/min_terminated_length": 706.0,
+      "epoch": 0.014857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.6482232213020325,
+      "kl": 0.0006001790364583334,
+      "learning_rate": 9.473264167865171e-07,
+      "loss": -0.0518,
+      "num_tokens": 1918975.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.15883508324623108,
+      "reward_std": 0.4439440965652466,
+      "rewards/cosine_scaled_reward/mean": -0.09656915813684464,
+      "rewards/cosine_scaled_reward/std": 0.38122376799583435,
+      "rewards/format_reward/mean": 0.4375,
+      "rewards/format_reward/std": 0.5013279914855957,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5208333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3364.0,
+      "completions/mean_length": 2708.541748046875,
+      "completions/mean_terminated_length": 1756.95654296875,
+      "completions/min_length": 661.0,
+      "completions/min_terminated_length": 661.0,
+      "epoch": 0.016,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.1001851558685303,
+      "kl": 0.0006243387858072916,
+      "learning_rate": 9.316216432703916e-07,
+      "loss": -0.6409,
+      "num_tokens": 2056941.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.2933271527290344,
+      "reward_std": 0.5121263861656189,
+      "rewards/cosine_scaled_reward/mean": -0.013947081752121449,
+      "rewards/cosine_scaled_reward/std": 0.49587875604629517,
+      "rewards/format_reward/mean": 0.4791666567325592,
+      "rewards/format_reward/std": 0.5048523545265198,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5833333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3430.0,
+      "completions/mean_length": 2640.125,
+      "completions/mean_terminated_length": 1318.7000732421875,
+      "completions/min_length": 425.0,
+      "completions/min_terminated_length": 425.0,
+      "epoch": 0.017142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.0151442289352417,
+      "kl": 0.0005944569905598959,
+      "learning_rate": 9.140576474687263e-07,
+      "loss": -0.6683,
+      "num_tokens": 2191857.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.2889822721481323,
+      "reward_std": 0.31458932161331177,
+      "rewards/cosine_scaled_reward/mean": -0.038122642785310745,
+      "rewards/cosine_scaled_reward/std": 0.44473376870155334,
+      "rewards/format_reward/mean": 0.5208333134651184,
+      "rewards/format_reward/std": 0.5048523545265198,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9583333333333334,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3304.0,
+      "completions/mean_length": 3522.1875,
+      "completions/mean_terminated_length": 2100.5,
+      "completions/min_length": 897.0,
+      "completions/min_terminated_length": 897.0,
+      "epoch": 0.018285714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.9443197250366211,
+      "kl": 0.0007228851318359375,
+      "learning_rate": 8.9471999940354e-07,
+      "loss": 2.2731,
+      "num_tokens": 2368752.0,
+      "policy_entropy_avg": 8.125,
+      "reward": -0.21434035897254944,
+      "reward_std": 0.31254154443740845,
+      "rewards/cosine_scaled_reward/mean": -0.20654386281967163,
+      "rewards/cosine_scaled_reward/std": 0.2568126320838928,
+      "rewards/format_reward/mean": 0.0833333358168602,
+      "rewards/format_reward/std": 0.2793101966381073,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.45833333333333337,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3076.0,
+      "completions/mean_length": 2299.875,
+      "completions/mean_terminated_length": 1213.3077392578125,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "epoch": 0.019428571428571427,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.764594554901123,
+      "kl": 0.000736236572265625,
+      "learning_rate": 8.737029101523929e-07,
+      "loss": -0.0316,
+      "num_tokens": 2487036.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.3496870696544647,
+      "reward_std": 0.5126497745513916,
+      "rewards/cosine_scaled_reward/mean": -0.0018433034420013428,
+      "rewards/cosine_scaled_reward/std": 0.4425233006477356,
+      "rewards/format_reward/mean": 0.5416666865348816,
+      "rewards/format_reward/std": 0.5035336017608643,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5416666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3228.0,
+      "completions/mean_length": 2700.3125,
+      "completions/mean_terminated_length": 1655.95458984375,
+      "completions/min_length": 536.0,
+      "completions/min_terminated_length": 536.0,
+      "epoch": 0.02057142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.8999841213226318,
+      "kl": 0.0006821950276692709,
+      "learning_rate": 8.511087728614862e-07,
+      "loss": -0.6147,
+      "num_tokens": 2624433.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.26751697063446045,
+      "reward_std": 0.4905094802379608,
+      "rewards/cosine_scaled_reward/mean": -0.05463438108563423,
+      "rewards/cosine_scaled_reward/std": 0.447592556476593,
+      "rewards/format_reward/mean": 0.5208333134651184,
+      "rewards/format_reward/std": 0.5048523545265198,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6666666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3405.0,
+      "completions/mean_length": 2786.77099609375,
+      "completions/mean_terminated_length": 1192.3125,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "epoch": 0.021714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.0238417387008667,
+      "kl": 0.0006643931070963541,
+      "learning_rate": 8.270476638965461e-07,
+      "loss": -0.0253,
+      "num_tokens": 2766640.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.3180326819419861,
+      "reward_std": 0.5151762366294861,
+      "rewards/cosine_scaled_reward/mean": 0.03630712628364563,
+      "rewards/cosine_scaled_reward/std": 0.47822093963623047,
+      "rewards/format_reward/mean": 0.4166666567325592,
+      "rewards/format_reward/std": 0.49822381138801575,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3561.0,
+      "completions/mean_length": 2221.1875,
+      "completions/mean_terminated_length": 1403.5001220703125,
+      "completions/min_length": 359.0,
+      "completions/min_terminated_length": 359.0,
+      "epoch": 0.022857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.0431965589523315,
+      "kl": 0.0006084442138671875,
+      "learning_rate": 8.01636806561836e-07,
+      "loss": -0.6551,
+      "num_tokens": 2881771.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.3996829688549042,
+      "reward_std": 0.34862709045410156,
+      "rewards/cosine_scaled_reward/mean": -0.06755157560110092,
+      "rewards/cosine_scaled_reward/std": 0.39543306827545166,
+      "rewards/format_reward/mean": 0.75,
+      "rewards/format_reward/std": 0.4375949800014496,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5208333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3408.0,
+      "completions/mean_length": 2486.52099609375,
+      "completions/mean_terminated_length": 1293.6087646484375,
+      "completions/min_length": 343.0,
+      "completions/min_terminated_length": 343.0,
+      "epoch": 0.024,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.1584967374801636,
+      "kl": 0.0008227030436197916,
+      "learning_rate": 7.75e-07,
+      "loss": -0.3451,
+      "num_tokens": 3009230.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.08958987891674042,
+      "reward_std": 0.3577960431575775,
+      "rewards/cosine_scaled_reward/mean": -0.17066805064678192,
+      "rewards/cosine_scaled_reward/std": 0.31892454624176025,
+      "rewards/format_reward/mean": 0.4791666567325592,
+      "rewards/format_reward/std": 0.5048523545265198,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.22916666666666663,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3488.0,
+      "completions/mean_length": 1677.9583740234375,
+      "completions/mean_terminated_length": 1111.29736328125,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 0.025142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.2852933406829834,
+      "kl": 0.000949859619140625,
+      "learning_rate": 7.472670160550848e-07,
+      "loss": -2.4718,
+      "num_tokens": 3096534.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.5749191045761108,
+      "reward_std": 0.3676217198371887,
+      "rewards/cosine_scaled_reward/mean": 0.015162050724029541,
+      "rewards/cosine_scaled_reward/std": 0.4574022591114044,
+      "rewards/format_reward/mean": 0.8541666865348816,
+      "rewards/format_reward/std": 0.3566739857196808,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3523.0,
+      "completions/mean_length": 2406.979248046875,
+      "completions/mean_terminated_length": 1491.5185546875,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "epoch": 0.026285714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.7508360147476196,
+      "kl": 0.0007603963216145834,
+      "learning_rate": 7.185729670371604e-07,
+      "loss": 0.5997,
+      "num_tokens": 3220163.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.29059678316116333,
+      "reward_std": 0.5978084802627563,
+      "rewards/cosine_scaled_reward/mean": -0.06813069432973862,
+      "rewards/cosine_scaled_reward/std": 0.44070079922676086,
+      "rewards/format_reward/mean": 0.5833333134651184,
+      "rewards/format_reward/std": 0.49822378158569336,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3552.0,
+      "completions/mean_length": 2770.39599609375,
+      "completions/mean_terminated_length": 1956.791748046875,
+      "completions/min_length": 381.0,
+      "completions/min_terminated_length": 381.0,
+      "epoch": 0.027428571428571427,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.5464559197425842,
+      "kl": 0.0007966359456380209,
+      "learning_rate": 6.890576474687263e-07,
+      "loss": -0.6853,
+      "num_tokens": 3361338.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.2939620018005371,
+      "reward_std": 0.60921311378479,
+      "rewards/cosine_scaled_reward/mean": -0.034292057156562805,
+      "rewards/cosine_scaled_reward/std": 0.44030100107192993,
+      "rewards/format_reward/mean": 0.5208333134651184,
+      "rewards/format_reward/std": 0.5048523545265198,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6041666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 2654.0,
+      "completions/mean_length": 2720.8125,
+      "completions/mean_terminated_length": 1403.3157958984375,
+      "completions/min_length": 509.0,
+      "completions/min_terminated_length": 509.0,
+      "epoch": 0.02857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.8707253336906433,
+      "kl": 0.000972747802734375,
+      "learning_rate": 6.588648530198504e-07,
+      "loss": 0.227,
+      "num_tokens": 3499659.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.19027817249298096,
+      "reward_std": 0.5284578204154968,
+      "rewards/cosine_scaled_reward/mean": -0.051548827439546585,
+      "rewards/cosine_scaled_reward/std": 0.4388459026813507,
+      "rewards/format_reward/mean": 0.3958333432674408,
+      "rewards/format_reward/std": 0.49420398473739624,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6041666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3077.0,
+      "completions/mean_length": 2909.729248046875,
+      "completions/mean_terminated_length": 1880.5789794921875,
+      "completions/min_length": 943.0,
+      "completions/min_terminated_length": 943.0,
+      "epoch": 0.029714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.9067810773849487,
+      "kl": 0.0007839202880859375,
+      "learning_rate": 6.281416799501187e-07,
+      "loss": 0.0152,
+      "num_tokens": 3647300.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.1333228200674057,
+      "reward_std": 0.3536017835140228,
+      "rewards/cosine_scaled_reward/mean": -0.1057773232460022,
+      "rewards/cosine_scaled_reward/std": 0.3763800859451294,
+      "rewards/format_reward/mean": 0.4166666567325592,
+      "rewards/format_reward/std": 0.49822381138801575,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6458333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3495.0,
+      "completions/mean_length": 2900.666748046875,
+      "completions/mean_terminated_length": 1654.5882568359375,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "epoch": 0.030857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.270411729812622,
+      "kl": 0.00095367431640625,
+      "learning_rate": 5.97037808470444e-07,
+      "loss": -0.0823,
+      "num_tokens": 3794248.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.1146991103887558,
+      "reward_std": 0.4322494864463806,
+      "rewards/cosine_scaled_reward/mean": -0.10968658328056335,
+      "rewards/cosine_scaled_reward/std": 0.319176584482193,
+      "rewards/format_reward/mean": 0.3958333432674408,
+      "rewards/format_reward/std": 0.49420398473739624,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5208333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3292.0,
+      "completions/mean_length": 2623.14599609375,
+      "completions/mean_terminated_length": 1578.7391357421875,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "epoch": 0.032,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.2852171659469604,
+      "kl": 0.0007775624593098959,
+      "learning_rate": 5.657047735161255e-07,
+      "loss": -0.6041,
+      "num_tokens": 3927911.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.45130008459091187,
+      "reward_std": 0.44697779417037964,
+      "rewards/cosine_scaled_reward/mean": 0.08673719316720963,
+      "rewards/cosine_scaled_reward/std": 0.49450618028640747,
+      "rewards/format_reward/mean": 0.5208333134651184,
+      "rewards/format_reward/std": 0.5048523545265198,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3166.0,
+      "completions/mean_length": 3315.20849609375,
+      "completions/mean_terminated_length": 2150.444580078125,
+      "completions/min_length": 597.0,
+      "completions/min_terminated_length": 597.0,
+      "epoch": 0.03314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.5690914392471313,
+      "kl": 0.0009403228759765625,
+      "learning_rate": 5.342952264838747e-07,
+      "loss": 0.4887,
+      "num_tokens": 4094895.0,
+      "policy_entropy_avg": 8.135416666666666,
+      "reward": 0.017309244722127914,
+      "reward_std": 0.44446587562561035,
+      "rewards/cosine_scaled_reward/mean": -0.10126852989196777,
+      "rewards/cosine_scaled_reward/std": 0.3542354702949524,
+      "rewards/format_reward/mean": 0.2291666716337204,
+      "rewards/format_reward/std": 0.4247443675994873,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5833333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3493.0,
+      "completions/mean_length": 2867.52099609375,
+      "completions/mean_terminated_length": 1864.4500732421875,
+      "completions/min_length": 716.0,
+      "completions/min_terminated_length": 716.0,
+      "epoch": 0.03428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.0161747932434082,
+      "kl": 0.0011774698893229167,
+      "learning_rate": 5.02962191529556e-07,
+      "loss": 0.2395,
+      "num_tokens": 4240438.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.1694916933774948,
+      "reward_std": 0.6536720395088196,
+      "rewards/cosine_scaled_reward/mean": -0.09878844022750854,
+      "rewards/cosine_scaled_reward/std": 0.39795705676078796,
+      "rewards/format_reward/mean": 0.4583333432674408,
+      "rewards/format_reward/std": 0.5035336017608643,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.7291666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3384.0,
+      "completions/mean_length": 2928.875,
+      "completions/mean_terminated_length": 1165.0770263671875,
+      "completions/min_length": 271.0,
+      "completions/min_terminated_length": 271.0,
+      "epoch": 0.03542857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.1587620973587036,
+      "kl": 0.000823974609375,
+      "learning_rate": 4.7185832004988133e-07,
+      "loss": 0.1645,
+      "num_tokens": 4388896.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.0713512971997261,
+      "reward_std": 0.43281668424606323,
+      "rewards/cosine_scaled_reward/mean": -0.0909477099776268,
+      "rewards/cosine_scaled_reward/std": 0.3611638844013214,
+      "rewards/format_reward/mean": 0.2916666567325592,
+      "rewards/format_reward/std": 0.4593396782875061,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6041666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 2794.0,
+      "completions/mean_length": 2802.9375,
+      "completions/mean_terminated_length": 1610.7894287109375,
+      "completions/min_length": 529.0,
+      "completions/min_terminated_length": 529.0,
+      "epoch": 0.036571428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.7285576462745667,
+      "kl": 0.0011965433756510417,
+      "learning_rate": 4.4113514698014953e-07,
+      "loss": 0.5699,
+      "num_tokens": 4531201.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.13415873050689697,
+      "reward_std": 0.4497126042842865,
+      "rewards/cosine_scaled_reward/mean": -0.09471765905618668,
+      "rewards/cosine_scaled_reward/std": 0.3660888373851776,
+      "rewards/format_reward/mean": 0.3958333432674408,
+      "rewards/format_reward/std": 0.49420398473739624,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6666666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3527.0,
+      "completions/mean_length": 3081.33349609375,
+      "completions/mean_terminated_length": 2076.0,
+      "completions/min_length": 765.0,
+      "completions/min_terminated_length": 765.0,
+      "epoch": 0.037714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.5806891322135925,
+      "kl": 0.0007527669270833334,
+      "learning_rate": 4.1094235253127374e-07,
+      "loss": -0.155,
+      "num_tokens": 4687517.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.052633434534072876,
+      "reward_std": 0.5880012512207031,
+      "rewards/cosine_scaled_reward/mean": -0.14701275527477264,
+      "rewards/cosine_scaled_reward/std": 0.3462415635585785,
+      "rewards/format_reward/mean": 0.375,
+      "rewards/format_reward/std": 0.48924607038497925,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.41666666666666663,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3110.0,
+      "completions/mean_length": 2213.104248046875,
+      "completions/mean_terminated_length": 1233.8929443359375,
+      "completions/min_length": 582.0,
+      "completions/min_terminated_length": 582.0,
+      "epoch": 0.038857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.0047067403793335,
+      "kl": 0.0015347798665364583,
+      "learning_rate": 3.8142703296283953e-07,
+      "loss": 0.6987,
+      "num_tokens": 4800910.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.43960699439048767,
+      "reward_std": 0.5042138695716858,
+      "rewards/cosine_scaled_reward/mean": 0.025659168139100075,
+      "rewards/cosine_scaled_reward/std": 0.5110981464385986,
+      "rewards/format_reward/mean": 0.625,
+      "rewards/format_reward/std": 0.48924607038497925,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5833333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3482.0,
+      "completions/mean_length": 2852.125,
+      "completions/mean_terminated_length": 1827.5,
+      "completions/min_length": 369.0,
+      "completions/min_terminated_length": 369.0,
+      "epoch": 0.04,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.6956940293312073,
+      "kl": 0.0008710225423177084,
+      "learning_rate": 3.5273298394491515e-07,
+      "loss": 0.023,
+      "num_tokens": 4946434.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.27234378457069397,
+      "reward_std": 0.6467978954315186,
+      "rewards/cosine_scaled_reward/mean": -0.030088132247328758,
+      "rewards/cosine_scaled_reward/std": 0.4617981016635895,
+      "rewards/format_reward/mean": 0.4791666567325592,
+      "rewards/format_reward/std": 0.5048523545265198,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.7916666666666666,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3373.0,
+      "completions/mean_length": 3279.95849609375,
+      "completions/mean_terminated_length": 2124.60009765625,
+      "completions/min_length": 920.0,
+      "completions/min_terminated_length": 920.0,
+      "epoch": 0.04114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.6212595105171204,
+      "kl": 0.0012715657552083333,
+      "learning_rate": 3.250000000000001e-07,
+      "loss": 0.6297,
+      "num_tokens": 5112206.0,
+      "policy_entropy_avg": 8.135416666666666,
+      "reward": -0.14957016706466675,
+      "reward_std": 0.34586209058761597,
+      "rewards/cosine_scaled_reward/mean": -0.2400539517402649,
+      "rewards/cosine_scaled_reward/std": 0.1841082125902176,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4375949800014496,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3444.0,
+      "completions/mean_length": 3138.104248046875,
+      "completions/mean_terminated_length": 1800.416748046875,
+      "completions/min_length": 474.0,
+      "completions/min_terminated_length": 474.0,
+      "epoch": 0.04228571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.7584943771362305,
+      "kl": 0.0009466807047526041,
+      "learning_rate": 2.9836319343816397e-07,
+      "loss": 1.1086,
+      "num_tokens": 5271103.0,
+      "policy_entropy_avg": 8.135416666666666,
+      "reward": -0.1891360729932785,
+      "reward_std": 0.35265272855758667,
+      "rewards/cosine_scaled_reward/mean": -0.27048927545547485,
+      "rewards/cosine_scaled_reward/std": 0.19963285326957703,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4375949800014496,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8333333333333334,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 2923.0,
+      "completions/mean_length": 3194.5,
+      "completions/mean_terminated_length": 1247.0,
+      "completions/min_length": 478.0,
+      "completions/min_terminated_length": 478.0,
+      "epoch": 0.04342857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.9534838795661926,
+      "kl": 0.0009625752766927084,
+      "learning_rate": 2.729523361034538e-07,
+      "loss": 0.6428,
+      "num_tokens": 5433223.0,
+      "policy_entropy_avg": 8.125,
+      "reward": -0.03065665066242218,
+      "reward_std": 0.27790385484695435,
+      "rewards/cosine_scaled_reward/mean": -0.10691537708044052,
+      "rewards/cosine_scaled_reward/std": 0.2967289388179779,
+      "rewards/format_reward/mean": 0.1666666716337204,
+      "rewards/format_reward/std": 0.3766217827796936,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5416666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3191.0,
+      "completions/mean_length": 2574.58349609375,
+      "completions/mean_terminated_length": 1381.6363525390625,
+      "completions/min_length": 326.0,
+      "completions/min_terminated_length": 326.0,
+      "epoch": 0.044571428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 38.30278396606445,
+      "kl": 0.0010172526041666667,
+      "learning_rate": 2.488912271385139e-07,
+      "loss": -21.1377,
+      "num_tokens": 5564891.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.24889466166496277,
+      "reward_std": 0.4223267734050751,
+      "rewards/cosine_scaled_reward/mean": -0.03770924732089043,
+      "rewards/cosine_scaled_reward/std": 0.4333648681640625,
+      "rewards/format_reward/mean": 0.4583333432674408,
+      "rewards/format_reward/std": 0.5035336017608643,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3383.0,
+      "completions/mean_length": 2522.6875,
+      "completions/mean_terminated_length": 1461.375,
+      "completions/min_length": 430.0,
+      "completions/min_terminated_length": 430.0,
+      "epoch": 0.045714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.7990902662277222,
+      "kl": 0.0011386871337890625,
+      "learning_rate": 2.2629708984760706e-07,
+      "loss": -0.3002,
+      "num_tokens": 5693534.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.23835521936416626,
+      "reward_std": 0.3610273599624634,
+      "rewards/cosine_scaled_reward/mean": -0.07706651836633682,
+      "rewards/cosine_scaled_reward/std": 0.39551421999931335,
+      "rewards/format_reward/mean": 0.5208333134651184,
+      "rewards/format_reward/std": 0.5048523545265198,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3214.0,
+      "completions/mean_length": 2528.33349609375,
+      "completions/mean_terminated_length": 1472.666748046875,
+      "completions/min_length": 547.0,
+      "completions/min_terminated_length": 547.0,
+      "epoch": 0.046857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.0874838829040527,
+      "kl": 0.001148223876953125,
+      "learning_rate": 2.0528000059645995e-07,
+      "loss": -0.3434,
+      "num_tokens": 5823258.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.3574024438858032,
+      "reward_std": 0.6161948442459106,
+      "rewards/cosine_scaled_reward/mean": 0.01450828742235899,
+      "rewards/cosine_scaled_reward/std": 0.46221035718917847,
+      "rewards/format_reward/mean": 0.5208333134651184,
+      "rewards/format_reward/std": 0.5048523545265198,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6458333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 2705.0,
+      "completions/mean_length": 2656.416748046875,
+      "completions/mean_terminated_length": 964.941162109375,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 0.048,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.283677339553833,
+      "kl": 0.0011049906412760417,
+      "learning_rate": 1.8594235253127372e-07,
+      "loss": 0.27,
+      "num_tokens": 5958536.0,
+      "policy_entropy_avg": 8.125,
+      "reward": -0.12885941565036774,
+      "reward_std": 0.28405821323394775,
+      "rewards/cosine_scaled_reward/mean": -0.2866226136684418,
+      "rewards/cosine_scaled_reward/std": 0.1905842125415802,
+      "rewards/format_reward/mean": 0.375,
+      "rewards/format_reward/std": 0.48924607038497925,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5833333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 2993.0,
+      "completions/mean_length": 2602.354248046875,
+      "completions/mean_terminated_length": 1228.050048828125,
+      "completions/min_length": 586.0,
+      "completions/min_terminated_length": 586.0,
+      "epoch": 0.04914285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.7168472409248352,
+      "kl": 0.0010967254638671875,
+      "learning_rate": 1.6837835672960831e-07,
+      "loss": -0.0951,
+      "num_tokens": 6092161.0,
+      "policy_entropy_avg": 8.114583333333334,
+      "reward": 0.21493911743164062,
+      "reward_std": 0.4637143015861511,
+      "rewards/cosine_scaled_reward/mean": -0.07424557209014893,
+      "rewards/cosine_scaled_reward/std": 0.43746232986450195,
+      "rewards/format_reward/mean": 0.4791666567325592,
+      "rewards/format_reward/std": 0.5048523545265198,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5833333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3087.0,
+      "completions/mean_length": 2491.875,
+      "completions/mean_terminated_length": 962.9000244140625,
+      "completions/min_length": 340.0,
+      "completions/min_terminated_length": 340.0,
+      "epoch": 0.05028571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.7064040303230286,
+      "kl": 0.0012423197428385417,
+      "learning_rate": 1.5267358321348285e-07,
+      "loss": 0.4683,
+      "num_tokens": 6219793.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.2282370775938034,
+      "reward_std": 0.4510888457298279,
+      "rewards/cosine_scaled_reward/mean": -0.04318302869796753,
+      "rewards/cosine_scaled_reward/std": 0.4827073812484741,
+      "rewards/format_reward/mean": 0.4375,
+      "rewards/format_reward/std": 0.5013279914855957,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3524.0,
+      "completions/mean_length": 3407.64599609375,
+      "completions/mean_terminated_length": 2643.444580078125,
+      "completions/min_length": 1086.0,
+      "completions/min_terminated_length": 1086.0,
+      "epoch": 0.05142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.6878222227096558,
+      "kl": 0.0010045369466145833,
+      "learning_rate": 1.3890454406082956e-07,
+      "loss": 0.5395,
+      "num_tokens": 6392102.0,
+      "policy_entropy_avg": 8.135416666666666,
+      "reward": 0.18636834621429443,
+      "reward_std": 0.656842827796936,
+      "rewards/cosine_scaled_reward/mean": -0.012889747507870197,
+      "rewards/cosine_scaled_reward/std": 0.47439250349998474,
+      "rewards/format_reward/mean": 0.3125,
+      "rewards/format_reward/std": 0.4684174358844757,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6666666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 1561.0,
+      "completions/mean_length": 2694.75,
+      "completions/mean_terminated_length": 916.25,
+      "completions/min_length": 596.0,
+      "completions/min_terminated_length": 596.0,
+      "epoch": 0.052571428571428575,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.1607235670089722,
+      "kl": 0.0015538533528645833,
+      "learning_rate": 1.2713832064634125e-07,
+      "loss": -0.0955,
+      "num_tokens": 6529826.0,
+      "policy_entropy_avg": 8.125,
+      "reward": -0.13178138434886932,
+      "reward_std": 0.2914193272590637,
+      "rewards/cosine_scaled_reward/mean": -0.26803696155548096,
+      "rewards/cosine_scaled_reward/std": 0.174865260720253,
+      "rewards/format_reward/mean": 0.3333333432674408,
+      "rewards/format_reward/std": 0.47639307379722595,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.45833333333333337,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3485.0,
+      "completions/mean_length": 2533.6875,
+      "completions/mean_terminated_length": 1644.9615478515625,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "epoch": 0.053714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.012225389480591,
+      "kl": 0.000888824462890625,
+      "learning_rate": 1.1743223682775649e-07,
+      "loss": -1.3532,
+      "num_tokens": 6659243.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.5446165800094604,
+      "reward_std": 0.5056490898132324,
+      "rewards/cosine_scaled_reward/mean": 0.11685246229171753,
+      "rewards/cosine_scaled_reward/std": 0.5205204486846924,
+      "rewards/format_reward/mean": 0.6041666865348816,
+      "rewards/format_reward/std": 0.49420401453971863,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5208333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3350.0,
+      "completions/mean_length": 2581.33349609375,
+      "completions/mean_terminated_length": 1491.478271484375,
+      "completions/min_length": 552.0,
+      "completions/min_terminated_length": 552.0,
+      "epoch": 0.054857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.6057020425796509,
+      "kl": 0.00128936767578125,
+      "learning_rate": 1.0983357966978745e-07,
+      "loss": 0.4498,
+      "num_tokens": 6791055.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.21042108535766602,
+      "reward_std": 0.5790350437164307,
+      "rewards/cosine_scaled_reward/mean": -0.10897094756364822,
+      "rewards/cosine_scaled_reward/std": 0.40693506598472595,
+      "rewards/format_reward/mean": 0.5416666865348816,
+      "rewards/format_reward/std": 0.5035336017608643,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.39583333333333337,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3352.0,
+      "completions/mean_length": 2207.791748046875,
+      "completions/mean_terminated_length": 1306.137939453125,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 0.056,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.9485021233558655,
+      "kl": 0.0012709299723307292,
+      "learning_rate": 1.0437936906629334e-07,
+      "loss": -0.5865,
+      "num_tokens": 6904577.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.2760649025440216,
+      "reward_std": 0.5270255208015442,
+      "rewards/cosine_scaled_reward/mean": -0.10014239698648453,
+      "rewards/cosine_scaled_reward/std": 0.42991697788238525,
+      "rewards/format_reward/mean": 0.625,
+      "rewards/format_reward/std": 0.48924607038497925,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 2808.0,
+      "completions/mean_length": 2786.791748046875,
+      "completions/mean_terminated_length": 1032.933349609375,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "epoch": 0.05714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.7447382807731628,
+      "kl": 0.001110076904296875,
+      "learning_rate": 1.0109617738307911e-07,
+      "loss": -0.2738,
+      "num_tokens": 7046455.0,
+      "policy_entropy_avg": 8.135416666666666,
+      "reward": 0.2738919258117676,
+      "reward_std": 0.44107958674430847,
+      "rewards/cosine_scaled_reward/mean": 0.023186095058918,
+      "rewards/cosine_scaled_reward/std": 0.3980216383934021,
+      "rewards/format_reward/mean": 0.375,
+      "rewards/format_reward/std": 0.48924607038497925,
+      "step": 50
+    },
+    {
+      "epoch": 0.05714285714285714,
+      "step": 50,
+      "total_flos": 0.0,
+      "train_loss": -0.45832799572497607,
+      "train_runtime": 4441.2019,
+      "train_samples_per_second": 0.54,
+      "train_steps_per_second": 0.011
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 50,
+  "num_input_tokens_seen": 7046455,
+  "num_train_epochs": 1,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}