Model save

Browse files

Files changed (5) hide show

README.md +67 -0
all_results.json +8 -0
generation_config.json +9 -0
train_results.json +8 -0
trainer_state.json +1493 -0

README.md ADDED Viewed

	@@ -0,0 +1,67 @@

+---
+library_name: transformers
+model_name: ER-GRPO-alpha99
+tags:
+- generated_from_trainer
+- trl
+- grpo
+licence: license
+---
+# Model Card for ER-GRPO-alpha99
+This model is a fine-tuned version of [None](https://huggingface.co/None).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="lalalaDa/ER-GRPO-alpha99", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+### Framework versions
+- TRL: 0.18.1
+- Transformers: 4.52.4
+- Pytorch: 2.5.1
+- Datasets: 3.6.0
+- Tokenizers: 0.21.1
+## Citations
+Cite GRPO as:
+```bibtex
+@article{zhihong2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+```
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

all_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "total_flos": 0.0,
+    "train_loss": 0.00044901110231876373,
+    "train_runtime": 4526.0548,
+    "train_samples": 7000,
+    "train_samples_per_second": 0.53,
+    "train_steps_per_second": 0.011
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151646,
+  "do_sample": true,
+  "eos_token_id": 151643,
+  "temperature": 0.6,
+  "top_p": 0.95,
+  "transformers_version": "4.52.4"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "total_flos": 0.0,
+    "train_loss": 0.00044901110231876373,
+    "train_runtime": 4526.0548,
+    "train_samples": 7000,
+    "train_samples_per_second": 0.53,
+    "train_steps_per_second": 0.011
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1493 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.05714285714285714,
+  "eval_steps": 500,
+  "global_step": 50,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5208333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3128.0,
+      "completions/mean_length": 2584.104248046875,
+      "completions/mean_terminated_length": 1497.2608642578125,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "epoch": 0.001142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26198074221611023,
+      "kl": 0.0,
+      "learning_rate": 0.0,
+      "loss": -0.0022,
+      "num_tokens": 131153.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.3948305547237396,
+      "reward_std": 0.7732391357421875,
+      "rewards/cosine_scaled_reward/mean": -0.062009382992982864,
+      "rewards/cosine_scaled_reward/std": 0.43048128485679626,
+      "rewards/format_reward/mean": 0.5208333134651184,
+      "rewards/format_reward/std": 0.504852294921875,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5833333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3280.0,
+      "completions/mean_length": 2761.666748046875,
+      "completions/mean_terminated_length": 1610.4000244140625,
+      "completions/min_length": 465.0,
+      "completions/min_terminated_length": 465.0,
+      "epoch": 0.002285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2314005047082901,
+      "kl": 0.0,
+      "learning_rate": 2e-07,
+      "loss": -0.0045,
+      "num_tokens": 271243.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.4077601432800293,
+      "reward_std": 0.8425893187522888,
+      "rewards/cosine_scaled_reward/mean": -0.003428752301260829,
+      "rewards/cosine_scaled_reward/std": 0.4935320317745209,
+      "rewards/format_reward/mean": 0.4166666567325592,
+      "rewards/format_reward/std": 0.49822381138801575,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 2945.0,
+      "completions/mean_length": 3343.33349609375,
+      "completions/mean_terminated_length": 1658.666748046875,
+      "completions/min_length": 490.0,
+      "completions/min_terminated_length": 490.0,
+      "epoch": 0.0034285714285714284,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.19728681445121765,
+      "kl": 0.0006656646728515625,
+      "learning_rate": 4e-07,
+      "loss": 0.0095,
+      "num_tokens": 439577.0,
+      "policy_entropy_avg": 8.125,
+      "reward": -0.15455231070518494,
+      "reward_std": 0.5764515995979309,
+      "rewards/cosine_scaled_reward/mean": -0.17141447961330414,
+      "rewards/cosine_scaled_reward/std": 0.32203689217567444,
+      "rewards/format_reward/mean": 0.1875,
+      "rewards/format_reward/std": 0.3944427967071533,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.39583333333333337,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3458.0,
+      "completions/mean_length": 2226.89599609375,
+      "completions/mean_terminated_length": 1337.7586669921875,
+      "completions/min_length": 407.0,
+      "completions/min_terminated_length": 407.0,
+      "epoch": 0.004571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2950705587863922,
+      "kl": 0.0006043116251627604,
+      "learning_rate": 6e-07,
+      "loss": -0.001,
+      "num_tokens": 553824.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.4680083394050598,
+      "reward_std": 0.8357078433036804,
+      "rewards/cosine_scaled_reward/mean": -0.09815327078104019,
+      "rewards/cosine_scaled_reward/std": 0.399366170167923,
+      "rewards/format_reward/mean": 0.6666666865348816,
+      "rewards/format_reward/std": 0.47639307379722595,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.7083333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 2603.0,
+      "completions/mean_length": 3089.104248046875,
+      "completions/mean_terminated_length": 1887.21435546875,
+      "completions/min_length": 909.0,
+      "completions/min_terminated_length": 909.0,
+      "epoch": 0.005714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2482743263244629,
+      "kl": 0.000629425048828125,
+      "learning_rate": 8e-07,
+      "loss": 0.0028,
+      "num_tokens": 710213.0,
+      "policy_entropy_avg": 8.125,
+      "reward": -0.06966459006071091,
+      "reward_std": 0.7608852386474609,
+      "rewards/cosine_scaled_reward/mean": -0.20167399942874908,
+      "rewards/cosine_scaled_reward/std": 0.3204644024372101,
+      "rewards/format_reward/mean": 0.3333333432674408,
+      "rewards/format_reward/std": 0.47639307379722595,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.7916666666666666,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3421.0,
+      "completions/mean_length": 3119.52099609375,
+      "completions/mean_terminated_length": 1354.5,
+      "completions/min_length": 554.0,
+      "completions/min_terminated_length": 554.0,
+      "epoch": 0.006857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24190759658813477,
+      "kl": 0.0006701151529947916,
+      "learning_rate": 1e-06,
+      "loss": 0.0018,
+      "num_tokens": 868686.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.026811789721250534,
+      "reward_std": 0.7506579756736755,
+      "rewards/cosine_scaled_reward/mean": -0.1427767425775528,
+      "rewards/cosine_scaled_reward/std": 0.3361252248287201,
+      "rewards/format_reward/mean": 0.3125,
+      "rewards/format_reward/std": 0.4684174358844757,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5416666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3457.0,
+      "completions/mean_length": 3024.291748046875,
+      "completions/mean_terminated_length": 2362.818359375,
+      "completions/min_length": 839.0,
+      "completions/min_terminated_length": 839.0,
+      "epoch": 0.008,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.20822857320308685,
+      "kl": 0.0005512237548828125,
+      "learning_rate": 9.989038226169207e-07,
+      "loss": -0.009,
+      "num_tokens": 1021658.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.47669005393981934,
+      "reward_std": 0.9081848859786987,
+      "rewards/cosine_scaled_reward/mean": -0.031290601938962936,
+      "rewards/cosine_scaled_reward/std": 0.47983497381210327,
+      "rewards/format_reward/mean": 0.5416666865348816,
+      "rewards/format_reward/std": 0.5035336017608643,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6041666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3568.0,
+      "completions/mean_length": 2791.875,
+      "completions/mean_terminated_length": 1582.8421630859375,
+      "completions/min_length": 327.0,
+      "completions/min_terminated_length": 327.0,
+      "epoch": 0.009142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23235374689102173,
+      "kl": 0.0005970001220703125,
+      "learning_rate": 9.956206309337066e-07,
+      "loss": -0.0081,
+      "num_tokens": 1163480.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.5300650596618652,
+      "reward_std": 0.7924127578735352,
+      "rewards/cosine_scaled_reward/mean": 0.03719766065478325,
+      "rewards/cosine_scaled_reward/std": 0.4377634525299072,
+      "rewards/format_reward/mean": 0.4583333432674408,
+      "rewards/format_reward/std": 0.5035336017608643,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.7083333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3494.0,
+      "completions/mean_length": 3142.95849609375,
+      "completions/mean_terminated_length": 2071.857177734375,
+      "completions/min_length": 955.0,
+      "completions/min_terminated_length": 955.0,
+      "epoch": 0.010285714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.21945622563362122,
+      "kl": 0.0006663004557291666,
+      "learning_rate": 9.901664203302124e-07,
+      "loss": 0.002,
+      "num_tokens": 1322934.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.09029825031757355,
+      "reward_std": 0.8250617980957031,
+      "rewards/cosine_scaled_reward/mean": -0.1421239972114563,
+      "rewards/cosine_scaled_reward/std": 0.3718816637992859,
+      "rewards/format_reward/mean": 0.375,
+      "rewards/format_reward/std": 0.48924607038497925,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3440.0,
+      "completions/mean_length": 2639.791748046875,
+      "completions/mean_terminated_length": 1066.111083984375,
+      "completions/min_length": 329.0,
+      "completions/min_terminated_length": 329.0,
+      "epoch": 0.011428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2775964140892029,
+      "kl": 0.0005779266357421875,
+      "learning_rate": 9.825677631722435e-07,
+      "loss": -0.0111,
+      "num_tokens": 1457768.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.31791985034942627,
+      "reward_std": 0.7219366431236267,
+      "rewards/cosine_scaled_reward/mean": -0.03815798461437225,
+      "rewards/cosine_scaled_reward/std": 0.4010634124279022,
+      "rewards/format_reward/mean": 0.3958333432674408,
+      "rewards/format_reward/std": 0.49420398473739624,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3528.0,
+      "completions/mean_length": 3260.8125,
+      "completions/mean_terminated_length": 1860.3333740234375,
+      "completions/min_length": 855.0,
+      "completions/min_terminated_length": 855.0,
+      "epoch": 0.012571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.21065565943717957,
+      "kl": 0.0005486806233723959,
+      "learning_rate": 9.728616793536587e-07,
+      "loss": 0.0145,
+      "num_tokens": 1623041.0,
+      "policy_entropy_avg": 8.135416666666666,
+      "reward": -0.1468753218650818,
+      "reward_std": 0.909512996673584,
+      "rewards/cosine_scaled_reward/mean": -0.18839001655578613,
+      "rewards/cosine_scaled_reward/std": 0.377286821603775,
+      "rewards/format_reward/mean": 0.2291666716337204,
+      "rewards/format_reward/std": 0.4247443675994873,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.41666666666666663,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3564.0,
+      "completions/mean_length": 2480.791748046875,
+      "completions/mean_terminated_length": 1692.7857666015625,
+      "completions/min_length": 474.0,
+      "completions/min_terminated_length": 474.0,
+      "epoch": 0.013714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33562034368515015,
+      "kl": 0.0005970001220703125,
+      "learning_rate": 9.610954559391704e-07,
+      "loss": 0.0138,
+      "num_tokens": 1750327.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.425642192363739,
+      "reward_std": 0.823100745677948,
+      "rewards/cosine_scaled_reward/mean": -0.08819279819726944,
+      "rewards/cosine_scaled_reward/std": 0.44234269857406616,
+      "rewards/format_reward/mean": 0.6041666865348816,
+      "rewards/format_reward/std": 0.49420398473739624,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5833333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3538.0,
+      "completions/mean_length": 2816.14599609375,
+      "completions/mean_terminated_length": 1741.1500244140625,
+      "completions/min_length": 452.0,
+      "completions/min_terminated_length": 452.0,
+      "epoch": 0.014857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27906712889671326,
+      "kl": 0.0005658467610677084,
+      "learning_rate": 9.473264167865171e-07,
+      "loss": 0.0016,
+      "num_tokens": 1893782.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.24736499786376953,
+      "reward_std": 0.7155070304870605,
+      "rewards/cosine_scaled_reward/mean": -0.09444598108530045,
+      "rewards/cosine_scaled_reward/std": 0.4492030441761017,
+      "rewards/format_reward/mean": 0.4375,
+      "rewards/format_reward/std": 0.5013279914855957,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5833333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3369.0,
+      "completions/mean_length": 2769.0,
+      "completions/mean_terminated_length": 1628.0,
+      "completions/min_length": 555.0,
+      "completions/min_terminated_length": 555.0,
+      "epoch": 0.016,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27333828806877136,
+      "kl": 0.000553131103515625,
+      "learning_rate": 9.316216432703916e-07,
+      "loss": 0.0017,
+      "num_tokens": 2034650.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.1486106812953949,
+      "reward_std": 0.8035473227500916,
+      "rewards/cosine_scaled_reward/mean": -0.1336546093225479,
+      "rewards/cosine_scaled_reward/std": 0.3953794538974762,
+      "rewards/format_reward/mean": 0.4166666567325592,
+      "rewards/format_reward/std": 0.49822381138801575,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3186.0,
+      "completions/mean_length": 2703.08349609375,
+      "completions/mean_terminated_length": 1234.888916015625,
+      "completions/min_length": 405.0,
+      "completions/min_terminated_length": 405.0,
+      "epoch": 0.017142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2734379768371582,
+      "kl": 0.0005137125651041666,
+      "learning_rate": 9.140576474687263e-07,
+      "loss": -0.0122,
+      "num_tokens": 2172588.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.406665563583374,
+      "reward_std": 0.3276861608028412,
+      "rewards/cosine_scaled_reward/mean": 0.0168545451015234,
+      "rewards/cosine_scaled_reward/std": 0.4574853479862213,
+      "rewards/format_reward/mean": 0.375,
+      "rewards/format_reward/std": 0.48924607038497925,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9791666666666666,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 2984.0,
+      "completions/mean_length": 3571.5,
+      "completions/mean_terminated_length": 2984.0,
+      "completions/min_length": 2984.0,
+      "completions/min_terminated_length": 2984.0,
+      "epoch": 0.018285714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.1932428628206253,
+      "kl": 0.0006643931070963541,
+      "learning_rate": 8.9471999940354e-07,
+      "loss": 0.0075,
+      "num_tokens": 2351850.0,
+      "policy_entropy_avg": 8.135416666666666,
+      "reward": -0.3992506265640259,
+      "reward_std": 0.5042399168014526,
+      "rewards/cosine_scaled_reward/mean": -0.22146178781986237,
+      "rewards/cosine_scaled_reward/std": 0.292772501707077,
+      "rewards/format_reward/mean": 0.0416666679084301,
+      "rewards/format_reward/std": 0.20194092392921448,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.39583333333333337,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3475.0,
+      "completions/mean_length": 2287.416748046875,
+      "completions/mean_terminated_length": 1437.9310302734375,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "epoch": 0.019428571428571427,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.37555888295173645,
+      "kl": 0.0006338755289713541,
+      "learning_rate": 8.737029101523929e-07,
+      "loss": -0.0011,
+      "num_tokens": 2469536.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.5107091665267944,
+      "reward_std": 0.8238445520401001,
+      "rewards/cosine_scaled_reward/mean": -0.04544559493660927,
+      "rewards/cosine_scaled_reward/std": 0.45671001076698303,
+      "rewards/format_reward/mean": 0.6041666865348816,
+      "rewards/format_reward/std": 0.49420398473739624,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3116.0,
+      "completions/mean_length": 2911.89599609375,
+      "completions/mean_terminated_length": 1433.2667236328125,
+      "completions/min_length": 608.0,
+      "completions/min_terminated_length": 608.0,
+      "epoch": 0.02057142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.21075770258903503,
+      "kl": 0.0006434122721354166,
+      "learning_rate": 8.511087728614862e-07,
+      "loss": 0.0029,
+      "num_tokens": 2617089.0,
+      "policy_entropy_avg": 8.135416666666666,
+      "reward": -0.13376453518867493,
+      "reward_std": 0.6403241157531738,
+      "rewards/cosine_scaled_reward/mean": -0.2234683483839035,
+      "rewards/cosine_scaled_reward/std": 0.2743138074874878,
+      "rewards/format_reward/mean": 0.3125,
+      "rewards/format_reward/std": 0.4684174358844757,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3400.0,
+      "completions/mean_length": 2844.5,
+      "completions/mean_terminated_length": 1893.71435546875,
+      "completions/min_length": 504.0,
+      "completions/min_terminated_length": 504.0,
+      "epoch": 0.021714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24317172169685364,
+      "kl": 0.0006122589111328125,
+      "learning_rate": 8.270476638965461e-07,
+      "loss": -0.0105,
+      "num_tokens": 2762067.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.7856847643852234,
+      "reward_std": 0.5978894829750061,
+      "rewards/cosine_scaled_reward/mean": 0.15523308515548706,
+      "rewards/cosine_scaled_reward/std": 0.5373290181159973,
+      "rewards/format_reward/mean": 0.4791666567325592,
+      "rewards/format_reward/std": 0.5048523545265198,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.47916666666666663,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3494.0,
+      "completions/mean_length": 2482.83349609375,
+      "completions/mean_terminated_length": 1469.760009765625,
+      "completions/min_length": 408.0,
+      "completions/min_terminated_length": 408.0,
+      "epoch": 0.022857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26188376545906067,
+      "kl": 0.0005286534627278646,
+      "learning_rate": 8.01636806561836e-07,
+      "loss": -0.0042,
+      "num_tokens": 2889757.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.5357545614242554,
+      "reward_std": 0.7750095129013062,
+      "rewards/cosine_scaled_reward/mean": -0.03285994753241539,
+      "rewards/cosine_scaled_reward/std": 0.4009867310523987,
+      "rewards/format_reward/mean": 0.6041666865348816,
+      "rewards/format_reward/std": 0.49420398473739624,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6041666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 2710.0,
+      "completions/mean_length": 2631.70849609375,
+      "completions/mean_terminated_length": 1178.2105712890625,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "epoch": 0.024,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31193122267723083,
+      "kl": 0.0006847381591796875,
+      "learning_rate": 7.75e-07,
+      "loss": 0.0002,
+      "num_tokens": 3024185.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.18238481879234314,
+      "reward_std": 0.4078831374645233,
+      "rewards/cosine_scaled_reward/mean": -0.11668267101049423,
+      "rewards/cosine_scaled_reward/std": 0.3962862193584442,
+      "rewards/format_reward/mean": 0.4166666567325592,
+      "rewards/format_reward/std": 0.49822381138801575,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.27083333333333337,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3239.0,
+      "completions/mean_length": 1697.2083740234375,
+      "completions/mean_terminated_length": 996.4000244140625,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 0.025142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.40179967880249023,
+      "kl": 0.0006052652994791666,
+      "learning_rate": 7.472670160550848e-07,
+      "loss": -0.005,
+      "num_tokens": 3112413.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.6634411811828613,
+      "reward_std": 0.5782728791236877,
+      "rewards/cosine_scaled_reward/mean": -0.06244581937789917,
+      "rewards/cosine_scaled_reward/std": 0.4282727539539337,
+      "rewards/format_reward/mean": 0.7916666865348816,
+      "rewards/format_reward/std": 0.41041406989097595,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.39583333333333337,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3421.0,
+      "completions/mean_length": 2181.104248046875,
+      "completions/mean_terminated_length": 1261.9654541015625,
+      "completions/min_length": 595.0,
+      "completions/min_terminated_length": 595.0,
+      "epoch": 0.026285714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3063512444496155,
+      "kl": 0.0006097157796223959,
+      "learning_rate": 7.185729670371604e-07,
+      "loss": 0.0002,
+      "num_tokens": 3225200.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.3287263512611389,
+      "reward_std": 0.8908068537712097,
+      "rewards/cosine_scaled_reward/mean": -0.14731089770793915,
+      "rewards/cosine_scaled_reward/std": 0.42148637771606445,
+      "rewards/format_reward/mean": 0.625,
+      "rewards/format_reward/std": 0.48924607038497925,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5416666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3322.0,
+      "completions/mean_length": 2681.229248046875,
+      "completions/mean_terminated_length": 1614.3182373046875,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "epoch": 0.027428571428571427,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26260870695114136,
+      "kl": 0.0006230672200520834,
+      "learning_rate": 6.890576474687263e-07,
+      "loss": 0.001,
+      "num_tokens": 3362095.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.3754323720932007,
+      "reward_std": 0.7894452810287476,
+      "rewards/cosine_scaled_reward/mean": -0.061340540647506714,
+      "rewards/cosine_scaled_reward/std": 0.4359513223171234,
+      "rewards/format_reward/mean": 0.5,
+      "rewards/format_reward/std": 0.5052911639213562,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3461.0,
+      "completions/mean_length": 2590.166748046875,
+      "completions/mean_terminated_length": 1312.3809814453125,
+      "completions/min_length": 532.0,
+      "completions/min_terminated_length": 532.0,
+      "epoch": 0.02857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23011414706707,
+      "kl": 0.0007470448811848959,
+      "learning_rate": 6.588648530198504e-07,
+      "loss": 0.0021,
+      "num_tokens": 3494145.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.3740345239639282,
+      "reward_std": 0.7695837020874023,
+      "rewards/cosine_scaled_reward/mean": -0.03079296462237835,
+      "rewards/cosine_scaled_reward/std": 0.44012707471847534,
+      "rewards/format_reward/mean": 0.4375,
+      "rewards/format_reward/std": 0.5013279914855957,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3395.0,
+      "completions/mean_length": 2929.666748046875,
+      "completions/mean_terminated_length": 2088.381103515625,
+      "completions/min_length": 879.0,
+      "completions/min_terminated_length": 879.0,
+      "epoch": 0.029714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23103339970111847,
+      "kl": 0.0006039937337239584,
+      "learning_rate": 6.281416799501187e-07,
+      "loss": 0.0037,
+      "num_tokens": 3642743.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.2619829773902893,
+      "reward_std": 0.6574144959449768,
+      "rewards/cosine_scaled_reward/mean": -0.10793358087539673,
+      "rewards/cosine_scaled_reward/std": 0.4338338077068329,
+      "rewards/format_reward/mean": 0.4791666567325592,
+      "rewards/format_reward/std": 0.5048523545265198,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6666666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3288.0,
+      "completions/mean_length": 2908.20849609375,
+      "completions/mean_terminated_length": 1556.625,
+      "completions/min_length": 518.0,
+      "completions/min_terminated_length": 518.0,
+      "epoch": 0.030857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2881726324558258,
+      "kl": 0.000667572021484375,
+      "learning_rate": 5.97037808470444e-07,
+      "loss": -0.0,
+      "num_tokens": 3790053.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.041869934648275375,
+      "reward_std": 0.7798717021942139,
+      "rewards/cosine_scaled_reward/mean": -0.1560431718826294,
+      "rewards/cosine_scaled_reward/std": 0.29862359166145325,
+      "rewards/format_reward/mean": 0.3541666567325592,
+      "rewards/format_reward/std": 0.4833211302757263,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6041666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 2977.0,
+      "completions/mean_length": 2831.52099609375,
+      "completions/mean_terminated_length": 1683.0,
+      "completions/min_length": 509.0,
+      "completions/min_terminated_length": 509.0,
+      "epoch": 0.032,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24444623291492462,
+      "kl": 0.0005861918131510416,
+      "learning_rate": 5.657047735161255e-07,
+      "loss": 0.0062,
+      "num_tokens": 3933718.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.4484003484249115,
+      "reward_std": 0.8719537258148193,
+      "rewards/cosine_scaled_reward/mean": 0.006576786283403635,
+      "rewards/cosine_scaled_reward/std": 0.4855944514274597,
+      "rewards/format_reward/mean": 0.4375,
+      "rewards/format_reward/std": 0.5013279914855957,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.7916666666666666,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3091.0,
+      "completions/mean_length": 3182.02099609375,
+      "completions/mean_terminated_length": 1654.5,
+      "completions/min_length": 418.0,
+      "completions/min_terminated_length": 418.0,
+      "epoch": 0.03314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22512345016002655,
+      "kl": 0.0006993611653645834,
+      "learning_rate": 5.342952264838747e-07,
+      "loss": 0.0099,
+      "num_tokens": 4094309.0,
+      "policy_entropy_avg": 8.125,
+      "reward": -0.1955069899559021,
+      "reward_std": 0.668115496635437,
+      "rewards/cosine_scaled_reward/mean": -0.21282805502414703,
+      "rewards/cosine_scaled_reward/std": 0.37752190232276917,
+      "rewards/format_reward/mean": 0.2291666716337204,
+      "rewards/format_reward/std": 0.4247443675994873,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3320.0,
+      "completions/mean_length": 2794.666748046875,
+      "completions/mean_terminated_length": 1779.8095703125,
+      "completions/min_length": 667.0,
+      "completions/min_terminated_length": 667.0,
+      "epoch": 0.03428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22448024153709412,
+      "kl": 0.0006434122721354166,
+      "learning_rate": 5.02962191529556e-07,
+      "loss": 0.0021,
+      "num_tokens": 4236355.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.38854461908340454,
+      "reward_std": 0.8984581828117371,
+      "rewards/cosine_scaled_reward/mean": -0.0443347692489624,
+      "rewards/cosine_scaled_reward/std": 0.44917213916778564,
+      "rewards/format_reward/mean": 0.4791666567325592,
+      "rewards/format_reward/std": 0.5048523545265198,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.7708333333333334,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3570.0,
+      "completions/mean_length": 3039.39599609375,
+      "completions/mean_terminated_length": 1207.5455322265625,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 0.03542857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22645282745361328,
+      "kl": 0.0006421407063802084,
+      "learning_rate": 4.7185832004988133e-07,
+      "loss": 0.009,
+      "num_tokens": 4390118.0,
+      "policy_entropy_avg": 8.125,
+      "reward": -0.09765049070119858,
+      "reward_std": 0.6962664127349854,
+      "rewards/cosine_scaled_reward/mean": -0.1740705966949463,
+      "rewards/cosine_scaled_reward/std": 0.4055609405040741,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4375949800014496,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5833333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3517.0,
+      "completions/mean_length": 3097.125,
+      "completions/mean_terminated_length": 2415.5,
+      "completions/min_length": 1046.0,
+      "completions/min_terminated_length": 1046.0,
+      "epoch": 0.036571428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.19842347502708435,
+      "kl": 0.000629425048828125,
+      "learning_rate": 4.4113514698014953e-07,
+      "loss": -0.0137,
+      "num_tokens": 4546544.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.6700344085693359,
+      "reward_std": 0.7424625158309937,
+      "rewards/cosine_scaled_reward/mean": 0.10753399133682251,
+      "rewards/cosine_scaled_reward/std": 0.5346410274505615,
+      "rewards/format_reward/mean": 0.4583333432674408,
+      "rewards/format_reward/std": 0.5035336017608643,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3484.0,
+      "completions/mean_length": 3236.604248046875,
+      "completions/mean_terminated_length": 2194.416748046875,
+      "completions/min_length": 1039.0,
+      "completions/min_terminated_length": 1039.0,
+      "epoch": 0.037714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.19302453100681305,
+      "kl": 0.0005734761555989584,
+      "learning_rate": 4.1094235253127374e-07,
+      "loss": 0.0048,
+      "num_tokens": 4710313.0,
+      "policy_entropy_avg": 8.125,
+      "reward": -0.12012770771980286,
+      "reward_std": 0.6003495454788208,
+      "rewards/cosine_scaled_reward/mean": -0.1957823485136032,
+      "rewards/cosine_scaled_reward/std": 0.28730008006095886,
+      "rewards/format_reward/mean": 0.2708333432674408,
+      "rewards/format_reward/std": 0.4490928649902344,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.45833333333333337,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3326.0,
+      "completions/mean_length": 2303.45849609375,
+      "completions/mean_terminated_length": 1219.923095703125,
+      "completions/min_length": 430.0,
+      "completions/min_terminated_length": 430.0,
+      "epoch": 0.038857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.273879736661911,
+      "kl": 0.0007279713948567709,
+      "learning_rate": 3.8142703296283953e-07,
+      "loss": 0.0012,
+      "num_tokens": 4828043.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.5595396757125854,
+      "reward_std": 0.9288837313652039,
+      "rewards/cosine_scaled_reward/mean": -7.428725803038105e-05,
+      "rewards/cosine_scaled_reward/std": 0.5000401139259338,
+      "rewards/format_reward/mean": 0.5625,
+      "rewards/format_reward/std": 0.5013279914855957,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.7291666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3498.0,
+      "completions/mean_length": 3022.52099609375,
+      "completions/mean_terminated_length": 1510.84619140625,
+      "completions/min_length": 417.0,
+      "completions/min_terminated_length": 417.0,
+      "epoch": 0.04,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25011301040649414,
+      "kl": 0.0006783803304036459,
+      "learning_rate": 3.5273298394491515e-07,
+      "loss": 0.0054,
+      "num_tokens": 4981746.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.1530809849500656,
+      "reward_std": 0.9632071256637573,
+      "rewards/cosine_scaled_reward/mean": -0.07932490855455399,
+      "rewards/cosine_scaled_reward/std": 0.4641749858856201,
+      "rewards/format_reward/mean": 0.3125,
+      "rewards/format_reward/std": 0.4684174358844757,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3483.0,
+      "completions/mean_length": 3256.479248046875,
+      "completions/mean_terminated_length": 1837.2222900390625,
+      "completions/min_length": 1007.0,
+      "completions/min_terminated_length": 1007.0,
+      "epoch": 0.04114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.21851785480976105,
+      "kl": 0.0007712046305338541,
+      "learning_rate": 3.250000000000001e-07,
+      "loss": 0.0035,
+      "num_tokens": 5146391.0,
+      "policy_entropy_avg": 8.135416666666666,
+      "reward": -0.2397136390209198,
+      "reward_std": 0.3913343846797943,
+      "rewards/cosine_scaled_reward/mean": -0.23504245281219482,
+      "rewards/cosine_scaled_reward/std": 0.17867261171340942,
+      "rewards/format_reward/mean": 0.2291666716337204,
+      "rewards/format_reward/std": 0.4247443675994873,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.7708333333333334,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3151.0,
+      "completions/mean_length": 3157.70849609375,
+      "completions/mean_terminated_length": 1723.8182373046875,
+      "completions/min_length": 749.0,
+      "completions/min_terminated_length": 749.0,
+      "epoch": 0.04228571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2363603562116623,
+      "kl": 0.0006186167399088541,
+      "learning_rate": 2.9836319343816397e-07,
+      "loss": 0.0063,
+      "num_tokens": 5306229.0,
+      "policy_entropy_avg": 8.125,
+      "reward": -0.23191750049591064,
+      "reward_std": 0.505261242389679,
+      "rewards/cosine_scaled_reward/mean": -0.24154144525527954,
+      "rewards/cosine_scaled_reward/std": 0.23630201816558838,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4375949800014496,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.7916666666666666,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 2515.0,
+      "completions/mean_length": 3111.854248046875,
+      "completions/mean_terminated_length": 1317.7000732421875,
+      "completions/min_length": 679.0,
+      "completions/min_terminated_length": 679.0,
+      "epoch": 0.04342857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.19270718097686768,
+      "kl": 0.0006771087646484375,
+      "learning_rate": 2.729523361034538e-07,
+      "loss": -0.0004,
+      "num_tokens": 5464382.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.06942842155694962,
+      "reward_std": 0.4463602900505066,
+      "rewards/cosine_scaled_reward/mean": -0.07969469577074051,
+      "rewards/cosine_scaled_reward/std": 0.3691597282886505,
+      "rewards/format_reward/mean": 0.2291666716337204,
+      "rewards/format_reward/std": 0.4247443675994873,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5833333333333333,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3387.0,
+      "completions/mean_length": 2799.166748046875,
+      "completions/mean_terminated_length": 1700.4000244140625,
+      "completions/min_length": 262.0,
+      "completions/min_terminated_length": 262.0,
+      "epoch": 0.044571428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2799771726131439,
+      "kl": 0.0005861918131510416,
+      "learning_rate": 2.488912271385139e-07,
+      "loss": -0.0356,
+      "num_tokens": 5606830.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.41357025504112244,
+      "reward_std": 0.3624642491340637,
+      "rewards/cosine_scaled_reward/mean": -0.04217575863003731,
+      "rewards/cosine_scaled_reward/std": 0.4245593845844269,
+      "rewards/format_reward/mean": 0.5,
+      "rewards/format_reward/std": 0.5052911639213562,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5416666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 2945.0,
+      "completions/mean_length": 2401.95849609375,
+      "completions/mean_terminated_length": 1005.0,
+      "completions/min_length": 494.0,
+      "completions/min_terminated_length": 494.0,
+      "epoch": 0.045714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2853446900844574,
+      "kl": 0.000675201416015625,
+      "learning_rate": 2.2629708984760706e-07,
+      "loss": -0.0033,
+      "num_tokens": 5729678.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.2566830515861511,
+      "reward_std": 0.573390781879425,
+      "rewards/cosine_scaled_reward/mean": -0.11059689521789551,
+      "rewards/cosine_scaled_reward/std": 0.43331247568130493,
+      "rewards/format_reward/mean": 0.4791666567325592,
+      "rewards/format_reward/std": 0.5048523545265198,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3370.0,
+      "completions/mean_length": 2863.45849609375,
+      "completions/mean_terminated_length": 1662.5555419921875,
+      "completions/min_length": 762.0,
+      "completions/min_terminated_length": 762.0,
+      "epoch": 0.046857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22030523419380188,
+      "kl": 0.0006434122721354166,
+      "learning_rate": 2.0528000059645995e-07,
+      "loss": 0.0144,
+      "num_tokens": 5875488.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.050709377974271774,
+      "reward_std": 0.7291332483291626,
+      "rewards/cosine_scaled_reward/mean": -0.18285124003887177,
+      "rewards/cosine_scaled_reward/std": 0.3616393804550171,
+      "rewards/format_reward/mean": 0.4166666567325592,
+      "rewards/format_reward/std": 0.49822381138801575,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 2370.0,
+      "completions/mean_length": 2728.5,
+      "completions/mean_terminated_length": 846.4000244140625,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 0.048,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3805939257144928,
+      "kl": 0.0007483164469401041,
+      "learning_rate": 1.8594235253127372e-07,
+      "loss": 0.001,
+      "num_tokens": 6014226.0,
+      "policy_entropy_avg": 8.125,
+      "reward": -0.11857573688030243,
+      "reward_std": 0.35950538516044617,
+      "rewards/cosine_scaled_reward/mean": -0.2158358097076416,
+      "rewards/cosine_scaled_reward/std": 0.18257829546928406,
+      "rewards/format_reward/mean": 0.3125,
+      "rewards/format_reward/std": 0.4684174358844757,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 2668.0,
+      "completions/mean_length": 3001.479248046875,
+      "completions/mean_terminated_length": 1253.916748046875,
+      "completions/min_length": 529.0,
+      "completions/min_terminated_length": 529.0,
+      "epoch": 0.04914285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2410728931427002,
+      "kl": 0.0007006327311197916,
+      "learning_rate": 1.6837835672960831e-07,
+      "loss": 0.0025,
+      "num_tokens": 6167009.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.13230201601982117,
+      "reward_std": 0.6667492389678955,
+      "rewards/cosine_scaled_reward/mean": -0.05851660296320915,
+      "rewards/cosine_scaled_reward/std": 0.43021252751350403,
+      "rewards/format_reward/mean": 0.25,
+      "rewards/format_reward/std": 0.4375949800014496,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6041666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3552.0,
+      "completions/mean_length": 2645.104248046875,
+      "completions/mean_terminated_length": 1212.0526123046875,
+      "completions/min_length": 395.0,
+      "completions/min_terminated_length": 395.0,
+      "epoch": 0.05028571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2952722907066345,
+      "kl": 0.0007654825846354166,
+      "learning_rate": 1.5267358321348285e-07,
+      "loss": 0.0014,
+      "num_tokens": 6301996.0,
+      "policy_entropy_avg": 8.135416666666666,
+      "reward": 0.43852299451828003,
+      "reward_std": 0.8475234508514404,
+      "rewards/cosine_scaled_reward/mean": 0.0016132990131154656,
+      "rewards/cosine_scaled_reward/std": 0.5085917711257935,
+      "rewards/format_reward/mean": 0.4375,
+      "rewards/format_reward/std": 0.5013279914855957,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8541666666666666,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3505.0,
+      "completions/mean_length": 3466.39599609375,
+      "completions/mean_terminated_length": 2777.571533203125,
+      "completions/min_length": 1678.0,
+      "completions/min_terminated_length": 1678.0,
+      "epoch": 0.05142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.19519105553627014,
+      "kl": 0.0006815592447916666,
+      "learning_rate": 1.3890454406082956e-07,
+      "loss": 0.0045,
+      "num_tokens": 6477125.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.16751746833324432,
+      "reward_std": 0.5252600312232971,
+      "rewards/cosine_scaled_reward/mean": -0.030403709039092064,
+      "rewards/cosine_scaled_reward/std": 0.44781333208084106,
+      "rewards/format_reward/mean": 0.2291666716337204,
+      "rewards/format_reward/std": 0.4247443675994873,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.7916666666666666,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3388.0,
+      "completions/mean_length": 3097.77099609375,
+      "completions/mean_terminated_length": 1250.0999755859375,
+      "completions/min_length": 605.0,
+      "completions/min_terminated_length": 605.0,
+      "epoch": 0.052571428571428575,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2662387490272522,
+      "kl": 0.0007890065511067709,
+      "learning_rate": 1.2713832064634125e-07,
+      "loss": 0.006,
+      "num_tokens": 6634194.0,
+      "policy_entropy_avg": 8.125,
+      "reward": -0.2356199026107788,
+      "reward_std": 0.4806956648826599,
+      "rewards/cosine_scaled_reward/mean": -0.22256861627101898,
+      "rewards/cosine_scaled_reward/std": 0.2471582442522049,
+      "rewards/format_reward/mean": 0.2083333283662796,
+      "rewards/format_reward/std": 0.41041409969329834,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.47916666666666663,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3500.0,
+      "completions/mean_length": 2685.25,
+      "completions/mean_terminated_length": 1858.39990234375,
+      "completions/min_length": 431.0,
+      "completions/min_terminated_length": 431.0,
+      "epoch": 0.053714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30878785252571106,
+      "kl": 0.0005480448404947916,
+      "learning_rate": 1.1743223682775649e-07,
+      "loss": 0.0002,
+      "num_tokens": 6770886.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.628765344619751,
+      "reward_std": 0.8911368250846863,
+      "rewards/cosine_scaled_reward/mean": 0.04512912034988403,
+      "rewards/cosine_scaled_reward/std": 0.5223999619483948,
+      "rewards/format_reward/mean": 0.5416666865348816,
+      "rewards/format_reward/std": 0.503533661365509,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 2532.0,
+      "completions/mean_length": 2819.6875,
+      "completions/mean_terminated_length": 1138.2000732421875,
+      "completions/min_length": 705.0,
+      "completions/min_terminated_length": 705.0,
+      "epoch": 0.054857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25418996810913086,
+      "kl": 0.0007025400797526041,
+      "learning_rate": 1.0983357966978745e-07,
+      "loss": 0.0033,
+      "num_tokens": 6914139.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.09342099726200104,
+      "reward_std": 0.7818130850791931,
+      "rewards/cosine_scaled_reward/mean": -0.11972144991159439,
+      "rewards/cosine_scaled_reward/std": 0.401507169008255,
+      "rewards/format_reward/mean": 0.3333333432674408,
+      "rewards/format_reward/std": 0.47639307379722595,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.47916666666666663,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 3209.0,
+      "completions/mean_length": 2395.6875,
+      "completions/mean_terminated_length": 1302.43994140625,
+      "completions/min_length": 326.0,
+      "completions/min_terminated_length": 326.0,
+      "epoch": 0.056,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2804883122444153,
+      "kl": 0.0006554921468098959,
+      "learning_rate": 1.0437936906629334e-07,
+      "loss": -0.0017,
+      "num_tokens": 7036680.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.4345873296260834,
+      "reward_std": 0.7855587005615234,
+      "rewards/cosine_scaled_reward/mean": -0.06286442279815674,
+      "rewards/cosine_scaled_reward/std": 0.4665209949016571,
+      "rewards/format_reward/mean": 0.5625,
+      "rewards/format_reward/std": 0.5013279914855957,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6666666666666667,
+      "completions/max_length": 3584.0,
+      "completions/max_terminated_length": 2765.0,
+      "completions/mean_length": 2816.8125,
+      "completions/mean_terminated_length": 1282.4375,
+      "completions/min_length": 370.0,
+      "completions/min_terminated_length": 370.0,
+      "epoch": 0.05714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23478873074054718,
+      "kl": 0.0006268819173177084,
+      "learning_rate": 1.0109617738307911e-07,
+      "loss": -0.0009,
+      "num_tokens": 7179999.0,
+      "policy_entropy_avg": 8.125,
+      "reward": 0.23419660329818726,
+      "reward_std": 0.5556939840316772,
+      "rewards/cosine_scaled_reward/mean": -0.04897995665669441,
+      "rewards/cosine_scaled_reward/std": 0.39337849617004395,
+      "rewards/format_reward/mean": 0.3333333432674408,
+      "rewards/format_reward/std": 0.47639307379722595,
+      "step": 50
+    },
+    {
+      "epoch": 0.05714285714285714,
+      "step": 50,
+      "total_flos": 0.0,
+      "train_loss": 0.00044901110231876373,
+      "train_runtime": 4526.0548,
+      "train_samples_per_second": 0.53,
+      "train_steps_per_second": 0.011
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 50,
+  "num_input_tokens_seen": 7179999,
+  "num_train_epochs": 1,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}