xiwenc1
/

OpenRS-GRPO-DPPv4

@@ -27,7 +27,7 @@ print(output["generated_text"])
 ## Training procedure
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/myopen-rs/huggingface/runs/9mjk517m)
 This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).

 ## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/myopen-rs/huggingface/runs/sh56i1gy)
 This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).

all_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "total_flos": 0.0,
-    "train_loss": 0.005753870498389006,
-    "train_runtime": 9057.1764,
     "train_samples": 7000,
-    "train_samples_per_second": 2.65,
-    "train_steps_per_second": 0.055
 }

 {
     "total_flos": 0.0,
+    "train_loss": 2.328130582016623e-05,
+    "train_runtime": 113.9487,
     "train_samples": 7000,
+    "train_samples_per_second": 210.621,
+    "train_steps_per_second": 4.388
 }

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "total_flos": 0.0,
-    "train_loss": 0.005753870498389006,
-    "train_runtime": 9057.1764,
     "train_samples": 7000,
-    "train_samples_per_second": 2.65,
-    "train_steps_per_second": 0.055
 }

 {
     "total_flos": 0.0,
+    "train_loss": 2.328130582016623e-05,
+    "train_runtime": 113.9487,
     "train_samples": 7000,
+    "train_samples_per_second": 210.621,
+    "train_steps_per_second": 4.388
 }

trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.5714285714285714,
   "eval_steps": 500,
-  "global_step": 500,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -6509,13 +6509,26 @@
       "step": 500
     },
     {
-      "epoch": 0.5714285714285714,
-      "step": 500,
       "total_flos": 0.0,
-      "train_loss": 0.005753870498389006,
-      "train_runtime": 9057.1764,
-      "train_samples_per_second": 2.65,
-      "train_steps_per_second": 0.055
     }
   ],
   "logging_steps": 1,

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.5725714285714286,
   "eval_steps": 500,
+  "global_step": 501,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "step": 500
     },
     {
+      "completion_length": 1138.708366394043,
+      "epoch": 0.5725714285714286,
+      "grad_norm": 1.5487465858459473,
+      "kl": 0.29150390625,
+      "learning_rate": 1.0001096618257236e-07,
+      "loss": 0.0117,
+      "reward": 0.21772483922541142,
+      "reward_std": 0.14172286912798882,
+      "rewards/cosine_scaled_reward": -0.009133230894804,
+      "rewards/format_reward": 0.8333333432674408,
+      "step": 501
+    },
+    {
+      "epoch": 0.5725714285714286,
+      "step": 501,
       "total_flos": 0.0,
+      "train_loss": 2.328130582016623e-05,
+      "train_runtime": 113.9487,
+      "train_samples_per_second": 210.621,
+      "train_steps_per_second": 4.388
     }
   ],
   "logging_steps": 1,