jzhang-x commited on
Commit
1a083a2
·
verified ·
1 Parent(s): dbfc5c8

Model save

Browse files
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- base_model: Qwen/Qwen2.5-7B
3
  library_name: transformers
4
  model_name: Qwen-2.5-7B-Simple-RL
5
  tags:
@@ -11,7 +11,7 @@ licence: license
11
 
12
  # Model Card for Qwen-2.5-7B-Simple-RL
13
 
14
- This model is a fine-tuned version of [Qwen/Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/jzhang-x-01/huggingface/runs/38wymkvl)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
1
  ---
2
+ base_model: Qwen/Qwen2.5-0.5B
3
  library_name: transformers
4
  model_name: Qwen-2.5-7B-Simple-RL
5
  tags:
 
11
 
12
  # Model Card for Qwen-2.5-7B-Simple-RL
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-0.5B](https://huggingface.co/Qwen/Qwen2.5-0.5B).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/jzhang-x-01/huggingface/runs/gj9p99u7)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.0010652160639474098,
4
- "train_runtime": 7821.2028,
5
- "train_samples": 7500,
6
- "train_samples_per_second": 0.959,
7
- "train_steps_per_second": 0.007
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 2.8635031776502727e-05,
4
+ "train_runtime": 365.6249,
5
+ "train_samples": 750,
6
+ "train_samples_per_second": 2.051,
7
+ "train_steps_per_second": 0.014
8
  }
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "Qwen/Qwen2.5-7B",
3
  "architectures": [
4
  "Qwen2ForCausalLM"
5
  ],
@@ -7,24 +7,24 @@
7
  "bos_token_id": 151643,
8
  "eos_token_id": 151643,
9
  "hidden_act": "silu",
10
- "hidden_size": 3584,
11
  "initializer_range": 0.02,
12
- "intermediate_size": 18944,
13
- "max_position_embeddings": 131072,
14
- "max_window_layers": 28,
15
  "model_type": "qwen2",
16
- "num_attention_heads": 28,
17
- "num_hidden_layers": 28,
18
- "num_key_value_heads": 4,
19
  "rms_norm_eps": 1e-06,
20
  "rope_scaling": null,
21
  "rope_theta": 1000000.0,
22
- "sliding_window": 131072,
23
- "tie_word_embeddings": false,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.49.0",
26
  "use_cache": false,
27
  "use_mrope": false,
28
  "use_sliding_window": false,
29
- "vocab_size": 152064
30
  }
 
1
  {
2
+ "_name_or_path": "Qwen/Qwen2.5-0.5B",
3
  "architectures": [
4
  "Qwen2ForCausalLM"
5
  ],
 
7
  "bos_token_id": 151643,
8
  "eos_token_id": 151643,
9
  "hidden_act": "silu",
10
+ "hidden_size": 896,
11
  "initializer_range": 0.02,
12
+ "intermediate_size": 4864,
13
+ "max_position_embeddings": 32768,
14
+ "max_window_layers": 24,
15
  "model_type": "qwen2",
16
+ "num_attention_heads": 14,
17
+ "num_hidden_layers": 24,
18
+ "num_key_value_heads": 2,
19
  "rms_norm_eps": 1e-06,
20
  "rope_scaling": null,
21
  "rope_theta": 1000000.0,
22
+ "sliding_window": 32768,
23
+ "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.49.0",
26
  "use_cache": false,
27
  "use_mrope": false,
28
  "use_sliding_window": false,
29
+ "vocab_size": 151936
30
  }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5645c1de0b30d0217ecac6927108da17c19791fe8675832336c0dd33c4102e06
3
+ size 988097824
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.0010652160639474098,
4
- "train_runtime": 7821.2028,
5
- "train_samples": 7500,
6
- "train_samples_per_second": 0.959,
7
- "train_steps_per_second": 0.007
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 2.8635031776502727e-05,
4
+ "train_runtime": 365.6249,
5
+ "train_samples": 750,
6
+ "train_samples_per_second": 2.051,
7
+ "train_steps_per_second": 0.014
8
  }
trainer_state.json CHANGED
@@ -1,187 +1,51 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9893390191897654,
5
  "eval_steps": 100,
6
- "global_step": 58,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 473.94757080078125,
13
- "epoch": 0.017057569296375266,
14
- "grad_norm": 4.064376038455483,
15
  "kl": 0.0,
16
- "learning_rate": 5e-07,
17
  "loss": 0.0,
18
- "reward": 0.1729910783469677,
19
- "reward_std": 0.27430381812155247,
20
- "rewards/accuracy_reward": 0.1729910783469677,
21
  "rewards/format_reward": 0.0,
22
  "step": 1
23
  },
24
  {
25
- "completion_length": 485.79745292663574,
26
- "epoch": 0.08528784648187633,
27
- "grad_norm": 1.1351739582457168,
28
- "kl": 0.0002506077289581299,
29
- "learning_rate": 2.5e-06,
30
  "loss": 0.0,
31
- "reward": 0.13504464970901608,
32
- "reward_std": 0.21868262230418622,
33
- "rewards/accuracy_reward": 0.1347656319849193,
34
- "rewards/format_reward": 0.00027901786961592734,
35
- "step": 5
36
- },
37
- {
38
- "completion_length": 568.8616348266602,
39
- "epoch": 0.17057569296375266,
40
- "grad_norm": 0.5575129470407404,
41
- "kl": 0.015078353881835937,
42
- "learning_rate": 2.956412726139078e-06,
43
- "loss": 0.0006,
44
- "reward": 0.3587053745985031,
45
- "reward_std": 0.3362843289971352,
46
- "rewards/accuracy_reward": 0.35848215967416763,
47
- "rewards/format_reward": 0.00022321429569274187,
48
- "step": 10
49
- },
50
- {
51
- "completion_length": 584.8319480895996,
52
- "epoch": 0.255863539445629,
53
- "grad_norm": 0.2277859906277476,
54
- "kl": 0.02491455078125,
55
- "learning_rate": 2.7836719084521715e-06,
56
- "loss": 0.001,
57
- "reward": 0.638169676065445,
58
- "reward_std": 0.24866362921893598,
59
- "rewards/accuracy_reward": 0.638169676065445,
60
  "rewards/format_reward": 0.0,
61
- "step": 15
62
- },
63
- {
64
- "completion_length": 535.3618560791016,
65
- "epoch": 0.3411513859275053,
66
- "grad_norm": 0.23887369640814476,
67
- "kl": 0.0245208740234375,
68
- "learning_rate": 2.4946839873611927e-06,
69
- "loss": 0.001,
70
- "reward": 0.6774553880095482,
71
- "reward_std": 0.2348104739561677,
72
- "rewards/accuracy_reward": 0.6767857447266579,
73
- "rewards/format_reward": 0.0006696428870782256,
74
- "step": 20
75
- },
76
- {
77
- "completion_length": 542.0183288574219,
78
- "epoch": 0.42643923240938164,
79
- "grad_norm": 0.3014825446189458,
80
- "kl": 0.02587432861328125,
81
- "learning_rate": 2.1156192081791355e-06,
82
- "loss": 0.001,
83
- "reward": 0.6584821730852127,
84
- "reward_std": 0.22855298295617105,
85
- "rewards/accuracy_reward": 0.657589316368103,
86
- "rewards/format_reward": 0.0008928571827709675,
87
- "step": 25
88
- },
89
- {
90
- "completion_length": 536.8770370483398,
91
- "epoch": 0.511727078891258,
92
- "grad_norm": 0.21529686209177487,
93
- "kl": 0.0219482421875,
94
- "learning_rate": 1.6808050203829845e-06,
95
- "loss": 0.0009,
96
- "reward": 0.6495536014437675,
97
- "reward_std": 0.2183901211246848,
98
- "rewards/accuracy_reward": 0.6488839566707612,
99
- "rewards/format_reward": 0.0006696428870782256,
100
- "step": 30
101
- },
102
- {
103
- "completion_length": 528.6895355224609,
104
- "epoch": 0.5970149253731343,
105
- "grad_norm": 0.16077481890595569,
106
- "kl": 0.0267181396484375,
107
- "learning_rate": 1.2296174432791415e-06,
108
- "loss": 0.0011,
109
- "reward": 0.6448661029338837,
110
- "reward_std": 0.2145981529727578,
111
- "rewards/accuracy_reward": 0.6441964589059352,
112
- "rewards/format_reward": 0.0006696428870782256,
113
- "step": 35
114
- },
115
- {
116
- "completion_length": 514.0154296875,
117
- "epoch": 0.6823027718550106,
118
- "grad_norm": 0.8368130250058291,
119
- "kl": 0.02160186767578125,
120
- "learning_rate": 8.029152419343472e-07,
121
- "loss": 0.0009,
122
- "reward": 0.6649553939700127,
123
- "reward_std": 0.20347560551017524,
124
- "rewards/accuracy_reward": 0.6647321790456772,
125
- "rewards/format_reward": 0.00022321429569274187,
126
- "step": 40
127
- },
128
- {
129
- "completion_length": 528.3629722595215,
130
- "epoch": 0.767590618336887,
131
- "grad_norm": 0.3144943865256562,
132
- "kl": 0.0201324462890625,
133
- "learning_rate": 4.3933982822017883e-07,
134
- "loss": 0.0008,
135
- "reward": 0.6553571708500385,
136
- "reward_std": 0.21330095445737243,
137
- "rewards/accuracy_reward": 0.6542410992085934,
138
- "rewards/format_reward": 0.0011160714784637094,
139
- "step": 45
140
- },
141
- {
142
- "completion_length": 526.5335037231446,
143
- "epoch": 0.8528784648187633,
144
- "grad_norm": 2.72821016556171,
145
- "kl": 0.027325439453125,
146
- "learning_rate": 1.718159615201853e-07,
147
- "loss": 0.0011,
148
- "reward": 0.6439732402563095,
149
- "reward_std": 0.22680639270693065,
150
- "rewards/accuracy_reward": 0.6437500268220901,
151
- "rewards/format_reward": 0.00022321429569274187,
152
- "step": 50
153
- },
154
- {
155
- "completion_length": 523.5705612182617,
156
- "epoch": 0.9381663113006397,
157
- "grad_norm": 0.41062600235384844,
158
- "kl": 0.0195343017578125,
159
- "learning_rate": 2.4570139579284723e-08,
160
- "loss": 0.0008,
161
- "reward": 0.6685268178582191,
162
- "reward_std": 0.23179882913827896,
163
- "rewards/accuracy_reward": 0.6676339596509934,
164
- "rewards/format_reward": 0.0008928571827709675,
165
- "step": 55
166
  },
167
  {
168
- "completion_length": 517.845261891683,
169
- "epoch": 0.9893390191897654,
170
- "kl": 0.021814982096354168,
171
- "reward": 0.6592262188593546,
172
- "reward_std": 0.20016193420936665,
173
- "rewards/accuracy_reward": 0.6588541939854622,
174
- "rewards/format_reward": 0.0003720238261545698,
175
- "step": 58,
176
  "total_flos": 0.0,
177
- "train_loss": 0.0010652160639474098,
178
- "train_runtime": 7821.2028,
179
- "train_samples_per_second": 0.959,
180
- "train_steps_per_second": 0.007
181
  }
182
  ],
183
  "logging_steps": 5,
184
- "max_steps": 58,
185
  "num_input_tokens_seen": 0,
186
  "num_train_epochs": 1,
187
  "save_steps": 500,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.851063829787234,
5
  "eval_steps": 100,
6
+ "global_step": 5,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 558.3560485839844,
13
+ "epoch": 0.1702127659574468,
14
+ "grad_norm": 0.202052697018779,
15
  "kl": 0.0,
16
+ "learning_rate": 3e-06,
17
  "loss": 0.0,
18
+ "reward": 0.017857143888249993,
19
+ "reward_std": 0.04515197407454252,
20
+ "rewards/accuracy_reward": 0.017857143888249993,
21
  "rewards/format_reward": 0.0,
22
  "step": 1
23
  },
24
  {
25
+ "completion_length": 560.4059982299805,
26
+ "epoch": 0.851063829787234,
27
+ "grad_norm": 0.23003411642703778,
28
+ "kl": 0.0008947253227233887,
29
+ "learning_rate": 0.0,
30
  "loss": 0.0,
31
+ "reward": 0.025948662223527208,
32
+ "reward_std": 0.06277249014237896,
33
+ "rewards/accuracy_reward": 0.025948662223527208,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  "rewards/format_reward": 0.0,
35
+ "step": 5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  },
37
  {
38
+ "epoch": 0.851063829787234,
39
+ "step": 5,
 
 
 
 
 
 
40
  "total_flos": 0.0,
41
+ "train_loss": 2.8635031776502727e-05,
42
+ "train_runtime": 365.6249,
43
+ "train_samples_per_second": 2.051,
44
+ "train_steps_per_second": 0.014
45
  }
46
  ],
47
  "logging_steps": 5,
48
+ "max_steps": 5,
49
  "num_input_tokens_seen": 0,
50
  "num_train_epochs": 1,
51
  "save_steps": 500,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e9c97f93968e70f80d79d3367db73181a5a13edc4aad2638a46a6504eb5664cf
3
- size 7928
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0331e1487175abe417a3ec2bca598d4d79190d4f1202fbdf1338d0683e148e6b
3
+ size 8056