PRIME-RL
/

EurusPRM-Stage1

Safetensors

qwen2

Model card Files Files and versions Community

yuchenFan commited on 10 days ago

Commit

0e8ab88

1 Parent(s): 7dda72b

Update README.md

Browse files

Files changed (1) hide show

README.md +17 -6

README.md CHANGED Viewed

@@ -69,8 +69,16 @@ We applied \\(L_{CE}\\) to train implicit PRM. We used a learning rate of 5e-7 a
 We show an example leveraging **EurusPRM-Stage1** below:
 ```python
 coef=0.001
-d = {'query':'111','answer':['111','222']
      }
 model = AutoModelForCausalLM.from_pretrained('PRIME-RL/EurusPRM-Stage1')
 tokenizer = AutoTokenizer.from_pretrained('PRIME-RL/EurusPRM-Stage1')
@@ -78,7 +86,7 @@ ref_model = AutoModelForCausalLM.from_pretrained('Qwen/Qwen2.5-Math-7B-Instruct'
 input_ids = tokenizer.apply_chat_template([
     {"role": "user", "content": d["query"]},
     {"role": "assistant", "content": "\n\n".join(d["answer"])},
-], tokenize=True, add_generation_prompt=False)
 attention_mask = input_ids!=tokenizer.pad_token_id
 step_last_tokens = []
 for step_num in range(0, len(d["answer"])+1):
@@ -92,8 +100,10 @@ for step_num in range(0, len(d["answer"])+1):
     currect_ids = tokenizer.encode(conv,add_special_tokens=False)
     step_last_tokens.append(len(currect_ids) - 2)
 inputs = {'input_ids':input_ids,'attention_mask':attention_mask,'labels':input_ids}
-step_last_tokens = torch.tensor(step_last_tokens)
 def get_logps(model,inputs):
     logits = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']).logits
@@ -105,12 +115,13 @@ def get_logps(model,inputs):
 with torch.no_grad():
     per_token_logps = get_logps(model, inputs)
-    per_token_logps = get_logps(ref_model,inputs)
 raw_reward = per_token_logps - ref_per_token_logps
-beta_reward = coef * raw_reward
 beta_reward = beta_reward.cumsum(-1)
-beta_reward = beta_reward.gather(dim=-1, index=step_last_tokens[1:])
 print(beta_reward)
 ```

 We show an example leveraging **EurusPRM-Stage1** below:
 ```python
+import torch
+from transformers import AutoTokenizer,AutoModelForCausalLM
 coef=0.001
+d = {'query':'Convert the point $(0,3)$ in rectangular coordinates to polar coordinates.  Enter your answer in the form $(r,\\theta),$ where $r > 0$ and $0 \\le \\theta < 2 \\pi.$',
+     'answer':[
+"Step 1: To convert the point (0,3) from rectangular coordinates to polar coordinates, we need to find the radius (r) and the angle theta (\u03b8).",
+            "Step 1: Find the radius (r). The radius is the distance from the origin (0,0) to the point (0,3). Since the x-coordinate is 0, the distance is simply the absolute value of the y-coordinate. So, r = |3| = 3.",
+            "Step 2: Find the angle theta (\u03b8). The angle theta is measured counterclockwise from the positive x-axis. Since the point (0,3) lies on the positive y-axis, the angle theta is 90 degrees or \u03c0\/2 radians.",
+            "Step 3: Write the polar coordinates. The polar coordinates are (r, \u03b8), where r > 0 and 0 \u2264 \u03b8 < 2\u03c0. In this case, r = 3 and \u03b8 = \u03c0\/2.\n\nTherefore, the polar coordinates of the point (0,3) are (3, \u03c0\/2).\n\n\n\\boxed{(3,\\frac{\\pi}{2})}"
+     ]
      }
 model = AutoModelForCausalLM.from_pretrained('PRIME-RL/EurusPRM-Stage1')
 tokenizer = AutoTokenizer.from_pretrained('PRIME-RL/EurusPRM-Stage1')
 input_ids = tokenizer.apply_chat_template([
     {"role": "user", "content": d["query"]},
     {"role": "assistant", "content": "\n\n".join(d["answer"])},
+], tokenize=True, add_generation_prompt=False,return_tensors='pt')
 attention_mask = input_ids!=tokenizer.pad_token_id
 step_last_tokens = []
 for step_num in range(0, len(d["answer"])+1):
     currect_ids = tokenizer.encode(conv,add_special_tokens=False)
     step_last_tokens.append(len(currect_ids) - 2)
 inputs = {'input_ids':input_ids,'attention_mask':attention_mask,'labels':input_ids}
+label_mask = torch.tensor([[0]*step_last_tokens[0]+[1]*(input_ids.shape[-1]-step_last_tokens[0])])
+step_last_tokens = torch.tensor([step_last_tokens])
 def get_logps(model,inputs):
     logits = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']).logits
 with torch.no_grad():
     per_token_logps = get_logps(model, inputs)
+    ref_per_token_logps = get_logps(ref_model,inputs)
 raw_reward = per_token_logps - ref_per_token_logps
+beta_reward = coef * raw_reward * label_mask[:,1:]
 beta_reward = beta_reward.cumsum(-1)
+beta_reward = beta_reward.gather(dim=-1, index=step_last_tokens[:,1:])
 print(beta_reward)
 ```