SZhanZ commited on
Commit
1b1a083
·
1 Parent(s): 2410ef7
Files changed (1) hide show
  1. app.py +9 -4
app.py CHANGED
@@ -17,7 +17,10 @@ def extract_bbox_answer(content):
17
  return bbox
18
  return [0, 0, 0, 0]
19
 
20
- def process_image_and_text(image, text, device):
 
 
 
21
  """Process image and text input, return thinking process and bbox"""
22
  question = f"Please provide the bounding box coordinate of the region this sentence describes: {text}."
23
  QUESTION_TEMPLATE = "{Question} First output the thinking process in <think> </think> tags and then output the final answer in <answer> </answer> tags. Output the final answer in JSON format."
@@ -45,7 +48,7 @@ def process_image_and_text(image, text, device):
45
  add_special_tokens=False,
46
  )
47
 
48
- inputs = inputs.to(device)
49
 
50
  with torch.no_grad():
51
  generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
@@ -76,8 +79,10 @@ if __name__ == "__main__":
76
 
77
  # model_path = "/data/shz/project/vlm-r1/VLM-R1/output/Qwen2.5-VL-3B-GRPO-REC/checkpoint-500"
78
  model_path = "SZhanZ/Qwen2.5VL-VLM-R1-REC-step500"
79
- device = "cuda" if torch.cuda.is_available() else "cpu"
80
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16, device_map=device)
 
 
81
  processor = AutoProcessor.from_pretrained(model_path)
82
 
83
  def gradio_interface(image, text):
 
17
  return bbox
18
  return [0, 0, 0, 0]
19
 
20
+ import spaces
21
+
22
+ @spaces.GPU
23
+ def process_image_and_text(image, text):
24
  """Process image and text input, return thinking process and bbox"""
25
  question = f"Please provide the bounding box coordinate of the region this sentence describes: {text}."
26
  QUESTION_TEMPLATE = "{Question} First output the thinking process in <think> </think> tags and then output the final answer in <answer> </answer> tags. Output the final answer in JSON format."
 
48
  add_special_tokens=False,
49
  )
50
 
51
+ inputs = inputs.to("cuda")
52
 
53
  with torch.no_grad():
54
  generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
 
79
 
80
  # model_path = "/data/shz/project/vlm-r1/VLM-R1/output/Qwen2.5-VL-3B-GRPO-REC/checkpoint-500"
81
  model_path = "SZhanZ/Qwen2.5VL-VLM-R1-REC-step500"
82
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
83
+ device = "cuda"
84
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
85
+ model.to(device)
86
  processor = AutoProcessor.from_pretrained(model_path)
87
 
88
  def gradio_interface(image, text):