Spaces:

omlab
/

VLM-R1-Referral-Expression

Running on Zero

SZhanZ commited on Mar 2

Commit

1b1a083

1 Parent(s): 2410ef7

zero

Files changed (1) hide show

app.py CHANGED Viewed

@@ -17,7 +17,10 @@ def extract_bbox_answer(content):
         return bbox
     return [0, 0, 0, 0]
-def process_image_and_text(image, text, device):
     """Process image and text input, return thinking process and bbox"""
     question = f"Please provide the bounding box coordinate of the region this sentence describes: {text}."
     QUESTION_TEMPLATE = "{Question} First output the thinking process in <think> </think> tags and then output the final answer in <answer> </answer> tags. Output the final answer in JSON format."
@@ -45,7 +48,7 @@ def process_image_and_text(image, text, device):
         add_special_tokens=False,
     )
-    inputs = inputs.to(device)
     with torch.no_grad():
         generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
@@ -76,8 +79,10 @@ if __name__ == "__main__":
     # model_path = "/data/shz/project/vlm-r1/VLM-R1/output/Qwen2.5-VL-3B-GRPO-REC/checkpoint-500"
     model_path = "SZhanZ/Qwen2.5VL-VLM-R1-REC-step500"
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16, device_map=device)
     processor = AutoProcessor.from_pretrained(model_path)
     def gradio_interface(image, text):

         return bbox
     return [0, 0, 0, 0]
+import spaces
+@spaces.GPU
+def process_image_and_text(image, text):
     """Process image and text input, return thinking process and bbox"""
     question = f"Please provide the bounding box coordinate of the region this sentence describes: {text}."
     QUESTION_TEMPLATE = "{Question} First output the thinking process in <think> </think> tags and then output the final answer in <answer> </answer> tags. Output the final answer in JSON format."
         add_special_tokens=False,
     )
+    inputs = inputs.to("cuda")
     with torch.no_grad():
         generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
     # model_path = "/data/shz/project/vlm-r1/VLM-R1/output/Qwen2.5-VL-3B-GRPO-REC/checkpoint-500"
     model_path = "SZhanZ/Qwen2.5VL-VLM-R1-REC-step500"
+    # device = "cuda" if torch.cuda.is_available() else "cpu"
+    device = "cuda"
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
+    model.to(device)
     processor = AutoProcessor.from_pretrained(model_path)
     def gradio_interface(image, text):