Spaces:

nhatipoglu
/

demo-vit-v2

Runtime error

App Files Files Community

nhatipoglu commited on Sep 12, 2024

Commit

bcec177

verified ·

1 Parent(s): 4959b00

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -11

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 import spaces
-from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 from qwen_vl_utils import process_vision_info
 import torch
 import base64
@@ -11,12 +11,17 @@ import re
 models = {
     "Qwen/Qwen2-VL-7B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"),
-    "Qwen/Qwen2-VL-2B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto")
 }
 processors = {
     "Qwen/Qwen2-VL-7B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct"),
-    "Qwen/Qwen2-VL-2B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
 }
@@ -52,7 +57,7 @@ def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scal
 @spaces.GPU
-def run_example(image, text_input, system_prompt, model_id="Qwen/Qwen2-VL-7B-Instruct"):
     model = models[model_id].eval()
     processor = processors[model_id]
@@ -92,7 +97,7 @@ def run_example(image, text_input, system_prompt, model_id="Qwen/Qwen2-VL-7B-Ins
     matches = re.findall(pattern, str(output_text))
     parsed_boxes = [[int(num) for num in match] for match in matches]
     scaled_boxes = rescale_bounding_boxes(parsed_boxes, image.width, image.height)
-    return output_text, parsed_boxes, draw_bounding_boxes(image, scaled_boxes)
 css = """
   #output {
@@ -101,8 +106,6 @@ css = """
     border: 1px solid #ccc;
   }
 """
-default_system_prompt = "You are a helpfull assistant to detect objects in images. When asked to detect elements based on a description you return bounding boxes for all elements in the form of [xmin, ymin, xmax, ymax] whith the values beeing scaled to 1000 by 1000 pixels. When there are more than one result, answer with a list of bounding boxes in the form of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
 with gr.Blocks(css=css) as demo:
     gr.Markdown(
     """
@@ -125,10 +128,10 @@ with gr.Blocks(css=css) as demo:
         gr.Examples(
             examples=[
-                ["assets/2024_09_10_10_58_23.png", "Solve the question", default_system_prompt],
-                ["assets/2024_09_10_10_58_40.png", "Solve the question", default_system_prompt],
-                ["assets/2024_09_10_11_07_31.png", "Solve the question", default_system_prompt],
-                ["assets/comics.jpeg", "Describe the since", default_system_prompt],
             ],
             inputs=[input_img, text_input],
             outputs=[model_output_text],

 import gradio as gr
 import spaces
+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor, CLIPModel, BlipForConditionalGeneration, CLIPProcessor, BlipProcessor
 from qwen_vl_utils import process_vision_info
 import torch
 import base64
 models = {
     "Qwen/Qwen2-VL-7B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"),
+    "Qwen/Qwen2-VL-2B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"),
+                                                                                  torch_dtype="auto", device_map="auto"),
+    "openai/clip-vit-base-patch32": CLIPModel.from_pretrained("openai/clip-vit-base-patch32"),
+    "Salesforce/blip-image-captioning-base": BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
 }
 processors = {
     "Qwen/Qwen2-VL-7B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct"),
+    "Qwen/Qwen2-VL-2B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct"),
+     "openai/clip-vit-base-patch32": CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32"),
+    "Salesforce/blip-image-captioning-base": BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 }
 @spaces.GPU
+def run_example(image, text_input, system_prompt, model_id="Qwen/Qwen2-VL-2B-Instruct"):
     model = models[model_id].eval()
     processor = processors[model_id]
     matches = re.findall(pattern, str(output_text))
     parsed_boxes = [[int(num) for num in match] for match in matches]
     scaled_boxes = rescale_bounding_boxes(parsed_boxes, image.width, image.height)
+    return output_text
 css = """
   #output {
     border: 1px solid #ccc;
   }
 """
 with gr.Blocks(css=css) as demo:
     gr.Markdown(
     """
         gr.Examples(
             examples=[
+                ["assets/2024_09_10_10_58_23.png", "Solve the question"],
+                ["assets/2024_09_10_10_58_40.png", "Solve the question"],
+                ["assets/2024_09_10_11_07_31.png", "Solve the question"],
+                ["assets/comics.jpeg", "Describe the since"],
             ],
             inputs=[input_img, text_input],
             outputs=[model_output_text],