nhatipoglu commited on
Commit
bcec177
1 Parent(s): 4959b00

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -11
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  import spaces
3
- from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
4
  from qwen_vl_utils import process_vision_info
5
  import torch
6
  import base64
@@ -11,12 +11,17 @@ import re
11
 
12
  models = {
13
  "Qwen/Qwen2-VL-7B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"),
14
- "Qwen/Qwen2-VL-2B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto")
 
 
 
15
  }
16
 
17
  processors = {
18
  "Qwen/Qwen2-VL-7B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct"),
19
- "Qwen/Qwen2-VL-2B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
 
 
20
  }
21
 
22
 
@@ -52,7 +57,7 @@ def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scal
52
 
53
 
54
  @spaces.GPU
55
- def run_example(image, text_input, system_prompt, model_id="Qwen/Qwen2-VL-7B-Instruct"):
56
  model = models[model_id].eval()
57
  processor = processors[model_id]
58
 
@@ -92,7 +97,7 @@ def run_example(image, text_input, system_prompt, model_id="Qwen/Qwen2-VL-7B-Ins
92
  matches = re.findall(pattern, str(output_text))
93
  parsed_boxes = [[int(num) for num in match] for match in matches]
94
  scaled_boxes = rescale_bounding_boxes(parsed_boxes, image.width, image.height)
95
- return output_text, parsed_boxes, draw_bounding_boxes(image, scaled_boxes)
96
 
97
  css = """
98
  #output {
@@ -101,8 +106,6 @@ css = """
101
  border: 1px solid #ccc;
102
  }
103
  """
104
- default_system_prompt = "You are a helpfull assistant to detect objects in images. When asked to detect elements based on a description you return bounding boxes for all elements in the form of [xmin, ymin, xmax, ymax] whith the values beeing scaled to 1000 by 1000 pixels. When there are more than one result, answer with a list of bounding boxes in the form of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
105
-
106
  with gr.Blocks(css=css) as demo:
107
  gr.Markdown(
108
  """
@@ -125,10 +128,10 @@ with gr.Blocks(css=css) as demo:
125
 
126
  gr.Examples(
127
  examples=[
128
- ["assets/2024_09_10_10_58_23.png", "Solve the question", default_system_prompt],
129
- ["assets/2024_09_10_10_58_40.png", "Solve the question", default_system_prompt],
130
- ["assets/2024_09_10_11_07_31.png", "Solve the question", default_system_prompt],
131
- ["assets/comics.jpeg", "Describe the since", default_system_prompt],
132
  ],
133
  inputs=[input_img, text_input],
134
  outputs=[model_output_text],
 
1
  import gradio as gr
2
  import spaces
3
+ from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor, CLIPModel, BlipForConditionalGeneration, CLIPProcessor, BlipProcessor
4
  from qwen_vl_utils import process_vision_info
5
  import torch
6
  import base64
 
11
 
12
  models = {
13
  "Qwen/Qwen2-VL-7B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"),
14
+ "Qwen/Qwen2-VL-2B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"),
15
+ torch_dtype="auto", device_map="auto"),
16
+ "openai/clip-vit-base-patch32": CLIPModel.from_pretrained("openai/clip-vit-base-patch32"),
17
+ "Salesforce/blip-image-captioning-base": BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
18
  }
19
 
20
  processors = {
21
  "Qwen/Qwen2-VL-7B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct"),
22
+ "Qwen/Qwen2-VL-2B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct"),
23
+ "openai/clip-vit-base-patch32": CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32"),
24
+ "Salesforce/blip-image-captioning-base": BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
25
  }
26
 
27
 
 
57
 
58
 
59
  @spaces.GPU
60
+ def run_example(image, text_input, system_prompt, model_id="Qwen/Qwen2-VL-2B-Instruct"):
61
  model = models[model_id].eval()
62
  processor = processors[model_id]
63
 
 
97
  matches = re.findall(pattern, str(output_text))
98
  parsed_boxes = [[int(num) for num in match] for match in matches]
99
  scaled_boxes = rescale_bounding_boxes(parsed_boxes, image.width, image.height)
100
+ return output_text
101
 
102
  css = """
103
  #output {
 
106
  border: 1px solid #ccc;
107
  }
108
  """
 
 
109
  with gr.Blocks(css=css) as demo:
110
  gr.Markdown(
111
  """
 
128
 
129
  gr.Examples(
130
  examples=[
131
+ ["assets/2024_09_10_10_58_23.png", "Solve the question"],
132
+ ["assets/2024_09_10_10_58_40.png", "Solve the question"],
133
+ ["assets/2024_09_10_11_07_31.png", "Solve the question"],
134
+ ["assets/comics.jpeg", "Describe the since"],
135
  ],
136
  inputs=[input_img, text_input],
137
  outputs=[model_output_text],