nhatipoglu commited on
Commit
b251db3
1 Parent(s): 8677efd

add app files2

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+
37
+ .idea
.idea/misc.xml CHANGED
@@ -1,5 +1,8 @@
1
  <?xml version="1.0" encoding="UTF-8"?>
2
  <project version="4">
 
 
 
3
  <component name="GithubDefaultAccount">
4
  <option name="defaultAccountId" value="16dd0ba3-f1ec-4fdf-9c62-48bd69c3904d" />
5
  </component>
 
1
  <?xml version="1.0" encoding="UTF-8"?>
2
  <project version="4">
3
+ <component name="Black">
4
+ <option name="sdkName" value="Python 3.12 (linkedIn_auto_jobs_applier_with_AI)" />
5
+ </component>
6
  <component name="GithubDefaultAccount">
7
  <option name="defaultAccountId" value="16dd0ba3-f1ec-4fdf-9c62-48bd69c3904d" />
8
  </component>
README.md CHANGED
@@ -1,45 +1,9 @@
1
- ### README.md for Multi-Model Object Detection Demo
2
-
3
- ---
4
-
5
- # Multi-Model Object Detection Demo
6
-
7
- This repository provides a demo application that uses multiple state-of-the-art vision-language models for various tasks such as object detection, image captioning, visual question answering, and image-text matching. The demo is built using Gradio for the user interface and leverages Hugging Face's `transformers` library to load and run various pre-trained models.
8
-
9
- ## Available Models
10
-
11
- The following models are available in the demo:
12
-
13
- - **Qwen2-VL (7B, 2B, 5B, 1B):** Vision-language models optimized for object detection, question-answering, and image description tasks.
14
- - **BLIP:** Specialized in image captioning and visual question-answering.
15
- - **CLIP:** Uses contrastive learning for image-text matching.
16
-
17
- ## Usage
18
-
19
- To use the demo:
20
-
21
- 1. **Input an Image:** Upload an image that you want to analyze.
22
- 2. **Select a Model:** Choose a model from the dropdown list to perform the desired task.
23
- 3. **Provide a System Prompt:** Optionally, enter a system prompt to guide the model's behavior.
24
- 4. **Enter a User Prompt:** Describe the object or task you want the model to perform.
25
- 5. **Submit:** Click the "Submit" button to run the model and display the results.
26
-
27
- ## Getting Started
28
-
29
-
30
- ### Example Inputs
31
-
32
- The demo provides some pre-configured examples to try:
33
-
34
- - **Image 1:** Detect goats in an image.
35
- - **Image 2:** Find a blue button in the image.
36
- - **Image 3:** Describe a person on a bike.
37
- - **Image 4:** Solve questions from a screenshot.
38
- - **Image 5:** Describe various images such as landscapes, animals, or objects.
39
-
40
- ## Available Functions
41
-
42
- - `run_example`: Core function to process the input image and prompts, run the selected model, and return the results.
43
- - `image_to_base64`: Converts an image to a base64 encoded string for model processing.
44
- - `draw_bounding_boxes`: Draws bounding boxes around detected objects in the image.
45
- - `rescale_bounding_boxes`: Rescales bounding boxes to the original image dimensions.
 
1
+ title: Qwen2 VL Localization
2
+ emoji: 📉
3
+ colorFrom: pink
4
+ colorTo: purple
5
+ sdk: gradio
6
+ sdk_version: 4.42.0
7
+ app_file: app.py
8
+ pinned: false
9
+ license: mit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import gradio as gr
2
  import spaces
3
- from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor, CLIPModel, \
4
- BlipForConditionalGeneration, CLIPProcessor, BlipProcessor
5
  from qwen_vl_utils import process_vision_info
6
  import torch
7
  import base64
@@ -9,29 +8,15 @@ from PIL import Image, ImageDraw
9
  from io import BytesIO
10
  import re
11
 
12
- models = {
13
- "Qwen/Qwen2-VL-7B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct",
14
- torch_dtype="auto", device_map="auto"),
15
- "Qwen/Qwen2-VL-2B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct",
16
- torch_dtype="auto", device_map="auto"),
17
- "Qwen/Qwen2-VL-1B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-1B-Instruct",
18
- torch_dtype="auto", device_map="auto"),
19
- "Qwen/Qwen2-VL-5B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-5B-Instruct",
20
- torch_dtype="auto", device_map="auto"),
21
- "openai/clip-vit-base-patch32": CLIPModel.from_pretrained("openai/clip-vit-base-patch32"),
22
- "Salesforce/blip-image-captioning-base": BlipForConditionalGeneration.from_pretrained(
23
- "Salesforce/blip-image-captioning-base"),
24
 
 
 
 
25
  }
26
 
27
  processors = {
28
  "Qwen/Qwen2-VL-7B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct"),
29
- "Qwen/Qwen2-VL-2B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct"),
30
- "Qwen/Qwen2-VL-1B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-1B-Instruct"),
31
- "Qwen/Qwen2-VL-5B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-5B-Instruct"),
32
- "openai/clip-vit-base-patch32": CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32"),
33
- "Salesforce/blip-image-captioning-base": BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base"),
34
-
35
  }
36
 
37
 
@@ -97,21 +82,18 @@ def run_example(image, text_input, system_prompt, model_id="Qwen/Qwen2-VL-7B-Ins
97
 
98
  generated_ids = model.generate(**inputs, max_new_tokens=128)
99
  generated_ids_trimmed = [
100
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
101
  ]
102
- output_text = processor.batch_decode(generated_ids_trimmed,
103
- skip_special_tokens=True,
104
- clean_up_tokenization_spaces=False)
105
-
106
  print(output_text)
107
  pattern = r'\[\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\]'
108
  matches = re.findall(pattern, str(output_text))
109
  parsed_boxes = [[int(num) for num in match] for match in matches]
110
  scaled_boxes = rescale_bounding_boxes(parsed_boxes, image.width, image.height)
111
-
112
  return output_text, parsed_boxes, draw_bounding_boxes(image, scaled_boxes)
113
 
114
-
115
  css = """
116
  #output {
117
  height: 500px;
@@ -119,34 +101,20 @@ css = """
119
  border: 1px solid #ccc;
120
  }
121
  """
122
- default_system_prompt = ("You are a helpfull assistant to detect objects in images. "
123
- "When asked to detect elements based on a description you return bounding boxes for all "
124
- "elements in the form of [xmin, ymin, xmax, ymax] whith the "
125
- "values beeing scaled to 1000 by 1000 pixels. When there are more than one result, "
126
- "answer with a list of bounding boxes in the form of"
127
- " [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...].")
128
 
129
  with gr.Blocks(css=css) as demo:
130
  gr.Markdown(
131
- """
132
- # Multi-Model Object Detection Demo
133
- This demo uses various state-of-the-art models for object detection and image-text alignment tasks.
134
-
135
- **Available Models**:
136
- - **Qwen2-VL (7B, 2B, 5B, 1B)**: Vision-language models optimized for various tasks.
137
- - **BLIP**: Image captioning and visual question answering.
138
- - **CLIP**: Contrastive learning for image-text matching.
139
- - **Flamingo**: Few-shot learning for various visual tasks.
140
- - **LLaVA**: Balanced performance in visual understanding and interactive AI tasks.
141
-
142
- **Usage**: Input an image and a description of the target object you want to detect.
143
- """
144
- )
145
- with gr.Tab(label="Input"):
146
  with gr.Row():
147
  with gr.Column():
148
  input_img = gr.Image(label="Input Image", type="pil")
149
- model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-2B-Instruct")
150
  system_prompt = gr.Textbox(label="System Prompt", value=default_system_prompt)
151
  text_input = gr.Textbox(label="User Prompt")
152
  submit_btn = gr.Button(value="Submit")
@@ -157,15 +125,9 @@ with gr.Blocks(css=css) as demo:
157
 
158
  gr.Examples(
159
  examples=[
160
- ["images/2024_09_10_10_56_40.png", "solve the questions in Turkish", default_system_prompt],
161
- ["images/2024_09_10_10_58_23.png", "solve the questions in Turkish", default_system_prompt],
162
- ["images/2024_09_10_10_58_40.png", "solve the questions in Turkish", default_system_prompt],
163
- ["images/2024_09_10_11_07_31.png", "Describe the questions and write python code", default_system_prompt],
164
- ["images/IMG_3644", "Describe the image", default_system_prompt],
165
- ["images/IMG_3658", "Describe the image", default_system_prompt],
166
- ["images/IMG_4028", "Describe the image", default_system_prompt],
167
- ["images/IMG_4070", "Describe the image", default_system_prompt],
168
- ["images/comics.jpeg", "Describe the image", default_system_prompt],
169
  ],
170
  inputs=[input_img, text_input, system_prompt],
171
  outputs=[model_output_text, parsed_boxes, annotated_image],
@@ -174,7 +136,6 @@ with gr.Blocks(css=css) as demo:
174
  label="Try examples"
175
  )
176
 
177
- submit_btn.click(run_example, [input_img, text_input, system_prompt, model_selector],
178
- [model_output_text, parsed_boxes, annotated_image])
179
 
180
- demo.launch(debug=True)
 
1
  import gradio as gr
2
  import spaces
3
+ from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 
4
  from qwen_vl_utils import process_vision_info
5
  import torch
6
  import base64
 
8
  from io import BytesIO
9
  import re
10
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ models = {
13
+ "Qwen/Qwen2-VL-7B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"),
14
+ "Qwen/Qwen2-VL-2B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto")
15
  }
16
 
17
  processors = {
18
  "Qwen/Qwen2-VL-7B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct"),
19
+ "Qwen/Qwen2-VL-2B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
 
 
 
 
 
20
  }
21
 
22
 
 
82
 
83
  generated_ids = model.generate(**inputs, max_new_tokens=128)
84
  generated_ids_trimmed = [
85
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
86
  ]
87
+ output_text = processor.batch_decode(
88
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
89
+ )
 
90
  print(output_text)
91
  pattern = r'\[\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\]'
92
  matches = re.findall(pattern, str(output_text))
93
  parsed_boxes = [[int(num) for num in match] for match in matches]
94
  scaled_boxes = rescale_bounding_boxes(parsed_boxes, image.width, image.height)
 
95
  return output_text, parsed_boxes, draw_bounding_boxes(image, scaled_boxes)
96
 
 
97
  css = """
98
  #output {
99
  height: 500px;
 
101
  border: 1px solid #ccc;
102
  }
103
  """
104
+ default_system_prompt = "You are a helpfull assistant to detect objects in images. When asked to detect elements based on a description you return bounding boxes for all elements in the form of [xmin, ymin, xmax, ymax] whith the values beeing scaled to 1000 by 1000 pixels. When there are more than one result, answer with a list of bounding boxes in the form of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
 
 
 
 
 
105
 
106
  with gr.Blocks(css=css) as demo:
107
  gr.Markdown(
108
+ """
109
+ # Qwen2-VL Object Detection Demo
110
+ Use the Qwen2-VL models to detect objects in an image. The 7B variant seems to work much better.
111
+ **Usage**: Use the keyword "detect" and a description of the target (see examples below).
112
+ """)
113
+ with gr.Tab(label="Qwen2-VL Input"):
 
 
 
 
 
 
 
 
 
114
  with gr.Row():
115
  with gr.Column():
116
  input_img = gr.Image(label="Input Image", type="pil")
117
+ model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-7B-Instruct")
118
  system_prompt = gr.Textbox(label="System Prompt", value=default_system_prompt)
119
  text_input = gr.Textbox(label="User Prompt")
120
  submit_btn = gr.Button(value="Submit")
 
125
 
126
  gr.Examples(
127
  examples=[
128
+ ["assets/image1.jpg", "detect goats", default_system_prompt],
129
+ ["assets/image2.jpg", "detect blue button", default_system_prompt],
130
+ ["assets/image3.jpg", "detect person on bike", default_system_prompt],
 
 
 
 
 
 
131
  ],
132
  inputs=[input_img, text_input, system_prompt],
133
  outputs=[model_output_text, parsed_boxes, annotated_image],
 
136
  label="Try examples"
137
  )
138
 
139
+ submit_btn.click(run_example, [input_img, text_input, system_prompt, model_selector], [model_output_text, parsed_boxes, annotated_image])
 
140
 
141
+ demo.launch(debug=True)
ex.py DELETED
@@ -1,87 +0,0 @@
1
- from PIL import Image
2
- from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
3
- from qwen_vl_utils import process_vision_info
4
-
5
- # %%
6
-
7
- model = Qwen2VLForConditionalGeneration.from_pretrained(pretrained_model_name_or_path="Qwen/Qwen2-VL-2B-Instruct",
8
- torch_dtype="auto",
9
- device_map="auto")
10
-
11
- processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
12
-
13
-
14
- # %%
15
- def rescale_image_dimensions(original_width, original_height, max_size=1000):
16
- # Orijinal boyutlar 1000 pikselin üzerindeyse yeniden ölçeklendir
17
- if original_width > max_size or original_height > max_size:
18
- aspect_ratio = original_width / original_height
19
-
20
- if aspect_ratio > 1: # Genişlik yükseklikten büyükse
21
- scaled_width = max_size
22
- scaled_height = int(max_size / aspect_ratio)
23
- else: # Yükseklik genişlikten büyükse veya eşitse
24
- scaled_height = max_size
25
- scaled_width = int(max_size * aspect_ratio)
26
- else:
27
- # Orijinal boyutlar zaten uygun ise
28
- scaled_width = original_width
29
- scaled_height = original_height
30
-
31
- return scaled_width, scaled_height
32
-
33
-
34
- # %%
35
- messages = [
36
- {
37
- "role": "user",
38
- "content": [
39
- {
40
- "type": "image",
41
- "image": "/home/nuh-hatipoglu/Desktop/NewMind/demo-vit/images/IMG_3644.JPG",
42
- },
43
- {"type": "text", "text": "Describe image"},
44
- ],
45
- }
46
- ]
47
- # %%%
48
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
49
-
50
- # %%
51
- image_inputs, video_inputs = process_vision_info(messages)
52
-
53
- # %%
54
-
55
- original_width, original_height = image_inputs[0].size
56
- new_width, new_height = rescale_image_dimensions(original_width, original_height)
57
- rescaled_image = image_inputs[0].resize((new_width, new_height), Image.Resampling.LANCZOS)
58
- image_inputs = rescaled_image
59
-
60
- #%%
61
-
62
- image_inputs[0].show()
63
- # %%
64
- inputs = processor(text=[text],
65
- images=image_inputs,
66
- videos=video_inputs,
67
- padding=True,
68
- return_tensors="pt", )
69
- inputs = inputs.to("cuda")
70
- # %%
71
- # Görseli aç
72
- image_path = "your_image_path.jpg" # Görselin dosya yolu
73
- image = Image.open(image_path)
74
-
75
- # Orijinal boyutları al
76
- original_width, original_height = image.size
77
-
78
- # Yeni boyutları hesapla
79
- new_width, new_height = rescale_image_dimensions(original_width, original_height)
80
-
81
- # Görseli yeniden boyutlandır
82
- rescaled_image = image.resize((new_width, new_height), Image.ANTIALIAS)
83
-
84
- # Yeniden boyutlandırılmış görseli kaydet
85
- rescaled_image.save("rescaled_" + image_path)
86
-
87
- print(f"Görsel başarıyla yeniden boyutlandırıldı: {new_width}x{new_height}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
images/2024_09_10_10_56_40.png DELETED
Binary file (19.9 kB)
 
images/2024_09_10_10_58_23.png DELETED
Binary file (9.04 kB)
 
images/2024_09_10_10_58_40.png DELETED
Binary file (19.6 kB)
 
images/2024_09_10_11_07_31.png DELETED
Binary file (45.9 kB)
 
images/comics.jpeg DELETED
Binary file (69.3 kB)
 
requirements.txt CHANGED
@@ -1,106 +0,0 @@
1
- accelerate==0.30.0
2
- aiofiles==23.2.1
3
- altair==5.4.1
4
- annotated-types==0.7.0
5
- anyio==4.4.0
6
- attrs==24.2.0
7
- blinker==1.8.2
8
- cachetools==5.5.0
9
- certifi==2024.8.30
10
- charset-normalizer==3.3.2
11
- click==8.1.7
12
- contourpy==1.3.0
13
- cycler==0.12.1
14
- fastapi==0.114.1
15
- ffmpy==0.4.0
16
- filelock==3.16.0
17
- fonttools==4.53.1
18
- fsspec==2024.9.0
19
- gitdb==4.0.11
20
- GitPython==3.1.43
21
- gradio==4.44.0
22
- gradio_client==1.3.0
23
- h11==0.14.0
24
- httpcore==1.0.5
25
- httpx==0.27.2
26
- huggingface-hub==0.24.6
27
- idna==3.8
28
- importlib_resources==6.4.5
29
- Jinja2==3.1.4
30
- jsonschema==4.23.0
31
- jsonschema-specifications==2023.12.1
32
- kiwisolver==1.4.7
33
- markdown-it-py==3.0.0
34
- MarkupSafe==2.1.5
35
- matplotlib==3.9.2
36
- mdurl==0.1.2
37
- mpmath==1.3.0
38
- narwhals==1.6.4
39
- networkx==3.3
40
- numpy==2.1.1
41
- nvidia-cublas-cu12==12.1.3.1
42
- nvidia-cuda-cupti-cu12==12.1.105
43
- nvidia-cuda-nvrtc-cu12==12.1.105
44
- nvidia-cuda-runtime-cu12==12.1.105
45
- nvidia-cudnn-cu12==9.1.0.70
46
- nvidia-cufft-cu12==11.0.2.54
47
- nvidia-curand-cu12==10.3.2.106
48
- nvidia-cusolver-cu12==11.4.5.107
49
- nvidia-cusparse-cu12==12.1.0.106
50
- nvidia-nccl-cu12==2.20.5
51
- nvidia-nvjitlink-cu12==12.6.68
52
- nvidia-nvtx-cu12==12.1.105
53
- orjson==3.10.7
54
- packaging==24.1
55
- pandas==2.2.2
56
- pillow==10.4.0
57
- protobuf==5.28.0
58
- psutil==5.9.8
59
- pyarrow==17.0.0
60
- pydantic==2.9.1
61
- pydantic_core==2.23.3
62
- pydeck==0.9.1
63
- pydub==0.25.1
64
- Pygments==2.18.0
65
- pyparsing==3.1.4
66
- python-dateutil==2.9.0.post0
67
- python-multipart==0.0.9
68
- pytz==2024.1
69
- PyYAML==6.0.2
70
- qwen-vl-utils==0.0.4
71
- referencing==0.35.1
72
- regex==2024.7.24
73
- requests==2.32.3
74
- rich==13.8.1
75
- rpds-py==0.20.0
76
- ruff==0.6.4
77
- safetensors==0.4.5
78
- semantic-version==2.10.0
79
- setuptools==74.1.2
80
- shellingham==1.5.4
81
- six==1.16.0
82
- smmap==5.0.1
83
- sniffio==1.3.1
84
- spaces==0.30.2
85
- starlette==0.38.5
86
- streamlit==1.38.0
87
- sympy==1.13.2
88
- tenacity==8.5.0
89
- tokenizers==0.19.1
90
- toml==0.10.2
91
- tomlkit==0.12.0
92
- torch==2.4.1
93
- torchvision==0.19.1
94
- tornado==6.4.1
95
- tqdm==4.66.5
96
- transformers @ git+https://github.com/huggingface/transformers.git@f38590dade57c1f8cf8a67e9409dae8935f8c478
97
- triton==3.0.0
98
- typer==0.12.5
99
- typing_extensions==4.12.2
100
- tzdata==2024.1
101
- urllib3==2.2.2
102
- uvicorn==0.30.6
103
- watchdog==4.0.2
104
- websockets==12.0
105
- yarl==1.7.0
106
- transformers~=4.45.0.dev0