Spaces:

nhatipoglu
/

demo-vit-v2

Runtime error

App Files Files Community

nhatipoglu commited on Sep 12, 2024

Commit

b251db3

1 Parent(s): 8677efd

add app files2

Browse files

Files changed (11) hide show

.gitattributes +2 -0
.idea/misc.xml +3 -0
README.md +9 -45
app.py +22 -61
ex.py +0 -87
images/2024_09_10_10_56_40.png +0 -0
images/2024_09_10_10_58_23.png +0 -0
images/2024_09_10_10_58_40.png +0 -0
images/2024_09_10_11_07_31.png +0 -0
images/comics.jpeg +0 -0
requirements.txt +0 -106

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+.idea

.idea/misc.xml CHANGED Viewed

@@ -1,5 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
   <component name="GithubDefaultAccount">
     <option name="defaultAccountId" value="16dd0ba3-f1ec-4fdf-9c62-48bd69c3904d" />
   </component>

 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.12 (linkedIn_auto_jobs_applier_with_AI)" />
+  </component>
   <component name="GithubDefaultAccount">
     <option name="defaultAccountId" value="16dd0ba3-f1ec-4fdf-9c62-48bd69c3904d" />
   </component>

README.md CHANGED Viewed

@@ -1,45 +1,9 @@
-### README.md for Multi-Model Object Detection Demo
----
-# Multi-Model Object Detection Demo
-This repository provides a demo application that uses multiple state-of-the-art vision-language models for various tasks such as object detection, image captioning, visual question answering, and image-text matching. The demo is built using Gradio for the user interface and leverages Hugging Face's `transformers` library to load and run various pre-trained models.
-## Available Models
-The following models are available in the demo:
-- **Qwen2-VL (7B, 2B, 5B, 1B):** Vision-language models optimized for object detection, question-answering, and image description tasks.
-- **BLIP:** Specialized in image captioning and visual question-answering.
-- **CLIP:** Uses contrastive learning for image-text matching.
-## Usage
-To use the demo:
-1. **Input an Image:** Upload an image that you want to analyze.
-2. **Select a Model:** Choose a model from the dropdown list to perform the desired task.
-3. **Provide a System Prompt:** Optionally, enter a system prompt to guide the model's behavior.
-4. **Enter a User Prompt:** Describe the object or task you want the model to perform.
-5. **Submit:** Click the "Submit" button to run the model and display the results.
-## Getting Started
-### Example Inputs
-The demo provides some pre-configured examples to try:
-- **Image 1:** Detect goats in an image.
-- **Image 2:** Find a blue button in the image.
-- **Image 3:** Describe a person on a bike.
-- **Image 4:** Solve questions from a screenshot.
-- **Image 5:** Describe various images such as landscapes, animals, or objects.
-## Available Functions
-- `run_example`: Core function to process the input image and prompts, run the selected model, and return the results.
-- `image_to_base64`: Converts an image to a base64 encoded string for model processing.
-- `draw_bounding_boxes`: Draws bounding boxes around detected objects in the image.
-- `rescale_bounding_boxes`: Rescales bounding boxes to the original image dimensions.

+title: Qwen2 VL Localization
+emoji: 📉
+colorFrom: pink
+colorTo: purple
+sdk: gradio
+sdk_version: 4.42.0
+app_file: app.py
+pinned: false
+license: mit

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
 import spaces
-from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor, CLIPModel, \
-    BlipForConditionalGeneration, CLIPProcessor, BlipProcessor
 from qwen_vl_utils import process_vision_info
 import torch
 import base64
@@ -9,29 +8,15 @@ from PIL import Image, ImageDraw
 from io import BytesIO
 import re
-models = {
-    "Qwen/Qwen2-VL-7B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct",
-                                                                                 torch_dtype="auto", device_map="auto"),
-    "Qwen/Qwen2-VL-2B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct",
-                                                                                 torch_dtype="auto", device_map="auto"),
-    "Qwen/Qwen2-VL-1B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-1B-Instruct",
-                                                                                 torch_dtype="auto", device_map="auto"),
-    "Qwen/Qwen2-VL-5B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-5B-Instruct",
-                                                                                 torch_dtype="auto", device_map="auto"),
-    "openai/clip-vit-base-patch32": CLIPModel.from_pretrained("openai/clip-vit-base-patch32"),
-    "Salesforce/blip-image-captioning-base": BlipForConditionalGeneration.from_pretrained(
-        "Salesforce/blip-image-captioning-base"),
 }
 processors = {
     "Qwen/Qwen2-VL-7B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct"),
-    "Qwen/Qwen2-VL-2B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct"),
-    "Qwen/Qwen2-VL-1B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-1B-Instruct"),
-    "Qwen/Qwen2-VL-5B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-5B-Instruct"),
-    "openai/clip-vit-base-patch32": CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32"),
-    "Salesforce/blip-image-captioning-base": BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base"),
 }
@@ -97,21 +82,18 @@ def run_example(image, text_input, system_prompt, model_id="Qwen/Qwen2-VL-7B-Ins
     generated_ids = model.generate(**inputs, max_new_tokens=128)
     generated_ids_trimmed = [
-        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
     ]
-    output_text = processor.batch_decode(generated_ids_trimmed,
-                                         skip_special_tokens=True,
-                                         clean_up_tokenization_spaces=False)
     print(output_text)
     pattern = r'\[\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\]'
     matches = re.findall(pattern, str(output_text))
     parsed_boxes = [[int(num) for num in match] for match in matches]
     scaled_boxes = rescale_bounding_boxes(parsed_boxes, image.width, image.height)
     return output_text, parsed_boxes, draw_bounding_boxes(image, scaled_boxes)
 css = """
   #output {
     height: 500px;
@@ -119,34 +101,20 @@ css = """
     border: 1px solid #ccc;
   }
 """
-default_system_prompt = ("You are a helpfull assistant to detect objects in images. "
-                         "When asked to detect elements based on a description you return bounding boxes for all "
-                         "elements in the form of [xmin, ymin, xmax, ymax] whith the "
-                         "values beeing scaled to 1000 by 1000 pixels. When there are more than one result, "
-                         "answer with a list of bounding boxes in the form of"
-                         " [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...].")
 with gr.Blocks(css=css) as demo:
     gr.Markdown(
-        """
-        # Multi-Model Object Detection Demo
-        This demo uses various state-of-the-art models for object detection and image-text alignment tasks.
-        **Available Models**:
-        - **Qwen2-VL (7B, 2B, 5B, 1B)**: Vision-language models optimized for various tasks.
-        - **BLIP**: Image captioning and visual question answering.
-        - **CLIP**: Contrastive learning for image-text matching.
-        - **Flamingo**: Few-shot learning for various visual tasks.
-        - **LLaVA**: Balanced performance in visual understanding and interactive AI tasks.
-        **Usage**: Input an image and a description of the target object you want to detect.
-        """
-    )
-    with gr.Tab(label="Input"):
         with gr.Row():
             with gr.Column():
                 input_img = gr.Image(label="Input Image", type="pil")
-                model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-2B-Instruct")
                 system_prompt = gr.Textbox(label="System Prompt", value=default_system_prompt)
                 text_input = gr.Textbox(label="User Prompt")
                 submit_btn = gr.Button(value="Submit")
@@ -157,15 +125,9 @@ with gr.Blocks(css=css) as demo:
         gr.Examples(
             examples=[
-                ["images/2024_09_10_10_56_40.png", "solve the questions in Turkish", default_system_prompt],
-                ["images/2024_09_10_10_58_23.png", "solve the questions in Turkish", default_system_prompt],
-                ["images/2024_09_10_10_58_40.png", "solve the questions in Turkish", default_system_prompt],
-                ["images/2024_09_10_11_07_31.png", "Describe the questions and write python code", default_system_prompt],
-                ["images/IMG_3644", "Describe the image", default_system_prompt],
-                ["images/IMG_3658", "Describe the image", default_system_prompt],
-                ["images/IMG_4028", "Describe the image", default_system_prompt],
-                ["images/IMG_4070", "Describe the image", default_system_prompt],
-                ["images/comics.jpeg", "Describe the image", default_system_prompt],
             ],
             inputs=[input_img, text_input, system_prompt],
             outputs=[model_output_text, parsed_boxes, annotated_image],
@@ -174,7 +136,6 @@ with gr.Blocks(css=css) as demo:
             label="Try examples"
         )
-        submit_btn.click(run_example, [input_img, text_input, system_prompt, model_selector],
-                         [model_output_text, parsed_boxes, annotated_image])
-demo.launch(debug=True)

 import gradio as gr
 import spaces
+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 from qwen_vl_utils import process_vision_info
 import torch
 import base64
 from io import BytesIO
 import re
+models = {
+    "Qwen/Qwen2-VL-7B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"),
+    "Qwen/Qwen2-VL-2B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto")
 }
 processors = {
     "Qwen/Qwen2-VL-7B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct"),
+    "Qwen/Qwen2-VL-2B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
 }
     generated_ids = model.generate(**inputs, max_new_tokens=128)
     generated_ids_trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
     ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
     print(output_text)
     pattern = r'\[\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\]'
     matches = re.findall(pattern, str(output_text))
     parsed_boxes = [[int(num) for num in match] for match in matches]
     scaled_boxes = rescale_bounding_boxes(parsed_boxes, image.width, image.height)
     return output_text, parsed_boxes, draw_bounding_boxes(image, scaled_boxes)
 css = """
   #output {
     height: 500px;
     border: 1px solid #ccc;
   }
 """
+default_system_prompt = "You are a helpfull assistant to detect objects in images. When asked to detect elements based on a description you return bounding boxes for all elements in the form of [xmin, ymin, xmax, ymax] whith the values beeing scaled to 1000 by 1000 pixels. When there are more than one result, answer with a list of bounding boxes in the form of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
 with gr.Blocks(css=css) as demo:
     gr.Markdown(
+    """
+    # Qwen2-VL Object Detection Demo
+    Use the Qwen2-VL models to detect objects in an image. The 7B variant seems to work much better.
+    **Usage**: Use the keyword "detect" and a description of the target (see examples below).
+    """)
+    with gr.Tab(label="Qwen2-VL Input"):
         with gr.Row():
             with gr.Column():
                 input_img = gr.Image(label="Input Image", type="pil")
+                model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-7B-Instruct")
                 system_prompt = gr.Textbox(label="System Prompt", value=default_system_prompt)
                 text_input = gr.Textbox(label="User Prompt")
                 submit_btn = gr.Button(value="Submit")
         gr.Examples(
             examples=[
+                ["assets/image1.jpg", "detect goats", default_system_prompt],
+                ["assets/image2.jpg", "detect blue button", default_system_prompt],
+                ["assets/image3.jpg", "detect person on bike", default_system_prompt],
             ],
             inputs=[input_img, text_input, system_prompt],
             outputs=[model_output_text, parsed_boxes, annotated_image],
             label="Try examples"
         )
+        submit_btn.click(run_example, [input_img, text_input, system_prompt, model_selector], [model_output_text, parsed_boxes, annotated_image])
+demo.launch(debug=True)

ex.py DELETED Viewed

@@ -1,87 +0,0 @@
-from PIL import Image
-from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
-from qwen_vl_utils import process_vision_info
-# %%
-model = Qwen2VLForConditionalGeneration.from_pretrained(pretrained_model_name_or_path="Qwen/Qwen2-VL-2B-Instruct",
-                                                        torch_dtype="auto",
-                                                        device_map="auto")
-processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
-# %%
-def rescale_image_dimensions(original_width, original_height, max_size=1000):
-    # Orijinal boyutlar 1000 pikselin üzerindeyse yeniden ölçeklendir
-    if original_width > max_size or original_height > max_size:
-        aspect_ratio = original_width / original_height
-        if aspect_ratio > 1:  # Genişlik yükseklikten büyükse
-            scaled_width = max_size
-            scaled_height = int(max_size / aspect_ratio)
-        else:  # Yükseklik genişlikten büyükse veya eşitse
-            scaled_height = max_size
-            scaled_width = int(max_size * aspect_ratio)
-    else:
-        # Orijinal boyutlar zaten uygun ise
-        scaled_width = original_width
-        scaled_height = original_height
-    return scaled_width, scaled_height
-# %%
-messages = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "image": "/home/nuh-hatipoglu/Desktop/NewMind/demo-vit/images/IMG_3644.JPG",
-            },
-            {"type": "text", "text": "Describe image"},
-        ],
-    }
-]
-# %%%
-text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-# %%
-image_inputs, video_inputs = process_vision_info(messages)
-# %%
-original_width, original_height = image_inputs[0].size
-new_width, new_height = rescale_image_dimensions(original_width, original_height)
-rescaled_image = image_inputs[0].resize((new_width, new_height), Image.Resampling.LANCZOS)
-image_inputs = rescaled_image
-#%%
-image_inputs[0].show()
-# %%
-inputs = processor(text=[text],
-                   images=image_inputs,
-                   videos=video_inputs,
-                   padding=True,
-                   return_tensors="pt", )
-inputs = inputs.to("cuda")
-# %%
-# Görseli aç
-image_path = "your_image_path.jpg"  # Görselin dosya yolu
-image = Image.open(image_path)
-# Orijinal boyutları al
-original_width, original_height = image.size
-# Yeni boyutları hesapla
-new_width, new_height = rescale_image_dimensions(original_width, original_height)
-# Görseli yeniden boyutlandır
-rescaled_image = image.resize((new_width, new_height), Image.ANTIALIAS)
-# Yeniden boyutlandırılmış görseli kaydet
-rescaled_image.save("rescaled_" + image_path)
-print(f"Görsel başarıyla yeniden boyutlandırıldı: {new_width}x{new_height}")

images/2024_09_10_10_56_40.png DELETED Viewed

Binary file (19.9 kB)

images/2024_09_10_10_58_23.png DELETED Viewed

Binary file (9.04 kB)

images/2024_09_10_10_58_40.png DELETED Viewed

Binary file (19.6 kB)

images/2024_09_10_11_07_31.png DELETED Viewed

Binary file (45.9 kB)

images/comics.jpeg DELETED Viewed

Binary file (69.3 kB)

requirements.txt CHANGED Viewed

@@ -1,106 +0,0 @@
-accelerate==0.30.0
-aiofiles==23.2.1
-altair==5.4.1
-annotated-types==0.7.0
-anyio==4.4.0
-attrs==24.2.0
-blinker==1.8.2
-cachetools==5.5.0
-certifi==2024.8.30
-charset-normalizer==3.3.2
-click==8.1.7
-contourpy==1.3.0
-cycler==0.12.1
-fastapi==0.114.1
-ffmpy==0.4.0
-filelock==3.16.0
-fonttools==4.53.1
-fsspec==2024.9.0
-gitdb==4.0.11
-GitPython==3.1.43
-gradio==4.44.0
-gradio_client==1.3.0
-h11==0.14.0
-httpcore==1.0.5
-httpx==0.27.2
-huggingface-hub==0.24.6
-idna==3.8
-importlib_resources==6.4.5
-Jinja2==3.1.4
-jsonschema==4.23.0
-jsonschema-specifications==2023.12.1
-kiwisolver==1.4.7
-markdown-it-py==3.0.0
-MarkupSafe==2.1.5
-matplotlib==3.9.2
-mdurl==0.1.2
-mpmath==1.3.0
-narwhals==1.6.4
-networkx==3.3
-numpy==2.1.1
-nvidia-cublas-cu12==12.1.3.1
-nvidia-cuda-cupti-cu12==12.1.105
-nvidia-cuda-nvrtc-cu12==12.1.105
-nvidia-cuda-runtime-cu12==12.1.105
-nvidia-cudnn-cu12==9.1.0.70
-nvidia-cufft-cu12==11.0.2.54
-nvidia-curand-cu12==10.3.2.106
-nvidia-cusolver-cu12==11.4.5.107
-nvidia-cusparse-cu12==12.1.0.106
-nvidia-nccl-cu12==2.20.5
-nvidia-nvjitlink-cu12==12.6.68
-nvidia-nvtx-cu12==12.1.105
-orjson==3.10.7
-packaging==24.1
-pandas==2.2.2
-pillow==10.4.0
-protobuf==5.28.0
-psutil==5.9.8
-pyarrow==17.0.0
-pydantic==2.9.1
-pydantic_core==2.23.3
-pydeck==0.9.1
-pydub==0.25.1
-Pygments==2.18.0
-pyparsing==3.1.4
-python-dateutil==2.9.0.post0
-python-multipart==0.0.9
-pytz==2024.1
-PyYAML==6.0.2
-qwen-vl-utils==0.0.4
-referencing==0.35.1
-regex==2024.7.24
-requests==2.32.3
-rich==13.8.1
-rpds-py==0.20.0
-ruff==0.6.4
-safetensors==0.4.5
-semantic-version==2.10.0
-setuptools==74.1.2
-shellingham==1.5.4
-six==1.16.0
-smmap==5.0.1
-sniffio==1.3.1
-spaces==0.30.2
-starlette==0.38.5
-streamlit==1.38.0
-sympy==1.13.2
-tenacity==8.5.0
-tokenizers==0.19.1
-toml==0.10.2
-tomlkit==0.12.0
-torch==2.4.1
-torchvision==0.19.1
-tornado==6.4.1
-tqdm==4.66.5
-transformers @ git+https://github.com/huggingface/transformers.git@f38590dade57c1f8cf8a67e9409dae8935f8c478
-triton==3.0.0
-typer==0.12.5
-typing_extensions==4.12.2
-tzdata==2024.1
-urllib3==2.2.2
-uvicorn==0.30.6
-watchdog==4.0.2
-websockets==12.0
-yarl==1.7.0
-transformers~=4.45.0.dev0