Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -3,7 +3,6 @@ from transformers import AutoTokenizer, VisionEncoderDecoderModel, AutoImageProc
|
|
3 |
from PIL import Image
|
4 |
from torchvision.transforms.functional import crop
|
5 |
import gradio as gr
|
6 |
-
import json
|
7 |
import base64
|
8 |
import io
|
9 |
from huggingface_hub import hf_hub_download
|
@@ -101,20 +100,15 @@ def process_image(image):
|
|
101 |
caption = tokenizer.decode(caption_ids[0], skip_special_tokens=True)
|
102 |
captions.append(caption)
|
103 |
|
104 |
-
# Prepare the result for visualization
|
105 |
-
detection_results =
|
106 |
for i, (label, box, score, caption) in enumerate(zip(labels, boxes, scores, captions)):
|
107 |
-
detection_results
|
108 |
-
"label": label,
|
109 |
-
"caption": caption,
|
110 |
-
"bounding_box": [float(coord) for coord in box], # Convert to float
|
111 |
-
"confidence_score": float(score) # Convert to float
|
112 |
-
})
|
113 |
|
114 |
# Render image with bounding boxes
|
115 |
result_image = results.render()[0]
|
116 |
|
117 |
-
# Return the image with detections and the caption
|
118 |
return result_image, detection_results, original_caption
|
119 |
|
120 |
except Exception as e:
|
@@ -129,7 +123,7 @@ interface = gr.Interface(
|
|
129 |
inputs=gr.Image(type="pil"), # Input: Image upload
|
130 |
outputs=[
|
131 |
gr.Image(type="pil", label="Detected Objects"), # Output 1: Image with bounding boxes
|
132 |
-
gr.
|
133 |
gr.Textbox(label="Whole Image Caption") # Output 3: Caption for the whole image
|
134 |
],
|
135 |
live=True
|
|
|
3 |
from PIL import Image
|
4 |
from torchvision.transforms.functional import crop
|
5 |
import gradio as gr
|
|
|
6 |
import base64
|
7 |
import io
|
8 |
from huggingface_hub import hf_hub_download
|
|
|
100 |
caption = tokenizer.decode(caption_ids[0], skip_special_tokens=True)
|
101 |
captions.append(caption)
|
102 |
|
103 |
+
# Prepare the result for visualization as a formatted string
|
104 |
+
detection_results = ""
|
105 |
for i, (label, box, score, caption) in enumerate(zip(labels, boxes, scores, captions)):
|
106 |
+
detection_results += f"Object {i + 1}: {label} - Caption: {caption}\n"
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
# Render image with bounding boxes
|
109 |
result_image = results.render()[0]
|
110 |
|
111 |
+
# Return the image with detections, formatted captions, and the whole image caption
|
112 |
return result_image, detection_results, original_caption
|
113 |
|
114 |
except Exception as e:
|
|
|
123 |
inputs=gr.Image(type="pil"), # Input: Image upload
|
124 |
outputs=[
|
125 |
gr.Image(type="pil", label="Detected Objects"), # Output 1: Image with bounding boxes
|
126 |
+
gr.Textbox(label="Object Captions & Bounding Boxes", lines=10), # Output 2: Formatted captions
|
127 |
gr.Textbox(label="Whole Image Caption") # Output 3: Caption for the whole image
|
128 |
],
|
129 |
live=True
|