motheecreator commited on
Commit
d00962d
·
verified ·
1 Parent(s): 8064ded

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -11
app.py CHANGED
@@ -3,7 +3,6 @@ from transformers import AutoTokenizer, VisionEncoderDecoderModel, AutoImageProc
3
  from PIL import Image
4
  from torchvision.transforms.functional import crop
5
  import gradio as gr
6
- import json
7
  import base64
8
  import io
9
  from huggingface_hub import hf_hub_download
@@ -101,20 +100,15 @@ def process_image(image):
101
  caption = tokenizer.decode(caption_ids[0], skip_special_tokens=True)
102
  captions.append(caption)
103
 
104
- # Prepare the result for visualization
105
- detection_results = []
106
  for i, (label, box, score, caption) in enumerate(zip(labels, boxes, scores, captions)):
107
- detection_results.append({
108
- "label": label,
109
- "caption": caption,
110
- "bounding_box": [float(coord) for coord in box], # Convert to float
111
- "confidence_score": float(score) # Convert to float
112
- })
113
 
114
  # Render image with bounding boxes
115
  result_image = results.render()[0]
116
 
117
- # Return the image with detections and the caption
118
  return result_image, detection_results, original_caption
119
 
120
  except Exception as e:
@@ -129,7 +123,7 @@ interface = gr.Interface(
129
  inputs=gr.Image(type="pil"), # Input: Image upload
130
  outputs=[
131
  gr.Image(type="pil", label="Detected Objects"), # Output 1: Image with bounding boxes
132
- gr.JSON(label="Object Captions & Bounding Boxes"), # Output 2: JSON results for each object
133
  gr.Textbox(label="Whole Image Caption") # Output 3: Caption for the whole image
134
  ],
135
  live=True
 
3
  from PIL import Image
4
  from torchvision.transforms.functional import crop
5
  import gradio as gr
 
6
  import base64
7
  import io
8
  from huggingface_hub import hf_hub_download
 
100
  caption = tokenizer.decode(caption_ids[0], skip_special_tokens=True)
101
  captions.append(caption)
102
 
103
+ # Prepare the result for visualization as a formatted string
104
+ detection_results = ""
105
  for i, (label, box, score, caption) in enumerate(zip(labels, boxes, scores, captions)):
106
+ detection_results += f"Object {i + 1}: {label} - Caption: {caption}\n"
 
 
 
 
 
107
 
108
  # Render image with bounding boxes
109
  result_image = results.render()[0]
110
 
111
+ # Return the image with detections, formatted captions, and the whole image caption
112
  return result_image, detection_results, original_caption
113
 
114
  except Exception as e:
 
123
  inputs=gr.Image(type="pil"), # Input: Image upload
124
  outputs=[
125
  gr.Image(type="pil", label="Detected Objects"), # Output 1: Image with bounding boxes
126
+ gr.Textbox(label="Object Captions & Bounding Boxes", lines=10), # Output 2: Formatted captions
127
  gr.Textbox(label="Whole Image Caption") # Output 3: Caption for the whole image
128
  ],
129
  live=True