Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,723 Bytes
3ea53a0 f830549 3ea53a0 f830549 3ea53a0 cc582e0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
import spaces
import gradio as gr
from PIL import Image, ImageDraw, ImageFont
from ultralytics import YOLO
from huggingface_hub import hf_hub_download
import cv2
import tempfile
import numpy as np
def download_model(model_filename):
return hf_hub_download(repo_id="atalaydenknalbant/Yolov13", filename=model_filename)
@spaces.GPU
def yolo_inference(input_type, image, video, model_id, conf_threshold, iou_threshold, max_detection):
model_path = download_model(model_id)
if input_type == "Image":
if image is None:
width, height = 640, 480
blank_image = Image.new("RGB", (width, height), color="white")
draw = ImageDraw.Draw(blank_image)
message = "No image provided"
font = ImageFont.load_default(size=40)
bbox = draw.textbbox((0, 0), message, font=font)
text_width = bbox[2] - bbox[0]
text_height = bbox[3] - bbox[1]
text_x = (width - text_width) / 2
text_y = (height - text_height) / 2
draw.text((text_x, text_y), message, fill="black", font=font)
return blank_image, None
model = YOLO(model_path)
results = model.predict(
source=image,
conf=conf_threshold,
iou=iou_threshold,
imgsz=640,
max_det=max_detection,
show_labels=True,
show_conf=True,
)
for r in results:
image_array = r.plot()
annotated_image = Image.fromarray(image_array[..., ::-1])
return annotated_image, None
elif input_type == "Video":
if video is None:
width, height = 640, 480
blank_image = Image.new("RGB", (width, height), color="white")
draw = ImageDraw.Draw(blank_image)
message = "No video provided"
font = ImageFont.load_default(size=40)
bbox = draw.textbbox((0, 0), message, font=font)
text_width = bbox[2] - bbox[0]
text_height = bbox[3] - bbox[1]
text_x = (width - text_width) / 2
text_y = (height - text_height) / 2
draw.text((text_x, text_y), message, fill="black", font=font)
temp_video_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter(temp_video_file, fourcc, 1, (width, height))
frame = cv2.cvtColor(np.array(blank_image), cv2.COLOR_RGB2BGR)
out.write(frame)
out.release()
return None, temp_video_file
model = YOLO(model_path)
cap = cv2.VideoCapture(video)
fps = cap.get(cv2.CAP_PROP_FPS) if cap.get(cv2.CAP_PROP_FPS) > 0 else 25
frames = []
while True:
ret, frame = cap.read()
if not ret:
break
pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
results = model.predict(
source=pil_frame,
conf=conf_threshold,
iou=iou_threshold,
imgsz=640,
max_det=max_detection,
show_labels=True,
show_conf=True,
)
for r in results:
annotated_frame_array = r.plot()
annotated_frame = cv2.cvtColor(annotated_frame_array, cv2.COLOR_BGR2RGB)
frames.append(annotated_frame)
cap.release()
if not frames:
return None, None
height_out, width_out, _ = frames[0].shape
temp_video_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter(temp_video_file, fourcc, fps, (width_out, height_out))
for f in frames:
f_bgr = cv2.cvtColor(f, cv2.COLOR_RGB2BGR)
out.write(f_bgr)
out.release()
return None, temp_video_file
return None, None
def update_visibility(input_type):
if input_type == "Image":
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
else:
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=true)
def yolo_inference_for_examples(image, model_id, conf_threshold, iou_threshold, max_detection):
annotated_image, _ = yolo_inference(
input_type="Image",
image=image,
video=None,
model_id=model_id,
conf_threshold=conf_threshold,
iou_threshold=iou_threshold,
max_detection=max_detection
)
return gr.update(value="Image"), annotated_image
with gr.Blocks() as app:
gr.Markdown("# Yolo13: Object Detection")
gr.Markdown("Upload an image or video for inference using the latest YOLOv13 models.")
gr.Markdown("π **Note:** Better-trained models will be deployed as they become available.")
with gr.Accordion("Paper and Citation", open=False):
gr.Markdown("""
This application is based on the research from the paper: **YOLOv13: Real-Time Object Detection with Hypergraph-Enhanced Adaptive Visual Perception**.
- **Authors:** Mengqi Lei, Siqi Li, Yihong Wu, et al.
- **Preprint Link:** [https://arxiv.org/abs/2506.17733](https://arxiv.org/abs/2506.17733)
**BibTeX:**
```
@article{yolov13,
title={YOLOv13: Real-Time Object Detection with Hypergraph-Enhanced Adaptive Visual Perception},
author={Lei, Mengqi and Li, Siqi and Wu, Yihong and et al.},
journal={arXiv preprint arXiv:2506.17733},
year={2025}
}
```
""")
with gr.Row():
with gr.Column():
image = gr.Image(type="pil", label="Image", visible=True)
video = gr.Video(label="Video", visible=False)
input_type = gr.Radio(
choices=["Image", "Video"],
value="Image",
label="Input Type",
)
model_id = gr.Dropdown(
label="Model Name",
choices=[
'yolov13n.pt', 'yolov13s.pt', 'yolov13l.pt', 'yolov13x.pt',
],
value="yolov13n.pt",
)
conf_threshold = gr.Slider(minimum=0, maximum=1, value=0.25, label="Confidence Threshold")
iou_threshold = gr.Slider(minimum=0, maximum=1, value=0.45, label="IoU Threshold")
max_detection = gr.Slider(minimum=1, maximum=300, step=1, value=300, label="Max Detection")
infer_button = gr.Button("Detect Objects")
with gr.Column():
output_image = gr.Image(type="pil", label="Annotated Image", visible=True)
output_video = gr.Video(label="Annotated Video", visible=False)
gr.DeepLinkButton()
input_type.change(
fn=update_visibility,
inputs=input_type,
outputs=[image, video, output_image, output_video],
)
infer_button.click(
fn=yolo_inference,
inputs=[input_type, image, video, model_id, conf_threshold, iou_threshold, max_detection],
outputs=[output_image, output_video],
)
gr.Examples(
examples=[
["zidane.jpg", "yolov13s.pt", 0.35, 0.45, 300],
["bus.jpg", "yolov13l.pt", 0.35, 0.45, 300],
["yolo_vision.jpg", "yolov13x.pt", 0.35, 0.45, 300],
],
fn=yolo_inference_for_examples,
inputs=[image, model_id, conf_threshold, iou_threshold, max_detection],
outputs=[input_type, output_image],
label="Examples (Images)",
)
if __name__ == '__main__':
app.launch() |