import spaces
import gradio as gr
from PIL import Image, ImageDraw, ImageFont
from ultralytics import YOLO
from huggingface_hub import hf_hub_download
import cv2
import tempfile
import numpy as np

def download_model(model_filename):
    return hf_hub_download(repo_id="atalaydenknalbant/Yolov13", filename=model_filename)

@spaces.GPU
def yolo_inference(input_type, image, video, model_id, conf_threshold, iou_threshold, max_detection):
    model_path = download_model(model_id)

    if input_type == "Image":
        if image is None:
            width, height = 640, 480
            blank_image = Image.new("RGB", (width, height), color="white")
            draw = ImageDraw.Draw(blank_image)
            message = "No image provided"
            font = ImageFont.load_default(size=40)
            bbox = draw.textbbox((0, 0), message, font=font)
            text_width = bbox[2] - bbox[0]
            text_height = bbox[3] - bbox[1]
            text_x = (width - text_width) / 2
            text_y = (height - text_height) / 2
            draw.text((text_x, text_y), message, fill="black", font=font)
            return blank_image, None

        model = YOLO(model_path)
        results = model.predict(
            source=image,
            conf=conf_threshold,
            iou=iou_threshold,
            imgsz=640,
            max_det=max_detection,
            show_labels=True,
            show_conf=True,
        )
        for r in results:
            image_array = r.plot()
            annotated_image = Image.fromarray(image_array[..., ::-1])
        return annotated_image, None

    elif input_type == "Video":
        if video is None:
            width, height = 640, 480
            blank_image = Image.new("RGB", (width, height), color="white")
            draw = ImageDraw.Draw(blank_image)
            message = "No video provided"
            font = ImageFont.load_default(size=40)
            bbox = draw.textbbox((0, 0), message, font=font)
            text_width = bbox[2] - bbox[0]
            text_height = bbox[3] - bbox[1]
            text_x = (width - text_width) / 2
            text_y = (height - text_height) / 2
            draw.text((text_x, text_y), message, fill="black", font=font)
            temp_video_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
            fourcc = cv2.VideoWriter_fourcc(*"mp4v")
            out = cv2.VideoWriter(temp_video_file, fourcc, 1, (width, height))
            frame = cv2.cvtColor(np.array(blank_image), cv2.COLOR_RGB2BGR)
            out.write(frame)
            out.release()
            return None, temp_video_file

        model = YOLO(model_path)
        cap = cv2.VideoCapture(video)
        fps = cap.get(cv2.CAP_PROP_FPS) if cap.get(cv2.CAP_PROP_FPS) > 0 else 25
        frames = []
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            results = model.predict(
                source=pil_frame,
                conf=conf_threshold,
                iou=iou_threshold,
                imgsz=640,
                max_det=max_detection,
                show_labels=True,
                show_conf=True,
            )
            for r in results:
                annotated_frame_array = r.plot()
                annotated_frame = cv2.cvtColor(annotated_frame_array, cv2.COLOR_BGR2RGB)
            frames.append(annotated_frame)
        cap.release()
        if not frames:
            return None, None

        height_out, width_out, _ = frames[0].shape
        temp_video_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        out = cv2.VideoWriter(temp_video_file, fourcc, fps, (width_out, height_out))
        for f in frames:
            f_bgr = cv2.cvtColor(f, cv2.COLOR_RGB2BGR)
            out.write(f_bgr)
        out.release()
        return None, temp_video_file

    return None, None

def update_visibility(input_type):
    if input_type == "Image":
        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
    else:
        return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)

def yolo_inference_for_examples(image, model_id, conf_threshold, iou_threshold, max_detection):
    annotated_image, _ = yolo_inference(
        input_type="Image",
        image=image,
        video=None,
        model_id=model_id,
        conf_threshold=conf_threshold,
        iou_threshold=iou_threshold,
        max_detection=max_detection
    )
    return annotated_image

with gr.Blocks() as app:
    gr.Markdown("# Yolo13: Object Detection")
    gr.Markdown("Upload an image or video for inference using the latest YOLOv13 models.")
    gr.Markdown("📝 **Note:** Better-trained models will be deployed as they become available.")
    with gr.Accordion("Paper and Citation", open=False):
        gr.Markdown("""
        This application is based on the research from the paper: **YOLOv13: Real-Time Object Detection with Hypergraph-Enhanced Adaptive Visual Perception**.

        - **Authors:** Mengqi Lei, Siqi Li, Yihong Wu, et al.
        - **Preprint Link:** [https://arxiv.org/abs/2506.17733](https://arxiv.org/abs/2506.17733)

        **BibTeX:**
        ```
        @article{yolov13,
          title={YOLOv13: Real-Time Object Detection with Hypergraph-Enhanced Adaptive Visual Perception},
          author={Lei, Mengqi and Li, Siqi and Wu, Yihong and et al.},
          journal={arXiv preprint arXiv:2506.17733},
          year={2025}
        }
        ```
        """)

    with gr.Row():
        with gr.Column():
            image = gr.Image(type="pil", label="Image", visible=True)
            video = gr.Video(label="Video", visible=False)
            input_type = gr.Radio(
                choices=["Image", "Video"],
                value="Image",
                label="Input Type",
            )
            model_id = gr.Dropdown(
                label="Model Name",
                choices=[
                    'yolov13n.pt', 'yolov13s.pt', 'yolov13l.pt', 'yolov13x.pt',
                ],
                value="yolov13n.pt",
            )
            conf_threshold = gr.Slider(minimum=0, maximum=1, value=0.25, label="Confidence Threshold")
            iou_threshold = gr.Slider(minimum=0, maximum=1, value=0.45, label="IoU Threshold")
            max_detection = gr.Slider(minimum=1, maximum=300, step=1, value=300, label="Max Detection")
            infer_button = gr.Button("Detect Objects")
        with gr.Column():
            output_image = gr.Image(type="pil", label="Annotated Image", visible=True)
            output_video = gr.Video(label="Annotated Video", visible=False)
            gr.DeepLinkButton()

    input_type.change(
        fn=update_visibility,
        inputs=input_type,
        outputs=[image, video, output_image, output_video],
    )

    infer_button.click(
        fn=yolo_inference,
        inputs=[input_type, image, video, model_id, conf_threshold, iou_threshold, max_detection],
        outputs=[output_image, output_video],
    )

    gr.Examples(
        examples=[
            ["zidane.jpg", "yolov13s.pt", 0.35, 0.45, 300],
            ["bus.jpg", "yolov13l.pt", 0.35, 0.45, 300],
            ["yolo_vision.jpg", "yolov13x.pt", 0.35, 0.45, 300],
        ],
        fn=yolo_inference_for_examples,
        inputs=[image, model_id, conf_threshold, iou_threshold, max_detection],
        outputs=[output_image],
        label="Examples (Images)",
    )

if __name__ == '__main__':
    app.launch()