import io
import numpy
import gradio
import spaces
import moviepy
import supervision
from PIL import Image
from ultralytics import YOLOE


@spaces.GPU
def inference(video):
    model = YOLOE("./model.pt").to("cuda")
    names = ["person", "vehicle"]
    model.set_classes(names, model.get_text_pe(names))
    clip = moviepy.VideoFileClip(video)
    results = []
    for i, frame in enumerate(clip.iter_frames(fps=1)):
        image = Image.fromarray(numpy.uint8(frame))
        result = model.predict(frame, imgsz=640, conf=0.25, iou=0.7)
        detections = supervision.Detections.from_ultralytics(result[0])
        resolution_wh = image.size
        thickness = supervision.calculate_optimal_line_thickness(resolution_wh=resolution_wh)
        text_scale = supervision.calculate_optimal_text_scale(resolution_wh=resolution_wh)
        labels = [
            f"{class_name} {confidence:.2f}"
            for class_name, confidence
            in zip(detections['class_name'], detections.confidence)
        ]
        annotated_image = image.copy()
        annotated_image = supervision.MaskAnnotator(color_lookup=supervision.ColorLookup.INDEX, opacity=0.4).annotate(
            scene=annotated_image, detections=detections)
        annotated_image = supervision.BoxAnnotator(color_lookup=supervision.ColorLookup.INDEX,
                                                   thickness=thickness).annotate(
            scene=annotated_image, detections=detections)
        annotated_image = supervision.LabelAnnotator(color_lookup=supervision.ColorLookup.INDEX, text_scale=text_scale,
                                                     smart_position=True).annotate(
            scene=annotated_image, detections=detections, labels=labels)
        results.append(annotated_image)
    frames = [numpy.array(img) for img in results]
    output_clip = moviepy.ImageSequenceClip(frames, fps=1)
    buf = io.BytesIO()
    output_clip.write_videofile(buf, codec="libx264", audio=False)
    clip.close()
    buf.seek(0)
    return buf


if __name__ == "__main__":
    gradio.Interface(
        fn=inference,
        inputs=gradio.Video(),
        outputs=gradio.Video(),
        title="Video Object Detection",
        description="Upload a video to run object detection using YOLOE.",
    ).launch()