# for Zero GPU Spaces compatibility
import spaces
@spaces.GPU
def dummy_gpu():
    pass

import gradio as gr
import numpy as np
import cv2
import torch
import onnxruntime as ort
from optimum.onnxruntime import ORTModel
from ultralytics import YOLO
import os
from typing import Tuple, List
import subprocess

def install_cuda_toolkit():
    print("Installing CUDA Toolkit.") 
    #CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"
    CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run"
    CUDA_TOOLKIT_FILE = "/tmp/%s" % os.path.basename(CUDA_TOOLKIT_URL)
    subprocess.call(["wget", "-q", CUDA_TOOLKIT_URL, "-O", CUDA_TOOLKIT_FILE])
    subprocess.call(["chmod", "+x", CUDA_TOOLKIT_FILE])
    subprocess.call([CUDA_TOOLKIT_FILE, "--silent", "--toolkit"])

    os.environ["CUDA_HOME"] = "/usr/local/cuda"
    os.environ["PATH"] = "%s/bin:%s" % (os.environ["CUDA_HOME"], os.environ["PATH"])
    os.environ["LD_LIBRARY_PATH"] = "%s/lib:%s" % (
        os.environ["CUDA_HOME"],
        "" if "LD_LIBRARY_PATH" not in os.environ else os.environ["LD_LIBRARY_PATH"],
    )
    # Fix: arch_list[-1] += '+PTX'; IndexError: list index out of range
    os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6"

#install_cuda_toolkit()

# Configuration - UPDATE THESE VALUES
MODEL_PT_PATH = "model.pt"    # Your trained PyTorch model
MODEL_ONNX_PATH = "model.onnx" # Output ONNX model name
INPUT_SIZE = 640              # Must match training size
CLASS_NAMES = ["class0", "class1"]  # Your actual class names
CONF_THRESHOLD = 0.5          # Confidence threshold
IOU_THRESHOLD = 0.45          # NMS IoU threshold
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

session_options = ort.SessionOptions()
session_options.log_severity_level = 0

def convert_pt_to_onnx():
    """Convert PyTorch model to ONNX format if not exists"""
    print(f'Converting model on {"cuda" if torch.cuda.is_available() else "cpu"}')
    if not os.path.exists(MODEL_ONNX_PATH):
        print("Converting PyTorch model to ONNX...")
        try:
            # Load trained YOLO model
            model = YOLO(MODEL_PT_PATH)
            
            # Export to ONNX with correct parameters
            model.export(
                format="onnx",
                imgsz=INPUT_SIZE,
                opset=12,
                simplify=True,
                dynamic=False,
                half=False  # Disable for maximum compatibility
            )
            
            # Rename exported model (Ultralytics uses default name)
            if os.path.exists("yolov8n.onnx"):
                os.rename("yolov8n.onnx", MODEL_ONNX_PATH)
                
            print("ONNX conversion successful!")
        except Exception as e:
            raise RuntimeError(f"ONNX conversion failed: {str(e)}")

def load_onnx_model() -> ort.InferenceSession:
    """Initialize ONNX runtime session"""
    print(f'Loading model on {"cuda" if torch.cuda.is_available() else "cpu"}')
    providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if DEVICE != "cpu" else ['CPUExecutionProvider']
    try:
        #return ort.InferenceSession(MODEL_ONNX_PATH, providers=providers, session_options=session_options, export=True)
        return ORTModel.load_model(MODEL_ONNX_PATH, provider='CUDAExecutionProvider' if DEVICE != "cpu" else 'CPUExecutionProvider', session_options=session_options)
    except Exception as e:
        raise RuntimeError(f"Failed to load ONNX model: {str(e)}")

# Initialize model
convert_pt_to_onnx()
ort_session = load_onnx_model()
print("Available Providers: ", ort_session._providers)
#assert "CUDAExecutionProvider" in ort_session._providers

def letterbox_image(image: np.ndarray) -> Tuple[np.ndarray, float, Tuple[int, int]]:
    """
    Preprocess image using YOLO's letterboxing method
    Returns:
        - Processed image tensor
        - Scale ratio (original to processed)
        - Padding dimensions (width, height)
    """
    # Get original dimensions
    h, w = image.shape[:2]
    
    # Calculate scale and new dimensions
    scale = min(INPUT_SIZE / h, INPUT_SIZE / w)
    new_h, new_w = int(h * scale), int(w * scale)
    
    # Resize with antialiasing
    resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA)
    
    # Create canvas with 114-gray background
    canvas = np.full((INPUT_SIZE, INPUT_SIZE, 3), 114, dtype=np.uint8)
    
    # Calculate padding offsets
    pad_w = (INPUT_SIZE - new_w) // 2
    pad_h = (INPUT_SIZE - new_h) // 2
    
    # Paste resized image onto canvas
    canvas[pad_h:pad_h+new_h, pad_w:pad_w+new_w] = resized
    
    # Convert to float32 and normalize
    processed = canvas.astype(np.float32) / 255.0
    
    # Transpose to CHW format and add batch dimension
    processed = processed.transpose(2, 0, 1)[None, ...]
    
    return processed, scale, (pad_w, pad_h)

def process_detections(
    outputs: np.ndarray,
    scale: float,
    padding: Tuple[int, int],
    orig_shape: Tuple[int, int]
) -> Tuple[List[List[int]], List[float], List[int]]:
    """
    Process raw model outputs into usable detections
    Returns:
        - List of bounding boxes [x1, y1, x2, y2]
        - List of confidence scores
        - List of class IDs
    """
    # Transpose and squeeze outputs
    predictions = np.squeeze(outputs[0]).T
    
    # Filter by confidence threshold
    scores = np.max(predictions[:, 4:], axis=1)
    valid = scores > CONF_THRESHOLD
    predictions = predictions[valid]
    scores = scores[valid]
    
    if predictions.shape[0] == 0:
        return [], [], []
    
    # Extract boxes and classes
    boxes = predictions[:, :4]
    class_ids = np.argmax(predictions[:, 4:], axis=1)
    
    # Convert from center to corner coordinates
    boxes[:, [0, 1]] = boxes[:, [0, 1]] - boxes[:, [2, 3]] / 2  # xy top-left
    boxes[:, [2, 3]] = boxes[:, [0, 1]] + boxes[:, [2, 3]]      # xy bottom-right
    
    # Adjust for letterbox padding and scale
    pad_w, pad_h = padding
    boxes[:, [0, 2]] = (boxes[:, [0, 2]] - pad_w) / scale
    boxes[:, [1, 3]] = (boxes[:, [1, 3]] - pad_h) / scale
    
    # Clip coordinates to image dimensions
    boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, orig_shape[1])
    boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, orig_shape[0])
    
    # Convert to integer coordinates
    boxes = boxes.round().astype(int)
    
    # Apply NMS
    indices = cv2.dnn.NMSBoxes(
        boxes.tolist(),
        scores.tolist(),
        CONF_THRESHOLD,
        IOU_THRESHOLD
    )
    
    if len(indices) == 0:
        return [], [], []
    
    # Return filtered results
    return boxes[indices], scores[indices], class_ids[indices]

def draw_detections(
    image: np.ndarray,
    boxes: List[List[int]],
    scores: List[float],
    class_ids: List[int]
) -> np.ndarray:
    """Draw bounding boxes and labels on image"""
    output = image.copy()
    
    for box, score, class_id in zip(boxes, scores, class_ids):
        x1, y1, x2, y2 = box
        
        # Draw bounding box
        color = (0, 255, 0)  # Green
        cv2.rectangle(output, (x1, y1), (x2, y2), color, 2)
        
        # Create label
        label = f"{CLASS_NAMES[class_id]}: {score:.2f}"
        
        # Get text size
        (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
        
        # Draw text background
        cv2.rectangle(
            output,
            (x1, y1 - th - 4),
            (x1 + tw, y1),
            color,
            -1
        )
        
        # Draw text
        cv2.putText(
            output,
            label,
            (x1, y1 - 4),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.5,
            (0, 0, 0),
            1,
            cv2.LINE_AA
        )
    
    return output

def inference_frame(frame: np.ndarray) -> np.ndarray:
    """Full processing pipeline for single frame"""
    # Preprocess
    input_tensor, scale, padding = letterbox_image(frame)
    
    # Inference
    outputs = ort_session.run(
        None,
        {ort_session.get_inputs()[0].name: input_tensor}
    )
    
    # Post-process
    boxes, scores, class_ids = process_detections(
        outputs,
        scale,
        padding,
        frame.shape[:2]
    )
    
    # Draw results
    if len(boxes) > 0:
        frame = draw_detections(frame, boxes, scores, class_ids)
    
    return frame

# Gradio interface setup
with gr.Blocks() as app:
    gr.Markdown("# Real-Time YOLOv8 Object Detection")
    
    with gr.Row():
        webcam = gr.Image(
            sources=["webcam"],
            streaming=True,
            label="Webcam Input"
        )
        output = gr.Image(
            label="Detections",
            interactive=False
        )
    
    webcam.stream(
        fn=inference_frame,
        inputs=webcam,
        outputs=output,
        show_progress="hidden"
    )

if __name__ == "__main__":
    app.launch(show_error=True)

# https://discuss.huggingface.co/t/failed-to-create-cudaexecutionprovider/26501
# https://stackoverflow.com/questions/75267445/why-does-onnxruntime-fail-to-create-cudaexecutionprovider-in-linuxubuntu-20
# https://github.com/microsoft/onnxruntime/issues/4292
# https://github.com/ultralytics/ultralytics/issues/664