# for Zero GPU Spaces compatibility import spaces @spaces.GPU def dummy_gpu(): pass import gradio as gr import numpy as np import cv2 import torch import onnxruntime as ort from optimum.onnxruntime import ORTModel from ultralytics import YOLO import os from typing import Tuple, List import subprocess def install_cuda_toolkit(): print("Installing CUDA Toolkit.") #CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run" CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run" CUDA_TOOLKIT_FILE = "/tmp/%s" % os.path.basename(CUDA_TOOLKIT_URL) subprocess.call(["wget", "-q", CUDA_TOOLKIT_URL, "-O", CUDA_TOOLKIT_FILE]) subprocess.call(["chmod", "+x", CUDA_TOOLKIT_FILE]) subprocess.call([CUDA_TOOLKIT_FILE, "--silent", "--toolkit"]) os.environ["CUDA_HOME"] = "/usr/local/cuda" os.environ["PATH"] = "%s/bin:%s" % (os.environ["CUDA_HOME"], os.environ["PATH"]) os.environ["LD_LIBRARY_PATH"] = "%s/lib:%s" % ( os.environ["CUDA_HOME"], "" if "LD_LIBRARY_PATH" not in os.environ else os.environ["LD_LIBRARY_PATH"], ) # Fix: arch_list[-1] += '+PTX'; IndexError: list index out of range os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6" #install_cuda_toolkit() # Configuration - UPDATE THESE VALUES MODEL_PT_PATH = "model.pt" # Your trained PyTorch model MODEL_ONNX_PATH = "model.onnx" # Output ONNX model name INPUT_SIZE = 640 # Must match training size CLASS_NAMES = ["class0", "class1"] # Your actual class names CONF_THRESHOLD = 0.5 # Confidence threshold IOU_THRESHOLD = 0.45 # NMS IoU threshold DEVICE = "cuda" if torch.cuda.is_available() else "cpu" session_options = ort.SessionOptions() session_options.log_severity_level = 0 def convert_pt_to_onnx(): """Convert PyTorch model to ONNX format if not exists""" print(f'Converting model on {"cuda" if torch.cuda.is_available() else "cpu"}') if not os.path.exists(MODEL_ONNX_PATH): print("Converting PyTorch model to ONNX...") try: # Load trained YOLO model model = YOLO(MODEL_PT_PATH) # Export to ONNX with correct parameters model.export( format="onnx", imgsz=INPUT_SIZE, opset=12, simplify=True, dynamic=False, half=False # Disable for maximum compatibility ) # Rename exported model (Ultralytics uses default name) if os.path.exists("yolov8n.onnx"): os.rename("yolov8n.onnx", MODEL_ONNX_PATH) print("ONNX conversion successful!") except Exception as e: raise RuntimeError(f"ONNX conversion failed: {str(e)}") def load_onnx_model() -> ort.InferenceSession: """Initialize ONNX runtime session""" print(f'Loading model on {"cuda" if torch.cuda.is_available() else "cpu"}') providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if DEVICE != "cpu" else ['CPUExecutionProvider'] try: #return ort.InferenceSession(MODEL_ONNX_PATH, providers=providers, session_options=session_options, export=True) return ORTModel.load_model(MODEL_ONNX_PATH, provider='CUDAExecutionProvider' if DEVICE != "cpu" else 'CPUExecutionProvider', session_options=session_options) except Exception as e: raise RuntimeError(f"Failed to load ONNX model: {str(e)}") # Initialize model convert_pt_to_onnx() ort_session = load_onnx_model() print("Available Providers: ", ort_session._providers) #assert "CUDAExecutionProvider" in ort_session._providers def letterbox_image(image: np.ndarray) -> Tuple[np.ndarray, float, Tuple[int, int]]: """ Preprocess image using YOLO's letterboxing method Returns: - Processed image tensor - Scale ratio (original to processed) - Padding dimensions (width, height) """ # Get original dimensions h, w = image.shape[:2] # Calculate scale and new dimensions scale = min(INPUT_SIZE / h, INPUT_SIZE / w) new_h, new_w = int(h * scale), int(w * scale) # Resize with antialiasing resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA) # Create canvas with 114-gray background canvas = np.full((INPUT_SIZE, INPUT_SIZE, 3), 114, dtype=np.uint8) # Calculate padding offsets pad_w = (INPUT_SIZE - new_w) // 2 pad_h = (INPUT_SIZE - new_h) // 2 # Paste resized image onto canvas canvas[pad_h:pad_h+new_h, pad_w:pad_w+new_w] = resized # Convert to float32 and normalize processed = canvas.astype(np.float32) / 255.0 # Transpose to CHW format and add batch dimension processed = processed.transpose(2, 0, 1)[None, ...] return processed, scale, (pad_w, pad_h) def process_detections( outputs: np.ndarray, scale: float, padding: Tuple[int, int], orig_shape: Tuple[int, int] ) -> Tuple[List[List[int]], List[float], List[int]]: """ Process raw model outputs into usable detections Returns: - List of bounding boxes [x1, y1, x2, y2] - List of confidence scores - List of class IDs """ # Transpose and squeeze outputs predictions = np.squeeze(outputs[0]).T # Filter by confidence threshold scores = np.max(predictions[:, 4:], axis=1) valid = scores > CONF_THRESHOLD predictions = predictions[valid] scores = scores[valid] if predictions.shape[0] == 0: return [], [], [] # Extract boxes and classes boxes = predictions[:, :4] class_ids = np.argmax(predictions[:, 4:], axis=1) # Convert from center to corner coordinates boxes[:, [0, 1]] = boxes[:, [0, 1]] - boxes[:, [2, 3]] / 2 # xy top-left boxes[:, [2, 3]] = boxes[:, [0, 1]] + boxes[:, [2, 3]] # xy bottom-right # Adjust for letterbox padding and scale pad_w, pad_h = padding boxes[:, [0, 2]] = (boxes[:, [0, 2]] - pad_w) / scale boxes[:, [1, 3]] = (boxes[:, [1, 3]] - pad_h) / scale # Clip coordinates to image dimensions boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, orig_shape[1]) boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, orig_shape[0]) # Convert to integer coordinates boxes = boxes.round().astype(int) # Apply NMS indices = cv2.dnn.NMSBoxes( boxes.tolist(), scores.tolist(), CONF_THRESHOLD, IOU_THRESHOLD ) if len(indices) == 0: return [], [], [] # Return filtered results return boxes[indices], scores[indices], class_ids[indices] def draw_detections( image: np.ndarray, boxes: List[List[int]], scores: List[float], class_ids: List[int] ) -> np.ndarray: """Draw bounding boxes and labels on image""" output = image.copy() for box, score, class_id in zip(boxes, scores, class_ids): x1, y1, x2, y2 = box # Draw bounding box color = (0, 255, 0) # Green cv2.rectangle(output, (x1, y1), (x2, y2), color, 2) # Create label label = f"{CLASS_NAMES[class_id]}: {score:.2f}" # Get text size (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) # Draw text background cv2.rectangle( output, (x1, y1 - th - 4), (x1 + tw, y1), color, -1 ) # Draw text cv2.putText( output, label, (x1, y1 - 4), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, cv2.LINE_AA ) return output def inference_frame(frame: np.ndarray) -> np.ndarray: """Full processing pipeline for single frame""" # Preprocess input_tensor, scale, padding = letterbox_image(frame) # Inference outputs = ort_session.run( None, {ort_session.get_inputs()[0].name: input_tensor} ) # Post-process boxes, scores, class_ids = process_detections( outputs, scale, padding, frame.shape[:2] ) # Draw results if len(boxes) > 0: frame = draw_detections(frame, boxes, scores, class_ids) return frame # Gradio interface setup with gr.Blocks() as app: gr.Markdown("# Real-Time YOLOv8 Object Detection") with gr.Row(): webcam = gr.Image( sources=["webcam"], streaming=True, label="Webcam Input" ) output = gr.Image( label="Detections", interactive=False ) webcam.stream( fn=inference_frame, inputs=webcam, outputs=output, show_progress="hidden" ) if __name__ == "__main__": app.launch(show_error=True) # https://discuss.huggingface.co/t/failed-to-create-cudaexecutionprovider/26501 # https://stackoverflow.com/questions/75267445/why-does-onnxruntime-fail-to-create-cudaexecutionprovider-in-linuxubuntu-20 # https://github.com/microsoft/onnxruntime/issues/4292 # https://github.com/ultralytics/ultralytics/issues/664