Spaces:

minh9972t12
/

yolocar

Running

App Files Files Community

minh9972t12 commited on 14 days ago

Commit

c534101

1 Parent(s): 5cc72f5

Update src/detection.py

Browse files

Files changed (1) hide show

src/detection.py +267 -140

src/detection.py CHANGED Viewed

@@ -1,14 +1,52 @@
 import numpy as np
-from typing import List, Dict, Tuple
 import cv2
 from pathlib import Path
 import yaml
 class YOLOv11Detector:
-    """YOLOv11 detector for car damage detection"""
     def __init__(self, config_path: str = "config.yaml"):
-        """Initialize YOLOv11 detector with configuration"""
         with open(config_path, 'r') as f:
             self.config = yaml.safe_load(f)
@@ -36,8 +74,11 @@ class YOLOv11Detector:
         self.iou_threshold = self.config['model']['iou_threshold']
         self.classes = self.config['detection']['classes']
-        # Load model - Ultralytics YOLO supports both .pt and .onnx
-        self._load_model()
     def _load_pytorch_model(self):
         """Load PyTorch model using Ultralytics"""
@@ -45,43 +86,146 @@ class YOLOv11Detector:
         self.model = YOLO(self.model_path)
         # Set model to appropriate device
-        if self.device == 'cuda:0':
             self.model.to('cuda')
-        print(f"Loaded PyTorch model: {self.model_path}")
-    def _load_onnx_model(self):
-        """Load ONNX model using OpenCV DNN"""
-        self.net = cv2.dnn.readNet(self.model_path)
-        # Set backend based on device
-        if self.device == 'cuda:0':
-            self.net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
-            self.net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
-        else:
-            self.net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
-            self.net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
-        print(f"Loaded ONNX model: {self.model_path}")
     def detect(self, image: np.ndarray) -> Dict:
         """
-        Perform detection on image
         Args:
             image: Input image as numpy array (BGR format)
         Returns:
             Dictionary containing detection results
         """
-        # Run YOLO inference (works for both PT and ONNX)
         results = self.model(
             image,
             conf=self.confidence,
             iou=self.iou_threshold,
-            device=self.device if not self.model_path.endswith('.onnx') else 'cpu',
             verbose=False
         )
-        # Parse results (same for both PT and ONNX)
         detections = {
             'boxes': [],
             'confidences': [],
@@ -91,21 +235,11 @@ class YOLOv11Detector:
         if len(results) > 0 and results[0].boxes is not None:
             boxes = results[0].boxes
             for box in boxes:
-                # Get box coordinates (xyxy format)
                 x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
-                # Get confidence and class
                 conf = float(box.conf[0].cpu().numpy())
                 cls_id = int(box.cls[0].cpu().numpy())
-                # Map class ID to class name
-                if cls_id < len(self.classes):
-                    cls_name = self.classes[cls_id]
-                else:
-                    cls_name = f"class_{cls_id}"
                 detections['boxes'].append([int(x1), int(y1), int(x2), int(y2)])
                 detections['confidences'].append(conf)
                 detections['classes'].append(cls_name)
@@ -113,139 +247,132 @@ class YOLOv11Detector:
         return detections
-    def _detect_pytorch(self, image: np.ndarray) -> Dict:
-        """Detection using PyTorch model"""
-        # Run YOLO inference
-        results = self.model(
-            image,
-            conf=self.confidence,
-            iou=self.iou_threshold,
-            device=self.device,
-            verbose=False
-        )
-        # Parse results
-        detections = {
-            'boxes': [],
-            'confidences': [],
-            'classes': [],
-            'class_ids': []
-        }
-        if len(results) > 0 and results[0].boxes is not None:
-            boxes = results[0].boxes
-            for box in boxes:
-                # Get box coordinates (xyxy format)
-                x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
-                # Get confidence and class
-                conf = float(box.conf[0].cpu().numpy())
-                cls_id = int(box.cls[0].cpu().numpy())
-                # Map class ID to class name
-                if cls_id < len(self.classes):
-                    cls_name = self.classes[cls_id]
-                else:
-                    cls_name = f"class_{cls_id}"
-                detections['boxes'].append([int(x1), int(y1), int(x2), int(y2)])
-                detections['confidences'].append(conf)
-                detections['classes'].append(cls_name)
-                detections['class_ids'].append(cls_id)
-        return detections
-    def _detect_onnx(self, image: np.ndarray) -> Dict:
-        """Detection using ONNX model (compatible with original code)"""
-        height, width = image.shape[:2]
-        # Preprocess image for ONNX
-        blob = cv2.dnn.blobFromImage(
-            image, 1/255.0, (640, 640),
-            swapRB=True, crop=False
-        )
-        self.net.setInput(blob)
-        preds = self.net.forward()
-        preds = preds.transpose((0, 2, 1))
-        # Extract outputs
-        detections = self._extract_onnx_output(
-            preds=preds,
-            image_shape=(height, width),
-            input_shape=(640, 640)
-        )
-        return detections
-    def _extract_onnx_output(self, preds: np.ndarray, image_shape: Tuple[int, int],
-                            input_shape: Tuple[int, int]) -> Dict:
-        """Extract detection results from ONNX model output"""
-        class_ids, confs, boxes = [], [], []
-        image_height, image_width = image_shape
-        input_height, input_width = input_shape
-        x_factor = image_width / input_width
-        y_factor = image_height / input_height
-        rows = preds[0].shape[0]
-        for i in range(rows):
-            row = preds[0][i]
-            conf = row[4]
-            classes_score = row[4:]
-            _, _, _, max_idx = cv2.minMaxLoc(classes_score)
-            class_id = max_idx[1]
-            if classes_score[class_id] > self.confidence:
-                confs.append(float(conf))
-                label = self.classes[int(class_id)] if int(class_id) < len(self.classes) else f"class_{class_id}"
-                class_ids.append(label)
-                # Extract boxes
-                x, y, w, h = row[0].item(), row[1].item(), row[2].item(), row[3].item()
-                left = int((x - 0.5 * w) * x_factor)
-                top = int((y - 0.5 * h) * y_factor)
-                width = int(w * x_factor)
-                height = int(h * y_factor)
-                box = [left, top, left + width, top + height]
-                boxes.append(box)
         # Apply NMS
-        if len(boxes) > 0:
             indices = cv2.dnn.NMSBoxes(
-                [[b[0], b[1], b[2]-b[0], b[3]-b[1]] for b in boxes],
-                confs, self.confidence, self.iou_threshold
             )
             if len(indices) > 0:
                 indices = indices.flatten()
                 return {
-                    'boxes': [boxes[i] for i in indices],
-                    'confidences': [confs[i] for i in indices],
-                    'classes': [class_ids[i] for i in indices],
-                    'class_ids': list(range(len(indices)))
                 }
         return {'boxes': [], 'confidences': [], 'classes': [], 'class_ids': []}
     def detect_batch(self, images: List[np.ndarray]) -> List[Dict]:
-        """Detect on multiple images"""
-        return [self.detect(img) for img in images]
-    def _load_model(self):
-        """Load model using Ultralytics (supports both PT and ONNX)"""
-        from ultralytics import YOLO
-        # Ultralytics YOLO can load both .pt and .onnx files
-        self.model = YOLO(self.model_path)
-        # Set model to appropriate device for PT models
-        # For ONNX, device is handled differently
-        if not self.model_path.endswith('.onnx'):
-            if self.device == 'cuda:0':
-                self.model.to('cuda')
-        model_type = "ONNX" if self.model_path.endswith('.onnx') else "PyTorch"
-        print(f"Loaded {model_type} model: {self.model_path}")

 import numpy as np
+from typing import List, Dict
 import cv2
 from pathlib import Path
+import torch
 import yaml
+import onnxruntime as ort
+import os
+import psutil
+def _get_optimal_threads():
+    """Calculate optimal thread count for current system"""
+    physical_cores = psutil.cpu_count(logical=False)
+    logical_cores = psutil.cpu_count(logical=True)
+    # Optimal intra-op threads = physical cores
+    # For high-performance scenarios, use physical cores
+    intra_threads = physical_cores if physical_cores else 4
+    print(f"System info: {physical_cores} physical cores, {logical_cores} logical cores")
+    print(f"Using {intra_threads} intra-op threads for optimal performance")
+    return intra_threads
+def _preprocess_image_optimized(image: np.ndarray) -> np.ndarray:
+    """Optimized preprocessing for minimal overhead"""
+    height, width = image.shape[:2]
+    # Resize with optimal interpolation
+    img_resized = cv2.resize(image, (640, 640), interpolation=cv2.INTER_LINEAR)
+    # RGB conversion (most efficient method)
+    img_rgb = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB)
+    # Normalize and transpose in one operation (memory efficient)
+    img_normalized = img_rgb.astype(np.float32, copy=False) / 255.0
+    img_transposed = np.transpose(img_normalized, (2, 0, 1))
+    img_batch = np.expand_dims(img_transposed, axis=0)
+    return img_batch, height, width
 class YOLOv11Detector:
+    """YOLOv11 detector optimized for ONNX Runtime v1.19 with opset 21"""
     def __init__(self, config_path: str = "config.yaml"):
+        """Initialize YOLOv11 detector with maximum ONNX Runtime optimizations"""
         with open(config_path, 'r') as f:
             self.config = yaml.safe_load(f)
         self.iou_threshold = self.config['model']['iou_threshold']
         self.classes = self.config['detection']['classes']
+        # Load model based on extension
+        if self.model_path.endswith('.onnx'):
+            self._load_onnx_model_optimized()  # Will use optimizations for ONNX models
+        else:
+            self._load_pytorch_model()  # Keep original PyTorch logic
     def _load_pytorch_model(self):
         """Load PyTorch model using Ultralytics"""
         self.model = YOLO(self.model_path)
         # Set model to appropriate device
+        if self.device == 'cuda:0' and torch.cuda.is_available():
             self.model.to('cuda')
+        else:
+            self.model.to('cpu')
+        print(f"Loaded PyTorch model: {self.model_path} on device: {self.device}")
+    def _load_onnx_model_optimized(self):
+        """Load ONNX model with MAXIMUM optimizations for v1.19 + opset 21"""
+        # Get optimal thread count
+        intra_threads = _get_optimal_threads()
+        # Configure MAXIMUM performance session options
+        sess_options = ort.SessionOptions()
+        # === GRAPH OPTIMIZATIONS (Level: ALL) ===
+        # Enable ALL optimizations including layout optimizations
+        sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        # === THREADING OPTIMIZATIONS ===
+        # Intra-op parallelism (within operators) - use physical cores
+        sess_options.intra_op_num_threads = intra_threads
+        # Inter-op parallelism (between operators) - keep at 1 for sequential execution
+        # Sequential execution often performs better than parallel for single inference
+        sess_options.inter_op_num_threads = 1
+        sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+        # === MEMORY OPTIMIZATIONS ===
+        # Enable memory pattern optimization (reduces memory allocation overhead)
+        sess_options.enable_mem_pattern = True
+        # Enable memory arena optimization (better memory reuse)
+        sess_options.enable_cpu_mem_arena = True
+        # === CPU PERFORMANCE OPTIMIZATIONS ===
+        # Allow threads to spin waiting for work (trades CPU for latency)
+        sess_options.add_session_config_entry("session.intra_op.allow_spinning", "1")
+        sess_options.add_session_config_entry("session.inter_op.allow_spinning", "1")
+        # Dynamic cost model for better load balancing (reduces latency variance)
+        # Best value for dynamic_block_base is 4 according to docs
+        sess_options.add_session_config_entry("session.intra_op.dynamic_block_base", "4")
+        # For systems with >64 logical cores, use lock-free queues
+        logical_cores = psutil.cpu_count(logical=True)
+        if logical_cores and logical_cores > 64:
+            sess_options.add_session_config_entry("session.use_lock_free_queue", "1")
+            print("Enabled lock-free queues for high-core system")
+        # Disable profiling in production for best performance
+        sess_options.enable_profiling = False
+        # === EXECUTION PROVIDER CONFIGURATION ===
+        providers = []
+        provider_options = []
+        if self.device == 'cuda:0' and ort.get_device() == 'GPU':
+            # CUDA EP with optimizations
+            cuda_options = {
+                'device_id': 0,
+                'arena_extend_strategy': 'kNextPowerOfTwo',  # Better memory allocation
+                'gpu_mem_limit': 2 * 1024 * 1024 * 1024,  # 2GB limit
+                'cudnn_conv_algo_search': 'EXHAUSTIVE',  # Find best conv algorithms
+                'do_copy_in_default_stream': True  # Better stream utilization
+            }
+            providers.append('CUDAExecutionProvider')
+            provider_options.append(cuda_options)
+            print("CUDA EP configured with optimizations")
+        # CPU EP with OpenMP optimizations (always fallback)
+        cpu_options = {
+            'use_arena': True,
+            'arena_extend_strategy': 'kSameAsRequested'
+        }
+        providers.append('CPUExecutionProvider')
+        provider_options.append(cpu_options)
+        # === SET OPENMP ENVIRONMENT VARIABLES FOR OPTIMAL CPU PERFORMANCE ===
+        # These should ideally be set before importing onnxruntime, but we set them anyway
+        os.environ['OMP_NUM_THREADS'] = str(intra_threads)
+        os.environ['OMP_WAIT_POLICY'] = 'ACTIVE'  # Don't yield CPU, faster inference
+        os.environ['OMP_NESTED'] = '0'  # Disable nested parallelism
+        # For Intel CPUs: compact affinity for better cache usage
+        os.environ['KMP_AFFINITY'] = 'granularity=fine,compact,1,0'
+        print(f"OpenMP configuration: threads={intra_threads}, policy=ACTIVE")
+        # === CREATE OPTIMIZED SESSION ===
+        self.session = ort.InferenceSession(
+            self.model_path,
+            sess_options=sess_options,
+            providers=providers,
+            provider_options=provider_options
+        )
+        # Get input/output info
+        self.input_name = self.session.get_inputs()[0].name
+        self.output_name = self.session.get_outputs()[0].name
+        # Verify opset version (should be 21 for latest optimizations)
+        try:
+            # This might not always be available, but good to check
+            model_meta = self.session.get_modelmeta()
+            print(f"Model metadata - Domain: {getattr(model_meta, 'domain', 'N/A')}")
+        except:
+            pass
+        provider_used = self.session.get_providers()[0]
+        print(f"✅ ONNX Runtime v{ort.__version__} - Optimized session created")
+        print(f"📈 Provider: {provider_used}")
+        print(f"🧵 Threading: {intra_threads} intra-op threads, sequential execution")
+        print(f"🚀 Optimizations: Graph=ALL, Memory=Enabled, Spinning=Enabled, Dynamic=Enabled")
     def detect(self, image: np.ndarray) -> Dict:
         """
+        Perform detection on image with maximum optimization
         Args:
             image: Input image as numpy array (BGR format)
         Returns:
             Dictionary containing detection results
         """
+        if self.model_path.endswith('.onnx'):
+            return self._detect_onnx_optimized(image)
+        else:
+            return self._detect_pytorch(image)
+    def _detect_pytorch(self, image: np.ndarray) -> Dict:
+        """Detection using PyTorch model"""
+        from ultralytics import YOLO
         results = self.model(
             image,
             conf=self.confidence,
             iou=self.iou_threshold,
+            device=self.device if 'cuda' in self.device and torch.cuda.is_available() else 'cpu',
             verbose=False
         )
+        # Parse results
         detections = {
             'boxes': [],
             'confidences': [],
         if len(results) > 0 and results[0].boxes is not None:
             boxes = results[0].boxes
             for box in boxes:
                 x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
                 conf = float(box.conf[0].cpu().numpy())
                 cls_id = int(box.cls[0].cpu().numpy())
+                cls_name = self.classes[cls_id] if cls_id < len(self.classes) else f"class_{cls_id}"
                 detections['boxes'].append([int(x1), int(y1), int(x2), int(y2)])
                 detections['confidences'].append(conf)
                 detections['classes'].append(cls_name)
         return detections
+    def _detect_onnx_optimized(self, image: np.ndarray) -> Dict:
+        """Optimized ONNX detection with minimal overhead"""
+        # Optimized preprocessing
+        input_tensor, orig_height, orig_width = _preprocess_image_optimized(image)
+        # Run inference (optimized session handles the rest)
+        output = self.session.run([self.output_name], {self.input_name: input_tensor})[0]
+        # Optimized output parsing
+        detections = self._parse_yolo_output_optimized(output, orig_height, orig_width)
+        return detections
+    def _parse_yolo_output_optimized(self, output: np.ndarray, orig_height: int, orig_width: int) -> Dict:
+        """Optimized YOLO output parsing for maximum performance"""
+        # Output shape: [1, 84, 8400] -> transpose to [8400, 84]
+        output = output[0].transpose(1, 0)
+        num_classes = len(self.classes)
+        x_factor = orig_width / 640.0
+        y_factor = orig_height / 640.0
+        # Vectorized operations for better performance
+        class_scores = output[:, 4:4 + num_classes]
+        max_confidences = np.max(class_scores, axis=1)
+        # Filter by confidence threshold (vectorized)
+        valid_indices = max_confidences > self.confidence
+        if not np.any(valid_indices):
+            return {'boxes': [], 'confidences': [], 'classes': [], 'class_ids': []}
+        # Extract valid detections
+        valid_output = output[valid_indices]
+        valid_confidences = max_confidences[valid_indices]
+        valid_class_ids = np.argmax(class_scores[valid_indices], axis=1)
+        # Convert bounding boxes (vectorized)
+        cx = valid_output[:, 0] * x_factor
+        cy = valid_output[:, 1] * y_factor
+        w = valid_output[:, 2] * x_factor
+        h = valid_output[:, 3] * y_factor
+        x1 = cx - w / 2
+        y1 = cy - h / 2
+        x2 = cx + w / 2
+        y2 = cy + h / 2
+        # Prepare for NMS
+        boxes_for_nms = np.column_stack([x1, y1, w, h])
         # Apply NMS
+        if len(boxes_for_nms) > 0:
             indices = cv2.dnn.NMSBoxes(
+                boxes_for_nms.tolist(),
+                valid_confidences.tolist(),
+                self.confidence,
+                self.iou_threshold
             )
             if len(indices) > 0:
                 indices = indices.flatten()
+                # Final results
+                final_boxes = [[int(x1[i]), int(y1[i]), int(x2[i]), int(y2[i])] for i in indices]
+                final_confs = [float(valid_confidences[i]) for i in indices]
+                final_class_ids = [int(valid_class_ids[i]) for i in indices]
+                final_classes = [
+                    self.classes[cls_id] if cls_id < num_classes else f"class_{cls_id}"
+                    for cls_id in final_class_ids
+                ]
                 return {
+                    'boxes': final_boxes,
+                    'confidences': final_confs,
+                    'classes': final_classes,
+                    'class_ids': final_class_ids
                 }
         return {'boxes': [], 'confidences': [], 'classes': [], 'class_ids': []}
     def detect_batch(self, images: List[np.ndarray]) -> List[Dict]:
+        """Optimized batch detection"""
+        if self.model_path.endswith('.onnx'):
+            return self._detect_batch_onnx_optimized(images)
+        else:
+            return [self.detect(img) for img in images]
+    def _detect_batch_onnx_optimized(self, images: List[np.ndarray]) -> List[Dict]:
+        """Batch processing for ONNX with memory optimization"""
+        results = []
+        # Process images in optimal batch sizes to balance memory and performance
+        batch_size = min(4, len(images))  # Limit batch size for memory efficiency
+        for i in range(0, len(images), batch_size):
+            batch_images = images[i:i + batch_size]
+            batch_results = []
+            for img in batch_images:
+                batch_results.append(self.detect(img))
+            results.extend(batch_results)
+        return results
+    def get_performance_info(self) -> Dict:
+        """Get current performance configuration info"""
+        info = {
+            "model_path": self.model_path,
+            "model_type": "ONNX" if self.model_path.endswith('.onnx') else "PyTorch",
+            "onnx_version": ort.__version__ if hasattr(self, 'session') else None,
+            "confidence_threshold": self.confidence,
+            "iou_threshold": self.iou_threshold,
+            "classes": self.classes
+        }
+        if hasattr(self, 'session'):
+            info.update({
+                "providers": self.session.get_providers(),
+                "optimization_level": "ORT_ENABLE_ALL",
+                "memory_optimizations": "Enabled",
+                "threading_optimizations": "Enabled",
+                "dynamic_cost_model": "Enabled"
+            })
+        return info