yolocar / src /detection.py
minh9972t12's picture
Update src/detection.py
c534101
raw
history blame
15 kB
import numpy as np
from typing import List, Dict
import cv2
from pathlib import Path
import torch
import yaml
import onnxruntime as ort
import os
import psutil
def _get_optimal_threads():
"""Calculate optimal thread count for current system"""
physical_cores = psutil.cpu_count(logical=False)
logical_cores = psutil.cpu_count(logical=True)
# Optimal intra-op threads = physical cores
# For high-performance scenarios, use physical cores
intra_threads = physical_cores if physical_cores else 4
print(f"System info: {physical_cores} physical cores, {logical_cores} logical cores")
print(f"Using {intra_threads} intra-op threads for optimal performance")
return intra_threads
def _preprocess_image_optimized(image: np.ndarray) -> np.ndarray:
"""Optimized preprocessing for minimal overhead"""
height, width = image.shape[:2]
# Resize with optimal interpolation
img_resized = cv2.resize(image, (640, 640), interpolation=cv2.INTER_LINEAR)
# RGB conversion (most efficient method)
img_rgb = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB)
# Normalize and transpose in one operation (memory efficient)
img_normalized = img_rgb.astype(np.float32, copy=False) / 255.0
img_transposed = np.transpose(img_normalized, (2, 0, 1))
img_batch = np.expand_dims(img_transposed, axis=0)
return img_batch, height, width
class YOLOv11Detector:
"""YOLOv11 detector optimized for ONNX Runtime v1.19 with opset 21"""
def __init__(self, config_path: str = "config.yaml"):
"""Initialize YOLOv11 detector with maximum ONNX Runtime optimizations"""
with open(config_path, 'r') as f:
self.config = yaml.safe_load(f)
model_path = self.config['model']['path']
# Check which model file exists
if not Path(model_path).exists():
# Try to find available model files
model_dir = Path("models")
if (model_dir / "best.pt").exists():
model_path = str(model_dir / "best.pt")
print(f"Using best.pt model from training")
elif (model_dir / "last.pt").exists():
model_path = str(model_dir / "last.pt")
print(f"Using last.pt checkpoint model")
elif (model_dir / "best.onnx").exists():
model_path = str(model_dir / "best.onnx")
print(f"Using best.onnx model")
else:
raise FileNotFoundError(f"No model files found in models/ directory!")
self.model_path = model_path
self.device = self.config['model']['device']
self.confidence = self.config['model']['confidence']
self.iou_threshold = self.config['model']['iou_threshold']
self.classes = self.config['detection']['classes']
# Load model based on extension
if self.model_path.endswith('.onnx'):
self._load_onnx_model_optimized() # Will use optimizations for ONNX models
else:
self._load_pytorch_model() # Keep original PyTorch logic
def _load_pytorch_model(self):
"""Load PyTorch model using Ultralytics"""
from ultralytics import YOLO
self.model = YOLO(self.model_path)
# Set model to appropriate device
if self.device == 'cuda:0' and torch.cuda.is_available():
self.model.to('cuda')
else:
self.model.to('cpu')
print(f"Loaded PyTorch model: {self.model_path} on device: {self.device}")
def _load_onnx_model_optimized(self):
"""Load ONNX model with MAXIMUM optimizations for v1.19 + opset 21"""
# Get optimal thread count
intra_threads = _get_optimal_threads()
# Configure MAXIMUM performance session options
sess_options = ort.SessionOptions()
# === GRAPH OPTIMIZATIONS (Level: ALL) ===
# Enable ALL optimizations including layout optimizations
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# === THREADING OPTIMIZATIONS ===
# Intra-op parallelism (within operators) - use physical cores
sess_options.intra_op_num_threads = intra_threads
# Inter-op parallelism (between operators) - keep at 1 for sequential execution
# Sequential execution often performs better than parallel for single inference
sess_options.inter_op_num_threads = 1
sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
# === MEMORY OPTIMIZATIONS ===
# Enable memory pattern optimization (reduces memory allocation overhead)
sess_options.enable_mem_pattern = True
# Enable memory arena optimization (better memory reuse)
sess_options.enable_cpu_mem_arena = True
# === CPU PERFORMANCE OPTIMIZATIONS ===
# Allow threads to spin waiting for work (trades CPU for latency)
sess_options.add_session_config_entry("session.intra_op.allow_spinning", "1")
sess_options.add_session_config_entry("session.inter_op.allow_spinning", "1")
# Dynamic cost model for better load balancing (reduces latency variance)
# Best value for dynamic_block_base is 4 according to docs
sess_options.add_session_config_entry("session.intra_op.dynamic_block_base", "4")
# For systems with >64 logical cores, use lock-free queues
logical_cores = psutil.cpu_count(logical=True)
if logical_cores and logical_cores > 64:
sess_options.add_session_config_entry("session.use_lock_free_queue", "1")
print("Enabled lock-free queues for high-core system")
# Disable profiling in production for best performance
sess_options.enable_profiling = False
# === EXECUTION PROVIDER CONFIGURATION ===
providers = []
provider_options = []
if self.device == 'cuda:0' and ort.get_device() == 'GPU':
# CUDA EP with optimizations
cuda_options = {
'device_id': 0,
'arena_extend_strategy': 'kNextPowerOfTwo', # Better memory allocation
'gpu_mem_limit': 2 * 1024 * 1024 * 1024, # 2GB limit
'cudnn_conv_algo_search': 'EXHAUSTIVE', # Find best conv algorithms
'do_copy_in_default_stream': True # Better stream utilization
}
providers.append('CUDAExecutionProvider')
provider_options.append(cuda_options)
print("CUDA EP configured with optimizations")
# CPU EP with OpenMP optimizations (always fallback)
cpu_options = {
'use_arena': True,
'arena_extend_strategy': 'kSameAsRequested'
}
providers.append('CPUExecutionProvider')
provider_options.append(cpu_options)
# === SET OPENMP ENVIRONMENT VARIABLES FOR OPTIMAL CPU PERFORMANCE ===
# These should ideally be set before importing onnxruntime, but we set them anyway
os.environ['OMP_NUM_THREADS'] = str(intra_threads)
os.environ['OMP_WAIT_POLICY'] = 'ACTIVE' # Don't yield CPU, faster inference
os.environ['OMP_NESTED'] = '0' # Disable nested parallelism
# For Intel CPUs: compact affinity for better cache usage
os.environ['KMP_AFFINITY'] = 'granularity=fine,compact,1,0'
print(f"OpenMP configuration: threads={intra_threads}, policy=ACTIVE")
# === CREATE OPTIMIZED SESSION ===
self.session = ort.InferenceSession(
self.model_path,
sess_options=sess_options,
providers=providers,
provider_options=provider_options
)
# Get input/output info
self.input_name = self.session.get_inputs()[0].name
self.output_name = self.session.get_outputs()[0].name
# Verify opset version (should be 21 for latest optimizations)
try:
# This might not always be available, but good to check
model_meta = self.session.get_modelmeta()
print(f"Model metadata - Domain: {getattr(model_meta, 'domain', 'N/A')}")
except:
pass
provider_used = self.session.get_providers()[0]
print(f"✅ ONNX Runtime v{ort.__version__} - Optimized session created")
print(f"📈 Provider: {provider_used}")
print(f"🧵 Threading: {intra_threads} intra-op threads, sequential execution")
print(f"🚀 Optimizations: Graph=ALL, Memory=Enabled, Spinning=Enabled, Dynamic=Enabled")
def detect(self, image: np.ndarray) -> Dict:
"""
Perform detection on image with maximum optimization
Args:
image: Input image as numpy array (BGR format)
Returns:
Dictionary containing detection results
"""
if self.model_path.endswith('.onnx'):
return self._detect_onnx_optimized(image)
else:
return self._detect_pytorch(image)
def _detect_pytorch(self, image: np.ndarray) -> Dict:
"""Detection using PyTorch model"""
from ultralytics import YOLO
results = self.model(
image,
conf=self.confidence,
iou=self.iou_threshold,
device=self.device if 'cuda' in self.device and torch.cuda.is_available() else 'cpu',
verbose=False
)
# Parse results
detections = {
'boxes': [],
'confidences': [],
'classes': [],
'class_ids': []
}
if len(results) > 0 and results[0].boxes is not None:
boxes = results[0].boxes
for box in boxes:
x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
conf = float(box.conf[0].cpu().numpy())
cls_id = int(box.cls[0].cpu().numpy())
cls_name = self.classes[cls_id] if cls_id < len(self.classes) else f"class_{cls_id}"
detections['boxes'].append([int(x1), int(y1), int(x2), int(y2)])
detections['confidences'].append(conf)
detections['classes'].append(cls_name)
detections['class_ids'].append(cls_id)
return detections
def _detect_onnx_optimized(self, image: np.ndarray) -> Dict:
"""Optimized ONNX detection with minimal overhead"""
# Optimized preprocessing
input_tensor, orig_height, orig_width = _preprocess_image_optimized(image)
# Run inference (optimized session handles the rest)
output = self.session.run([self.output_name], {self.input_name: input_tensor})[0]
# Optimized output parsing
detections = self._parse_yolo_output_optimized(output, orig_height, orig_width)
return detections
def _parse_yolo_output_optimized(self, output: np.ndarray, orig_height: int, orig_width: int) -> Dict:
"""Optimized YOLO output parsing for maximum performance"""
# Output shape: [1, 84, 8400] -> transpose to [8400, 84]
output = output[0].transpose(1, 0)
num_classes = len(self.classes)
x_factor = orig_width / 640.0
y_factor = orig_height / 640.0
# Vectorized operations for better performance
class_scores = output[:, 4:4 + num_classes]
max_confidences = np.max(class_scores, axis=1)
# Filter by confidence threshold (vectorized)
valid_indices = max_confidences > self.confidence
if not np.any(valid_indices):
return {'boxes': [], 'confidences': [], 'classes': [], 'class_ids': []}
# Extract valid detections
valid_output = output[valid_indices]
valid_confidences = max_confidences[valid_indices]
valid_class_ids = np.argmax(class_scores[valid_indices], axis=1)
# Convert bounding boxes (vectorized)
cx = valid_output[:, 0] * x_factor
cy = valid_output[:, 1] * y_factor
w = valid_output[:, 2] * x_factor
h = valid_output[:, 3] * y_factor
x1 = cx - w / 2
y1 = cy - h / 2
x2 = cx + w / 2
y2 = cy + h / 2
# Prepare for NMS
boxes_for_nms = np.column_stack([x1, y1, w, h])
# Apply NMS
if len(boxes_for_nms) > 0:
indices = cv2.dnn.NMSBoxes(
boxes_for_nms.tolist(),
valid_confidences.tolist(),
self.confidence,
self.iou_threshold
)
if len(indices) > 0:
indices = indices.flatten()
# Final results
final_boxes = [[int(x1[i]), int(y1[i]), int(x2[i]), int(y2[i])] for i in indices]
final_confs = [float(valid_confidences[i]) for i in indices]
final_class_ids = [int(valid_class_ids[i]) for i in indices]
final_classes = [
self.classes[cls_id] if cls_id < num_classes else f"class_{cls_id}"
for cls_id in final_class_ids
]
return {
'boxes': final_boxes,
'confidences': final_confs,
'classes': final_classes,
'class_ids': final_class_ids
}
return {'boxes': [], 'confidences': [], 'classes': [], 'class_ids': []}
def detect_batch(self, images: List[np.ndarray]) -> List[Dict]:
"""Optimized batch detection"""
if self.model_path.endswith('.onnx'):
return self._detect_batch_onnx_optimized(images)
else:
return [self.detect(img) for img in images]
def _detect_batch_onnx_optimized(self, images: List[np.ndarray]) -> List[Dict]:
"""Batch processing for ONNX with memory optimization"""
results = []
# Process images in optimal batch sizes to balance memory and performance
batch_size = min(4, len(images)) # Limit batch size for memory efficiency
for i in range(0, len(images), batch_size):
batch_images = images[i:i + batch_size]
batch_results = []
for img in batch_images:
batch_results.append(self.detect(img))
results.extend(batch_results)
return results
def get_performance_info(self) -> Dict:
"""Get current performance configuration info"""
info = {
"model_path": self.model_path,
"model_type": "ONNX" if self.model_path.endswith('.onnx') else "PyTorch",
"onnx_version": ort.__version__ if hasattr(self, 'session') else None,
"confidence_threshold": self.confidence,
"iou_threshold": self.iou_threshold,
"classes": self.classes
}
if hasattr(self, 'session'):
info.update({
"providers": self.session.get_providers(),
"optimization_level": "ORT_ENABLE_ALL",
"memory_optimizations": "Enabled",
"threading_optimizations": "Enabled",
"dynamic_cost_model": "Enabled"
})
return info