Spaces:
Sleeping
Sleeping
import numpy as np | |
from typing import List, Dict | |
import cv2 | |
from pathlib import Path | |
import torch | |
import yaml | |
import onnxruntime as ort | |
import os | |
import psutil | |
def _get_optimal_threads(): | |
"""Calculate optimal thread count for current system""" | |
physical_cores = psutil.cpu_count(logical=False) | |
logical_cores = psutil.cpu_count(logical=True) | |
# Optimal intra-op threads = physical cores | |
# For high-performance scenarios, use physical cores | |
intra_threads = physical_cores if physical_cores else 4 | |
print(f"System info: {physical_cores} physical cores, {logical_cores} logical cores") | |
print(f"Using {intra_threads} intra-op threads for optimal performance") | |
return intra_threads | |
def _preprocess_image_optimized(image: np.ndarray) -> np.ndarray: | |
"""Optimized preprocessing for minimal overhead""" | |
height, width = image.shape[:2] | |
# Resize with optimal interpolation | |
img_resized = cv2.resize(image, (640, 640), interpolation=cv2.INTER_LINEAR) | |
# RGB conversion (most efficient method) | |
img_rgb = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB) | |
# Normalize and transpose in one operation (memory efficient) | |
img_normalized = img_rgb.astype(np.float32, copy=False) / 255.0 | |
img_transposed = np.transpose(img_normalized, (2, 0, 1)) | |
img_batch = np.expand_dims(img_transposed, axis=0) | |
return img_batch, height, width | |
class YOLOv11Detector: | |
"""YOLOv11 detector optimized for ONNX Runtime v1.19 with opset 21""" | |
def __init__(self, config_path: str = "config.yaml"): | |
"""Initialize YOLOv11 detector with maximum ONNX Runtime optimizations""" | |
with open(config_path, 'r') as f: | |
self.config = yaml.safe_load(f) | |
model_path = self.config['model']['path'] | |
# Check which model file exists | |
if not Path(model_path).exists(): | |
# Try to find available model files | |
model_dir = Path("models") | |
if (model_dir / "best.pt").exists(): | |
model_path = str(model_dir / "best.pt") | |
print(f"Using best.pt model from training") | |
elif (model_dir / "last.pt").exists(): | |
model_path = str(model_dir / "last.pt") | |
print(f"Using last.pt checkpoint model") | |
elif (model_dir / "best.onnx").exists(): | |
model_path = str(model_dir / "best.onnx") | |
print(f"Using best.onnx model") | |
else: | |
raise FileNotFoundError(f"No model files found in models/ directory!") | |
self.model_path = model_path | |
self.device = self.config['model']['device'] | |
self.confidence = self.config['model']['confidence'] | |
self.iou_threshold = self.config['model']['iou_threshold'] | |
self.classes = self.config['detection']['classes'] | |
# Load model based on extension | |
if self.model_path.endswith('.onnx'): | |
self._load_onnx_model_optimized() # Will use optimizations for ONNX models | |
else: | |
self._load_pytorch_model() # Keep original PyTorch logic | |
def _load_pytorch_model(self): | |
"""Load PyTorch model using Ultralytics""" | |
from ultralytics import YOLO | |
self.model = YOLO(self.model_path) | |
# Set model to appropriate device | |
if self.device == 'cuda:0' and torch.cuda.is_available(): | |
self.model.to('cuda') | |
else: | |
self.model.to('cpu') | |
print(f"Loaded PyTorch model: {self.model_path} on device: {self.device}") | |
def _load_onnx_model_optimized(self): | |
"""Load ONNX model with MAXIMUM optimizations for v1.19 + opset 21""" | |
# Get optimal thread count | |
intra_threads = _get_optimal_threads() | |
# Configure MAXIMUM performance session options | |
sess_options = ort.SessionOptions() | |
# === GRAPH OPTIMIZATIONS (Level: ALL) === | |
# Enable ALL optimizations including layout optimizations | |
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL | |
# === THREADING OPTIMIZATIONS === | |
# Intra-op parallelism (within operators) - use physical cores | |
sess_options.intra_op_num_threads = intra_threads | |
# Inter-op parallelism (between operators) - keep at 1 for sequential execution | |
# Sequential execution often performs better than parallel for single inference | |
sess_options.inter_op_num_threads = 1 | |
sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL | |
# === MEMORY OPTIMIZATIONS === | |
# Enable memory pattern optimization (reduces memory allocation overhead) | |
sess_options.enable_mem_pattern = True | |
# Enable memory arena optimization (better memory reuse) | |
sess_options.enable_cpu_mem_arena = True | |
# === CPU PERFORMANCE OPTIMIZATIONS === | |
# Allow threads to spin waiting for work (trades CPU for latency) | |
sess_options.add_session_config_entry("session.intra_op.allow_spinning", "1") | |
sess_options.add_session_config_entry("session.inter_op.allow_spinning", "1") | |
# Dynamic cost model for better load balancing (reduces latency variance) | |
# Best value for dynamic_block_base is 4 according to docs | |
sess_options.add_session_config_entry("session.intra_op.dynamic_block_base", "4") | |
# For systems with >64 logical cores, use lock-free queues | |
logical_cores = psutil.cpu_count(logical=True) | |
if logical_cores and logical_cores > 64: | |
sess_options.add_session_config_entry("session.use_lock_free_queue", "1") | |
print("Enabled lock-free queues for high-core system") | |
# Disable profiling in production for best performance | |
sess_options.enable_profiling = False | |
# === EXECUTION PROVIDER CONFIGURATION === | |
providers = [] | |
provider_options = [] | |
if self.device == 'cuda:0' and ort.get_device() == 'GPU': | |
# CUDA EP with optimizations | |
cuda_options = { | |
'device_id': 0, | |
'arena_extend_strategy': 'kNextPowerOfTwo', # Better memory allocation | |
'gpu_mem_limit': 2 * 1024 * 1024 * 1024, # 2GB limit | |
'cudnn_conv_algo_search': 'EXHAUSTIVE', # Find best conv algorithms | |
'do_copy_in_default_stream': True # Better stream utilization | |
} | |
providers.append('CUDAExecutionProvider') | |
provider_options.append(cuda_options) | |
print("CUDA EP configured with optimizations") | |
# CPU EP with OpenMP optimizations (always fallback) | |
cpu_options = { | |
'use_arena': True, | |
'arena_extend_strategy': 'kSameAsRequested' | |
} | |
providers.append('CPUExecutionProvider') | |
provider_options.append(cpu_options) | |
# === SET OPENMP ENVIRONMENT VARIABLES FOR OPTIMAL CPU PERFORMANCE === | |
# These should ideally be set before importing onnxruntime, but we set them anyway | |
os.environ['OMP_NUM_THREADS'] = str(intra_threads) | |
os.environ['OMP_WAIT_POLICY'] = 'ACTIVE' # Don't yield CPU, faster inference | |
os.environ['OMP_NESTED'] = '0' # Disable nested parallelism | |
# For Intel CPUs: compact affinity for better cache usage | |
os.environ['KMP_AFFINITY'] = 'granularity=fine,compact,1,0' | |
print(f"OpenMP configuration: threads={intra_threads}, policy=ACTIVE") | |
# === CREATE OPTIMIZED SESSION === | |
self.session = ort.InferenceSession( | |
self.model_path, | |
sess_options=sess_options, | |
providers=providers, | |
provider_options=provider_options | |
) | |
# Get input/output info | |
self.input_name = self.session.get_inputs()[0].name | |
self.output_name = self.session.get_outputs()[0].name | |
# Verify opset version (should be 21 for latest optimizations) | |
try: | |
# This might not always be available, but good to check | |
model_meta = self.session.get_modelmeta() | |
print(f"Model metadata - Domain: {getattr(model_meta, 'domain', 'N/A')}") | |
except: | |
pass | |
provider_used = self.session.get_providers()[0] | |
print(f"✅ ONNX Runtime v{ort.__version__} - Optimized session created") | |
print(f"📈 Provider: {provider_used}") | |
print(f"🧵 Threading: {intra_threads} intra-op threads, sequential execution") | |
print(f"🚀 Optimizations: Graph=ALL, Memory=Enabled, Spinning=Enabled, Dynamic=Enabled") | |
def detect(self, image: np.ndarray) -> Dict: | |
""" | |
Perform detection on image with maximum optimization | |
Args: | |
image: Input image as numpy array (BGR format) | |
Returns: | |
Dictionary containing detection results | |
""" | |
if self.model_path.endswith('.onnx'): | |
return self._detect_onnx_optimized(image) | |
else: | |
return self._detect_pytorch(image) | |
def _detect_pytorch(self, image: np.ndarray) -> Dict: | |
"""Detection using PyTorch model""" | |
from ultralytics import YOLO | |
results = self.model( | |
image, | |
conf=self.confidence, | |
iou=self.iou_threshold, | |
device=self.device if 'cuda' in self.device and torch.cuda.is_available() else 'cpu', | |
verbose=False | |
) | |
# Parse results | |
detections = { | |
'boxes': [], | |
'confidences': [], | |
'classes': [], | |
'class_ids': [] | |
} | |
if len(results) > 0 and results[0].boxes is not None: | |
boxes = results[0].boxes | |
for box in boxes: | |
x1, y1, x2, y2 = box.xyxy[0].cpu().numpy() | |
conf = float(box.conf[0].cpu().numpy()) | |
cls_id = int(box.cls[0].cpu().numpy()) | |
cls_name = self.classes[cls_id] if cls_id < len(self.classes) else f"class_{cls_id}" | |
detections['boxes'].append([int(x1), int(y1), int(x2), int(y2)]) | |
detections['confidences'].append(conf) | |
detections['classes'].append(cls_name) | |
detections['class_ids'].append(cls_id) | |
return detections | |
def _detect_onnx_optimized(self, image: np.ndarray) -> Dict: | |
"""Optimized ONNX detection with minimal overhead""" | |
# Optimized preprocessing | |
input_tensor, orig_height, orig_width = _preprocess_image_optimized(image) | |
# Run inference (optimized session handles the rest) | |
output = self.session.run([self.output_name], {self.input_name: input_tensor})[0] | |
# Optimized output parsing | |
detections = self._parse_yolo_output_optimized(output, orig_height, orig_width) | |
return detections | |
def _parse_yolo_output_optimized(self, output: np.ndarray, orig_height: int, orig_width: int) -> Dict: | |
"""Optimized YOLO output parsing for maximum performance""" | |
# Output shape: [1, 84, 8400] -> transpose to [8400, 84] | |
output = output[0].transpose(1, 0) | |
num_classes = len(self.classes) | |
x_factor = orig_width / 640.0 | |
y_factor = orig_height / 640.0 | |
# Vectorized operations for better performance | |
class_scores = output[:, 4:4 + num_classes] | |
max_confidences = np.max(class_scores, axis=1) | |
# Filter by confidence threshold (vectorized) | |
valid_indices = max_confidences > self.confidence | |
if not np.any(valid_indices): | |
return {'boxes': [], 'confidences': [], 'classes': [], 'class_ids': []} | |
# Extract valid detections | |
valid_output = output[valid_indices] | |
valid_confidences = max_confidences[valid_indices] | |
valid_class_ids = np.argmax(class_scores[valid_indices], axis=1) | |
# Convert bounding boxes (vectorized) | |
cx = valid_output[:, 0] * x_factor | |
cy = valid_output[:, 1] * y_factor | |
w = valid_output[:, 2] * x_factor | |
h = valid_output[:, 3] * y_factor | |
x1 = cx - w / 2 | |
y1 = cy - h / 2 | |
x2 = cx + w / 2 | |
y2 = cy + h / 2 | |
# Prepare for NMS | |
boxes_for_nms = np.column_stack([x1, y1, w, h]) | |
# Apply NMS | |
if len(boxes_for_nms) > 0: | |
indices = cv2.dnn.NMSBoxes( | |
boxes_for_nms.tolist(), | |
valid_confidences.tolist(), | |
self.confidence, | |
self.iou_threshold | |
) | |
if len(indices) > 0: | |
indices = indices.flatten() | |
# Final results | |
final_boxes = [[int(x1[i]), int(y1[i]), int(x2[i]), int(y2[i])] for i in indices] | |
final_confs = [float(valid_confidences[i]) for i in indices] | |
final_class_ids = [int(valid_class_ids[i]) for i in indices] | |
final_classes = [ | |
self.classes[cls_id] if cls_id < num_classes else f"class_{cls_id}" | |
for cls_id in final_class_ids | |
] | |
return { | |
'boxes': final_boxes, | |
'confidences': final_confs, | |
'classes': final_classes, | |
'class_ids': final_class_ids | |
} | |
return {'boxes': [], 'confidences': [], 'classes': [], 'class_ids': []} | |
def detect_batch(self, images: List[np.ndarray]) -> List[Dict]: | |
"""Optimized batch detection""" | |
if self.model_path.endswith('.onnx'): | |
return self._detect_batch_onnx_optimized(images) | |
else: | |
return [self.detect(img) for img in images] | |
def _detect_batch_onnx_optimized(self, images: List[np.ndarray]) -> List[Dict]: | |
"""Batch processing for ONNX with memory optimization""" | |
results = [] | |
# Process images in optimal batch sizes to balance memory and performance | |
batch_size = min(4, len(images)) # Limit batch size for memory efficiency | |
for i in range(0, len(images), batch_size): | |
batch_images = images[i:i + batch_size] | |
batch_results = [] | |
for img in batch_images: | |
batch_results.append(self.detect(img)) | |
results.extend(batch_results) | |
return results | |
def get_performance_info(self) -> Dict: | |
"""Get current performance configuration info""" | |
info = { | |
"model_path": self.model_path, | |
"model_type": "ONNX" if self.model_path.endswith('.onnx') else "PyTorch", | |
"onnx_version": ort.__version__ if hasattr(self, 'session') else None, | |
"confidence_threshold": self.confidence, | |
"iou_threshold": self.iou_threshold, | |
"classes": self.classes | |
} | |
if hasattr(self, 'session'): | |
info.update({ | |
"providers": self.session.get_providers(), | |
"optimization_level": "ORT_ENABLE_ALL", | |
"memory_optimizations": "Enabled", | |
"threading_optimizations": "Enabled", | |
"dynamic_cost_model": "Enabled" | |
}) | |
return info |