Spaces:
Sleeping
Sleeping
File size: 15,014 Bytes
c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 22e3ce6 c534101 c4d55e5 af98df1 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c0ad40d c534101 c0ad40d c534101 c0ad40d c534101 c0ad40d c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 22e3ce6 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c4d55e5 c534101 c0ad40d c534101 c0ad40d c534101 c0ad40d c534101 c0ad40d c534101 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 |
import numpy as np
from typing import List, Dict
import cv2
from pathlib import Path
import torch
import yaml
import onnxruntime as ort
import os
import psutil
def _get_optimal_threads():
"""Calculate optimal thread count for current system"""
physical_cores = psutil.cpu_count(logical=False)
logical_cores = psutil.cpu_count(logical=True)
# Optimal intra-op threads = physical cores
# For high-performance scenarios, use physical cores
intra_threads = physical_cores if physical_cores else 4
print(f"System info: {physical_cores} physical cores, {logical_cores} logical cores")
print(f"Using {intra_threads} intra-op threads for optimal performance")
return intra_threads
def _preprocess_image_optimized(image: np.ndarray) -> np.ndarray:
"""Optimized preprocessing for minimal overhead"""
height, width = image.shape[:2]
# Resize with optimal interpolation
img_resized = cv2.resize(image, (640, 640), interpolation=cv2.INTER_LINEAR)
# RGB conversion (most efficient method)
img_rgb = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB)
# Normalize and transpose in one operation (memory efficient)
img_normalized = img_rgb.astype(np.float32, copy=False) / 255.0
img_transposed = np.transpose(img_normalized, (2, 0, 1))
img_batch = np.expand_dims(img_transposed, axis=0)
return img_batch, height, width
class YOLOv11Detector:
"""YOLOv11 detector optimized for ONNX Runtime v1.19 with opset 21"""
def __init__(self, config_path: str = "config.yaml"):
"""Initialize YOLOv11 detector with maximum ONNX Runtime optimizations"""
with open(config_path, 'r') as f:
self.config = yaml.safe_load(f)
model_path = self.config['model']['path']
# Check which model file exists
if not Path(model_path).exists():
# Try to find available model files
model_dir = Path("models")
if (model_dir / "best.pt").exists():
model_path = str(model_dir / "best.pt")
print(f"Using best.pt model from training")
elif (model_dir / "last.pt").exists():
model_path = str(model_dir / "last.pt")
print(f"Using last.pt checkpoint model")
elif (model_dir / "best.onnx").exists():
model_path = str(model_dir / "best.onnx")
print(f"Using best.onnx model")
else:
raise FileNotFoundError(f"No model files found in models/ directory!")
self.model_path = model_path
self.device = self.config['model']['device']
self.confidence = self.config['model']['confidence']
self.iou_threshold = self.config['model']['iou_threshold']
self.classes = self.config['detection']['classes']
# Load model based on extension
if self.model_path.endswith('.onnx'):
self._load_onnx_model_optimized() # Will use optimizations for ONNX models
else:
self._load_pytorch_model() # Keep original PyTorch logic
def _load_pytorch_model(self):
"""Load PyTorch model using Ultralytics"""
from ultralytics import YOLO
self.model = YOLO(self.model_path)
# Set model to appropriate device
if self.device == 'cuda:0' and torch.cuda.is_available():
self.model.to('cuda')
else:
self.model.to('cpu')
print(f"Loaded PyTorch model: {self.model_path} on device: {self.device}")
def _load_onnx_model_optimized(self):
"""Load ONNX model with MAXIMUM optimizations for v1.19 + opset 21"""
# Get optimal thread count
intra_threads = _get_optimal_threads()
# Configure MAXIMUM performance session options
sess_options = ort.SessionOptions()
# === GRAPH OPTIMIZATIONS (Level: ALL) ===
# Enable ALL optimizations including layout optimizations
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# === THREADING OPTIMIZATIONS ===
# Intra-op parallelism (within operators) - use physical cores
sess_options.intra_op_num_threads = intra_threads
# Inter-op parallelism (between operators) - keep at 1 for sequential execution
# Sequential execution often performs better than parallel for single inference
sess_options.inter_op_num_threads = 1
sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
# === MEMORY OPTIMIZATIONS ===
# Enable memory pattern optimization (reduces memory allocation overhead)
sess_options.enable_mem_pattern = True
# Enable memory arena optimization (better memory reuse)
sess_options.enable_cpu_mem_arena = True
# === CPU PERFORMANCE OPTIMIZATIONS ===
# Allow threads to spin waiting for work (trades CPU for latency)
sess_options.add_session_config_entry("session.intra_op.allow_spinning", "1")
sess_options.add_session_config_entry("session.inter_op.allow_spinning", "1")
# Dynamic cost model for better load balancing (reduces latency variance)
# Best value for dynamic_block_base is 4 according to docs
sess_options.add_session_config_entry("session.intra_op.dynamic_block_base", "4")
# For systems with >64 logical cores, use lock-free queues
logical_cores = psutil.cpu_count(logical=True)
if logical_cores and logical_cores > 64:
sess_options.add_session_config_entry("session.use_lock_free_queue", "1")
print("Enabled lock-free queues for high-core system")
# Disable profiling in production for best performance
sess_options.enable_profiling = False
# === EXECUTION PROVIDER CONFIGURATION ===
providers = []
provider_options = []
if self.device == 'cuda:0' and ort.get_device() == 'GPU':
# CUDA EP with optimizations
cuda_options = {
'device_id': 0,
'arena_extend_strategy': 'kNextPowerOfTwo', # Better memory allocation
'gpu_mem_limit': 2 * 1024 * 1024 * 1024, # 2GB limit
'cudnn_conv_algo_search': 'EXHAUSTIVE', # Find best conv algorithms
'do_copy_in_default_stream': True # Better stream utilization
}
providers.append('CUDAExecutionProvider')
provider_options.append(cuda_options)
print("CUDA EP configured with optimizations")
# CPU EP with OpenMP optimizations (always fallback)
cpu_options = {
'use_arena': True,
'arena_extend_strategy': 'kSameAsRequested'
}
providers.append('CPUExecutionProvider')
provider_options.append(cpu_options)
# === SET OPENMP ENVIRONMENT VARIABLES FOR OPTIMAL CPU PERFORMANCE ===
# These should ideally be set before importing onnxruntime, but we set them anyway
os.environ['OMP_NUM_THREADS'] = str(intra_threads)
os.environ['OMP_WAIT_POLICY'] = 'ACTIVE' # Don't yield CPU, faster inference
os.environ['OMP_NESTED'] = '0' # Disable nested parallelism
# For Intel CPUs: compact affinity for better cache usage
os.environ['KMP_AFFINITY'] = 'granularity=fine,compact,1,0'
print(f"OpenMP configuration: threads={intra_threads}, policy=ACTIVE")
# === CREATE OPTIMIZED SESSION ===
self.session = ort.InferenceSession(
self.model_path,
sess_options=sess_options,
providers=providers,
provider_options=provider_options
)
# Get input/output info
self.input_name = self.session.get_inputs()[0].name
self.output_name = self.session.get_outputs()[0].name
# Verify opset version (should be 21 for latest optimizations)
try:
# This might not always be available, but good to check
model_meta = self.session.get_modelmeta()
print(f"Model metadata - Domain: {getattr(model_meta, 'domain', 'N/A')}")
except:
pass
provider_used = self.session.get_providers()[0]
print(f"β
ONNX Runtime v{ort.__version__} - Optimized session created")
print(f"π Provider: {provider_used}")
print(f"π§΅ Threading: {intra_threads} intra-op threads, sequential execution")
print(f"π Optimizations: Graph=ALL, Memory=Enabled, Spinning=Enabled, Dynamic=Enabled")
def detect(self, image: np.ndarray) -> Dict:
"""
Perform detection on image with maximum optimization
Args:
image: Input image as numpy array (BGR format)
Returns:
Dictionary containing detection results
"""
if self.model_path.endswith('.onnx'):
return self._detect_onnx_optimized(image)
else:
return self._detect_pytorch(image)
def _detect_pytorch(self, image: np.ndarray) -> Dict:
"""Detection using PyTorch model"""
from ultralytics import YOLO
results = self.model(
image,
conf=self.confidence,
iou=self.iou_threshold,
device=self.device if 'cuda' in self.device and torch.cuda.is_available() else 'cpu',
verbose=False
)
# Parse results
detections = {
'boxes': [],
'confidences': [],
'classes': [],
'class_ids': []
}
if len(results) > 0 and results[0].boxes is not None:
boxes = results[0].boxes
for box in boxes:
x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
conf = float(box.conf[0].cpu().numpy())
cls_id = int(box.cls[0].cpu().numpy())
cls_name = self.classes[cls_id] if cls_id < len(self.classes) else f"class_{cls_id}"
detections['boxes'].append([int(x1), int(y1), int(x2), int(y2)])
detections['confidences'].append(conf)
detections['classes'].append(cls_name)
detections['class_ids'].append(cls_id)
return detections
def _detect_onnx_optimized(self, image: np.ndarray) -> Dict:
"""Optimized ONNX detection with minimal overhead"""
# Optimized preprocessing
input_tensor, orig_height, orig_width = _preprocess_image_optimized(image)
# Run inference (optimized session handles the rest)
output = self.session.run([self.output_name], {self.input_name: input_tensor})[0]
# Optimized output parsing
detections = self._parse_yolo_output_optimized(output, orig_height, orig_width)
return detections
def _parse_yolo_output_optimized(self, output: np.ndarray, orig_height: int, orig_width: int) -> Dict:
"""Optimized YOLO output parsing for maximum performance"""
# Output shape: [1, 84, 8400] -> transpose to [8400, 84]
output = output[0].transpose(1, 0)
num_classes = len(self.classes)
x_factor = orig_width / 640.0
y_factor = orig_height / 640.0
# Vectorized operations for better performance
class_scores = output[:, 4:4 + num_classes]
max_confidences = np.max(class_scores, axis=1)
# Filter by confidence threshold (vectorized)
valid_indices = max_confidences > self.confidence
if not np.any(valid_indices):
return {'boxes': [], 'confidences': [], 'classes': [], 'class_ids': []}
# Extract valid detections
valid_output = output[valid_indices]
valid_confidences = max_confidences[valid_indices]
valid_class_ids = np.argmax(class_scores[valid_indices], axis=1)
# Convert bounding boxes (vectorized)
cx = valid_output[:, 0] * x_factor
cy = valid_output[:, 1] * y_factor
w = valid_output[:, 2] * x_factor
h = valid_output[:, 3] * y_factor
x1 = cx - w / 2
y1 = cy - h / 2
x2 = cx + w / 2
y2 = cy + h / 2
# Prepare for NMS
boxes_for_nms = np.column_stack([x1, y1, w, h])
# Apply NMS
if len(boxes_for_nms) > 0:
indices = cv2.dnn.NMSBoxes(
boxes_for_nms.tolist(),
valid_confidences.tolist(),
self.confidence,
self.iou_threshold
)
if len(indices) > 0:
indices = indices.flatten()
# Final results
final_boxes = [[int(x1[i]), int(y1[i]), int(x2[i]), int(y2[i])] for i in indices]
final_confs = [float(valid_confidences[i]) for i in indices]
final_class_ids = [int(valid_class_ids[i]) for i in indices]
final_classes = [
self.classes[cls_id] if cls_id < num_classes else f"class_{cls_id}"
for cls_id in final_class_ids
]
return {
'boxes': final_boxes,
'confidences': final_confs,
'classes': final_classes,
'class_ids': final_class_ids
}
return {'boxes': [], 'confidences': [], 'classes': [], 'class_ids': []}
def detect_batch(self, images: List[np.ndarray]) -> List[Dict]:
"""Optimized batch detection"""
if self.model_path.endswith('.onnx'):
return self._detect_batch_onnx_optimized(images)
else:
return [self.detect(img) for img in images]
def _detect_batch_onnx_optimized(self, images: List[np.ndarray]) -> List[Dict]:
"""Batch processing for ONNX with memory optimization"""
results = []
# Process images in optimal batch sizes to balance memory and performance
batch_size = min(4, len(images)) # Limit batch size for memory efficiency
for i in range(0, len(images), batch_size):
batch_images = images[i:i + batch_size]
batch_results = []
for img in batch_images:
batch_results.append(self.detect(img))
results.extend(batch_results)
return results
def get_performance_info(self) -> Dict:
"""Get current performance configuration info"""
info = {
"model_path": self.model_path,
"model_type": "ONNX" if self.model_path.endswith('.onnx') else "PyTorch",
"onnx_version": ort.__version__ if hasattr(self, 'session') else None,
"confidence_threshold": self.confidence,
"iou_threshold": self.iou_threshold,
"classes": self.classes
}
if hasattr(self, 'session'):
info.update({
"providers": self.session.get_providers(),
"optimization_level": "ORT_ENABLE_ALL",
"memory_optimizations": "Enabled",
"threading_optimizations": "Enabled",
"dynamic_cost_model": "Enabled"
})
return info |