# import onnxruntime import numpy as np import cv2 from typing import Tuple, List, Union from .base_onnx import BaseONNX class COMMON_DETECTION_ONNX(BaseONNX): def __init__(self, model_path, labels: List[str], # 输入图片大小 input_size=(640, 640), # (w, h) iou_threshold: float = 0.5, score_threshold: float = 0.2, ): super().__init__(model_path, input_size) self.labels = labels self.label_colors = [] for i in range(len(labels)): self.label_colors.append((np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255))) self.iou_threshold = iou_threshold self.score_threshold = score_threshold def preprocess_image(self, image: cv2.UMat, to_rgb: bool = True) -> Tuple[np.ndarray, float, Tuple[int, int]]: if to_rgb: image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) target_size = self.input_size ori_shape = image.shape[:2] # 1. Resize with keep_ratio=True h, w = image.shape[:2] scale = min(target_size[0] / h, target_size[1] / w) new_h, new_w = int(h * scale), int(w * scale) resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LINEAR) # 2. Pad to 640x640 pad_h = target_size[0] - new_h pad_w = target_size[1] - new_w top, bottom = 0, pad_h left, right = 0, pad_w padded = cv2.copyMakeBorder( resized, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114) ) # img = img.astype(np.float32) # 3. Normalize (BGR format, matching mmdet pipeline) mean = np.array([103.53, 116.28, 123.675], dtype=np.float32) std = np.array([57.375, 57.12, 58.395], dtype=np.float32) normalized = (padded.astype(np.float32) - mean) / std # 4. Convert to (C, H, W) and add batch dimension input_tensor = normalized.transpose(2, 0, 1)[np.newaxis, ...] return input_tensor, scale, ori_shape def post_bbox(self, boxes, origin_shape, scale): """ 将onnx的输出结果转换为mmdet的输出结果, 与 preprocess_image 中 的预处理相反 boxes: (N, 4) x1, y1, x2, y2 origin_shape: (H, W) scale: 缩放因子,从 preprocess_image 获取 return: (N, 4) x1, y1, x2, y2 """ if boxes is None or len(boxes) == 0: return boxes boxes = boxes.copy() # 如果没有提供scale,假设是640x640输入,根据origin_shape计算scale if scale is None: target_size = 640 h, w = origin_shape scale = min(target_size / h, target_size / w) # 将坐标从缩放后的图像空间转换回原始图像空间 boxes /= scale # 裁剪到原始图像边界内 h, w = origin_shape boxes[:, 0] = np.clip(boxes[:, 0], 0, w) # x1 boxes[:, 1] = np.clip(boxes[:, 1], 0, h) # y1 boxes[:, 2] = np.clip(boxes[:, 2], 0, w) # x2 boxes[:, 3] = np.clip(boxes[:, 3], 0, h) # y2 return boxes def filter_results(self, boxes: np.ndarray, scores: np.ndarray, labels: np.ndarray, iou_threshold: float, score_threshold: float) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ Filter the boxes based on the iou_threshold and score_threshold. """ mask_score = scores >= score_threshold # 1. 过滤掉 score 小于 score_threshold 的 boxes target_boxes = boxes[mask_score] target_scores = scores[mask_score] target_labels = labels[mask_score] # 2. 过滤掉 iou 小于 iou_threshold 的 boxes mask_iou = self.nms(target_boxes, target_scores, iou_threshold) target_boxes = target_boxes[mask_iou] target_scores = target_scores[mask_iou] target_labels = target_labels[mask_iou] return target_boxes, target_scores, target_labels def nms(self, boxes: np.ndarray, scores: np.ndarray, iou_threshold: float) -> np.ndarray: """ Non-maximum suppression. 当 iou 大于 iou_threshold 时,保留 score 最大的 box """ if len(boxes) == 0: return np.array([], dtype=np.int32) # 获取坐标 x1 = boxes[:, 0] y1 = boxes[:, 1] x2 = boxes[:, 2] y2 = boxes[:, 3] # 计算面积 areas = (x2 - x1 + 1) * (y2 - y1 + 1) # 按分数排序,从高到低 order = np.argsort(scores)[::-1] keep = [] while order.size > 0: i = order[0] keep.append(i) # 计算当前框与其他框的交集 xx1 = np.maximum(x1[i], x1[order[1:]]) yy1 = np.maximum(y1[i], y1[order[1:]]) xx2 = np.minimum(x2[i], x2[order[1:]]) yy2 = np.minimum(y2[i], y2[order[1:]]) # 计算交集面积 w = np.maximum(0.0, xx2 - xx1 + 1) h = np.maximum(0.0, yy2 - yy1 + 1) inter = w * h # 计算IoU iou = inter / (areas[i] + areas[order[1:]] - inter) # 保留IoU小于阈值的框 inds = np.where(iou <= iou_threshold)[0] order = order[inds + 1] return np.array(keep, dtype=np.int32) def run_inference(self, image: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ Run inference on the image. Args: image (np.ndarray): The image to run inference on. Returns: boxes: (N, 4) x1, y1, x2, y2 scores: (N,) labels: (N,) """ # 运行推理 ort_outs = self.session.run(None, {self.input_name: image}) boxes_scores, labels = ort_outs[0], ort_outs[1] # RTMDet outputs cls_scores and bbox_preds boxes = boxes_scores[0, :, :4] scores = boxes_scores[0, :, 4] labels = labels[0] return boxes, scores, labels def pred(self, image: Union[cv2.UMat, str], to_rgb: bool = False) -> Tuple[np.ndarray, np.ndarray, List[str]]: """ Predict the detection results of the image. Args: image (cv2.UMat, str): The image to predict. Returns: """ if isinstance(image, str): img = cv2.imread(image) else: img = image.copy() image, scale, ori_shape = self.preprocess_image(img, to_rgb) boxes, scores, labels = self.run_inference(image) # 过滤结果 filtered_boxes, filtered_scores, filtered_labels = self.filter_results(boxes, scores, labels, self.iou_threshold, self.score_threshold) # to origin bbox origin_boxes = self.post_bbox(filtered_boxes, ori_shape, scale) # label_names label_names = [self.labels[label] for label in filtered_labels] return origin_boxes, filtered_scores, label_names def draw_pred(self, image: cv2.UMat, boxes: np.ndarray, scores: np.ndarray, labels: List[str]) -> cv2.UMat: # 不同label 对应不同颜色,一共 colors = self.label_colors # 在图像上绘制预测 bboxes 和 labels # boxes = boxes.tolist() # scores = scores.tolist() for box, score, label in zip(boxes, scores, labels): x1, y1, x2, y2 = box x1 = int(x1) y1 = int(y1) x2 = int(x2) y2 = int(y2) label_index = self.labels.index(label) cv2.rectangle(image, (x1, y1), (x2, y2), colors[label_index], 2) cv2.putText(image, f"{label} {score:.2f}", (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[label_index], 2) return image