Spaces:
Sleeping
Sleeping
# import onnxruntime | |
import numpy as np | |
import cv2 | |
from typing import Tuple, List, Union | |
from .base_onnx import BaseONNX | |
class COMMON_DETECTION_ONNX(BaseONNX): | |
def __init__(self, | |
model_path, | |
labels: List[str], | |
# 输入图片大小 | |
input_size=(640, 640), # (w, h) | |
iou_threshold: float = 0.5, | |
score_threshold: float = 0.2, | |
): | |
super().__init__(model_path, input_size) | |
self.labels = labels | |
self.label_colors = [] | |
for i in range(len(labels)): | |
self.label_colors.append((np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255))) | |
self.iou_threshold = iou_threshold | |
self.score_threshold = score_threshold | |
def preprocess_image(self, image: cv2.UMat, to_rgb: bool = True) -> Tuple[np.ndarray, float, Tuple[int, int]]: | |
if to_rgb: | |
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) | |
target_size = self.input_size | |
ori_shape = image.shape[:2] | |
# 1. Resize with keep_ratio=True | |
h, w = image.shape[:2] | |
scale = min(target_size[0] / h, target_size[1] / w) | |
new_h, new_w = int(h * scale), int(w * scale) | |
resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LINEAR) | |
# 2. Pad to 640x640 | |
pad_h = target_size[0] - new_h | |
pad_w = target_size[1] - new_w | |
top, bottom = 0, pad_h | |
left, right = 0, pad_w | |
padded = cv2.copyMakeBorder( | |
resized, top, bottom, left, right, | |
cv2.BORDER_CONSTANT, value=(114, 114, 114) | |
) | |
# img = img.astype(np.float32) | |
# 3. Normalize (BGR format, matching mmdet pipeline) | |
mean = np.array([103.53, 116.28, 123.675], dtype=np.float32) | |
std = np.array([57.375, 57.12, 58.395], dtype=np.float32) | |
normalized = (padded.astype(np.float32) - mean) / std | |
# 4. Convert to (C, H, W) and add batch dimension | |
input_tensor = normalized.transpose(2, 0, 1)[np.newaxis, ...] | |
return input_tensor, scale, ori_shape | |
def post_bbox(self, boxes, origin_shape, scale): | |
""" | |
将onnx的输出结果转换为mmdet的输出结果, 与 preprocess_image 中 的预处理相反 | |
boxes: (N, 4) x1, y1, x2, y2 | |
origin_shape: (H, W) | |
scale: 缩放因子,从 preprocess_image 获取 | |
return: (N, 4) x1, y1, x2, y2 | |
""" | |
if boxes is None or len(boxes) == 0: | |
return boxes | |
boxes = boxes.copy() | |
# 如果没有提供scale,假设是640x640输入,根据origin_shape计算scale | |
if scale is None: | |
target_size = 640 | |
h, w = origin_shape | |
scale = min(target_size / h, target_size / w) | |
# 将坐标从缩放后的图像空间转换回原始图像空间 | |
boxes /= scale | |
# 裁剪到原始图像边界内 | |
h, w = origin_shape | |
boxes[:, 0] = np.clip(boxes[:, 0], 0, w) # x1 | |
boxes[:, 1] = np.clip(boxes[:, 1], 0, h) # y1 | |
boxes[:, 2] = np.clip(boxes[:, 2], 0, w) # x2 | |
boxes[:, 3] = np.clip(boxes[:, 3], 0, h) # y2 | |
return boxes | |
def filter_results(self, boxes: np.ndarray, scores: np.ndarray, labels: np.ndarray, iou_threshold: float, score_threshold: float) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: | |
""" | |
Filter the boxes based on the iou_threshold and score_threshold. | |
""" | |
mask_score = scores >= score_threshold | |
# 1. 过滤掉 score 小于 score_threshold 的 boxes | |
target_boxes = boxes[mask_score] | |
target_scores = scores[mask_score] | |
target_labels = labels[mask_score] | |
# 2. 过滤掉 iou 小于 iou_threshold 的 boxes | |
mask_iou = self.nms(target_boxes, target_scores, iou_threshold) | |
target_boxes = target_boxes[mask_iou] | |
target_scores = target_scores[mask_iou] | |
target_labels = target_labels[mask_iou] | |
return target_boxes, target_scores, target_labels | |
def nms(self, boxes: np.ndarray, scores: np.ndarray, iou_threshold: float) -> np.ndarray: | |
""" | |
Non-maximum suppression. | |
当 iou 大于 iou_threshold 时,保留 score 最大的 box | |
""" | |
if len(boxes) == 0: | |
return np.array([], dtype=np.int32) | |
# 获取坐标 | |
x1 = boxes[:, 0] | |
y1 = boxes[:, 1] | |
x2 = boxes[:, 2] | |
y2 = boxes[:, 3] | |
# 计算面积 | |
areas = (x2 - x1 + 1) * (y2 - y1 + 1) | |
# 按分数排序,从高到低 | |
order = np.argsort(scores)[::-1] | |
keep = [] | |
while order.size > 0: | |
i = order[0] | |
keep.append(i) | |
# 计算当前框与其他框的交集 | |
xx1 = np.maximum(x1[i], x1[order[1:]]) | |
yy1 = np.maximum(y1[i], y1[order[1:]]) | |
xx2 = np.minimum(x2[i], x2[order[1:]]) | |
yy2 = np.minimum(y2[i], y2[order[1:]]) | |
# 计算交集面积 | |
w = np.maximum(0.0, xx2 - xx1 + 1) | |
h = np.maximum(0.0, yy2 - yy1 + 1) | |
inter = w * h | |
# 计算IoU | |
iou = inter / (areas[i] + areas[order[1:]] - inter) | |
# 保留IoU小于阈值的框 | |
inds = np.where(iou <= iou_threshold)[0] | |
order = order[inds + 1] | |
return np.array(keep, dtype=np.int32) | |
def run_inference(self, image: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: | |
""" | |
Run inference on the image. | |
Args: | |
image (np.ndarray): The image to run inference on. | |
Returns: | |
boxes: (N, 4) x1, y1, x2, y2 | |
scores: (N,) | |
labels: (N,) | |
""" | |
# 运行推理 | |
ort_outs = self.session.run(None, {self.input_name: image}) | |
boxes_scores, labels = ort_outs[0], ort_outs[1] # RTMDet outputs cls_scores and bbox_preds | |
boxes = boxes_scores[0, :, :4] | |
scores = boxes_scores[0, :, 4] | |
labels = labels[0] | |
return boxes, scores, labels | |
def pred(self, image: Union[cv2.UMat, str], to_rgb: bool = False) -> Tuple[np.ndarray, np.ndarray, List[str]]: | |
""" | |
Predict the detection results of the image. | |
Args: | |
image (cv2.UMat, str): The image to predict. | |
Returns: | |
""" | |
if isinstance(image, str): | |
img = cv2.imread(image) | |
else: | |
img = image.copy() | |
image, scale, ori_shape = self.preprocess_image(img, to_rgb) | |
boxes, scores, labels = self.run_inference(image) | |
# 过滤结果 | |
filtered_boxes, filtered_scores, filtered_labels = self.filter_results(boxes, scores, labels, self.iou_threshold, self.score_threshold) | |
# to origin bbox | |
origin_boxes = self.post_bbox(filtered_boxes, ori_shape, scale) | |
# label_names | |
label_names = [self.labels[label] for label in filtered_labels] | |
return origin_boxes, filtered_scores, label_names | |
def draw_pred(self, image: cv2.UMat, boxes: np.ndarray, scores: np.ndarray, labels: List[str]) -> cv2.UMat: | |
# 不同label 对应不同颜色,一共 | |
colors = self.label_colors | |
# 在图像上绘制预测 bboxes 和 labels | |
# boxes = boxes.tolist() | |
# scores = scores.tolist() | |
for box, score, label in zip(boxes, scores, labels): | |
x1, y1, x2, y2 = box | |
x1 = int(x1) | |
y1 = int(y1) | |
x2 = int(x2) | |
y2 = int(y2) | |
label_index = self.labels.index(label) | |
cv2.rectangle(image, (x1, y1), (x2, y2), colors[label_index], 2) | |
cv2.putText(image, f"{label} {score:.2f}", (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[label_index], 2) | |
return image | |