Spaces:

yolo12138
/

Poker_Detection_hf

Sleeping

File size: 8,156 Bytes

efafe9b

# import onnxruntime
import numpy as np
import cv2

from typing import Tuple, List, Union
from .base_onnx import BaseONNX

class COMMON_DETECTION_ONNX(BaseONNX):

    def __init__(self, 
                 model_path,
                 labels: List[str],
                 # 输入图片大小
                 input_size=(640, 640), # (w, h)
                 iou_threshold: float = 0.5,
                 score_threshold: float = 0.2,
                 ):
        super().__init__(model_path, input_size)

        self.labels = labels
        self.label_colors = []
        for i in range(len(labels)):
            self.label_colors.append((np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255)))
        
        self.iou_threshold = iou_threshold
        self.score_threshold = score_threshold

    def preprocess_image(self, image: cv2.UMat, to_rgb: bool = True) -> Tuple[np.ndarray, float, Tuple[int, int]]:

        if to_rgb:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            
        target_size = self.input_size
        ori_shape = image.shape[:2]
            
        # 1. Resize with keep_ratio=True
        h, w = image.shape[:2]
        scale = min(target_size[0] / h, target_size[1] / w)
        new_h, new_w = int(h * scale), int(w * scale)
        resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
        
        # 2. Pad to 640x640
        pad_h = target_size[0] - new_h
        pad_w = target_size[1] - new_w
        top, bottom = 0, pad_h
        left, right = 0, pad_w
        
        padded = cv2.copyMakeBorder(
            resized, top, bottom, left, right, 
            cv2.BORDER_CONSTANT, value=(114, 114, 114)
        )

        # img = img.astype(np.float32)
        
        # 3. Normalize (BGR format, matching mmdet pipeline)
        mean = np.array([103.53, 116.28, 123.675], dtype=np.float32)
        std = np.array([57.375, 57.12, 58.395], dtype=np.float32)
        
        normalized = (padded.astype(np.float32) - mean) / std
        
        # 4. Convert to (C, H, W) and add batch dimension
        input_tensor = normalized.transpose(2, 0, 1)[np.newaxis, ...]

        return input_tensor, scale, ori_shape
    
    def post_bbox(self, boxes, origin_shape, scale):
        """
        将onnx的输出结果转换为mmdet的输出结果, 与 preprocess_image 中 的预处理相反
        boxes: (N, 4) x1, y1, x2, y2
        origin_shape: (H, W)
        scale: 缩放因子，从 preprocess_image 获取
        return: (N, 4) x1, y1, x2, y2
        """
        if boxes is None or len(boxes) == 0:
            return boxes
        
        boxes = boxes.copy()
        
        # 如果没有提供scale，假设是640x640输入，根据origin_shape计算scale
        if scale is None:
            target_size = 640
            h, w = origin_shape
            scale = min(target_size / h, target_size / w)
        
        # 将坐标从缩放后的图像空间转换回原始图像空间
        boxes /= scale
        
        # 裁剪到原始图像边界内
        h, w = origin_shape
        boxes[:, 0] = np.clip(boxes[:, 0], 0, w)  # x1
        boxes[:, 1] = np.clip(boxes[:, 1], 0, h)  # y1
        boxes[:, 2] = np.clip(boxes[:, 2], 0, w)  # x2
        boxes[:, 3] = np.clip(boxes[:, 3], 0, h)  # y2
        
        return boxes


    def filter_results(self, boxes: np.ndarray, scores: np.ndarray, labels: np.ndarray, iou_threshold: float, score_threshold: float) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """
        Filter the boxes based on the iou_threshold and score_threshold.
        """
        mask_score = scores >= score_threshold
        
        
        # 1. 过滤掉 score 小于 score_threshold 的 boxes
        target_boxes = boxes[mask_score]
        target_scores = scores[mask_score]
        target_labels = labels[mask_score]
        
        # 2. 过滤掉 iou 小于 iou_threshold 的 boxes
        mask_iou = self.nms(target_boxes, target_scores, iou_threshold)
        
        target_boxes = target_boxes[mask_iou]
        target_scores = target_scores[mask_iou]
        target_labels = target_labels[mask_iou]
        
        return target_boxes, target_scores, target_labels

    def nms(self, boxes: np.ndarray, scores: np.ndarray, iou_threshold: float) -> np.ndarray:
        """
        Non-maximum suppression.
        当 iou 大于 iou_threshold 时，保留 score 最大的 box
        
        """
        if len(boxes) == 0:
            return np.array([], dtype=np.int32)

        # 获取坐标
        x1 = boxes[:, 0]
        y1 = boxes[:, 1]
        x2 = boxes[:, 2]
        y2 = boxes[:, 3]

        # 计算面积
        areas = (x2 - x1 + 1) * (y2 - y1 + 1)

        # 按分数排序，从高到低
        order = np.argsort(scores)[::-1]

        keep = []
        while order.size > 0:
            i = order[0]
            keep.append(i)

            # 计算当前框与其他框的交集
            xx1 = np.maximum(x1[i], x1[order[1:]])
            yy1 = np.maximum(y1[i], y1[order[1:]])
            xx2 = np.minimum(x2[i], x2[order[1:]])
            yy2 = np.minimum(y2[i], y2[order[1:]])

            # 计算交集面积
            w = np.maximum(0.0, xx2 - xx1 + 1)
            h = np.maximum(0.0, yy2 - yy1 + 1)
            inter = w * h

            # 计算IoU
            iou = inter / (areas[i] + areas[order[1:]] - inter)

            # 保留IoU小于阈值的框
            inds = np.where(iou <= iou_threshold)[0]
            order = order[inds + 1]

        return np.array(keep, dtype=np.int32)

    def run_inference(self, image: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """
        Run inference on the image.

        Args:
            image (np.ndarray): The image to run inference on.

        Returns:
            boxes: (N, 4) x1, y1, x2, y2
            scores: (N,)
            labels: (N,)
        """
        # 运行推理
        ort_outs = self.session.run(None, {self.input_name: image})
        
        boxes_scores, labels = ort_outs[0], ort_outs[1]  # RTMDet outputs cls_scores and bbox_preds
        boxes = boxes_scores[0, :, :4]
        scores = boxes_scores[0, :, 4]
        labels = labels[0]
        
        return boxes, scores, labels

    def pred(self, image: Union[cv2.UMat, str], to_rgb: bool = False) -> Tuple[np.ndarray, np.ndarray, List[str]]:
        """
        Predict the detection results of the image.

        Args:
            image (cv2.UMat, str): The image to predict.

        Returns:
          
        """
        if isinstance(image, str):
            img = cv2.imread(image)
        else:
            img = image.copy()

        image, scale, ori_shape = self.preprocess_image(img, to_rgb)

        boxes, scores, labels = self.run_inference(image)
        
        
            # 过滤结果
        filtered_boxes, filtered_scores, filtered_labels = self.filter_results(boxes, scores, labels, self.iou_threshold, self.score_threshold)
        
        
        
        # to origin bbox
        origin_boxes = self.post_bbox(filtered_boxes, ori_shape, scale)
        
        # label_names
        label_names = [self.labels[label] for label in filtered_labels]
        

        return origin_boxes, filtered_scores, label_names
    
    def draw_pred(self, image: cv2.UMat, boxes: np.ndarray, scores: np.ndarray, labels: List[str]) -> cv2.UMat:
        
        # 不同label 对应不同颜色，一共
        colors = self.label_colors

        # 在图像上绘制预测 bboxes 和 labels
        # boxes = boxes.tolist()
        # scores = scores.tolist()
        
        for box, score, label in zip(boxes, scores, labels):
            x1, y1, x2, y2 = box
            
            x1 = int(x1)
            y1 = int(y1)
            x2 = int(x2)
            y2 = int(y2)
            label_index = self.labels.index(label)
            
            cv2.rectangle(image, (x1, y1), (x2, y2), colors[label_index], 2)
            cv2.putText(image, f"{label} {score:.2f}", (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[label_index], 2)

        return image