pesi
/

rtmo

Object Detection

ONNX

Pose Estimation

Model card Files Files and versions

xet

Community

Luigi commited on May 21, 2024

Commit

634d4ff

1 Parent(s): 98d66d4

Return and show bounding box confidence

Browse files

Files changed (3) hide show

rtmo_demo.py +2 -2
rtmo_demo_batch.py +6 -4
rtmo_gpu.py +44 -17

rtmo_demo.py CHANGED Viewed

@@ -36,7 +36,7 @@ if __name__ == "__main__":
             if not success:
                 break
-            frame_out, bboxes, keypoints, scores = body(frame)
             if keypoints is not None:
                 if frame_idx % args.batch_size == 0 and frame_idx:
@@ -56,7 +56,7 @@ if __name__ == "__main__":
                                         scores,
                                         kpt_thr=0.3,
                                         line_width=2)
-                img_show = draw_bbox(img_show, bboxes)
                 img_show = resize_to_fit_screen(img_show, 720, 480)
                 cv2.putText(img_show, f'{fps:.1f}', (10, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 255, 0), 1, cv2.LINE_AA)
                 cv2.imshow(f'{model}', img_show)

             if not success:
                 break
+            frame_out, bboxes, bboxes_scores, keypoints, scores = body(frame)
             if keypoints is not None:
                 if frame_idx % args.batch_size == 0 and frame_idx:
                                         scores,
                                         kpt_thr=0.3,
                                         line_width=2)
+                img_show = draw_bbox(img_show, bboxes, bboxes_scores)
                 img_show = resize_to_fit_screen(img_show, 720, 480)
                 cv2.putText(img_show, f'{fps:.1f}', (10, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 255, 0), 1, cv2.LINE_AA)
                 cv2.imshow(f'{model}', img_show)

rtmo_demo_batch.py CHANGED Viewed

@@ -24,7 +24,7 @@ def process_video(video_path, body_estimator, batch_size=4):
         # Process the batch when it's full
         if len(batch_frames) == batch_size:
             s = time.time()
-            batch_bboxes, batch_keypoints, batch_scores = body_estimator.__batch_call__(batch_frames)
             det_time = time.time() - s
             fps = round(batch_size / det_time, 1)
             print(f'Batch det: {fps} FPS')
@@ -33,9 +33,10 @@ def process_video(video_path, body_estimator, batch_size=4):
                 scores = batch_scores[i]
                 frame = batch_frames[i]
                 bboxes = batch_bboxes[i]
                 img_show = frame.copy()
                 img_show = draw_skeleton(img_show, keypoints, scores, kpt_thr=0.3, line_width=2)
-                img_show = draw_bbox(img_show, bboxes)
                 img_show = resize_to_fit_screen(img_show, 720, 480)
                 cv2.putText(img_show, f'{fps:.1f}', (10, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 255, 0), 1, cv2.LINE_AA)
                 cv2.imshow(f'{video_path}', img_show)
@@ -54,14 +55,15 @@ def process_video(video_path, body_estimator, batch_size=4):
             # Option 2: Duplicate the last frame
             batch_frames.append(batch_frames[-1])
-        batch_bboxes, batch_keypoints, batch_scores = body_estimator.__batch_call__(batch_frames)
         for i, keypoints in enumerate(batch_keypoints):
             scores = batch_scores[i]
             frame = batch_frames[i]
             bboxes = batch_bboxes[i]
             img_show = frame.copy()
             img_show = draw_skeleton(img_show, keypoints, scores, kpt_thr=0.3, line_width=2)
-            img_show = draw_bbox(img_show, bboxes)
             img_show = resize_to_fit_screen(img_show, 720, 480)
             cv2.imshow(f'{video_path}', img_show)
             #cv2.waitKey(10)

         # Process the batch when it's full
         if len(batch_frames) == batch_size:
             s = time.time()
+            batch_bboxes, batch_bboxes_scores, batch_keypoints, batch_scores = body_estimator.__batch_call__(batch_frames)
             det_time = time.time() - s
             fps = round(batch_size / det_time, 1)
             print(f'Batch det: {fps} FPS')
                 scores = batch_scores[i]
                 frame = batch_frames[i]
                 bboxes = batch_bboxes[i]
+                bboxes_scores = batch_bboxes_scores[i]
                 img_show = frame.copy()
                 img_show = draw_skeleton(img_show, keypoints, scores, kpt_thr=0.3, line_width=2)
+                img_show = draw_bbox(img_show, bboxes, bboxes_scores)
                 img_show = resize_to_fit_screen(img_show, 720, 480)
                 cv2.putText(img_show, f'{fps:.1f}', (10, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 255, 0), 1, cv2.LINE_AA)
                 cv2.imshow(f'{video_path}', img_show)
             # Option 2: Duplicate the last frame
             batch_frames.append(batch_frames[-1])
+        batch_bboxes, batch_bboxes_scores, batch_keypoints, batch_scores = body_estimator.__batch_call__(batch_frames)
         for i, keypoints in enumerate(batch_keypoints):
             scores = batch_scores[i]
             frame = batch_frames[i]
             bboxes = batch_bboxes[i]
+            bboxes_scores = batch_bboxes_scores[i]
             img_show = frame.copy()
             img_show = draw_skeleton(img_show, keypoints, scores, kpt_thr=0.3, line_width=2)
+            img_show = draw_bbox(img_show, bboxes, bboxes_scores)
             img_show = resize_to_fit_screen(img_show, 720, 480)
             cv2.imshow(f'{video_path}', img_show)
             #cv2.waitKey(10)

rtmo_gpu.py CHANGED Viewed

@@ -207,12 +207,32 @@ def draw_mmpose(img,
     return img
-def draw_bbox(img, bboxes, color=(0, 255, 0)):
-    for bbox in bboxes:
         img = cv2.rectangle(img, (int(bbox[0]), int(bbox[1])),
-                            (int(bbox[2]), int(bbox[3])), color, 2)
     return img
 # with simplification to use onnxruntime only
 def draw_skeleton(img,
                   keypoints,
@@ -333,6 +353,8 @@ class RTMO_GPU(object):
             tuple:
             - final_boxes (np.ndarray): Final bounding boxes.
             - final_scores (np.ndarray): Final scores.
         """
         if not self.is_yolo_nas_pose:
@@ -346,6 +368,7 @@ class RTMO_GPU(object):
             isscore = final_scores > 0.3
             isbbox = [i for i in isscore]
             final_boxes = final_boxes[isbbox]
             # decode pose outputs
             keypoints, scores = pose_outputs[0, :, :, :2], pose_outputs[0, :, :, 2]
@@ -359,14 +382,15 @@ class RTMO_GPU(object):
             if flat_predictions.shape[0] > 0: # at least one person found
                 mask = flat_predictions[:, 0] == 0
                 final_boxes = flat_predictions[mask, 1:5]
                 pred_joints = flat_predictions[mask, 6:].reshape((len(final_boxes), -1, 3))
                 keypoints, scores = pred_joints[:,:,:2], pred_joints[:,:,-1]
                 keypoints = keypoints / ratio
                 final_boxes = final_boxes / ratio
             else: # no detection
-                final_boxes, keypoints, scores = np.zeros((0, 4)),np.zeros((0, 17, 2)), np.zeros((0, 17))
-        return final_boxes, keypoints, scores
     def inference(self, img: np.ndarray):
             """Inference model.
@@ -425,9 +449,9 @@ class RTMO_GPU(object):
             outputs = self.inference(image)
-            bboxes, keypoints, scores = self.postprocess(outputs, ratio)
-            return bboxes, keypoints, scores
     def __init__(self,
                  model: str = None,
@@ -569,22 +593,24 @@ class RTMO_GPU_Batch(RTMO_GPU):
         batch_keypoints = []
         batch_scores = []
         batch_bboxes = []
         b_dets, b_keypoints = outputs
         for i, ratio in enumerate(ratios):
             output = [np.expand_dims(b_dets[i], axis=0), np.expand_dims(b_keypoints[i],axis=0)]
-            bboxes, keypoints, scores = super().postprocess(output, ratio)
             batch_keypoints.append(keypoints)
             batch_scores.append(scores)
             batch_bboxes.append(bboxes)
-        return batch_bboxes, batch_keypoints, batch_scores
     def __batch_call__(self, images: List[np.ndarray]):
         batch_img, ratios = self.preprocess_batch(images)
         outputs = self.inference(batch_img)
-        bboxes, keypoints, scores = self.postprocess_batch(outputs, ratios)
-        return bboxes, keypoints, scores
     def __call__(self, image: np.array, camera_id = 0):
@@ -600,18 +626,19 @@ class RTMO_GPU_Batch(RTMO_GPU):
         in_queue.put(image)
         if len(self.buffers[camera_id]) == self.batch_size:
-            b_bboxes, b_keypoints, b_scores = self.__batch_call__(self.buffers[camera_id])
             for i, (keypoints, scores) in enumerate(zip(b_keypoints, b_scores)):
                 bboxes = b_bboxes[i]
-                out_queue.put((bboxes, keypoints, scores))
             self.buffers[camera_id] = []
-        frame, bboxes, keypoints, scores = None, None, None, None
         if not out_queue.empty():
-            bboxes, keypoints, scores = out_queue.get()
             frame = in_queue.get()
-        return frame, bboxes, keypoints, scores
     def __init__(self,

     return img
+def draw_bbox(img, bboxes, bboxes_scores=None, color=None):
+    for i, bbox in enumerate(bboxes):
+        # Determine the color based on the score if no color is given
+        if color is None and bboxes_scores is not None:
+            # Scale the score to a color range (green to red)
+            score = bboxes_scores[i]
+            green = int((1 - score) * 255)
+            red = int(score * 255)
+            box_color = (0, green, red)
+        else:
+            box_color = color if color is not None else (0, 255, 0)
+        # Draw the bounding box
         img = cv2.rectangle(img, (int(bbox[0]), int(bbox[1])),
+                            (int(bbox[2]), int(bbox[3])), box_color, 1)
+        # Display the score at the top-right corner of the bounding box
+        if bboxes_scores is not None:
+            score_text = f'{bboxes_scores[i]:.2f}'
+            text_size, _ = cv2.getTextSize(score_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+            text_x = int(bbox[2]) - text_size[0]
+            text_y = int(bbox[1]) + text_size[1]
+            img = cv2.putText(img, score_text, (text_x, text_y),
+                              cv2.FONT_HERSHEY_SIMPLEX, 0.5, box_color, 1, cv2.LINE_AA)
     return img
 # with simplification to use onnxruntime only
 def draw_skeleton(img,
                   keypoints,
             tuple:
             - final_boxes (np.ndarray): Final bounding boxes.
             - final_scores (np.ndarray): Final scores.
+            - final keypoints
+            - final keypoints scores
         """
         if not self.is_yolo_nas_pose:
             isscore = final_scores > 0.3
             isbbox = [i for i in isscore]
             final_boxes = final_boxes[isbbox]
+            final_boxes_scores = final_scores[isbbox]
             # decode pose outputs
             keypoints, scores = pose_outputs[0, :, :, :2], pose_outputs[0, :, :, 2]
             if flat_predictions.shape[0] > 0: # at least one person found
                 mask = flat_predictions[:, 0] == 0
                 final_boxes = flat_predictions[mask, 1:5]
+                final_boxes_scores = flat_predictions[mask, 5]
                 pred_joints = flat_predictions[mask, 6:].reshape((len(final_boxes), -1, 3))
                 keypoints, scores = pred_joints[:,:,:2], pred_joints[:,:,-1]
                 keypoints = keypoints / ratio
                 final_boxes = final_boxes / ratio
             else: # no detection
+                final_boxes, final_boxes_scores, keypoints, scores = np.zeros((0, 4)),np.zeros((0, 1)),np.zeros((0, 17, 2)), np.zeros((0, 17))
+        return final_boxes, final_boxes_scores, keypoints, scores
     def inference(self, img: np.ndarray):
             """Inference model.
             outputs = self.inference(image)
+            bboxes, bboxes_scores, keypoints, scores = self.postprocess(outputs, ratio)
+            return bboxes, bboxes_scores, keypoints, scores
     def __init__(self,
                  model: str = None,
         batch_keypoints = []
         batch_scores = []
         batch_bboxes = []
+        batch_bboxes_scores = []
         b_dets, b_keypoints = outputs
         for i, ratio in enumerate(ratios):
             output = [np.expand_dims(b_dets[i], axis=0), np.expand_dims(b_keypoints[i],axis=0)]
+            bboxes, bboxes_scores, keypoints, scores = super().postprocess(output, ratio)
             batch_keypoints.append(keypoints)
             batch_scores.append(scores)
             batch_bboxes.append(bboxes)
+            batch_bboxes_scores.append(bboxes_scores)
+        return batch_bboxes, batch_bboxes_scores, batch_keypoints, batch_scores
     def __batch_call__(self, images: List[np.ndarray]):
         batch_img, ratios = self.preprocess_batch(images)
         outputs = self.inference(batch_img)
+        bboxes, bboxes_scores, keypoints, scores = self.postprocess_batch(outputs, ratios)
+        return bboxes, bboxes_scores, keypoints, scores
     def __call__(self, image: np.array, camera_id = 0):
         in_queue.put(image)
         if len(self.buffers[camera_id]) == self.batch_size:
+            b_bboxes, b_bboxes_scores, b_keypoints, b_scores = self.__batch_call__(self.buffers[camera_id])
             for i, (keypoints, scores) in enumerate(zip(b_keypoints, b_scores)):
                 bboxes = b_bboxes[i]
+                bboxes_scores = b_bboxes_scores[i]
+                out_queue.put((bboxes, bboxes_scores, keypoints, scores))
             self.buffers[camera_id] = []
+        frame, bboxes, bboxes_scores, keypoints, scores = None, None, None, None, None
         if not out_queue.empty():
+            bboxes, bboxes_scores, keypoints, scores = out_queue.get()
             frame = in_queue.get()
+        return frame, bboxes, bboxes_scores, keypoints, scores
     def __init__(self,