Implement pycuda backend for inference with TensorRT engine
Browse filesin additon to original polygraphy backend.
the default is polygraphy. You can set TRT_BACKEND to 'PYCUDA' to choose pycuda backend.
- rtmo_gpu.py +125 -19
rtmo_gpu.py
CHANGED
@@ -5,6 +5,7 @@ import onnxruntime as ort
|
|
5 |
import cv2
|
6 |
from queue import Queue
|
7 |
os.environ['ORT_TENSORRT_EXTRA_PLUGIN_LIB_PATHS']='libmmdeploy_tensorrt_ops.so'
|
|
|
8 |
|
9 |
# dictionary from https://github.com/Tau-J/rtmlib/blob/4b29101d54b611048ef165277cebfffff3030074/rtmlib/visualization/skeleton/coco17.py
|
10 |
coco17 = dict(name='coco17',
|
@@ -442,17 +443,39 @@ class RTMO_GPU(object):
|
|
442 |
outputs = [output.numpy() for output in io_binding.get_outputs()]
|
443 |
|
444 |
else: # 'engine'
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
451 |
|
452 |
return outputs
|
453 |
|
454 |
def __exit__(self):
|
455 |
-
if self.model_format == 'engine':
|
456 |
if self.session.is_active:
|
457 |
self.session.deactivate()
|
458 |
|
@@ -471,7 +494,11 @@ class RTMO_GPU(object):
|
|
471 |
mean: tuple = None,
|
472 |
std: tuple = None,
|
473 |
device: str = 'cuda',
|
474 |
-
is_yolo_nas_pose = False
|
|
|
|
|
|
|
|
|
475 |
|
476 |
if not os.path.exists(model):
|
477 |
# If the file does not exist, raise FileNotFoundError
|
@@ -499,10 +526,62 @@ class RTMO_GPU(object):
|
|
499 |
providers=providers[device])
|
500 |
|
501 |
else: # 'engine'
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
506 |
|
507 |
self.model_input_size = self.input_shape[2:4] # B, C, H, W,
|
508 |
self.mean = mean
|
@@ -510,6 +589,9 @@ class RTMO_GPU(object):
|
|
510 |
self.device = device
|
511 |
self.is_yolo_nas_pose = is_yolo_nas_pose
|
512 |
|
|
|
|
|
|
|
513 |
class RTMO_GPU_Batch(RTMO_GPU):
|
514 |
def preprocess_batch(self, imgs: List[np.ndarray]) -> Tuple[np.ndarray, List[float]]:
|
515 |
"""Process a batch of images for RTMPose model inference.
|
@@ -571,12 +653,34 @@ class RTMO_GPU_Batch(RTMO_GPU):
|
|
571 |
outputs = [output.numpy() for output in io_binding.get_outputs()]
|
572 |
|
573 |
else: # 'engine'
|
|
|
|
|
|
|
574 |
|
575 |
-
|
576 |
-
|
577 |
-
|
578 |
-
|
579 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
580 |
|
581 |
return outputs
|
582 |
|
@@ -651,14 +755,16 @@ class RTMO_GPU_Batch(RTMO_GPU):
|
|
651 |
std: tuple = None,
|
652 |
device: str = 'cuda',
|
653 |
is_yolo_nas_pose = False,
|
|
|
654 |
batch_size: int = 1):
|
655 |
super().__init__(model,
|
656 |
mean,
|
657 |
std,
|
658 |
device,
|
659 |
-
is_yolo_nas_pose
|
|
|
|
|
660 |
|
661 |
-
self.batch_size = batch_size
|
662 |
self.in_queues = dict()
|
663 |
self.out_queues = dict()
|
664 |
self.buffers = dict()
|
|
|
5 |
import cv2
|
6 |
from queue import Queue
|
7 |
os.environ['ORT_TENSORRT_EXTRA_PLUGIN_LIB_PATHS']='libmmdeploy_tensorrt_ops.so'
|
8 |
+
TRT_BACKEND='POLYGRAPHY'
|
9 |
|
10 |
# dictionary from https://github.com/Tau-J/rtmlib/blob/4b29101d54b611048ef165277cebfffff3030074/rtmlib/visualization/skeleton/coco17.py
|
11 |
coco17 = dict(name='coco17',
|
|
|
443 |
outputs = [output.numpy() for output in io_binding.get_outputs()]
|
444 |
|
445 |
else: # 'engine'
|
446 |
+
if TRT_BACKEND == 'POLYGRAPHY':
|
447 |
+
if not self.session.is_active:
|
448 |
+
self.session.activate()
|
449 |
+
|
450 |
+
outputs = self.session.infer(feed_dict={'input': input}, check_inputs=False)
|
451 |
+
outputs = [output for output in outputs.values()]
|
452 |
+
else: # PYCUDA
|
453 |
+
import pycuda.driver as cuda
|
454 |
+
# Set the input shape dynamically
|
455 |
+
input_shape = input.shape
|
456 |
+
self.context.set_binding_shape(0, input_shape)
|
457 |
+
|
458 |
+
# Ensure input_data matches the expected shape
|
459 |
+
np.copyto(self.inputs[0]['host'], input.ravel())
|
460 |
+
cuda.memcpy_htod_async(self.inputs[0]['device'], self.inputs[0]['host'], self.stream)
|
461 |
+
|
462 |
+
# Run inference
|
463 |
+
self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle)
|
464 |
+
|
465 |
+
# Transfer predictions back from the GPU
|
466 |
+
for output in self.outputs:
|
467 |
+
cuda.memcpy_dtoh_async(output['host'], output['device'], self.stream)
|
468 |
+
|
469 |
+
# Synchronize the stream
|
470 |
+
self.stream.synchronize()
|
471 |
+
|
472 |
+
# Return only the output values (in their original shapes)
|
473 |
+
outputs = [out['host'].reshape(out['shape']) for out in self.outputs]
|
474 |
|
475 |
return outputs
|
476 |
|
477 |
def __exit__(self):
|
478 |
+
if self.model_format == 'engine' and TRT_BACKEND == 'POLYGRAPHY':
|
479 |
if self.session.is_active:
|
480 |
self.session.deactivate()
|
481 |
|
|
|
494 |
mean: tuple = None,
|
495 |
std: tuple = None,
|
496 |
device: str = 'cuda',
|
497 |
+
is_yolo_nas_pose = False,
|
498 |
+
batch_size = 1,
|
499 |
+
plugin_path = 'libmmdeploy_tensorrt_ops.so'):
|
500 |
+
|
501 |
+
self.batch_size = batch_size
|
502 |
|
503 |
if not os.path.exists(model):
|
504 |
# If the file does not exist, raise FileNotFoundError
|
|
|
526 |
providers=providers[device])
|
527 |
|
528 |
else: # 'engine'
|
529 |
+
if TRT_BACKEND == 'POLYGRAPHY':
|
530 |
+
from polygraphy.backend.common import BytesFromPath
|
531 |
+
from polygraphy.backend.trt import EngineFromBytes, TrtRunner
|
532 |
+
engine = EngineFromBytes(BytesFromPath(model))
|
533 |
+
self.session = TrtRunner(engine)
|
534 |
+
else: # PYCUDA
|
535 |
+
import tensorrt as trt
|
536 |
+
import ctypes
|
537 |
+
import pycuda.autoinit
|
538 |
+
import pycuda.driver as cuda
|
539 |
+
self.TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
|
540 |
+
self.trt_model_path = model
|
541 |
+
self.plugin_path = plugin_path
|
542 |
+
|
543 |
+
# Load the custom plugin library
|
544 |
+
ctypes.CDLL(self.plugin_path)
|
545 |
+
|
546 |
+
# Load the TensorRT engine
|
547 |
+
with open(self.trt_model_path, 'rb') as f:
|
548 |
+
engine_data = f.read()
|
549 |
+
|
550 |
+
self.runtime = trt.Runtime(self.TRT_LOGGER)
|
551 |
+
self.engine = self.runtime.deserialize_cuda_engine(engine_data)
|
552 |
+
|
553 |
+
if self.engine is None:
|
554 |
+
raise RuntimeError("Failed to load the engine.")
|
555 |
+
|
556 |
+
self.context = self.engine.create_execution_context()
|
557 |
+
|
558 |
+
self.inputs = []
|
559 |
+
self.outputs = []
|
560 |
+
self.bindings = []
|
561 |
+
self.stream = cuda.Stream()
|
562 |
+
|
563 |
+
# Allocate memory for inputs and outputs
|
564 |
+
for binding in self.engine:
|
565 |
+
binding_index = self.engine.get_binding_index(binding)
|
566 |
+
shape = self.engine.get_binding_shape(binding_index)
|
567 |
+
if shape[0] == -1:
|
568 |
+
# Handle dynamic batch size by setting max_batch_size
|
569 |
+
shape[0] = self.batch_size
|
570 |
+
size = trt.volume(shape)
|
571 |
+
dtype = trt.nptype(self.engine.get_binding_dtype(binding))
|
572 |
+
|
573 |
+
# Allocate host and device buffers
|
574 |
+
host_mem = cuda.pagelocked_empty(size, dtype)
|
575 |
+
device_mem = cuda.mem_alloc(host_mem.nbytes)
|
576 |
+
|
577 |
+
# Append the device buffer to device bindings.
|
578 |
+
self.bindings.append(int(device_mem))
|
579 |
+
|
580 |
+
# Append to the appropriate list.
|
581 |
+
if self.engine.binding_is_input(binding):
|
582 |
+
self.inputs.append({'host': host_mem, 'device': device_mem, 'shape': shape})
|
583 |
+
else:
|
584 |
+
self.outputs.append({'host': host_mem, 'device': device_mem, 'shape': shape})
|
585 |
|
586 |
self.model_input_size = self.input_shape[2:4] # B, C, H, W,
|
587 |
self.mean = mean
|
|
|
589 |
self.device = device
|
590 |
self.is_yolo_nas_pose = is_yolo_nas_pose
|
591 |
|
592 |
+
print(f'[I] Detected \'{self.model_format.upper()}\' model', end='')
|
593 |
+
print(f', \'{TRT_BACKEND.upper()}\' backend is chosen for inference' if self.model_format == 'engine' else '')
|
594 |
+
|
595 |
class RTMO_GPU_Batch(RTMO_GPU):
|
596 |
def preprocess_batch(self, imgs: List[np.ndarray]) -> Tuple[np.ndarray, List[float]]:
|
597 |
"""Process a batch of images for RTMPose model inference.
|
|
|
653 |
outputs = [output.numpy() for output in io_binding.get_outputs()]
|
654 |
|
655 |
else: # 'engine'
|
656 |
+
if TRT_BACKEND == 'POLYGRAPHY':
|
657 |
+
if not self.session.is_active:
|
658 |
+
self.session.activate()
|
659 |
|
660 |
+
outputs = self.session.infer(feed_dict={'input': input}, check_inputs=False)
|
661 |
+
outputs = [output for output in outputs.values()]
|
662 |
+
else: # PYCUDA
|
663 |
+
import pycuda.driver as cuda
|
664 |
+
# Set the input shape dynamically
|
665 |
+
input_shape = input.shape
|
666 |
+
self.context.set_binding_shape(0, input_shape)
|
667 |
+
|
668 |
+
# Ensure input_data matches the expected shape
|
669 |
+
np.copyto(self.inputs[0]['host'], input.ravel())
|
670 |
+
cuda.memcpy_htod_async(self.inputs[0]['device'], self.inputs[0]['host'], self.stream)
|
671 |
+
|
672 |
+
# Run inference
|
673 |
+
self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle)
|
674 |
+
|
675 |
+
# Transfer predictions back from the GPU
|
676 |
+
for output in self.outputs:
|
677 |
+
cuda.memcpy_dtoh_async(output['host'], output['device'], self.stream)
|
678 |
+
|
679 |
+
# Synchronize the stream
|
680 |
+
self.stream.synchronize()
|
681 |
+
|
682 |
+
# Return only the output values (in their original shapes)
|
683 |
+
outputs = [out['host'].reshape(out['shape']) for out in self.outputs]
|
684 |
|
685 |
return outputs
|
686 |
|
|
|
755 |
std: tuple = None,
|
756 |
device: str = 'cuda',
|
757 |
is_yolo_nas_pose = False,
|
758 |
+
plugin_path = 'libmmdeploy_tensorrt_ops.so',
|
759 |
batch_size: int = 1):
|
760 |
super().__init__(model,
|
761 |
mean,
|
762 |
std,
|
763 |
device,
|
764 |
+
is_yolo_nas_pose,
|
765 |
+
batch_size,
|
766 |
+
plugin_path)
|
767 |
|
|
|
768 |
self.in_queues = dict()
|
769 |
self.out_queues = dict()
|
770 |
self.buffers = dict()
|