Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Trt x86 #2

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 19 additions & 9 deletions inference/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os
from adaptive_object_detection.utils.parse_label_map import create_category_index_dict
from tools.visualization_utils import visualize_poses

logging.basicConfig(level=logging.INFO)


Expand All @@ -25,9 +26,15 @@ def inference(args):
if device == "x86":
from models.x86_pose_estimator import X86PoseEstimator
pose_estimator = X86PoseEstimator(detector_thresh=detector_thresh,
detector_input_size=(detector_input_height, detector_input_width),
pose_input_size=(pose_input_height, pose_input_width),
heatmap_size=(heatmap_height, heatmap_width))
detector_input_size=(detector_input_height, detector_input_width),
pose_input_size=(pose_input_height, pose_input_width),
heatmap_size=(heatmap_height, heatmap_width))
elif device == "trt":
from models.trt_x86_pose_estimator import TRTPoseEstimator
pose_estimator = TRTPoseEstimator(detector_thresh=detector_thresh,
detector_input_size=(detector_input_height, detector_input_width),
pose_input_size=(pose_input_height, pose_input_width),
heatmap_size=(heatmap_height, heatmap_width))
else:
raise ValueError("device should be 'x86' but you provided {0}".format(device))
video_uri = args.input_video
Expand All @@ -40,8 +47,9 @@ def inference(args):

file_name = ".".join((video_uri.split("/")[-1]).split(".")[:-1])
input_cap = cv.VideoCapture(video_uri)
fourcc = cv.VideoWriter_fourcc(*'XVID')
out_cap = cv.VideoWriter(os.path.join(args.out_dir, file_name + "_neuralet_pose.avi"),fourcc, 25, (args.out_width, args.out_height))
fourcc = cv.VideoWriter_fourcc(*'XVID')
out_cap = cv.VideoWriter(os.path.join(args.out_dir, file_name + "_neuralet_pose.avi"), fourcc, 25,
(args.out_width, args.out_height))
if (input_cap.isOpened()):
print('opened video ', video_uri)
else:
Expand Down Expand Up @@ -69,17 +77,19 @@ def inference(args):
out_cap.write(out_frame)
frame_number += 1
if frame_number % 100 == 0:
average_fps = round(1/(sum(times) / len(times)), 2)
average_fps = round(1 / (sum(times) / len(times)), 2)
times = []
logging.info("processed {0} frames, average FPS: {1}".format(frame_number, average_fps))


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="This script runs the inference of pose estimation models")
parser.add_argument("--device", type=str, default="x86", help="we only support x86 for now")
parser.add_argument("--device", type=str, default="x86",
help="supports x86 and trt which is running on x86 device")
parser.add_argument("--input_video", type=str, required=True, help="input video path")
parser.add_argument("--out_dir", type=str, required=True, help="directory to store output video")
parser.add_argument("--detector_model_path", type=str, help="path to the detector model files, if not provided the default COCO models will be used")
parser.add_argument("--detector_model_path", type=str,
help="path to the detector model files, if not provided the default COCO models will be used")
parser.add_argument("--label_map", type=str, help="path to the label map file")
parser.add_argument("--detector_threshold", type=float, default=0.1, help="detection's score threshold")
parser.add_argument("--detector_input_width", type=int, default=300, help="width of the detector's input")
Expand All @@ -95,5 +105,5 @@ def inference(args):

if (vars(args)["detector_model_path"]) and (not vars(args)["label_map"]):
parser.error('If you pass model_path you should pass label_map too')

inference(args)
239 changes: 239 additions & 0 deletions models/trt_x86_pose_estimator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
import os
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import logging
from adaptive_object_detection.detectors.jetson_detector import JetsonDetector
from adaptive_object_detection.detectors.x86_detector_trt import X86TrtDetector
from .base_pose_estimator import BasePoseEstimator
from tools.convert_results_format import prepare_detection_results
from tools.bbox import box_to_center_scale, center_scale_to_box
from tools.transformations import im_to_tensor
from tools.pose_nms import pose_nms
from tools.transformations import get_affine_transform, get_max_pred
import numpy as np
import cv2


class TRTPoseEstimator(BasePoseEstimator):
def __init__(self,
detector_thresh,
detector_input_size=(300, 300),
pose_input_size=(256, 192),
heatmap_size=(64, 48),
):
super().__init__(detector_thresh)
self.detector_height, self.detector_width = detector_input_size
self.pose_input_size = pose_input_size
self.heatmap_size = heatmap_size

self.h_input = None
self.d_input = None
self.h_ouput = None
self.d_output = None
self.stream = None

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
self.trt_runtime = trt.Runtime(TRT_LOGGER)
self.model = None
self.detector = None

def load_model(self, detector_path, detector_label_map):
self.detector = X86TrtDetector(width=self.detector_width, height=self.detector_height, thresh=self.detector_thresh)
self.detector.load_model(detector_path, detector_label_map)
self._init_cuda_stuff()

def inference(self, preprocessed_image):
raw_detections = self.detector.inference(preprocessed_image)
detections = prepare_detection_results(raw_detections, self.detector_width, self.detector_height)
inps, cropped_boxes, boxes, scores, ids = self.transform_detections(preprocessed_image, detections)
if inps.shape[0] == 0:
return (None, None, None, None, None)
result = np.zeros(np.shape(inps)[0], 64, 48, 17) # [Batch_size, 64, 48, 17] TODO: dynamic shape on trt
for i, sample in enumerate(inps):
single_img = np.array(sample, dtype=np.float32, order='C')
self._load_images_to_buffer(single_img)
with self.model.create_execution_context() as context:
# Transfer input data to the GPU.
cuda.memcpy_htod_async(self.d_input, self.h_input, self.stream)
context.execute(batch_size=1, bindings=[int(self.d_input), int(self.d_output)])
cuda.memcpy_dtoh_async(self.h_ouput, self.d_output, self.stream)
out = self.h_ouput.reshape((1, 64, 48, 17)) # TODO: it only works for fastpost
result[i, :] = out
return (result, cropped_boxes, boxes, scores, ids)

def preprocess(self, raw_image):
return self.detector.preprocess(raw_image)

def post_process(self, hm, cropped_boxes, boxes, scores, ids):
if hm is None:
return
assert hm.ndim == 4
pose_coords = []
pose_scores = []
for i in range(hm.shape[0]):
hm_size = self.heatmap_size
eval_joints = list(range(17))
bbox = cropped_boxes[i].tolist()
pose_coord, pose_score = self.heatmap_to_coord(hm[i, :, :, eval_joints], bbox, hm_shape=hm_size,
norm_type=None)

pose_coords.append(pose_coord)
pose_scores.append(pose_score)

preds_img = np.array(pose_coords)
preds_scores = np.array(pose_scores)

boxes, scores, ids, preds_img, preds_scores, pick_ids = \
pose_nms(boxes, scores, ids, preds_img, preds_scores, 0)
_result = []
for k in range(len(scores)):
if np.ndim(preds_scores[k] == 2):
preds_scores[k] = preds_scores[k][:, 0].reshape([17, 1])
_result.append(
{
'keypoints': preds_img[k],
'kp_score': preds_scores[k],
'proposal_score': np.mean(preds_scores[k]) + scores[k] + 1.25 * max(preds_scores[k]),
'idx': ids[k],
'bbox': [boxes[k][0], boxes[k][1], boxes[k][2], boxes[k][3]]
}
)
return _result

def transform_detections(self, image, dets):
input_size = self.pose_input_size
if isinstance(dets, int):
return 0, 0
dets = dets[dets[:, 0] == 0]
boxes = dets[:, 1:5]
scores = dets[:, 5:6]
ids = np.zeros(scores.shape)
inps = np.zeros([boxes.shape[0], int(input_size[0]), int(input_size[1]), 3])
cropped_boxes = np.zeros([boxes.shape[0], 4])
for i, box in enumerate(boxes):
inps[i], cropped_box = self.transform_single_detection(image, box, input_size)
cropped_boxes[i] = np.float32(cropped_box)
inps = im_to_tensor(inps)
return inps, cropped_boxes, boxes, scores, ids

@staticmethod
def transform_single_detection(image, bbox, input_size):
aspect_ratio = input_size[1] / input_size[0]
xmin, ymin, xmax, ymax = bbox
center, scale = box_to_center_scale(
xmin, ymin, xmax - xmin, ymax - ymin, aspect_ratio)
scale = scale * 1.0

input_size = input_size
inp_h, inp_w = input_size

trans = get_affine_transform(center, scale, 0, [inp_w, inp_h])
inp_h, inp_w = input_size
img = cv2.warpAffine(image, trans, (int(inp_w), int(inp_h)), flags=cv2.INTER_LINEAR)
bbox = center_scale_to_box(center, scale)
img = img / 255.0
img[..., 0] = img[..., 0] - 0.406
img[..., 1] = img[..., 1] - 0.457
img[..., 2] = img[..., 2] - 0.480
# img = im_to_tensor(img)
return img, bbox

def heatmap_to_coord(self, hms, bbox, hms_flip=None, **kwargs):
if hms_flip is not None:
hms = (hms + hms_flip) / 2
if not isinstance(hms, np.ndarray):
hms = hms.cpu().data.numpy()
coords, maxvals = get_max_pred(hms)

hm_h = hms.shape[1]
hm_w = hms.shape[2]

# post-processing
for p in range(coords.shape[0]):
hm = hms[p]
px = int(round(float(coords[p][0])))
py = int(round(float(coords[p][1])))
if 1 < px < hm_w - 1 and 1 < py < hm_h - 1:
diff = np.array((hm[py][px + 1] - hm[py][px - 1],
hm[py + 1][px] - hm[py - 1][px]))
coords[p] += np.sign(diff) * .25

preds = np.zeros_like(coords)

# transform bbox to scale
xmin, ymin, xmax, ymax = bbox
w = xmax - xmin
h = ymax - ymin
center = np.array([xmin + w * 0.5, ymin + h * 0.5])
scale = np.array([w, h])
# Transform back
for i in range(coords.shape[0]):
preds[i] = self.transform_preds(coords[i], center, scale,
[hm_w, hm_h])

return preds, maxvals

def transform_preds(self, coords, center, scale, output_size):
target_coords = np.zeros(coords.shape)
trans = get_affine_transform(center, scale, 0, output_size, inv=1)
target_coords[0:2] = self.affine_transform(coords[0:2], trans)
return target_coords

@staticmethod
def affine_transform(pt, t):
new_pt = np.array([pt[0], pt[1], 1.]).T
new_pt = np.dot(t, new_pt)
return new_pt[:2]

def _load_images_to_buffer(self, img):
preprocessed = np.asarray(img).ravel()
np.copyto(self.h_input, preprocessed)

def _load_engine(self):
base_dir = "models/data/"
model_file = "TRT_fastpose_resnet50.trt"
model_path = os.path.join(base_dir, model_file)
if not os.path.isfile(model_path):
logging.info(
'model does not exist under: {}'.format(str(model_path)))
else:
with open(model_path, 'rb') as f:
engine_data = f.read()
engine = self.trt_runtime.deserialize_cuda_engine(engine_data)
return engine

def _init_cuda_stuff(self):
self.model = self._load_engine()
self.h_input, self.d_input, self.h_ouput, self.d_output, self.stream = self._allocate_buffers(self.model, 1,
trt.float32)

@staticmethod
def _allocate_buffers(engine, batch_size, data_type):
"""
This is the function to allocate buffers for input and output in the device
Args:
engine : The path to the TensorRT engine.
batch_size : The batch size for execution time.
data_type: The type of the data for input and output, for example trt.float32.

Output:
h_input_1: Input in the host.
d_input_1: Input in the device.
h_output_1: Output in the host.
d_output_1: Output in the device.
stream: CUDA stream.

"""
# Determine dimensions and create page-locked memory buffers (which won't be swapped to disk) to hold host inputs/outputs.
h_input_1 = cuda.pagelocked_empty(batch_size * trt.volume(engine.get_binding_shape(0)),
dtype=trt.nptype(data_type))
h_output = cuda.pagelocked_empty(batch_size * trt.volume(engine.get_binding_shape(1)),
dtype=trt.nptype(data_type))
# Allocate device memory for inputs and outputs.
d_input_1 = cuda.mem_alloc(h_input_1.nbytes)

d_output = cuda.mem_alloc(h_output.nbytes)
# Create a stream in which to copy inputs/outputs and run inference.
stream = cuda.Stream()
return h_input_1, d_input_1, h_output, d_output, stream