diff --git a/run.py b/run.py index b47139f..bfb1b25 100644 --- a/run.py +++ b/run.py @@ -5,10 +5,31 @@ PALM_MODEL_PATH = "models/palm_detection_without_custom_op.tflite" LANDMARK_MODEL_PATH = "models/hand_landmark.tflite" ANCHORS_PATH = "models/anchors.csv" - +MULTIHAND = True +HULL = True POINT_COLOR = (0, 255, 0) CONNECTION_COLOR = (255, 0, 0) +HULL_COLOR = (0, 0, 255) THICKNESS = 2 +HULL_THICKNESS = 2 + + +def drawpointstoframe(points, frame): + if points is not None: + for point in points: + x, y = point + cv2.circle(frame, (int(x), int(y)), THICKNESS * 2, POINT_COLOR, HULL_THICKNESS) + for connection in connections: + x0, y0 = points[connection[0]] + x1, y1 = points[connection[1]] + cv2.line(frame, (int(x0), int(y0)), (int(x1), int(y1)), CONNECTION_COLOR, THICKNESS) + if HULL: + for hull_connection in hull_connections: + x0, y0 = points[hull_connection[0]] + x1, y1 = points[hull_connection[1]] + cv2.line(frame, (int(x0), int(y0)), (int(x1), int(y1)), HULL_COLOR, HULL_THICKNESS) + + cv2.namedWindow(WINDOW) capture = cv2.VideoCapture(0) @@ -31,14 +52,18 @@ # \ \ / # ------0- connections = [ - (0, 1), (1, 2), (2, 3), (3, 4), + (5, 6), (6, 7), (7, 8), (9, 10), (10, 11), (11, 12), (13, 14), (14, 15), (15, 16), - (17, 18), (18, 19), (19, 20), - (0, 5), (5, 9), (9, 13), (13, 17), (0, 17) + (0, 5), (5, 9), (9, 13), (13, 17), (0, 9), (0, 13) ] - +pseudo_hull_connections = [(0, 17), (17, 18), (18, 19), (19, 20), (0, 1), (1, 2), (2, 3), (3, 4)] +hull_connections = [(4, 8), (8, 12), (12, 16), (16, 20)] +if HULL: + hull_connections += pseudo_hull_connections +else: + connections += pseudo_hull_connections detector = HandTracker( PALM_MODEL_PATH, LANDMARK_MODEL_PATH, @@ -47,17 +72,15 @@ box_enlarge=1.3 ) + + + while hasFrame: image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - points, _ = detector(image) - if points is not None: - for point in points: - x, y = point - cv2.circle(frame, (int(x), int(y)), THICKNESS * 2, POINT_COLOR, THICKNESS) - for connection in connections: - x0, y0 = points[connection[0]] - x1, y1 = points[connection[1]] - cv2.line(frame, (int(x0), int(y0)), (int(x1), int(y1)), CONNECTION_COLOR, THICKNESS) + points1, points2, _, _ = detector(image) + drawpointstoframe(points1, frame) + if MULTIHAND: + drawpointstoframe(points2, frame) cv2.imshow(WINDOW, frame) hasFrame, frame = capture.read() key = cv2.waitKey(1) diff --git a/src/hand_tracker.py b/src/hand_tracker.py index 550e2bf..b2b93ca 100644 --- a/src/hand_tracker.py +++ b/src/hand_tracker.py @@ -25,7 +25,7 @@ class HandTracker(): """ def __init__(self, palm_model, joint_model, anchors_path, - box_enlarge=1.5, box_shift=0.2): + box_enlarge=1.5, box_shift=0.2): self.box_shift = box_shift self.box_enlarge = box_enlarge @@ -51,21 +51,21 @@ def __init__(self, palm_model, joint_model, anchors_path, self.out_idx_joint = self.interp_joint.get_output_details()[0]['index'] # 90° rotation matrix used to create the alignment trianlge - self.R90 = np.r_[[[0,1],[-1,0]]] + self.R90 = np.r_[[[0, 1], [-1, 0]]] # trianlge target coordinates used to move the detected hand # into the right position self._target_triangle = np.float32([ - [128, 128], - [128, 0], - [ 0, 128] - ]) + [128, 128], + [128, 0], + [0, 128] + ]) self._target_box = np.float32([ - [ 0, 0, 1], - [256, 0, 1], - [256, 256, 1], - [ 0, 256, 1], - ]) + [0, 0, 1], + [256, 0, 1], + [256, 256, 1], + [0, 256, 1], + ]) def _get_triangle(self, kp0, kp2, dist=1): """get a triangle used to calculate Affine transformation matrix""" @@ -74,7 +74,7 @@ def _get_triangle(self, kp0, kp2, dist=1): dir_v /= np.linalg.norm(dir_v) dir_v_r = dir_v @ self.R90.T - return np.float32([kp2, kp2+dir_v*dist, kp2 + dir_v_r*dist]) + return np.float32([kp2, kp2 + dir_v * dist, kp2 + dir_v_r * dist]) @staticmethod def _triangle_to_bbox(source): @@ -84,37 +84,36 @@ def _triangle_to_bbox(source): [source[1] + source[0] - source[2]], [3 * source[0] - source[1] - source[2]], [source[2] - source[1] + source[0]], - ].reshape(-1,2) + ].reshape(-1, 2) return bbox @staticmethod def _im_normalize(img): - return np.ascontiguousarray( - 2 * ((img / 255) - 0.5 - ).astype('float32')) + return np.ascontiguousarray( + 2 * ((img / 255) - 0.5 + ).astype('float32')) @staticmethod def _sigm(x): - return 1 / (1 + np.exp(-x) ) + return 1 / (1 + np.exp(-x)) @staticmethod def _pad1(x): - return np.pad(x, ((0,0),(0,1)), constant_values=1, mode='constant') - + return np.pad(x, ((0, 0), (0, 1)), constant_values=1, mode='constant') def predict_joints(self, img_norm): self.interp_joint.set_tensor( - self.in_idx_joint, img_norm.reshape(1,256,256,3)) + self.in_idx_joint, img_norm.reshape(1, 256, 256, 3)) self.interp_joint.invoke() joints = self.interp_joint.get_tensor(self.out_idx_joint) - return joints.reshape(-1,2) + return joints.reshape(-1, 2) - def detect_hand(self, img_norm): - assert -1 <= img_norm.min() and img_norm.max() <= 1,\ - "img_norm should be in range [-1, 1]" - assert img_norm.shape == (256, 256, 3),\ - "img_norm shape must be (256, 256, 3)" + def detect_hand(self, img_norm, index=0): + assert -1 <= img_norm.min() and img_norm.max() <= 1, \ + "img_norm should be in range [-1, 1]" + assert img_norm.shape == (256, 256, 3), \ + "img_norm shape must be (256, 256, 3)" # predict hand location and 7 initial landmarks self.interp_palm.set_tensor(self.in_idx, img_norm[None]) @@ -130,7 +129,7 @@ def detect_hand(self, img_norm): out_clf shape is [number of anchors] it is the classification score if there is a hand for each anchor box """ - out_clf = self.interp_palm.get_tensor(self.out_clf_idx)[0,:,0] + out_clf = self.interp_palm.get_tensor(self.out_clf_idx)[0, :, 0] # finding the best prediction probabilities = self._sigm(out_clf) @@ -150,16 +149,17 @@ def detect_hand(self, img_norm): box_ids = non_max_suppression_fast(moved_candidate_detect[:, :4], probabilities) # Pick the first detected hand. Could be adapted for multi hand recognition - box_ids = box_ids[0] - + # print(box_ids1) + if len(box_ids) > index: + box_ids1 = box_ids[index] + else: + return None, None, None # bounding box offsets, width and height - dx,dy,w,h = candidate_detect[box_ids, :4] - center_wo_offst = candidate_anchors[box_ids,:2] * 256 - + dx, dy, w, h = candidate_detect[box_ids1, :4] + center_wo_offst = candidate_anchors[box_ids1, :2] * 256 # 7 initial keypoints - keypoints = center_wo_offst + candidate_detect[box_ids,4:].reshape(-1,2) - side = max(w,h) * self.box_enlarge - + keypoints = center_wo_offst + candidate_detect[box_ids1, 4:].reshape(-1, 2) + side = max(w, h) * self.box_enlarge # now we need to move and rotate the detected hand for it to occupy a # 256x256 square # line from wrist keypoint to middle finger keypoint @@ -171,7 +171,7 @@ def detect_hand(self, img_norm): debug_info = { "detection_candidates": candidate_detect, "anchor_candidates": candidate_anchors, - "selected_box_id": box_ids, + "selected_box_id": box_ids1, } return source, keypoints, debug_info @@ -182,7 +182,7 @@ def preprocess_img(self, img): pad = (shape.max() - shape[:2]).astype('uint32') // 2 img_pad = np.pad( img, - ((pad[0],pad[0]), (pad[1],pad[1]), (0,0)), + ((pad[0], pad[0]), (pad[1], pad[1]), (0, 0)), mode='constant') img_small = cv2.resize(img_pad, (256, 256)) img_small = np.ascontiguousarray(img_small) @@ -190,38 +190,45 @@ def preprocess_img(self, img): img_norm = self._im_normalize(img_small) return img_pad, img_norm, pad - - def __call__(self, img): - img_pad, img_norm, pad = self.preprocess_img(img) - - source, keypoints, _ = self.detect_hand(img_norm) - if source is None: - return None, None - - # calculating transformation from img_pad coords - # to img_landmark coords (cropped hand image) - scale = max(img.shape) / 256 + def generate_orig(self, scale, source, img_pad, img_norm, pad): Mtr = cv2.getAffineTransform( source * scale, self._target_triangle ) img_landmark = cv2.warpAffine( - self._im_normalize(img_pad), Mtr, (256,256) + self._im_normalize(img_pad), Mtr, (256, 256) ) joints = self.predict_joints(img_landmark) # adding the [0,0,1] row to make the matrix square Mtr = self._pad1(Mtr.T).T - Mtr[2,:2] = 0 + + Mtr[2, :2] = 0 Minv = np.linalg.inv(Mtr) # projecting keypoints back into original image coordinate space - kp_orig = (self._pad1(joints) @ Minv.T)[:,:2] - box_orig = (self._target_box @ Minv.T)[:,:2] + kp_orig = (self._pad1(joints) @ Minv.T)[:, :2] + box_orig = (self._target_box @ Minv.T)[:, :2] kp_orig -= pad[::-1] box_orig -= pad[::-1] - return kp_orig, box_orig + + def __call__(self, img): + img_pad, img_norm, pad = self.preprocess_img(img) + source, keypoints, _ = self.detect_hand(img_norm, 0) # Hand One + if source is None: + return None, None, None, None + # calculating transformation from img_pad coords + # to img_landmark coords (cropped hand image) + scale = max(img.shape) / 256 + kp_orig, box_orig = self.generate_orig(scale, source, img_pad, img_norm, pad) + + source2, keypoints2, _ = self.detect_hand(img_norm, 1) # Hand two + if source2 is None: + return kp_orig, None, box_orig, None + kp_orig2, box_orig2 = self.generate_orig(scale, source2, img_pad, img_norm, pad) + + return kp_orig, kp_orig2, box_orig, box_orig2