Skip to content
This repository has been archived by the owner on Aug 5, 2023. It is now read-only.

Added MultiHand Support and Hull drawing around Hand #17

Merged
merged 18 commits into from
Oct 30, 2020
Merged
51 changes: 37 additions & 14 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,31 @@
PALM_MODEL_PATH = "models/palm_detection_without_custom_op.tflite"
LANDMARK_MODEL_PATH = "models/hand_landmark.tflite"
ANCHORS_PATH = "models/anchors.csv"

MULTIHAND = True
HULL = True
POINT_COLOR = (0, 255, 0)
CONNECTION_COLOR = (255, 0, 0)
HULL_COLOR = (0, 0, 255)
THICKNESS = 2
HULL_THICKNESS = 2


def drawpointstoframe(points, frame):
if points is not None:
for point in points:
x, y = point
cv2.circle(frame, (int(x), int(y)), THICKNESS * 2, POINT_COLOR, HULL_THICKNESS)
for connection in connections:
x0, y0 = points[connection[0]]
x1, y1 = points[connection[1]]
cv2.line(frame, (int(x0), int(y0)), (int(x1), int(y1)), CONNECTION_COLOR, THICKNESS)
if HULL:
for hull_connection in hull_connections:
x0, y0 = points[hull_connection[0]]
x1, y1 = points[hull_connection[1]]
cv2.line(frame, (int(x0), int(y0)), (int(x1), int(y1)), HULL_COLOR, HULL_THICKNESS)



cv2.namedWindow(WINDOW)
capture = cv2.VideoCapture(0)
Expand All @@ -31,14 +52,18 @@
# \ \ /
# ------0-
connections = [
(0, 1), (1, 2), (2, 3), (3, 4),

(5, 6), (6, 7), (7, 8),
(9, 10), (10, 11), (11, 12),
(13, 14), (14, 15), (15, 16),
(17, 18), (18, 19), (19, 20),
(0, 5), (5, 9), (9, 13), (13, 17), (0, 17)
(0, 5), (5, 9), (9, 13), (13, 17), (0, 9), (0, 13)
]

pseudo_hull_connections = [(0, 17), (17, 18), (18, 19), (19, 20), (0, 1), (1, 2), (2, 3), (3, 4)]
hull_connections = [(4, 8), (8, 12), (12, 16), (16, 20)]
if HULL:
hull_connections += pseudo_hull_connections
else:
connections += pseudo_hull_connections
detector = HandTracker(
PALM_MODEL_PATH,
LANDMARK_MODEL_PATH,
Expand All @@ -47,17 +72,15 @@
box_enlarge=1.3
)




while hasFrame:
image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
points, _ = detector(image)
if points is not None:
for point in points:
x, y = point
cv2.circle(frame, (int(x), int(y)), THICKNESS * 2, POINT_COLOR, THICKNESS)
for connection in connections:
x0, y0 = points[connection[0]]
x1, y1 = points[connection[1]]
cv2.line(frame, (int(x0), int(y0)), (int(x1), int(y1)), CONNECTION_COLOR, THICKNESS)
points1, points2, _, _ = detector(image)
drawpointstoframe(points1, frame)
if MULTIHAND:
drawpointstoframe(points2, frame)
cv2.imshow(WINDOW, frame)
hasFrame, frame = capture.read()
key = cv2.waitKey(1)
Expand Down
113 changes: 60 additions & 53 deletions src/hand_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class HandTracker():
"""

def __init__(self, palm_model, joint_model, anchors_path,
box_enlarge=1.5, box_shift=0.2):
box_enlarge=1.5, box_shift=0.2):
self.box_shift = box_shift
self.box_enlarge = box_enlarge

Expand All @@ -51,21 +51,21 @@ def __init__(self, palm_model, joint_model, anchors_path,
self.out_idx_joint = self.interp_joint.get_output_details()[0]['index']

# 90° rotation matrix used to create the alignment trianlge
self.R90 = np.r_[[[0,1],[-1,0]]]
self.R90 = np.r_[[[0, 1], [-1, 0]]]

# trianlge target coordinates used to move the detected hand
# into the right position
self._target_triangle = np.float32([
[128, 128],
[128, 0],
[ 0, 128]
])
[128, 128],
[128, 0],
[0, 128]
])
self._target_box = np.float32([
[ 0, 0, 1],
[256, 0, 1],
[256, 256, 1],
[ 0, 256, 1],
])
[0, 0, 1],
[256, 0, 1],
[256, 256, 1],
[0, 256, 1],
])

def _get_triangle(self, kp0, kp2, dist=1):
"""get a triangle used to calculate Affine transformation matrix"""
Expand All @@ -74,7 +74,7 @@ def _get_triangle(self, kp0, kp2, dist=1):
dir_v /= np.linalg.norm(dir_v)

dir_v_r = dir_v @ self.R90.T
return np.float32([kp2, kp2+dir_v*dist, kp2 + dir_v_r*dist])
return np.float32([kp2, kp2 + dir_v * dist, kp2 + dir_v_r * dist])

@staticmethod
def _triangle_to_bbox(source):
Expand All @@ -84,37 +84,36 @@ def _triangle_to_bbox(source):
[source[1] + source[0] - source[2]],
[3 * source[0] - source[1] - source[2]],
[source[2] - source[1] + source[0]],
].reshape(-1,2)
].reshape(-1, 2)
return bbox

@staticmethod
def _im_normalize(img):
return np.ascontiguousarray(
2 * ((img / 255) - 0.5
).astype('float32'))
return np.ascontiguousarray(
2 * ((img / 255) - 0.5
).astype('float32'))

@staticmethod
def _sigm(x):
return 1 / (1 + np.exp(-x) )
return 1 / (1 + np.exp(-x))

@staticmethod
def _pad1(x):
return np.pad(x, ((0,0),(0,1)), constant_values=1, mode='constant')

return np.pad(x, ((0, 0), (0, 1)), constant_values=1, mode='constant')

def predict_joints(self, img_norm):
self.interp_joint.set_tensor(
self.in_idx_joint, img_norm.reshape(1,256,256,3))
self.in_idx_joint, img_norm.reshape(1, 256, 256, 3))
self.interp_joint.invoke()

joints = self.interp_joint.get_tensor(self.out_idx_joint)
return joints.reshape(-1,2)
return joints.reshape(-1, 2)

def detect_hand(self, img_norm):
assert -1 <= img_norm.min() and img_norm.max() <= 1,\
"img_norm should be in range [-1, 1]"
assert img_norm.shape == (256, 256, 3),\
"img_norm shape must be (256, 256, 3)"
def detect_hand(self, img_norm, index=0):
assert -1 <= img_norm.min() and img_norm.max() <= 1, \
"img_norm should be in range [-1, 1]"
assert img_norm.shape == (256, 256, 3), \
"img_norm shape must be (256, 256, 3)"

# predict hand location and 7 initial landmarks
self.interp_palm.set_tensor(self.in_idx, img_norm[None])
Expand All @@ -130,7 +129,7 @@ def detect_hand(self, img_norm):
out_clf shape is [number of anchors]
it is the classification score if there is a hand for each anchor box
"""
out_clf = self.interp_palm.get_tensor(self.out_clf_idx)[0,:,0]
out_clf = self.interp_palm.get_tensor(self.out_clf_idx)[0, :, 0]

# finding the best prediction
probabilities = self._sigm(out_clf)
Expand All @@ -150,16 +149,17 @@ def detect_hand(self, img_norm):
box_ids = non_max_suppression_fast(moved_candidate_detect[:, :4], probabilities)

# Pick the first detected hand. Could be adapted for multi hand recognition
box_ids = box_ids[0]

# print(box_ids1)
if len(box_ids) > index:
box_ids1 = box_ids[index]
else:
return None, None, None
# bounding box offsets, width and height
dx,dy,w,h = candidate_detect[box_ids, :4]
center_wo_offst = candidate_anchors[box_ids,:2] * 256

dx, dy, w, h = candidate_detect[box_ids1, :4]
center_wo_offst = candidate_anchors[box_ids1, :2] * 256
# 7 initial keypoints
keypoints = center_wo_offst + candidate_detect[box_ids,4:].reshape(-1,2)
side = max(w,h) * self.box_enlarge

keypoints = center_wo_offst + candidate_detect[box_ids1, 4:].reshape(-1, 2)
side = max(w, h) * self.box_enlarge
# now we need to move and rotate the detected hand for it to occupy a
# 256x256 square
# line from wrist keypoint to middle finger keypoint
Expand All @@ -171,7 +171,7 @@ def detect_hand(self, img_norm):
debug_info = {
"detection_candidates": candidate_detect,
"anchor_candidates": candidate_anchors,
"selected_box_id": box_ids,
"selected_box_id": box_ids1,
}

return source, keypoints, debug_info
Expand All @@ -182,46 +182,53 @@ def preprocess_img(self, img):
pad = (shape.max() - shape[:2]).astype('uint32') // 2
img_pad = np.pad(
img,
((pad[0],pad[0]), (pad[1],pad[1]), (0,0)),
((pad[0], pad[0]), (pad[1], pad[1]), (0, 0)),
mode='constant')
img_small = cv2.resize(img_pad, (256, 256))
img_small = np.ascontiguousarray(img_small)

img_norm = self._im_normalize(img_small)
return img_pad, img_norm, pad


def __call__(self, img):
img_pad, img_norm, pad = self.preprocess_img(img)

source, keypoints, _ = self.detect_hand(img_norm)
if source is None:
return None, None

# calculating transformation from img_pad coords
# to img_landmark coords (cropped hand image)
scale = max(img.shape) / 256
def generate_orig(self, scale, source, img_pad, img_norm, pad):
Mtr = cv2.getAffineTransform(
source * scale,
self._target_triangle
)

img_landmark = cv2.warpAffine(
self._im_normalize(img_pad), Mtr, (256,256)
self._im_normalize(img_pad), Mtr, (256, 256)
)

joints = self.predict_joints(img_landmark)

# adding the [0,0,1] row to make the matrix square
Mtr = self._pad1(Mtr.T).T
Mtr[2,:2] = 0

Mtr[2, :2] = 0

Minv = np.linalg.inv(Mtr)

# projecting keypoints back into original image coordinate space
kp_orig = (self._pad1(joints) @ Minv.T)[:,:2]
box_orig = (self._target_box @ Minv.T)[:,:2]
kp_orig = (self._pad1(joints) @ Minv.T)[:, :2]
box_orig = (self._target_box @ Minv.T)[:, :2]
kp_orig -= pad[::-1]
box_orig -= pad[::-1]

return kp_orig, box_orig

def __call__(self, img):
img_pad, img_norm, pad = self.preprocess_img(img)
source, keypoints, _ = self.detect_hand(img_norm, 0) # Hand One
if source is None:
return None, None, None, None
# calculating transformation from img_pad coords
# to img_landmark coords (cropped hand image)
scale = max(img.shape) / 256
kp_orig, box_orig = self.generate_orig(scale, source, img_pad, img_norm, pad)

source2, keypoints2, _ = self.detect_hand(img_norm, 1) # Hand two
if source2 is None:
return kp_orig, None, box_orig, None
kp_orig2, box_orig2 = self.generate_orig(scale, source2, img_pad, img_norm, pad)

return kp_orig, kp_orig2, box_orig, box_orig2