metalwhale · metalwhale · Oct 30, 2020 · Sep 22, 2020 · Sep 22, 2020 · Sep 22, 2020
diff --git a/run.py b/run.py
@@ -5,10 +5,31 @@
 PALM_MODEL_PATH = "models/palm_detection_without_custom_op.tflite"
 LANDMARK_MODEL_PATH = "models/hand_landmark.tflite"
 ANCHORS_PATH = "models/anchors.csv"
-
+MULTIHAND = True
+HULL = True
 POINT_COLOR = (0, 255, 0)
 CONNECTION_COLOR = (255, 0, 0)
+HULL_COLOR = (0, 0, 255)
 THICKNESS = 2
+HULL_THICKNESS = 2
+
+
+def drawpointstoframe(points, frame):
+    if points is not None:
+        for point in points:
+            x, y = point
+            cv2.circle(frame, (int(x), int(y)), THICKNESS * 2, POINT_COLOR, HULL_THICKNESS)
+        for connection in connections:
+            x0, y0 = points[connection[0]]
+            x1, y1 = points[connection[1]]
+            cv2.line(frame, (int(x0), int(y0)), (int(x1), int(y1)), CONNECTION_COLOR, THICKNESS)
+        if HULL:
+            for hull_connection in hull_connections:
+                x0, y0 = points[hull_connection[0]]
+                x1, y1 = points[hull_connection[1]]
+                cv2.line(frame, (int(x0), int(y0)), (int(x1), int(y1)), HULL_COLOR, HULL_THICKNESS)
+
+
 
 cv2.namedWindow(WINDOW)
 capture = cv2.VideoCapture(0)
@@ -31,14 +52,18 @@
 #       \    \   /
 #        ------0-
 connections = [
-    (0, 1), (1, 2), (2, 3), (3, 4),
+
     (5, 6), (6, 7), (7, 8),
     (9, 10), (10, 11), (11, 12),
     (13, 14), (14, 15), (15, 16),
-    (17, 18), (18, 19), (19, 20),
-    (0, 5), (5, 9), (9, 13), (13, 17), (0, 17)
+    (0, 5), (5, 9), (9, 13), (13, 17), (0, 9), (0, 13)
 ]
-
+pseudo_hull_connections = [(0, 17), (17, 18), (18, 19), (19, 20), (0, 1), (1, 2), (2, 3), (3, 4)]
+hull_connections = [(4, 8), (8, 12), (12, 16), (16, 20)]
+if HULL:
+    hull_connections += pseudo_hull_connections
+else:
+    connections += pseudo_hull_connections
 detector = HandTracker(
     PALM_MODEL_PATH,
     LANDMARK_MODEL_PATH,
@@ -47,17 +72,15 @@
     box_enlarge=1.3
 )
 
+
+
+
 while hasFrame:
     image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-    points, _ = detector(image)
-    if points is not None:
-        for point in points:
-            x, y = point
-            cv2.circle(frame, (int(x), int(y)), THICKNESS * 2, POINT_COLOR, THICKNESS)
-        for connection in connections:
-            x0, y0 = points[connection[0]]
-            x1, y1 = points[connection[1]]
-            cv2.line(frame, (int(x0), int(y0)), (int(x1), int(y1)), CONNECTION_COLOR, THICKNESS)
+    points1, points2, _, _ = detector(image)
+    drawpointstoframe(points1, frame)
+    if MULTIHAND:
+        drawpointstoframe(points2, frame)
     cv2.imshow(WINDOW, frame)
     hasFrame, frame = capture.read()
     key = cv2.waitKey(1)

diff --git a/src/hand_tracker.py b/src/hand_tracker.py
@@ -25,7 +25,7 @@ class HandTracker():
     """
 
     def __init__(self, palm_model, joint_model, anchors_path,
-                box_enlarge=1.5, box_shift=0.2):
+                 box_enlarge=1.5, box_shift=0.2):
         self.box_shift = box_shift
         self.box_enlarge = box_enlarge
 
@@ -51,21 +51,21 @@ def __init__(self, palm_model, joint_model, anchors_path,
         self.out_idx_joint = self.interp_joint.get_output_details()[0]['index']
 
         # 90° rotation matrix used to create the alignment trianlge
-        self.R90 = np.r_[[[0,1],[-1,0]]]
+        self.R90 = np.r_[[[0, 1], [-1, 0]]]
 
         # trianlge target coordinates used to move the detected hand
         # into the right position
         self._target_triangle = np.float32([
-                        [128, 128],
-                        [128,   0],
-                        [  0, 128]
-                    ])
+            [128, 128],
+            [128, 0],
+            [0, 128]
+        ])
         self._target_box = np.float32([
-                        [  0,   0, 1],
-                        [256,   0, 1],
-                        [256, 256, 1],
-                        [  0, 256, 1],
-                    ])
+            [0, 0, 1],
+            [256, 0, 1],
+            [256, 256, 1],
+            [0, 256, 1],
+        ])
 
     def _get_triangle(self, kp0, kp2, dist=1):
         """get a triangle used to calculate Affine transformation matrix"""
@@ -74,7 +74,7 @@ def _get_triangle(self, kp0, kp2, dist=1):
         dir_v /= np.linalg.norm(dir_v)
 
         dir_v_r = dir_v @ self.R90.T
-        return np.float32([kp2, kp2+dir_v*dist, kp2 + dir_v_r*dist])
+        return np.float32([kp2, kp2 + dir_v * dist, kp2 + dir_v_r * dist])
 
     @staticmethod
     def _triangle_to_bbox(source):
@@ -84,37 +84,36 @@ def _triangle_to_bbox(source):
             [source[1] + source[0] - source[2]],
             [3 * source[0] - source[1] - source[2]],
             [source[2] - source[1] + source[0]],
-        ].reshape(-1,2)
+        ].reshape(-1, 2)
         return bbox
 
     @staticmethod
     def _im_normalize(img):
-         return np.ascontiguousarray(
-             2 * ((img / 255) - 0.5
-        ).astype('float32'))
+        return np.ascontiguousarray(
+            2 * ((img / 255) - 0.5
+                 ).astype('float32'))
 
     @staticmethod
     def _sigm(x):
-        return 1 / (1 + np.exp(-x) )
+        return 1 / (1 + np.exp(-x))
 
     @staticmethod
     def _pad1(x):
-        return np.pad(x, ((0,0),(0,1)), constant_values=1, mode='constant')
-
+        return np.pad(x, ((0, 0), (0, 1)), constant_values=1, mode='constant')
 
     def predict_joints(self, img_norm):
         self.interp_joint.set_tensor(
-            self.in_idx_joint, img_norm.reshape(1,256,256,3))
+            self.in_idx_joint, img_norm.reshape(1, 256, 256, 3))
         self.interp_joint.invoke()
 
         joints = self.interp_joint.get_tensor(self.out_idx_joint)
-        return joints.reshape(-1,2)
+        return joints.reshape(-1, 2)
 
-    def detect_hand(self, img_norm):
-        assert -1 <= img_norm.min() and img_norm.max() <= 1,\
-        "img_norm should be in range [-1, 1]"
-        assert img_norm.shape == (256, 256, 3),\
-        "img_norm shape must be (256, 256, 3)"
+    def detect_hand(self, img_norm, index=0):
+        assert -1 <= img_norm.min() and img_norm.max() <= 1, \
+            "img_norm should be in range [-1, 1]"
+        assert img_norm.shape == (256, 256, 3), \
+            "img_norm shape must be (256, 256, 3)"
 
         # predict hand location and 7 initial landmarks
         self.interp_palm.set_tensor(self.in_idx, img_norm[None])
@@ -130,7 +129,7 @@ def detect_hand(self, img_norm):
         out_clf shape is [number of anchors]
         it is the classification score if there is a hand for each anchor box
         """
-        out_clf = self.interp_palm.get_tensor(self.out_clf_idx)[0,:,0]
+        out_clf = self.interp_palm.get_tensor(self.out_clf_idx)[0, :, 0]
 
         # finding the best prediction
         probabilities = self._sigm(out_clf)
@@ -150,16 +149,17 @@ def detect_hand(self, img_norm):
         box_ids = non_max_suppression_fast(moved_candidate_detect[:, :4], probabilities)
 
         # Pick the first detected hand. Could be adapted for multi hand recognition
-        box_ids = box_ids[0]
-
+        # print(box_ids1)
+        if len(box_ids) > index:
+            box_ids1 = box_ids[index]
+        else:
+            return None, None, None
         # bounding box offsets, width and height
-        dx,dy,w,h = candidate_detect[box_ids, :4]
-        center_wo_offst = candidate_anchors[box_ids,:2] * 256
-
+        dx, dy, w, h = candidate_detect[box_ids1, :4]
+        center_wo_offst = candidate_anchors[box_ids1, :2] * 256
         # 7 initial keypoints
-        keypoints = center_wo_offst + candidate_detect[box_ids,4:].reshape(-1,2)
-        side = max(w,h) * self.box_enlarge
-
+        keypoints = center_wo_offst + candidate_detect[box_ids1, 4:].reshape(-1, 2)
+        side = max(w, h) * self.box_enlarge
         # now we need to move and rotate the detected hand for it to occupy a
         # 256x256 square
         # line from wrist keypoint to middle finger keypoint
@@ -171,7 +171,7 @@ def detect_hand(self, img_norm):
         debug_info = {
             "detection_candidates": candidate_detect,
             "anchor_candidates": candidate_anchors,
-            "selected_box_id": box_ids,
+            "selected_box_id": box_ids1,
         }
 
         return source, keypoints, debug_info
@@ -182,46 +182,53 @@ def preprocess_img(self, img):
         pad = (shape.max() - shape[:2]).astype('uint32') // 2
         img_pad = np.pad(
             img,
-            ((pad[0],pad[0]), (pad[1],pad[1]), (0,0)),
+            ((pad[0], pad[0]), (pad[1], pad[1]), (0, 0)),
             mode='constant')
         img_small = cv2.resize(img_pad, (256, 256))
         img_small = np.ascontiguousarray(img_small)
 
         img_norm = self._im_normalize(img_small)
         return img_pad, img_norm, pad
 
-
-    def __call__(self, img):
-        img_pad, img_norm, pad = self.preprocess_img(img)
-
-        source, keypoints, _ = self.detect_hand(img_norm)
-        if source is None:
-            return None, None
-
-        # calculating transformation from img_pad coords
-        # to img_landmark coords (cropped hand image)
-        scale = max(img.shape) / 256
+    def generate_orig(self, scale, source, img_pad, img_norm, pad):
         Mtr = cv2.getAffineTransform(
             source * scale,
             self._target_triangle
         )
 
         img_landmark = cv2.warpAffine(
-            self._im_normalize(img_pad), Mtr, (256,256)
+            self._im_normalize(img_pad), Mtr, (256, 256)
         )
 
         joints = self.predict_joints(img_landmark)
 
         # adding the [0,0,1] row to make the matrix square
         Mtr = self._pad1(Mtr.T).T
-        Mtr[2,:2] = 0
+
+        Mtr[2, :2] = 0
 
         Minv = np.linalg.inv(Mtr)
 
         # projecting keypoints back into original image coordinate space
-        kp_orig = (self._pad1(joints) @ Minv.T)[:,:2]
-        box_orig = (self._target_box @ Minv.T)[:,:2]
+        kp_orig = (self._pad1(joints) @ Minv.T)[:, :2]
+        box_orig = (self._target_box @ Minv.T)[:, :2]
         kp_orig -= pad[::-1]
         box_orig -= pad[::-1]
-
         return kp_orig, box_orig
+
+    def __call__(self, img):
+        img_pad, img_norm, pad = self.preprocess_img(img)
+        source, keypoints, _ = self.detect_hand(img_norm, 0)  # Hand One
+        if source is None:
+            return None, None, None, None
+        # calculating transformation from img_pad coords
+        # to img_landmark coords (cropped hand image)
+        scale = max(img.shape) / 256
+        kp_orig, box_orig = self.generate_orig(scale, source, img_pad, img_norm, pad)
+
+        source2, keypoints2, _ = self.detect_hand(img_norm, 1)  # Hand two
+        if source2 is None:
+            return kp_orig, None, box_orig, None
+        kp_orig2, box_orig2 = self.generate_orig(scale, source2, img_pad, img_norm, pad)
+
+        return kp_orig, kp_orig2, box_orig, box_orig2