From 3fd2bf94bbf34b7f62dc49a3a5ad3cba6afab9f0 Mon Sep 17 00:00:00 2001
From: Carson Swope <carsonmswope@gmail.com>
Date: Fri, 10 Sep 2021 15:14:48 -0600
Subject: [PATCH] allow evaluating forest at lower resolution for faster
 framerate

---
 src/3d_bz.py            | 34 +++++++++++++++++++++-------------
 src/cuda/tree_eval.cu   | 32 ++++++++++++++++++--------------
 src/decision_tree.py    | 40 +++++++++++++++++++++++++---------------
 src/run_live_layered.py | 33 ++++++++++++++++++++++++++-------
 4 files changed, 90 insertions(+), 49 deletions(-)

diff --git a/src/3d_bz.py b/src/3d_bz.py
index 6045698..59846f5 100644
--- a/src/3d_bz.py
+++ b/src/3d_bz.py
@@ -49,6 +49,8 @@ def __init__(self):
 
         self.midi = Midi()
 
+        self.LABELS_REDUCE = 2
+
         self.NO_DEBUG = args.no_debug
 
         self.NUM_RANDOM_GUESSES = args.plane_num_iterations or 25000
@@ -67,7 +69,7 @@ def __init__(self):
 
         print('loading forest')
 
-        self.layered_rdf = LayeredDecisionForest.load(args.cfg, (480, 848))
+        self.layered_rdf = LayeredDecisionForest.load(args.cfg, (480, 848), labels_reduce = self.LABELS_REDUCE)
 
         self.points_ops = PointsOps()
 
@@ -94,11 +96,15 @@ def __init__(self):
         self.coord_croups_cpu = np.zeros((self.depth_mm_dims[0] * self.depth_mm_dims[1], 3), dtype=np.int32)
         self.coord_groups_gpu = GpuBuffer((self.depth_mm_dims[0] * self.depth_mm_dims[1], 3), dtype=np.int32)
 
-        self.labels_image = GpuBuffer((self.DIM_Y, self.DIM_X), dtype=np.uint16)
-        self.labels_image_2 = GpuBuffer((self.DIM_Y, self.DIM_X), dtype=np.uint16)
-        self.labels_image_rgba_cpu = np.zeros((self.DIM_Y, self.DIM_X, 4), dtype=np.uint8)
-        self.labels_image_rgba = GpuBuffer((self.DIM_Y, self.DIM_X, 4), dtype=np.uint8)
-        self.labels_image_rgba_tex = GpuTexture((self.DIM_X, self.DIM_Y), (GL_RGBA, GL_UNSIGNED_BYTE))
+        self.LABELS_DIM_X = self.DIM_X // self.LABELS_REDUCE
+        self.LABELS_DIM_Y = self.DIM_Y // self.LABELS_REDUCE
+        self.LABELS_DIM = (self.LABELS_DIM_Y, self.LABELS_DIM_X)
+
+        self.labels_image = GpuBuffer(self.LABELS_DIM, dtype=np.uint16)
+        self.labels_image_2 = GpuBuffer(self.LABELS_DIM, dtype=np.uint16)
+        self.labels_image_rgba_cpu = np.zeros(self.LABELS_DIM + (4,), dtype=np.uint8)
+        self.labels_image_rgba = GpuBuffer(self.LABELS_DIM + (4,), dtype=np.uint8)
+        self.labels_image_rgba_tex = GpuTexture((self.LABELS_DIM_X, self.LABELS_DIM_Y), (GL_RGBA, GL_UNSIGNED_BYTE))
 
         mean_shift_variances = np.array(
             [100., 50., 50., 50., 50., 50., 50.],
@@ -296,7 +302,7 @@ def tick(self, _):
         window_pad = 24 * self.dpi_scale
 
         imgui.push_style_var(imgui.STYLE_WINDOW_PADDING, (window_pad, window_pad))
-        imgui.set_next_window_position(0, 0)#self.DIM_Y * self.dpi_scale)
+        imgui.set_next_window_position(0, 0)
         imgui.set_next_window_size(self.width * self.dpi_scale, 220 * self.dpi_scale)
         imgui.set_next_window_bg_alpha(0.3)
         imgui.begin('Hand state', flags= imgui.WINDOW_NO_MOVE | imgui.WINDOW_NO_RESIZE | imgui.WINDOW_NO_TITLE_BAR | imgui.WINDOW_NO_SCROLLBAR)
@@ -422,20 +428,20 @@ def run_per_hand_pipeline(self, g_id, flip_x):
         if flip_x:
             self.labels_image_2.cu().set(self.labels_image.cu())
             self.points_ops.flip_x(
-                np.array([self.DIM_X, self.DIM_Y], dtype=np.int32),
+                np.array([self.LABELS_DIM_X, self.LABELS_DIM_Y], dtype=np.int32),
                 self.labels_image_2.cu(),
                 self.labels_image.cu(),
-                grid=make_grid((self.DIM_X, self.DIM_Y, 1), (32, 32, 1)),
+                grid=make_grid((self.LABELS_DIM_X, self.LABELS_DIM_Y, 1), (32, 32, 1)),
                 block=(32, 32, 1))
 
         self.points_ops.make_rgba_from_labels(
-            np.uint32(self.DIM_X),
-            np.uint32(self.DIM_Y),
+            np.uint32(self.LABELS_DIM_X),
+            np.uint32(self.LABELS_DIM_Y),
             np.uint32(self.layered_rdf.num_layered_classes),
             self.labels_image.cu(),
             self.layered_rdf.label_colors.cu(),
             self.labels_image_rgba.cu(),
-            grid = ((self.DIM_X // 32) + 1, (self.DIM_Y // 32) + 1, 1),
+            grid = ((self.LABELS_DIM_X // 32) + 1, (self.LABELS_DIM_Y // 32) + 1, 1),
             block = (32,32,1))
 
         # self.cu_ctx.synchronize()
@@ -443,7 +449,7 @@ def run_per_hand_pipeline(self, g_id, flip_x):
 
         label_means = self.mean_shift.run(
             self.mean_shift_rounds,
-            self.labels_image.cu().reshape((1, self.DIM_Y, self.DIM_X)),
+            self.labels_image.cu().reshape((1, self.LABELS_DIM_Y, self.LABELS_DIM_X)),
             self.layered_rdf.num_layered_classes,
             self.mean_shift_variances)
 
@@ -486,6 +492,8 @@ def run_per_hand_pipeline(self, g_id, flip_x):
         for i, f_idx in zip(range(len(self.fingertip_idxes)), self.fingertip_idxes):
 
             px, py = label_means[f_idx-1].astype(np.int32)
+            px *= self.LABELS_REDUCE
+            py *= self.LABELS_REDUCE
             if px < 0 or py < 0 or px >= self.DIM_X or py >= self.DIM_Y:
                 hand_state.fingertips[i].reset_positions()
             else:
diff --git a/src/cuda/tree_eval.cu b/src/cuda/tree_eval.cu
index f6adabd..3dc0bdd 100644
--- a/src/cuda/tree_eval.cu
+++ b/src/cuda/tree_eval.cu
@@ -25,8 +25,8 @@ extern "C" {__global__
     void evaluate_image_using_forest(
         int NUM_TREES,
         int NUM_IMAGES,
-        int IMG_DIM_X,
-        int IMG_DIM_Y,
+        int depth_dim_x,
+        int depth_dim_y,
         int NUM_CLASSES,
         int MAX_TREE_DEPTH,
         int BLOCK_DIM_X,
@@ -34,14 +34,15 @@ extern "C" {__global__
         int filter_class,
         uint16* _filter,
         float* _forest,
-        uint16* _labels_out)
+        uint16* _labels_out,
+        int labels_reduce)
 {
 
     extern __shared__ float _thread_pdf[];
     Array2d<float> thread_pdf(_thread_pdf, {BLOCK_DIM_X, NUM_CLASSES});
 
-    const int2 IMG_DIM{IMG_DIM_X, IMG_DIM_Y};
-    const int TOTAL_NUM_PIXELS = NUM_IMAGES * IMG_DIM.x * IMG_DIM.y;
+    const int2 labels_img_dim{depth_dim_x / labels_reduce, depth_dim_y / labels_reduce};
+    const int TOTAL_NUM_PIXELS = NUM_IMAGES * labels_img_dim.x * labels_img_dim.y;
     const int TREE_NODE_ELS = 7 + NUM_CLASSES + NUM_CLASSES; // (ux,uy,vx,vy,thresh,l_next,r_next,{l_pdf},{r_pdf})
 
     const int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -59,13 +60,16 @@ extern "C" {__global__
 
     __syncthreads();
 
-    const int img_idx = i / (IMG_DIM.x * IMG_DIM.y);
-    const int i_rem = i % (IMG_DIM.x * IMG_DIM.y);
-    const int img_y = i_rem / IMG_DIM.x;
-    const int img_x = i_rem % IMG_DIM.x;
+    const int img_idx = i / (labels_img_dim.x * labels_img_dim.y);
+    const int i_rem = i % (labels_img_dim.x * labels_img_dim.y);
+    const int img_y = i_rem / labels_img_dim.x;
+    const int img_x = i_rem % labels_img_dim.x;
 
-    Array3d<uint16> img_in(_img_in, {NUM_IMAGES,IMG_DIM_Y,IMG_DIM_X}, MAX_UINT16);
-    Array3d<uint16> labels_out(_labels_out, {NUM_IMAGES,IMG_DIM_Y,IMG_DIM_X});
+    const int depth_img_y = img_y * labels_reduce;
+    const int depth_img_x = img_x * labels_reduce;
+
+    Array3d<uint16> img_in(_img_in, {NUM_IMAGES,depth_dim_y,depth_dim_x}, MAX_UINT16);
+    Array3d<uint16> labels_out(_labels_out, {NUM_IMAGES,labels_img_dim.y,labels_img_dim.x});
 
     const int TOTAL_TREE_NODES = (1 << MAX_TREE_DEPTH) - 1;
 
@@ -74,13 +78,13 @@ extern "C" {__global__
 
     // Don't try to evaluate if filtering by a filter image!
     if (filter_class != -1) {
-        Array3d<uint16> filter(_filter, {NUM_IMAGES,IMG_DIM_Y,IMG_DIM_X}, MAX_UINT16);
+        Array3d<uint16> filter(_filter, {NUM_IMAGES,labels_img_dim.y,labels_img_dim.x}, MAX_UINT16);
         const uint16 img_label = filter.get({img_idx, img_y, img_x});
         if ((int)img_label != filter_class) { return; }
     }
 
     // Don't try to evaluate if img in has 0 value!
-    const uint16 img_d = img_in.get({img_idx, img_y, img_x});
+    const uint16 img_d = img_in.get({img_idx, depth_img_y, depth_img_x});
     if (img_d == 0 || img_d == MAX_UINT16) { return; } // max uint16 is also considered 'pixel not present'
 
     // current node ID
@@ -96,7 +100,7 @@ extern "C" {__global__
         const int l_next = __float2int_rd(d_ptr[5]);
         const int r_next = __float2int_rd(d_ptr[6]);
 
-        const float f = compute_feature(img_in, img_idx, int2{img_x, img_y}, u, v);
+        const float f = compute_feature(img_in, img_idx, int2{depth_img_x, depth_img_y}, u, v);
         float* final_pdf = nullptr;
 
         if (f < thresh) {
diff --git a/src/decision_tree.py b/src/decision_tree.py
index b95d971..75036f8 100644
--- a/src/decision_tree.py
+++ b/src/decision_tree.py
@@ -170,16 +170,21 @@ def __init__(self, num_trees, max_depth, num_classes):
 # comes with gpu memory 
 class LayeredDecisionForest():
     @staticmethod
-    def load(config_filename, eval_dims):
+    def load(config_filename, depth_dims, labels_reduce=1):
         cfg = json.loads(open(config_filename).read())
         # models are loaded 1-by-1 from paths with parent directory as a root
         cfg['root'] = os.path.join(*Path(config_filename).parts[0:-1])
-        return LayeredDecisionForest(cfg, eval_dims)
+        return LayeredDecisionForest(cfg, depth_dims, labels_reduce)
 
-    def __init__(self, cfg, eval_dims):
+    def __init__(self, cfg, depth_dims, labels_reduce):
 
         self.eval = DecisionTreeEvaluator()
-        self.eval_dims = eval_dims # y,x !!
+
+        self.depth_dims = depth_dims # y,x !!
+
+        self.labels_reduce = labels_reduce
+        self.labels_dims = (depth_dims[0] // labels_reduce, depth_dims[1] // labels_reduce)
+
         self.m = []
         for l in cfg['layers']:
             # model path is relative to config file itself
@@ -195,7 +200,7 @@ def __init__(self, cfg, eval_dims):
 
         self.num_models = len(self.m)
 
-        self.label_images = [GpuBuffer(eval_dims, dtype=np.uint16) for _ in range(self.num_models)]
+        self.label_images = [GpuBuffer(self.labels_dims, dtype=np.uint16) for _ in range(self.num_models)]
 
         self.labels_images_ptrs_cu = GpuBuffer((self.num_models,), dtype=np.int64)
         label_images_ptrs = np.array([i.cu().__cuda_array_interface__['data'][0] for i in self.label_images], dtype=np.int64)
@@ -235,7 +240,8 @@ def run(self, depth_image, labels_image):
             i.cu().fill(MAX_UINT16)
 
         # first dim: image id. only one image!
-        dims = (1,) + self.eval_dims
+        depth_img_dims = (1,) + self.depth_dims
+        label_img_dims = (1,) + self.labels_dims
 
         for i in range(self.num_models):
             m, filter_model, filter_model_class = self.m[i]
@@ -243,17 +249,18 @@ def run(self, depth_image, labels_image):
 
             self.eval.get_labels_forest(
                 m,
-                depth_image.cu().reshape(dims),
-                single_labels_image.cu().reshape(dims),
-                filter_images=self.label_images[filter_model].cu().reshape(dims) if (filter_model is not None) else None,
+                depth_image.cu().reshape(depth_img_dims),
+                single_labels_image.cu().reshape(label_img_dims),
+                labels_reduce=self.labels_reduce,
+                filter_images=self.label_images[filter_model].cu().reshape(label_img_dims) if (filter_model is not None) else None,
                 filter_images_class=filter_model_class)
 
         self.eval.make_composite_labels_image(
             self.labels_images_ptrs_cu.cu(),
-            self.eval_dims[1],
-            self.eval_dims[0],
+            self.labels_dims[1],
+            self.labels_dims[0],
             self.labels_conditions_cu.cu(),
-            labels_image.cu().reshape(dims))
+            labels_image.cu().reshape(label_img_dims))
 
     # def eval()
 class DecisionTreeEvaluator():
@@ -287,14 +294,16 @@ def get_labels(self, tree, depth_images_in, labels_out):
 
         
     # TODO: support filter image for single tree forest! or not??
-    def get_labels_forest(self, forest, depth_images_in, labels_out, filter_images=None, filter_images_class=None):
+    def get_labels_forest(self, forest, depth_images_in, labels_out, labels_reduce = 1, filter_images=None, filter_images_class=None):
         num_images, dim_y, dim_x = depth_images_in.shape
 
+        assert labels_out.shape == (num_images, dim_y // labels_reduce, dim_x // labels_reduce)
+
         if filter_images is not None:
             assert filter_images_class is not None
-            assert filter_images.shape == depth_images_in.shape
+            assert filter_images.shape == labels_out.shape
 
-        num_test_pixels = num_images * dim_y * dim_x
+        num_test_pixels = num_images * (dim_y // labels_reduce) * (dim_x // labels_reduce)
 
         BLOCK_DIM_X = int(MAX_THREADS_PER_BLOCK // forest.num_trees) 
         grid_dim = (int(num_test_pixels // BLOCK_DIM_X) + 1, 1, 1)
@@ -315,6 +324,7 @@ def get_labels_forest(self, forest, depth_images_in, labels_out, filter_images=N
             f_img,
             forest.forest_cu,
             labels_out,
+            np.int32(labels_reduce),
             grid=grid_dim, block=block_dim, shared=(BLOCK_DIM_X * forest.num_classes * 4)) # sizeof(float), right?
 
 
diff --git a/src/run_live_layered.py b/src/run_live_layered.py
index c0f7507..d82f572 100644
--- a/src/run_live_layered.py
+++ b/src/run_live_layered.py
@@ -16,7 +16,7 @@
 
 class RunLive_Layered(AppBase):
     def __init__(self):
-        super().__init__(title="Layered RDF Demo")
+        super().__init__(title="Layered RDF Demo", width=848, height=800)
 
         parser = argparse.ArgumentParser(description='Train a classifier RDF for depth images')
         parser.add_argument('-cfg', nargs='?', required=True, type=str, help='Path to the layered decision forest config file')
@@ -35,16 +35,20 @@ def __init__(self):
 
         self.pipeline, self.depth_intrin, self.DIM_X, self.DIM_Y, self.FOCAL, self.PP = rs_util.start_stream(args)
 
-        self.layered_rdf = LayeredDecisionForest.load(args.cfg, (self.DIM_Y, self.DIM_X))
+        self.LABELS_REDUCE = 2
+
+        self.layered_rdf = LayeredDecisionForest.load(args.cfg, (self.DIM_Y, self.DIM_X), self.LABELS_REDUCE)
         self.points_ops = PointsOps()
 
         self.pts = GpuBuffer((self.DIM_Y, self.DIM_X, 4), dtype=np.float32)
 
         self.depth_image = GpuBuffer((1, self.DIM_Y, self.DIM_X), np.uint16)
-        self.labels_image = GpuBuffer((1, self.DIM_Y, self.DIM_X), dtype=np.uint16)
 
-        self.labels_image_rgba = GpuBuffer((self.DIM_Y, self.DIM_X, 4), dtype=np.uint8)
-        self.labels_image_rgba_tex = GpuTexture((self.DIM_X, self.DIM_Y), (GL_RGBA, GL_UNSIGNED_BYTE))
+
+        self.labels_image = GpuBuffer((1, self.DIM_Y // self.LABELS_REDUCE, self.DIM_X // self.LABELS_REDUCE), dtype=np.uint16)
+
+        self.labels_image_rgba = GpuBuffer((self.DIM_Y // self.LABELS_REDUCE, self.DIM_X // self.LABELS_REDUCE, 4), dtype=np.uint8)
+        self.labels_image_rgba_tex = GpuTexture((self.DIM_X // self.LABELS_REDUCE, self.DIM_Y // self.LABELS_REDUCE), (GL_RGBA, GL_UNSIGNED_BYTE))
 
         self.frame_num = 0
 
@@ -123,8 +127,8 @@ def tick(self, t):
         # make RGBA image
         self.labels_image_rgba.cu().fill(0)
         self.points_ops.make_rgba_from_labels(
-            np.uint32(self.DIM_X),
-            np.uint32(self.DIM_Y),
+            np.uint32(self.DIM_X // self.LABELS_REDUCE),
+            np.uint32(self.DIM_Y // self.LABELS_REDUCE),
             np.uint32(self.layered_rdf.num_layered_classes),
             self.labels_image.cu(),
             self.layered_rdf.label_colors.cu(),
@@ -135,7 +139,22 @@ def tick(self, t):
 
         self.frame_num += 1
 
+        self.begin_imgui_main()
         imgui.image(self.labels_image_rgba_tex.gl(), self.DIM_X * self.dpi_scale, self.DIM_Y * self.dpi_scale)
+        imgui.end()
+
+        imgui.set_next_window_size(200 * self.dpi_scale, 124 * self.dpi_scale)
+        imgui.set_next_window_bg_alpha(0.3)
+        imgui.begin('profile', imgui.WINDOW_NO_RESIZE | imgui.WINDOW_NO_TITLE_BAR | imgui.WINDOW_NO_SCROLLBAR)
+        profile_plot_width = 150 * self.dpi_scale
+        profile_plot_height = 60 * self.dpi_scale
+        imgui.text(f'ms/frame: {"{:.1f}".format(self.ms_per_frame_log[-1])}')
+        imgui.plot_lines('##ms-frame',
+            np.array(self.ms_per_frame_log, dtype=np.float32),
+            scale_max=100.,
+            scale_min=0.,
+            graph_size=(profile_plot_width , profile_plot_height))
+        imgui.end()
 
 if __name__ == '__main__':
     run_app(RunLive_Layered)