From 3fd2bf94bbf34b7f62dc49a3a5ad3cba6afab9f0 Mon Sep 17 00:00:00 2001 From: Carson Swope Date: Fri, 10 Sep 2021 15:14:48 -0600 Subject: [PATCH] allow evaluating forest at lower resolution for faster framerate --- src/3d_bz.py | 34 +++++++++++++++++++++------------- src/cuda/tree_eval.cu | 32 ++++++++++++++++++-------------- src/decision_tree.py | 40 +++++++++++++++++++++++++--------------- src/run_live_layered.py | 33 ++++++++++++++++++++++++++------- 4 files changed, 90 insertions(+), 49 deletions(-) diff --git a/src/3d_bz.py b/src/3d_bz.py index 6045698..59846f5 100644 --- a/src/3d_bz.py +++ b/src/3d_bz.py @@ -49,6 +49,8 @@ def __init__(self): self.midi = Midi() + self.LABELS_REDUCE = 2 + self.NO_DEBUG = args.no_debug self.NUM_RANDOM_GUESSES = args.plane_num_iterations or 25000 @@ -67,7 +69,7 @@ def __init__(self): print('loading forest') - self.layered_rdf = LayeredDecisionForest.load(args.cfg, (480, 848)) + self.layered_rdf = LayeredDecisionForest.load(args.cfg, (480, 848), labels_reduce = self.LABELS_REDUCE) self.points_ops = PointsOps() @@ -94,11 +96,15 @@ def __init__(self): self.coord_croups_cpu = np.zeros((self.depth_mm_dims[0] * self.depth_mm_dims[1], 3), dtype=np.int32) self.coord_groups_gpu = GpuBuffer((self.depth_mm_dims[0] * self.depth_mm_dims[1], 3), dtype=np.int32) - self.labels_image = GpuBuffer((self.DIM_Y, self.DIM_X), dtype=np.uint16) - self.labels_image_2 = GpuBuffer((self.DIM_Y, self.DIM_X), dtype=np.uint16) - self.labels_image_rgba_cpu = np.zeros((self.DIM_Y, self.DIM_X, 4), dtype=np.uint8) - self.labels_image_rgba = GpuBuffer((self.DIM_Y, self.DIM_X, 4), dtype=np.uint8) - self.labels_image_rgba_tex = GpuTexture((self.DIM_X, self.DIM_Y), (GL_RGBA, GL_UNSIGNED_BYTE)) + self.LABELS_DIM_X = self.DIM_X // self.LABELS_REDUCE + self.LABELS_DIM_Y = self.DIM_Y // self.LABELS_REDUCE + self.LABELS_DIM = (self.LABELS_DIM_Y, self.LABELS_DIM_X) + + self.labels_image = GpuBuffer(self.LABELS_DIM, dtype=np.uint16) + self.labels_image_2 = GpuBuffer(self.LABELS_DIM, dtype=np.uint16) + self.labels_image_rgba_cpu = np.zeros(self.LABELS_DIM + (4,), dtype=np.uint8) + self.labels_image_rgba = GpuBuffer(self.LABELS_DIM + (4,), dtype=np.uint8) + self.labels_image_rgba_tex = GpuTexture((self.LABELS_DIM_X, self.LABELS_DIM_Y), (GL_RGBA, GL_UNSIGNED_BYTE)) mean_shift_variances = np.array( [100., 50., 50., 50., 50., 50., 50.], @@ -296,7 +302,7 @@ def tick(self, _): window_pad = 24 * self.dpi_scale imgui.push_style_var(imgui.STYLE_WINDOW_PADDING, (window_pad, window_pad)) - imgui.set_next_window_position(0, 0)#self.DIM_Y * self.dpi_scale) + imgui.set_next_window_position(0, 0) imgui.set_next_window_size(self.width * self.dpi_scale, 220 * self.dpi_scale) imgui.set_next_window_bg_alpha(0.3) imgui.begin('Hand state', flags= imgui.WINDOW_NO_MOVE | imgui.WINDOW_NO_RESIZE | imgui.WINDOW_NO_TITLE_BAR | imgui.WINDOW_NO_SCROLLBAR) @@ -422,20 +428,20 @@ def run_per_hand_pipeline(self, g_id, flip_x): if flip_x: self.labels_image_2.cu().set(self.labels_image.cu()) self.points_ops.flip_x( - np.array([self.DIM_X, self.DIM_Y], dtype=np.int32), + np.array([self.LABELS_DIM_X, self.LABELS_DIM_Y], dtype=np.int32), self.labels_image_2.cu(), self.labels_image.cu(), - grid=make_grid((self.DIM_X, self.DIM_Y, 1), (32, 32, 1)), + grid=make_grid((self.LABELS_DIM_X, self.LABELS_DIM_Y, 1), (32, 32, 1)), block=(32, 32, 1)) self.points_ops.make_rgba_from_labels( - np.uint32(self.DIM_X), - np.uint32(self.DIM_Y), + np.uint32(self.LABELS_DIM_X), + np.uint32(self.LABELS_DIM_Y), np.uint32(self.layered_rdf.num_layered_classes), self.labels_image.cu(), self.layered_rdf.label_colors.cu(), self.labels_image_rgba.cu(), - grid = ((self.DIM_X // 32) + 1, (self.DIM_Y // 32) + 1, 1), + grid = ((self.LABELS_DIM_X // 32) + 1, (self.LABELS_DIM_Y // 32) + 1, 1), block = (32,32,1)) # self.cu_ctx.synchronize() @@ -443,7 +449,7 @@ def run_per_hand_pipeline(self, g_id, flip_x): label_means = self.mean_shift.run( self.mean_shift_rounds, - self.labels_image.cu().reshape((1, self.DIM_Y, self.DIM_X)), + self.labels_image.cu().reshape((1, self.LABELS_DIM_Y, self.LABELS_DIM_X)), self.layered_rdf.num_layered_classes, self.mean_shift_variances) @@ -486,6 +492,8 @@ def run_per_hand_pipeline(self, g_id, flip_x): for i, f_idx in zip(range(len(self.fingertip_idxes)), self.fingertip_idxes): px, py = label_means[f_idx-1].astype(np.int32) + px *= self.LABELS_REDUCE + py *= self.LABELS_REDUCE if px < 0 or py < 0 or px >= self.DIM_X or py >= self.DIM_Y: hand_state.fingertips[i].reset_positions() else: diff --git a/src/cuda/tree_eval.cu b/src/cuda/tree_eval.cu index f6adabd..3dc0bdd 100644 --- a/src/cuda/tree_eval.cu +++ b/src/cuda/tree_eval.cu @@ -25,8 +25,8 @@ extern "C" {__global__ void evaluate_image_using_forest( int NUM_TREES, int NUM_IMAGES, - int IMG_DIM_X, - int IMG_DIM_Y, + int depth_dim_x, + int depth_dim_y, int NUM_CLASSES, int MAX_TREE_DEPTH, int BLOCK_DIM_X, @@ -34,14 +34,15 @@ extern "C" {__global__ int filter_class, uint16* _filter, float* _forest, - uint16* _labels_out) + uint16* _labels_out, + int labels_reduce) { extern __shared__ float _thread_pdf[]; Array2d thread_pdf(_thread_pdf, {BLOCK_DIM_X, NUM_CLASSES}); - const int2 IMG_DIM{IMG_DIM_X, IMG_DIM_Y}; - const int TOTAL_NUM_PIXELS = NUM_IMAGES * IMG_DIM.x * IMG_DIM.y; + const int2 labels_img_dim{depth_dim_x / labels_reduce, depth_dim_y / labels_reduce}; + const int TOTAL_NUM_PIXELS = NUM_IMAGES * labels_img_dim.x * labels_img_dim.y; const int TREE_NODE_ELS = 7 + NUM_CLASSES + NUM_CLASSES; // (ux,uy,vx,vy,thresh,l_next,r_next,{l_pdf},{r_pdf}) const int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -59,13 +60,16 @@ extern "C" {__global__ __syncthreads(); - const int img_idx = i / (IMG_DIM.x * IMG_DIM.y); - const int i_rem = i % (IMG_DIM.x * IMG_DIM.y); - const int img_y = i_rem / IMG_DIM.x; - const int img_x = i_rem % IMG_DIM.x; + const int img_idx = i / (labels_img_dim.x * labels_img_dim.y); + const int i_rem = i % (labels_img_dim.x * labels_img_dim.y); + const int img_y = i_rem / labels_img_dim.x; + const int img_x = i_rem % labels_img_dim.x; - Array3d img_in(_img_in, {NUM_IMAGES,IMG_DIM_Y,IMG_DIM_X}, MAX_UINT16); - Array3d labels_out(_labels_out, {NUM_IMAGES,IMG_DIM_Y,IMG_DIM_X}); + const int depth_img_y = img_y * labels_reduce; + const int depth_img_x = img_x * labels_reduce; + + Array3d img_in(_img_in, {NUM_IMAGES,depth_dim_y,depth_dim_x}, MAX_UINT16); + Array3d labels_out(_labels_out, {NUM_IMAGES,labels_img_dim.y,labels_img_dim.x}); const int TOTAL_TREE_NODES = (1 << MAX_TREE_DEPTH) - 1; @@ -74,13 +78,13 @@ extern "C" {__global__ // Don't try to evaluate if filtering by a filter image! if (filter_class != -1) { - Array3d filter(_filter, {NUM_IMAGES,IMG_DIM_Y,IMG_DIM_X}, MAX_UINT16); + Array3d filter(_filter, {NUM_IMAGES,labels_img_dim.y,labels_img_dim.x}, MAX_UINT16); const uint16 img_label = filter.get({img_idx, img_y, img_x}); if ((int)img_label != filter_class) { return; } } // Don't try to evaluate if img in has 0 value! - const uint16 img_d = img_in.get({img_idx, img_y, img_x}); + const uint16 img_d = img_in.get({img_idx, depth_img_y, depth_img_x}); if (img_d == 0 || img_d == MAX_UINT16) { return; } // max uint16 is also considered 'pixel not present' // current node ID @@ -96,7 +100,7 @@ extern "C" {__global__ const int l_next = __float2int_rd(d_ptr[5]); const int r_next = __float2int_rd(d_ptr[6]); - const float f = compute_feature(img_in, img_idx, int2{img_x, img_y}, u, v); + const float f = compute_feature(img_in, img_idx, int2{depth_img_x, depth_img_y}, u, v); float* final_pdf = nullptr; if (f < thresh) { diff --git a/src/decision_tree.py b/src/decision_tree.py index b95d971..75036f8 100644 --- a/src/decision_tree.py +++ b/src/decision_tree.py @@ -170,16 +170,21 @@ def __init__(self, num_trees, max_depth, num_classes): # comes with gpu memory class LayeredDecisionForest(): @staticmethod - def load(config_filename, eval_dims): + def load(config_filename, depth_dims, labels_reduce=1): cfg = json.loads(open(config_filename).read()) # models are loaded 1-by-1 from paths with parent directory as a root cfg['root'] = os.path.join(*Path(config_filename).parts[0:-1]) - return LayeredDecisionForest(cfg, eval_dims) + return LayeredDecisionForest(cfg, depth_dims, labels_reduce) - def __init__(self, cfg, eval_dims): + def __init__(self, cfg, depth_dims, labels_reduce): self.eval = DecisionTreeEvaluator() - self.eval_dims = eval_dims # y,x !! + + self.depth_dims = depth_dims # y,x !! + + self.labels_reduce = labels_reduce + self.labels_dims = (depth_dims[0] // labels_reduce, depth_dims[1] // labels_reduce) + self.m = [] for l in cfg['layers']: # model path is relative to config file itself @@ -195,7 +200,7 @@ def __init__(self, cfg, eval_dims): self.num_models = len(self.m) - self.label_images = [GpuBuffer(eval_dims, dtype=np.uint16) for _ in range(self.num_models)] + self.label_images = [GpuBuffer(self.labels_dims, dtype=np.uint16) for _ in range(self.num_models)] self.labels_images_ptrs_cu = GpuBuffer((self.num_models,), dtype=np.int64) label_images_ptrs = np.array([i.cu().__cuda_array_interface__['data'][0] for i in self.label_images], dtype=np.int64) @@ -235,7 +240,8 @@ def run(self, depth_image, labels_image): i.cu().fill(MAX_UINT16) # first dim: image id. only one image! - dims = (1,) + self.eval_dims + depth_img_dims = (1,) + self.depth_dims + label_img_dims = (1,) + self.labels_dims for i in range(self.num_models): m, filter_model, filter_model_class = self.m[i] @@ -243,17 +249,18 @@ def run(self, depth_image, labels_image): self.eval.get_labels_forest( m, - depth_image.cu().reshape(dims), - single_labels_image.cu().reshape(dims), - filter_images=self.label_images[filter_model].cu().reshape(dims) if (filter_model is not None) else None, + depth_image.cu().reshape(depth_img_dims), + single_labels_image.cu().reshape(label_img_dims), + labels_reduce=self.labels_reduce, + filter_images=self.label_images[filter_model].cu().reshape(label_img_dims) if (filter_model is not None) else None, filter_images_class=filter_model_class) self.eval.make_composite_labels_image( self.labels_images_ptrs_cu.cu(), - self.eval_dims[1], - self.eval_dims[0], + self.labels_dims[1], + self.labels_dims[0], self.labels_conditions_cu.cu(), - labels_image.cu().reshape(dims)) + labels_image.cu().reshape(label_img_dims)) # def eval() class DecisionTreeEvaluator(): @@ -287,14 +294,16 @@ def get_labels(self, tree, depth_images_in, labels_out): # TODO: support filter image for single tree forest! or not?? - def get_labels_forest(self, forest, depth_images_in, labels_out, filter_images=None, filter_images_class=None): + def get_labels_forest(self, forest, depth_images_in, labels_out, labels_reduce = 1, filter_images=None, filter_images_class=None): num_images, dim_y, dim_x = depth_images_in.shape + assert labels_out.shape == (num_images, dim_y // labels_reduce, dim_x // labels_reduce) + if filter_images is not None: assert filter_images_class is not None - assert filter_images.shape == depth_images_in.shape + assert filter_images.shape == labels_out.shape - num_test_pixels = num_images * dim_y * dim_x + num_test_pixels = num_images * (dim_y // labels_reduce) * (dim_x // labels_reduce) BLOCK_DIM_X = int(MAX_THREADS_PER_BLOCK // forest.num_trees) grid_dim = (int(num_test_pixels // BLOCK_DIM_X) + 1, 1, 1) @@ -315,6 +324,7 @@ def get_labels_forest(self, forest, depth_images_in, labels_out, filter_images=N f_img, forest.forest_cu, labels_out, + np.int32(labels_reduce), grid=grid_dim, block=block_dim, shared=(BLOCK_DIM_X * forest.num_classes * 4)) # sizeof(float), right? diff --git a/src/run_live_layered.py b/src/run_live_layered.py index c0f7507..d82f572 100644 --- a/src/run_live_layered.py +++ b/src/run_live_layered.py @@ -16,7 +16,7 @@ class RunLive_Layered(AppBase): def __init__(self): - super().__init__(title="Layered RDF Demo") + super().__init__(title="Layered RDF Demo", width=848, height=800) parser = argparse.ArgumentParser(description='Train a classifier RDF for depth images') parser.add_argument('-cfg', nargs='?', required=True, type=str, help='Path to the layered decision forest config file') @@ -35,16 +35,20 @@ def __init__(self): self.pipeline, self.depth_intrin, self.DIM_X, self.DIM_Y, self.FOCAL, self.PP = rs_util.start_stream(args) - self.layered_rdf = LayeredDecisionForest.load(args.cfg, (self.DIM_Y, self.DIM_X)) + self.LABELS_REDUCE = 2 + + self.layered_rdf = LayeredDecisionForest.load(args.cfg, (self.DIM_Y, self.DIM_X), self.LABELS_REDUCE) self.points_ops = PointsOps() self.pts = GpuBuffer((self.DIM_Y, self.DIM_X, 4), dtype=np.float32) self.depth_image = GpuBuffer((1, self.DIM_Y, self.DIM_X), np.uint16) - self.labels_image = GpuBuffer((1, self.DIM_Y, self.DIM_X), dtype=np.uint16) - self.labels_image_rgba = GpuBuffer((self.DIM_Y, self.DIM_X, 4), dtype=np.uint8) - self.labels_image_rgba_tex = GpuTexture((self.DIM_X, self.DIM_Y), (GL_RGBA, GL_UNSIGNED_BYTE)) + + self.labels_image = GpuBuffer((1, self.DIM_Y // self.LABELS_REDUCE, self.DIM_X // self.LABELS_REDUCE), dtype=np.uint16) + + self.labels_image_rgba = GpuBuffer((self.DIM_Y // self.LABELS_REDUCE, self.DIM_X // self.LABELS_REDUCE, 4), dtype=np.uint8) + self.labels_image_rgba_tex = GpuTexture((self.DIM_X // self.LABELS_REDUCE, self.DIM_Y // self.LABELS_REDUCE), (GL_RGBA, GL_UNSIGNED_BYTE)) self.frame_num = 0 @@ -123,8 +127,8 @@ def tick(self, t): # make RGBA image self.labels_image_rgba.cu().fill(0) self.points_ops.make_rgba_from_labels( - np.uint32(self.DIM_X), - np.uint32(self.DIM_Y), + np.uint32(self.DIM_X // self.LABELS_REDUCE), + np.uint32(self.DIM_Y // self.LABELS_REDUCE), np.uint32(self.layered_rdf.num_layered_classes), self.labels_image.cu(), self.layered_rdf.label_colors.cu(), @@ -135,7 +139,22 @@ def tick(self, t): self.frame_num += 1 + self.begin_imgui_main() imgui.image(self.labels_image_rgba_tex.gl(), self.DIM_X * self.dpi_scale, self.DIM_Y * self.dpi_scale) + imgui.end() + + imgui.set_next_window_size(200 * self.dpi_scale, 124 * self.dpi_scale) + imgui.set_next_window_bg_alpha(0.3) + imgui.begin('profile', imgui.WINDOW_NO_RESIZE | imgui.WINDOW_NO_TITLE_BAR | imgui.WINDOW_NO_SCROLLBAR) + profile_plot_width = 150 * self.dpi_scale + profile_plot_height = 60 * self.dpi_scale + imgui.text(f'ms/frame: {"{:.1f}".format(self.ms_per_frame_log[-1])}') + imgui.plot_lines('##ms-frame', + np.array(self.ms_per_frame_log, dtype=np.float32), + scale_max=100., + scale_min=0., + graph_size=(profile_plot_width , profile_plot_height)) + imgui.end() if __name__ == '__main__': run_app(RunLive_Layered)