rapidsai · rapids-bot · Sep 29, 2021 · Jun 1, 2021 · Jul 2, 2021 · Jul 7, 2021
@@ -604,12 +604,24 @@ int max_depth(const tl::ModelImpl<T, L>& model)
   return depth;
 }
 
-// constructs a vector of size max_fid (number of features, or columns) from a Treelite tree,
-// where each feature has a maximum matching category and number of categorical nodes
+cat_feature_counters reduce_two_feature_counters(cat_feature_counters a, cat_feature_counters b)
+{
+  return {.max_matching = std::max(a.max_matching, b.max_matching),
+          .n_nodes      = a.n_nodes + b.n_nodes};
+}
+
+void eltwise_reduce_two_feature_counter_vectors(std::vector<cat_feature_counters>& dst,
+                                                const std::vector<cat_feature_counters>& extra)
+{
+  std::transform(dst.begin(), dst.end(), extra.begin(), dst.begin(), reduce_two_feature_counters);
+}
+
+// constructs a vector of size n_cols (number of features, or columns) from a Treelite tree,
+// where each feature has a maximum matching category and node count (from this tree alone).
 template <typename T, typename L>
-inline std::vector<int> max_matching_cat(const tl::Tree<T, L>& tree, int max_fid)
+inline std::vector<cat_feature_counters> cf_vec(const tl::Tree<T, L>& tree, int n_cols)
 {
-  std::vector<int> res(max_fid);
+  std::vector<cat_feature_counters> res(n_cols);
   std::stack<int> stack;
   stack.push(tree_root(tree));
   while (!stack.empty()) {
@@ -631,8 +643,8 @@ inline std::vector<int> max_matching_cat(const tl::Tree<T, L>& tree, int max_fid
         } else {
           max_matching_cat = -1;
         }
-        int* max_matching_res = &res[tree.SplitIndex(node_id)];
-        *max_matching_res     = std::max(*max_matching_res, max_matching_cat);
+        cat_feature_counters& counters = res[tree.SplitIndex(node_id)];
+        counters = reduce_two_feature_counters(counters, {max_matching_cat, 1});
       }
       stack.push(tree.LeftChild(node_id));
       node_id = tree.RightChild(node_id);
@@ -641,10 +653,10 @@ inline std::vector<int> max_matching_cat(const tl::Tree<T, L>& tree, int max_fid
   return res;
 }
 
-// constructs a vector of size max_fid (number of features, or columns) from a Treelite tree,
-// where each feature has a maximum matching category and number of categorical nodes
+// fills cat_sets.n_nodes[] (size number of features, or columns) from a Treelite tree,
+// where each feature has a number of categorical nodes
 template <typename T, typename L>
-inline std::size_t bit_pool_size(const tl::Tree<T, L>& tree, cat_sets_owner& cat_sets)
+inline std::size_t bit_pool_size(const tl::Tree<T, L>& tree, const categorical_sets& cat_sets)
 {
   std::size_t size = 0;
   std::stack<int> stack;
@@ -653,11 +665,9 @@ inline std::size_t bit_pool_size(const tl::Tree<T, L>& tree, cat_sets_owner& cat
     int node_id = stack.top();
     stack.pop();
     while (!tree.IsLeaf(node_id)) {
-      if (tree.SplitType(node_id) == tl::SplitFeatureType::kCategorical &&
-          tree.HasMatchingCategories(node_id)) {
+      if (tree.SplitType(node_id) == tl::SplitFeatureType::kCategorical) {
         int fid = tree.SplitIndex(node_id);
-        size += cat_sets.accessor().sizeof_mask(fid);
-        ++cat_sets.n_nodes[fid];
+        size += cat_sets.sizeof_mask(fid);
       }
       stack.push(tree.LeftChild(node_id));
       node_id = tree.RightChild(node_id);
@@ -666,31 +676,26 @@ inline std::size_t bit_pool_size(const tl::Tree<T, L>& tree, cat_sets_owner& cat
   return size;
 }
 
-void vec_max(std::vector<int>& dst, const std::vector<int>& extra)
-{
-  std::transform(dst.begin(), dst.end(), extra.begin(), dst.begin(), [](int a, int b) {
-    return std::max(a, b);
-  });
-}
-
 template <typename T, typename L>
 cat_sets_owner allocate_cat_sets_owner(const tl::ModelImpl<T, L>& model)
 {
-#pragma omp declare reduction(vec_max_red : std::vector<int> \
-      : vec_max(omp_out, omp_in))                              \
+#pragma omp declare reduction(cf_vec_red : std::vector<cat_feature_counters> \
+      : eltwise_reduce_two_feature_counter_vectors(omp_out, omp_in))                 \
     initializer(omp_priv = omp_orig)
   const auto& trees = model.trees;
-  cat_sets_owner cat_sets(model.num_feature, trees.size());
-  std::vector<int>& max_matching = cat_sets.max_matching;
-#pragma omp parallel for reduction(vec_max_red : max_matching)
+  cat_sets_owner cat_sets;
+  std::vector<cat_feature_counters> counters(model.num_feature);
+#pragma omp parallel for reduction(cf_vec_red : counters)
   for (size_t i = 0; i < trees.size(); ++i) {
-    vec_max(max_matching, max_matching_cat(trees[i], model.num_feature));
+    eltwise_reduce_two_feature_counter_vectors(counters, cf_vec(trees[i], model.num_feature));
   }
+  cat_sets.consume_counters(counters);
+  std::vector<size_t> bit_pool_sizes(trees.size());
 #pragma omp parallel for
   for (size_t i = 0; i < trees.size(); ++i) {
-    cat_sets.bit_pool_sizes[i] = bit_pool_size(trees[i], cat_sets);
+    bit_pool_sizes[i] = bit_pool_size(trees[i], cat_sets.accessor());
   }
-  cat_sets.initialize_from_bit_pool_sizes();
+  cat_sets.consume_bit_pool_sizes(bit_pool_sizes);
   return cat_sets;
 }
 
@@ -786,7 +791,8 @@ void tl2fil_leaf_payload(fil_node_t* fil_node,
 template <typename fil_node_t>
 struct conversion_state {
   fil_node_t node;
-  int tl_left, tl_right;
+  int tl_left;
+  int tl_right;
 };
 
 // modifies cat_sets
@@ -799,42 +805,27 @@ conversion_state<fil_node_t> tl2fil_inner_node(int fil_left_child,
                                                size_t* bit_pool_offset)
 {
   int tl_left = tree.LeftChild(tl_node_id), tl_right = tree.RightChild(tl_node_id);
-  val_t split{};
-  int feature_id = tree.SplitIndex(tl_node_id);
-  bool is_categorical, default_left;
+  val_t split         = {.f = NAN};  // yes there's a default initializer already
+  int feature_id      = tree.SplitIndex(tl_node_id);
+  bool is_categorical = tree.SplitType(tl_node_id) == tl::SplitFeatureType::kCategorical;
+  bool default_left   = tree.DefaultLeft(tl_node_id);
   if (tree.SplitType(tl_node_id) == tl::SplitFeatureType::kNumerical) {
-    is_categorical = false;
-    default_left   = tree.DefaultLeft(tl_node_id);
-    split.f        = static_cast<float>(tree.Threshold(tl_node_id));
+    split.f = static_cast<float>(tree.Threshold(tl_node_id));
     adjust_threshold(&split.f, &tl_left, &tl_right, &default_left, tree.ComparisonOp(tl_node_id));
   } else if (tree.SplitType(tl_node_id) == tl::SplitFeatureType::kCategorical) {
-    is_categorical = true;
-    default_left   = !tree.DefaultLeft(tl_node_id);
     // for FIL, the list of categories is always for the right child
-    if (tree.CategoriesListRightChild(tl_node_id) == false) std::swap(tl_left, tl_right);
+    if (tree.CategoriesListRightChild(tl_node_id) == false) {
+      std::swap(tl_left, tl_right);
+      default_left = !default_left;
+    }
     int sizeof_mask = cat_sets->accessor().sizeof_mask(feature_id);
     split.idx       = *bit_pool_offset;
     *bit_pool_offset += sizeof_mask;
-    ASSERT(split.idx >= 0, "split.idx < 0");
-    std::vector<uint32_t> matching_cats = tree.MatchingCategories(tl_node_id);
-    auto category_it                    = matching_cats.begin();
-    ASSERT(matching_cats.size() == 0 || matching_cats.data() != nullptr,
-           "internal error: nullptr from treelite");
-    // treelite guarantees tree.MatchingCategories() are in ascending order
-    // we have to initialize all pool bytes, so we iterate over those and keep category_it up to
-    // date
-    for (uint32_t which_8cats = 0; which_8cats < (uint32_t)sizeof_mask; ++which_8cats) {
-      uint8_t eight_cats = 0;
-      for (uint32_t bit = 0; bit < BITS_PER_BYTE; ++bit) {
-        if (category_it < matching_cats.end() &&
-            *category_it == which_8cats * BITS_PER_BYTE + bit) {
-          eight_cats |= 1 << bit;
-          ++category_it;
-        }
-      }
-      cat_sets->bits[split.idx + which_8cats] = eight_cats;
+    // cat_sets->bits have been zero-initialized
+    uint8_t* bits = &cat_sets->bits[split.idx];
+    for (uint32_t category : tree.MatchingCategories(tl_node_id)) {
+      bits[category / BITS_PER_BYTE] |= 1 << (category % BITS_PER_BYTE);
     }
-    ASSERT(category_it == matching_cats.end(), "internal error: didn't convert all categories");
   } else {
     ASSERT(false, "only numerical and categorical split nodes are supported");
   }
@@ -844,7 +835,7 @@ conversion_state<fil_node_t> tl2fil_inner_node(int fil_left_child,
   } else {
     node = fil_node_t({}, split, feature_id, default_left, false, is_categorical, fil_left_child);
   }
-  return {node, tl_left, tl_right};
+  return conversion_state<fil_node_t>{node, tl_left, tl_right};
 }
 
 template <typename T, typename L>
@@ -929,7 +920,7 @@ __noinline__ int tree2fil_sparse(std::vector<fil_node_t>& nodes,
   std::stack<pair_t> stack;
   int built_index = root + 1;
   stack.push(pair_t(tree_root(tree), 0));
-  size_t bit_pool_offset = cat_sets->bit_pool_offsets[tree_idx];
+  std::size_t bit_pool_offset = cat_sets->bit_pool_offsets[tree_idx];
   while (!stack.empty()) {
     const pair_t& top = stack.top();
     int node_id       = top.first;
@@ -1314,7 +1305,7 @@ void from_treelite(const raft::handle_t& handle,
   if (storage_type == storage_type_t::AUTO) {
     if (tl_params->algo == algo_t::ALGO_AUTO || tl_params->algo == algo_t::NAIVE) {
       int depth = max_depth(model);
-      // max 2**25 dense nodes, 256 MiB dense model size. Categorical mask size unlimited.
+      // max 2**25 dense nodes, 256 MiB dense model size. Categorical mask size is unlimited.
       const int LOG2_MAX_DENSE_NODES = 25;
       int log2_num_dense_nodes       = depth + 1 + int(ceil(std::log2(model.trees.size())));
       storage_type = log2_num_dense_nodes > LOG2_MAX_DENSE_NODES ? storage_type_t::SPARSE
@@ -1398,7 +1389,7 @@ char* sprintf_shape(const tl::ModelImpl<threshold_t, leaf_t>& model,
   forest_shape << storage_type_repr[storage] << " model size " << std::setprecision(2) << size_mb
                << " MB" << std::endl;
   if (cat_sets.bits.size() > 0) {
-    forest_shape << "categorical nodes for each feature id: {";
+    forest_shape << "number of categorical nodes for each feature id: {";
     std::size_t total_cat_nodes = 0;
     for (std::size_t n : cat_sets.n_nodes) {
       forest_shape << n << " ";

@@ -306,18 +306,13 @@ struct forest_params_t {
 /// FIL_TPB is the number of threads per block to use with FIL kernels
 const int FIL_TPB = 256;
 
-const int32_t MAX_PRECISE_INT_FLOAT = 1 << 24;  // 16'777'216
+constexpr int32_t MAX_PRECISE_INT_FLOAT = 1 << 24;  // 16'777'216
 
 __host__ __device__ __forceinline__ int fetch_bit(const uint8_t* array, int bit)
 {
   return (array[bit / BITS_PER_BYTE] >> (bit % BITS_PER_BYTE)) & 1;
 }
 
-struct cat_feature_counters {
-  int max_matching = -1;
-  int n_nodes      = 0;
-};
-
 struct categorical_sets {
   // arrays are const to use fast GPU read instructions by default
   // arrays from each node ID are concatenated first, then from all categories
@@ -376,14 +371,21 @@ struct tree_base {
     if (isnan(val)) {
       cond = !node.def_left();
     } else if (CATS_SUPPORTED && node.is_categorical()) {
-      cond = cat_sets.category_matches(node, (int)val);
+      cond = cat_sets.category_matches(node, static_cast<int>(val));
     } else {
       cond = val >= node.thresh();
     }
     return node.left(node_idx) + cond;
   }
 };
 
+// -1 means no matching categories
+struct cat_feature_counters {
+  int max_matching = -1;
+  int n_nodes      = 0;
+};
+
+// used only during model import. For inference, trimmed down using cat_sets_owner::accessor()
 // in internal.cuh, as opposed to fil_test.cu, because importing from treelite will require it
 struct cat_sets_owner {
   // arrays from each node ID are concatenated first, then from all categories
@@ -394,10 +396,12 @@ struct cat_sets_owner {
   // how many categorical nodes use a given feature id. Used for model shape string.
   std::vector<std::size_t> n_nodes;
   // per tree, size and offset of bit pool within the overall bit pool
-  std::vector<std::size_t> bit_pool_sizes, bit_pool_offsets;
+  std::vector<std::size_t> bit_pool_offsets;
 
   categorical_sets accessor() const
   {
+    ASSERT(bits.size() < INT_MAX,
+           "too many categories/categorical nodes: cannot store bits offset in node");
     return {
       .bits              = bits.data(),
       .max_matching      = max_matching.data(),
@@ -406,11 +410,19 @@ struct cat_sets_owner {
     };
   }
 
-  void initialize_from_bit_pool_sizes()
+  void consume_counters(const std::vector<cat_feature_counters>& counters)
+  {
+    for (cat_feature_counters cf : counters) {
+      max_matching.push_back(cf.max_matching);
+      n_nodes.push_back(cf.n_nodes);
+    }
+  }
+
+  void consume_bit_pool_sizes(const std::vector<std::size_t>& bit_pool_sizes)
   {
-    bit_pool_offsets[0] = 0;
-    for (std::size_t i = 1; i < bit_pool_sizes.size(); ++i) {
-      bit_pool_offsets[i] = bit_pool_offsets[i - 1] + bit_pool_sizes[i - 1];
+    bit_pool_offsets.push_back(0);
+    for (std::size_t i = 0; i < bit_pool_sizes.size() - 1; ++i) {
+      bit_pool_offsets.push_back(bit_pool_offsets.back() + bit_pool_sizes[i]);
     }
     bits.resize(bit_pool_offsets.back() + bit_pool_sizes.back());
   }
@@ -420,16 +432,6 @@ struct cat_sets_owner {
     : bits(bits_), max_matching(max_matching_)
   {
   }
-
-  // accepting int because GPU code only allows max<int> features
-  cat_sets_owner(int num_features, std::size_t num_trees)
-    : bits(0),
-      max_matching(num_features, -1),
-      n_nodes(num_features, 0),
-      bit_pool_offsets(num_trees),
-      bit_pool_sizes(num_trees)
-  {
-  }
 };
 
 std::ostream& operator<<(std::ostream& os, const cat_sets_owner& cso);

@@ -293,10 +293,10 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
     // uniformily distributed in orders of magnitude: smaller models which
     // still stress large bitfields.
     // up to 10**ps.max_magnitude_of_matching_cat (only if feature is categorical, else -1)
-    cat_sets_h = cat_sets_owner(ps.num_cols, ps.num_trees);
     std::mt19937 gen(ps.seed);
     std::uniform_real_distribution mmc(-1.0f, ps.max_magnitude_of_matching_cat);
     std::bernoulli_distribution fc(ps.feature_categorical_prob);
+    cat_sets_h.max_matching.resize(ps.num_cols);
     for (int fid = 0; fid < ps.num_cols; ++fid) {
       feature_categorical[fid] = fc(gen);
       if (feature_categorical[fid]) {
@@ -305,7 +305,7 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
         ASSERT(mm < INT_MAX,
                "internal error: max_magnitude_of_matching_cat %f is too large",
                ps.max_magnitude_of_matching_cat);
-        cat_sets_h.max_matching[fid] = (int)mm;
+        cat_sets_h.max_matching[fid] = mm;
       } else {
         cat_sets_h.max_matching[fid] = -1;
       }
@@ -330,14 +330,15 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
 
     // count nodes for each feature id, while splitting the sets between nodes
     std::size_t bit_pool_size = 0;
+    cat_sets_h.n_nodes        = std::vector<size_t>(ps.num_cols, 0);
     for (std::size_t node_id = 0; node_id < num_nodes; ++node_id) {
       int fid = fids_h[node_id];
 
       if (!feature_categorical[fid] || is_leafs_h[node_id]) is_categoricals_h[node_id] = 0.0f;
 
       if (is_categoricals_h[node_id] == 1.0) {
         // might allocate a categorical set for an unreachable inner node. That's OK.
-        cat_sets_h.n_nodes[fid]++;
+        ++cat_sets_h.n_nodes[fid];
         node_cat_set[node_id] = bit_pool_size;
         bit_pool_size += cat_sets_h.accessor().sizeof_mask(fid);
       }
@@ -774,8 +775,12 @@ class TreeliteFilTest : public BaseFilTest {
       if (dense_node.is_categorical()) {
         uint8_t byte = 0;
         for (int category = 0; category <= cat_sets_h.max_matching[dense_node.fid()]; ++category) {
-          if (category % 8 == 0) byte = cat_sets_h.bits[dense_node.set() + category / 8];
-          if ((byte & 1 << category % 8) != 0) left_categories.push_back(category);
+          if (category % BITS_PER_BYTE == 0) {
+            byte = cat_sets_h.bits[dense_node.set() + category / BITS_PER_BYTE];
+          }
+          if ((byte & (1 << (category % BITS_PER_BYTE))) != 0) {
+            left_categories.push_back(category);
+          }
         }
       } else {
         threshold = dense_node.thresh();
@@ -1235,6 +1240,15 @@ std::vector<FilTestParams> import_dense_inputs = {
   FIL_TEST_PARAMS(print_forest_shape = true),
   FIL_TEST_PARAMS(leaf_algo = VECTOR_LEAF, num_classes = 2),
   FIL_TEST_PARAMS(leaf_algo = VECTOR_LEAF, num_trees = 19, num_classes = 20),
+  FIL_TEST_PARAMS(node_categorical_prob = 0.5, feature_categorical_prob = 0.5),
+  FIL_TEST_PARAMS(
+    node_categorical_prob = 1.0, feature_categorical_prob = 1.0, cat_match_prob = 1.0),
+  FIL_TEST_PARAMS(
+    node_categorical_prob = 1.0, feature_categorical_prob = 1.0, cat_match_prob = 0.0),
+  FIL_TEST_PARAMS(depth                         = 3,
+                  node_categorical_prob         = 0.5,
+                  feature_categorical_prob      = 0.5,
+                  max_magnitude_of_matching_cat = 5),
 };
 
 TEST_P(TreeliteDenseFilTest, Import) { compare(); }

@@ -499,8 +499,9 @@ def to_categorical(features, n_categorical):
     cat_cols = features[:, :n_categorical]
     cat_cols = cat_cols - cat_cols.min(axis=1, keepdims=True)  # range [0, ?]
     cat_cols /= cat_cols.max(axis=1, keepdims=True)  # range [0, 1]
+    rough_n_categories = 100
     # round into rough_n_categories bins
-    cat_cols = (cat_cols * 100).astype(int)
+    cat_cols = (cat_cols * rough_n_categories).astype(int)
     for icol in range(n_categorical):
         col = cat_cols[:, icol]
         df_cols[icol] = pd.Series(pd.Categorical(col,