Skip to content

Commit

Permalink
skip missing lookup if nothing is missing in CPU hist partition kerne…
Browse files Browse the repository at this point in the history
…l. (#5644)

* [xgboost] skip missing lookup if nothing is missing
  • Loading branch information
okuvshynov authored May 12, 2020
1 parent 9ad4090 commit 4e64e2e
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 9 deletions.
9 changes: 9 additions & 0 deletions src/common/column_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ class ColumnMatrix {
index_base_ = const_cast<uint32_t*>(gmat.cut.Ptrs().data());

const bool noMissingValues = NoMissingValues(gmat.row_ptr[nrow], nrow, nfeature);
any_missing_ = !noMissingValues;

if (noMissingValues) {
missing_flags_.resize(feature_offsets_[nfeature], false);
Expand Down Expand Up @@ -311,11 +312,18 @@ class ColumnMatrix {
const BinTypeSize GetTypeSize() const {
return bins_type_size_;
}

// This is just an utility function
const bool NoMissingValues(const size_t n_elements,
const size_t n_row, const size_t n_features) {
return n_elements == n_features * n_row;
}

// And this returns part of state
const bool AnyMissing() const {
return any_missing_;
}

private:
std::vector<uint8_t> index_;

Expand All @@ -329,6 +337,7 @@ class ColumnMatrix {
uint32_t* index_base_;
std::vector<bool> missing_flags_;
BinTypeSize bins_type_size_;
bool any_missing_;
};

} // namespace common
Expand Down
41 changes: 32 additions & 9 deletions src/tree/updater_quantile_hist.cc
Original file line number Diff line number Diff line change
Expand Up @@ -826,7 +826,7 @@ void QuantileHistMaker::Builder::EvaluateSplits(const std::vector<ExpandEntry>&
// on comparison of indexes values (idx_span) and split point (split_cond)
// Handle dense columns
// Analog of std::stable_partition, but in no-inplace manner
template <bool default_left, typename BinIdxType>
template <bool default_left, bool any_missing, typename BinIdxType>
inline std::pair<size_t, size_t> PartitionDenseKernel(const common::DenseColumn<BinIdxType>& column,
common::Span<const size_t> rid_span, const int32_t split_cond,
common::Span<size_t> left_part, common::Span<size_t> right_part) {
Expand All @@ -837,14 +837,24 @@ inline std::pair<size_t, size_t> PartitionDenseKernel(const common::DenseColumn<
size_t nleft_elems = 0;
size_t nright_elems = 0;

for (auto rid : rid_span) {
if (column.IsMissing(rid)) {
if (default_left) {
p_left_part[nleft_elems++] = rid;
if (any_missing) {
for (auto rid : rid_span) {
if (column.IsMissing(rid)) {
if (default_left) {
p_left_part[nleft_elems++] = rid;
} else {
p_right_part[nright_elems++] = rid;
}
} else {
p_right_part[nright_elems++] = rid;
if ((static_cast<int32_t>(idx[rid]) + offset) <= split_cond) {
p_left_part[nleft_elems++] = rid;
} else {
p_right_part[nright_elems++] = rid;
}
}
} else {
}
} else {
for (auto rid : rid_span) {
if ((static_cast<int32_t>(idx[rid]) + offset) <= split_cond) {
p_left_part[nleft_elems++] = rid;
} else {
Expand Down Expand Up @@ -919,6 +929,7 @@ void QuantileHistMaker::Builder::PartitionKernel(
const size_t node_in_set, const size_t nid, common::Range1d range,
const int32_t split_cond, const ColumnMatrix& column_matrix, const RegTree& tree) {
const size_t* rid = row_set_collection_[nid].begin;

common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
common::Span<size_t> left = partition_builder_.GetLeftBuffer(node_in_set,
range.begin(), range.end());
Expand All @@ -934,9 +945,21 @@ void QuantileHistMaker::Builder::PartitionKernel(
const common::DenseColumn<BinIdxType>& column =
static_cast<const common::DenseColumn<BinIdxType>& >(*(column_ptr.get()));
if (default_left) {
child_nodes_sizes = PartitionDenseKernel<true>(column, rid_span, split_cond, left, right);
if (column_matrix.AnyMissing()) {
child_nodes_sizes = PartitionDenseKernel<true, true>(column, rid_span, split_cond,
left, right);
} else {
child_nodes_sizes = PartitionDenseKernel<true, false>(column, rid_span, split_cond,
left, right);
}
} else {
child_nodes_sizes = PartitionDenseKernel<false>(column, rid_span, split_cond, left, right);
if (column_matrix.AnyMissing()) {
child_nodes_sizes = PartitionDenseKernel<false, true>(column, rid_span, split_cond,
left, right);
} else {
child_nodes_sizes = PartitionDenseKernel<false, false>(column, rid_span, split_cond,
left, right);
}
}
} else {
const common::SparseColumn<BinIdxType>& column
Expand Down
82 changes: 82 additions & 0 deletions tests/cpp/tree/test_quantile_hist.cc
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,71 @@ class QuantileHistMock : public QuantileHistMaker {
omp_set_num_threads(1);
}

void TestApplySplit(const GHistIndexBlockMatrix& quantile_index_block,
const RegTree& tree) {
std::vector<GradientPair> row_gpairs =
{ {1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {2.27f, 0.28f},
{0.27f, 0.29f}, {0.37f, 0.39f}, {-0.47f, 0.49f}, {0.57f, 0.59f} };
size_t constexpr kMaxBins = 4;

// try out different sparsity to get different number of missing values
for (double sparsity : {0.0, 0.1, 0.2}) {
// kNRows samples with kNCols features
auto dmat = RandomDataGenerator(kNRows, kNCols, sparsity).Seed(3).GenerateDMatrix();

common::GHistIndexMatrix gmat;
gmat.Init(dmat.get(), kMaxBins);
ColumnMatrix cm;

// treat everything as dense, as this is what we intend to test here
cm.Init(gmat, 0.0);
RealImpl::InitData(gmat, row_gpairs, *dmat, tree);
hist_.AddHistRow(0);

RealImpl::InitNewNode(0, gmat, row_gpairs, *dmat, tree);

const size_t num_row = dmat->Info().num_row_;
// split by feature 0
const size_t bin_id_min = gmat.cut.Ptrs()[0];
const size_t bin_id_max = gmat.cut.Ptrs()[1];

// attempt to split at different bins
for (size_t split = 0; split < 4; split++) {
size_t left_cnt = 0, right_cnt = 0;

// manually compute how many samples go left or right
for (size_t rid = 0; rid < num_row; ++rid) {
for (size_t offset = gmat.row_ptr[rid]; offset < gmat.row_ptr[rid + 1]; ++offset) {
const size_t bin_id = gmat.index[offset];
if (bin_id >= bin_id_min && bin_id < bin_id_max) {
if (bin_id <= split) {
left_cnt ++;
} else {
right_cnt ++;
}
}
}
}

// if any were missing due to sparsity, we add them to the left or to the right
size_t missing = kNRows - left_cnt - right_cnt;
if (tree[0].DefaultLeft()) {
left_cnt += missing;
} else {
right_cnt += missing;
}

// have one node with kNRows (=8 at the moment) rows, just one task
RealImpl::partition_builder_.Init(1, 1, [&](size_t node_in_set) {
return 1;
});
RealImpl::PartitionKernel<uint8_t>(0, 0, common::Range1d(0, kNRows), split, cm, tree);
RealImpl::partition_builder_.CalculateRowOffsets();
ASSERT_EQ(RealImpl::partition_builder_.GetNLeftElems(0), left_cnt);
ASSERT_EQ(RealImpl::partition_builder_.GetNRightElems(0), right_cnt);
}
}
}
};

int static constexpr kNRows = 8, kNCols = 16;
Expand Down Expand Up @@ -322,6 +387,13 @@ class QuantileHistMock : public QuantileHistMaker {

builder_->TestEvaluateSplit(gmatb_, tree);
}

void TestApplySplit() {
RegTree tree = RegTree();
tree.param.UpdateAllowUnknown(cfg_);

builder_->TestApplySplit(gmatb_, tree);
}
};

TEST(QuantileHist, InitData) {
Expand Down Expand Up @@ -359,5 +431,15 @@ TEST(QuantileHist, EvalSplits) {
maker.TestEvaluateSplit();
}

TEST(QuantileHist, ApplySplit) {
std::vector<std::pair<std::string, std::string>> cfg
{{"num_feature", std::to_string(QuantileHistMock::GetNumColumns())},
{"split_evaluator", "elastic_net"},
{"reg_lambda", "0"}, {"reg_alpha", "0"}, {"max_delta_step", "0"},
{"min_child_weight", "0"}};
QuantileHistMock maker(cfg);
maker.TestApplySplit();
}

} // namespace tree
} // namespace xgboost

0 comments on commit 4e64e2e

Please sign in to comment.