diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index e4747ea749b4..6b33ce6550dc 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -659,13 +659,17 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat, } } +/*! + * \brief fill a histogram by zeroes + */ +void InitilizeHistByZeroes(GHistRow hist) { + memset(hist.data(), '\0', hist.size()*sizeof(tree::GradStats)); +} + void GHistBuilder::BuildHist(const std::vector& gpair, const RowSetCollection::Elem row_indices, const GHistIndexMatrix& gmat, GHistRow hist) { - const size_t nthread = static_cast(this->nthread_); - data_.resize(nbins_ * nthread_); - const size_t* rid = row_indices.begin; const size_t nrows = row_indices.Size(); const uint32_t* index = gmat.index.data(); @@ -673,79 +677,27 @@ void GHistBuilder::BuildHist(const std::vector& gpair, const float* pgh = reinterpret_cast(gpair.data()); double* hist_data = reinterpret_cast(hist.data()); - double* data = reinterpret_cast(data_.data()); - - const size_t block_size = 512; - size_t n_blocks = nrows/block_size; - n_blocks += !!(nrows - n_blocks*block_size); - - const size_t nthread_to_process = std::min(nthread, n_blocks); - memset(thread_init_.data(), '\0', nthread_to_process*sizeof(size_t)); const size_t cache_line_size = 64; const size_t prefetch_offset = 10; size_t no_prefetch_size = prefetch_offset + cache_line_size/sizeof(*rid); no_prefetch_size = no_prefetch_size > nrows ? nrows : no_prefetch_size; -#pragma omp parallel for num_threads(nthread_to_process) schedule(guided) - for (bst_omp_uint iblock = 0; iblock < n_blocks; iblock++) { - dmlc::omp_uint tid = omp_get_thread_num(); - double* data_local_hist = ((nthread_to_process == 1) ? hist_data : - reinterpret_cast(data_.data() + tid * nbins_)); - - if (!thread_init_[tid]) { - memset(data_local_hist, '\0', 2*nbins_*sizeof(double)); - thread_init_[tid] = true; - } - - const size_t istart = iblock*block_size; - const size_t iend = (((iblock+1)*block_size > nrows) ? nrows : istart + block_size); - for (size_t i = istart; i < iend; ++i) { - const size_t icol_start = row_ptr[rid[i]]; - const size_t icol_end = row_ptr[rid[i]+1]; - - if (i < nrows - no_prefetch_size) { - PREFETCH_READ_T0(row_ptr + rid[i + prefetch_offset]); - PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]); - } - - for (size_t j = icol_start; j < icol_end; ++j) { - const uint32_t idx_bin = 2*index[j]; - const size_t idx_gh = 2*rid[i]; + for (size_t i = 0; i < nrows; ++i) { + const size_t icol_start = row_ptr[rid[i]]; + const size_t icol_end = row_ptr[rid[i]+1]; - data_local_hist[idx_bin] += pgh[idx_gh]; - data_local_hist[idx_bin+1] += pgh[idx_gh+1]; - } + if (i < nrows - no_prefetch_size) { + PREFETCH_READ_T0(row_ptr + rid[i + prefetch_offset]); + PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]); } - } - if (nthread_to_process > 1) { - const size_t size = (2*nbins_); - const size_t block_size = 1024; - size_t n_blocks = size/block_size; - n_blocks += !!(size - n_blocks*block_size); + for (size_t j = icol_start; j < icol_end; ++j) { + const uint32_t idx_bin = 2*index[j]; + const size_t idx_gh = 2*rid[i]; - size_t n_worked_bins = 0; - for (size_t i = 0; i < nthread_to_process; ++i) { - if (thread_init_[i]) { - thread_init_[n_worked_bins++] = i; - } - } - -#pragma omp parallel for num_threads(std::min(nthread, n_blocks)) schedule(guided) - for (bst_omp_uint iblock = 0; iblock < n_blocks; iblock++) { - const size_t istart = iblock * block_size; - const size_t iend = (((iblock + 1) * block_size > size) ? size : istart + block_size); - - const size_t bin = 2 * thread_init_[0] * nbins_; - memcpy(hist_data + istart, (data + bin + istart), sizeof(double) * (iend - istart)); - - for (size_t i_bin_part = 1; i_bin_part < n_worked_bins; ++i_bin_part) { - const size_t bin = 2 * thread_init_[i_bin_part] * nbins_; - for (size_t i = istart; i < iend; i++) { - hist_data[i] += data[bin + i]; - } - } + hist_data[idx_bin] += pgh[idx_gh]; + hist_data[idx_bin+1] += pgh[idx_gh+1]; } } } diff --git a/src/common/hist_util.h b/src/common/hist_util.h index b7ccd54adf8a..a131d5f59532 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -251,7 +251,7 @@ class DenseCuts : public CutsBuilder { // FIXME(trivialfis): Merge this into generic cut builder. /*! \brief Builds the cut matrix on the GPU. - * + * * \return The row stride across the entire dataset. */ size_t DeviceSketch(int device, @@ -347,6 +347,11 @@ class GHistIndexBlockMatrix { */ using GHistRow = Span; +/*! + * \brief fill a histogram by zeroes + */ +void InitilizeHistByZeroes(GHistRow hist); + /*! * \brief histogram of gradient statistics for multiple nodes */ @@ -369,9 +374,13 @@ class HistCollection { // initialize histogram collection void Init(uint32_t nbins) { - nbins_ = nbins; + if (nbins_ != nbins) { + nbins_ = nbins; + // quite expensive operation, so let's do this only once + data_.clear(); + } row_ptr_.clear(); - data_.clear(); + n_nodes_added_ = 0; } // create an empty histogram for i-th node @@ -382,13 +391,19 @@ class HistCollection { } CHECK_EQ(row_ptr_[nid], kMax); - row_ptr_[nid] = data_.size(); - data_.resize(data_.size() + nbins_); + if (data_.size() <= nbins_ * (nid + 1)) { + data_.resize(nbins_ * (nid + 1)); + } + + row_ptr_[nid] = nbins_ * n_nodes_added_; + n_nodes_added_++; } private: /*! \brief number of all bins over all features */ - uint32_t nbins_; + uint32_t nbins_ = 0; + /*! \brief amount of active nodes in hist collection */ + uint32_t n_nodes_added_ = 0; std::vector data_; @@ -396,6 +411,85 @@ class HistCollection { std::vector row_ptr_; }; +/*! + * \brief Stores temporary histograms to compute them in parallel + * Supports processing multiple tree-nodes for nested parallelism + * Able to reduce histograms across threads in efficient way + */ +class ParallelGHistBuilder { + public: + void Init(size_t nbins) { + if (nbins != nbins_) { + hist_.Init(nbins); + max_size_ = 0; + nbins_ = nbins; + } + } + + // Add new elements if needed, mark all hists as unused + void Reset(size_t nthreads, size_t nodes) { + const size_t new_size = nthreads * nodes; + for (size_t i = max_size_; i < new_size; ++i) { + hist_.AddHistRow(i); + } + max_size_ = std::max(max_size_, new_size); + + nodes_ = nodes; + nthreads_ = nthreads; + + hist_was_used_.resize(nthreads * nodes); + std::fill(hist_was_used_.begin(), hist_was_used_.end(), static_cast(false)); + } + + // Get specified hist, initilize hist by zeroes if it wasn't used before + GHistRow GetInitializedHist(size_t tid, size_t nid) { + CHECK_LT(nid, nodes_); + CHECK_LT(tid, nthreads_); + GHistRow hist = hist_[tid * nodes_ + nid]; + + if (!hist_was_used_[tid * nodes_ + nid]) { + InitilizeHistByZeroes(hist); + hist_was_used_[tid * nodes_ + nid] = static_cast(true); + } + + return hist; + } + + // Reduce following bins (begin, end] for nid-node in dst across threads + void ReduceHist(GHistRow dst, size_t nid, size_t begin, size_t end) { + CHECK_GT(end, begin); + CHECK_LT(nid, nodes_); + + for (size_t tid = 0; tid < nthreads_; ++tid) { + if (hist_was_used_[tid * nodes_ + nid]) { + GHistRow src = hist_[tid * nodes_ + nid]; + + for (size_t i = begin; i < end; ++i) { + dst[i].Add(src[i]); + } + } + } + } + + protected: + /*! \brief number of bins in each histogram */ + size_t nbins_ = 0; + /*! \brief number of threads for parallel computation */ + size_t nthreads_ = 0; + /*! \brief number of nodes which will be processed in parallel */ + size_t nodes_ = 0; + /*! \brief Real size of hist_ */ + size_t max_size_ = 0; + /*! \brief number of nodes which will be processed in parallel */ + HistCollection hist_; + /*! + * \brief Marks which hists were used, it means that they should be merged. + * Contains only {true or false} values + * but 'int' is used instead of 'bool', because std::vector isn't thread safe + */ + std::vector hist_was_used_; +}; + /*! * \brief builder for histograms of gradient statistics */ diff --git a/src/common/threading_utils.h b/src/common/threading_utils.h index 34e363c00244..193a503746a6 100755 --- a/src/common/threading_utils.h +++ b/src/common/threading_utils.h @@ -108,10 +108,10 @@ class BlockedSpace2d { // Wrapper to implement nested parallelism with simple omp parallel for template -void ParallelFor2d(const BlockedSpace2d& space, Func func) { +void ParallelFor2d(const BlockedSpace2d& space, const int nthreads, Func func) { const int num_blocks_in_space = static_cast(space.Size()); - #pragma omp parallel for + #pragma omp parallel for num_threads(nthreads) for (auto i = 0; i < num_blocks_in_space; i++) { func(space.GetFirstDimension(i), space.GetRange(i)); } diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index b28c2f9707da..9d7a7385f89b 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -44,7 +44,7 @@ void QuantileHistMaker::Configure(const Args& args) { pruner_->Configure(args); param_.UpdateAllowUnknown(args); - // initialise the split evaluator + // initialize the split evaluator if (!spliteval_) { spliteval_.reset(SplitEvaluator::Create(param_.split_evaluator)); } @@ -100,16 +100,71 @@ void QuantileHistMaker::Builder::SyncHistograms( int sync_count, RegTree *p_tree) { builder_monitor_.Start("SyncHistograms"); - this->histred_.Allreduce(hist_[starting_index].data(), hist_builder_.GetNumBins() * sync_count); - // use Subtraction Trick - for (auto const& node_pair : nodes_for_subtraction_trick_) { - hist_.AddHistRow(node_pair.first); - SubtractionTrick(hist_[node_pair.first], hist_[node_pair.second], - hist_[(*p_tree)[node_pair.first].Parent()]); + + const bool isDistributed = rabit::IsDistributed(); + builder_monitor_.Start("AddHistRow"); + for (auto const& node : nodes_for_subtraction_trick_) { + hist_.AddHistRow(node.nid); + } + builder_monitor_.Stop("AddHistRow"); + + const size_t nbins = hist_builder_.GetNumBins(); + common::BlockedSpace2d space(nodes_for_explicit_hist_build_.size(), [&](size_t node) { + return nbins; + }, 1024); + + common::ParallelFor2d(space, this->nthread_, [&](size_t node, common::Range1d r) { + const auto entry = nodes_for_explicit_hist_build_[node]; + auto this_hist = hist_[entry.nid]; + // Merging histograms from each thread into once + common::InitilizeHistByZeroes({this_hist.data() + r.begin(), this_hist.data() + r.end()}); + hist_buffer_.ReduceHist(this_hist, node, r.begin(), r.end()); + + if (!(*p_tree)[entry.nid].IsRoot() && entry.sibling_nid > -1 && !isDistributed) { + auto parent_hist = hist_[(*p_tree)[entry.nid].Parent()]; + auto sibling_hist = hist_[entry.sibling_nid]; + + for (bst_omp_uint bin_id = r.begin(); bin_id < r.end(); bin_id++) { + sibling_hist[bin_id].SetSubstract(parent_hist[bin_id], this_hist[bin_id]); + } + } + }); + + if (isDistributed) { + this->histred_.Allreduce(hist_[starting_index].data(), hist_builder_.GetNumBins() * sync_count); + // use Subtraction Trick + for (auto const& node : nodes_for_subtraction_trick_) { + SubtractionTrick(hist_[node.nid], hist_[node.sibling_nid], + hist_[(*p_tree)[node.nid].Parent()]); + } } + builder_monitor_.Stop("SyncHistograms"); } +void QuantileHistMaker::Builder::BuildHistogramsLossGuide( + ExpandEntry entry, + const GHistIndexMatrix &gmat, + const GHistIndexBlockMatrix &gmatb, + RegTree *p_tree, + const std::vector &gpair_h) { + nodes_for_explicit_hist_build_.clear(); + nodes_for_subtraction_trick_.clear(); + nodes_for_explicit_hist_build_.push_back(entry); + + int starting_index = std::numeric_limits::max(); + int sync_count = 0; + + BuildLocalHistograms(&starting_index, &sync_count, gmat, gmatb, p_tree, gpair_h); + + // Subtraction Trick + if (entry.sibling_nid > -1) { + nodes_for_subtraction_trick_.emplace_back(entry.sibling_nid, entry.nid, + p_tree->GetDepth(entry.sibling_nid), 0.0f, 0); + } + SyncHistograms(starting_index, sync_count, p_tree); +} + void QuantileHistMaker::Builder::BuildLocalHistograms( int *starting_index, int *sync_count, @@ -118,48 +173,41 @@ void QuantileHistMaker::Builder::BuildLocalHistograms( RegTree *p_tree, const std::vector &gpair_h) { builder_monitor_.Start("BuildLocalHistograms"); - for (auto const& entry : qexpand_depth_wise_) { + + hist_buffer_.Reset(this->nthread_, nodes_for_explicit_hist_build_.size()); + + builder_monitor_.Start("AddHistRow"); + for (auto const& entry : nodes_for_explicit_hist_build_) { int nid = entry.nid; - RegTree::Node &node = (*p_tree)[nid]; - if (rabit::IsDistributed()) { - if (node.IsRoot() || node.IsLeftChild()) { - hist_.AddHistRow(nid); - // in distributed setting, we always calculate from left child or root node - BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, hist_[nid], false); - if (!node.IsRoot()) { - nodes_for_subtraction_trick_[(*p_tree)[node.Parent()].RightChild()] = nid; - } - (*sync_count)++; - (*starting_index) = std::min((*starting_index), nid); - } - } else { - if (!node.IsRoot() && node.IsLeftChild() && - (row_set_collection_[nid].Size() < - row_set_collection_[(*p_tree)[node.Parent()].RightChild()].Size())) { - hist_.AddHistRow(nid); - BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, hist_[nid], false); - nodes_for_subtraction_trick_[(*p_tree)[node.Parent()].RightChild()] = nid; - (*sync_count)++; - (*starting_index) = std::min((*starting_index), nid); - } else if (!node.IsRoot() && !node.IsLeftChild() && - (row_set_collection_[nid].Size() <= - row_set_collection_[(*p_tree)[node.Parent()].LeftChild()].Size())) { - hist_.AddHistRow(nid); - BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, hist_[nid], false); - nodes_for_subtraction_trick_[(*p_tree)[node.Parent()].LeftChild()] = nid; - (*sync_count)++; - (*starting_index) = std::min((*starting_index), nid); - } else if (node.IsRoot()) { - hist_.AddHistRow(nid); - BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, hist_[nid], false); - (*sync_count)++; - (*starting_index) = std::min((*starting_index), nid); - } - } + hist_.AddHistRow(nid); + (*starting_index) = std::min(nid, (*starting_index)); } + (*sync_count) = nodes_for_explicit_hist_build_.size(); + + builder_monitor_.Stop("AddHistRow"); + + // create space of size (# rows in each node) + common::BlockedSpace2d space(nodes_for_explicit_hist_build_.size(), [&](size_t node) { + const int32_t nid = nodes_for_explicit_hist_build_[node].nid; + return row_set_collection_[nid].Size(); + }, 256); + + // Parallel processing by nodes and data in each node + common::ParallelFor2d(space, this->nthread_, [&](size_t nid_in_set, common::Range1d r) { + const auto tid = static_cast(omp_get_thread_num()); + const int32_t nid = nodes_for_explicit_hist_build_[nid_in_set].nid; + + auto start_of_row_set = row_set_collection_[nid].begin; + auto rid_set = RowSetCollection::Elem(start_of_row_set + r.begin(), + start_of_row_set + r.end(), + nid); + BuildHist(gpair_h, rid_set, gmat, gmatb, hist_buffer_.GetInitializedHist(tid, nid_in_set)); + }); + builder_monitor_.Stop("BuildLocalHistograms"); } + void QuantileHistMaker::Builder::BuildNodeStats( const GHistIndexMatrix &gmat, DMatrix *p_fmat, @@ -206,9 +254,9 @@ void QuantileHistMaker::Builder::EvaluateSplits( this->ApplySplit(nid, gmat, column_matrix, hist_, *p_fmat, p_tree); int left_id = (*p_tree)[nid].LeftChild(); int right_id = (*p_tree)[nid].RightChild(); - temp_qexpand_depth->push_back(ExpandEntry(left_id, + temp_qexpand_depth->push_back(ExpandEntry(left_id, right_id, p_tree->GetDepth(left_id), 0.0, (*timestamp)++)); - temp_qexpand_depth->push_back(ExpandEntry(right_id, + temp_qexpand_depth->push_back(ExpandEntry(right_id, left_id, p_tree->GetDepth(right_id), 0.0, (*timestamp)++)); // - 1 parent + 2 new children (*num_leaves)++; @@ -216,6 +264,43 @@ void QuantileHistMaker::Builder::EvaluateSplits( } } +// Split nodes to 2 sets depending on amount of rows in each node +// Histograms for small nodes will be built explicitly +// Histograms for big nodes will be built by 'Subtraction Trick' +// Exception: in distributed setting, we always build the histogram for the left child node +// and use 'Subtraction Trick' to built the histogram for the right child node. +// This ensures that the workers operate on the same set of tree nodes. +void QuantileHistMaker::Builder::SplitSiblings(const std::vector& nodes, + std::vector* small_siblings, + std::vector* big_siblings, + RegTree *p_tree) { + for (auto const& entry : nodes) { + int nid = entry.nid; + RegTree::Node &node = (*p_tree)[nid]; + if (rabit::IsDistributed()) { + if (node.IsRoot() || node.IsLeftChild()) { + small_siblings->push_back(entry); + } else { + big_siblings->push_back(entry); + } + } else { + if (!node.IsRoot() && node.IsLeftChild() && + (row_set_collection_[nid].Size() < + row_set_collection_[(*p_tree)[node.Parent()].RightChild()].Size())) { + small_siblings->push_back(entry); + } else if (!node.IsRoot() && !node.IsLeftChild() && + (row_set_collection_[nid].Size() <= + row_set_collection_[(*p_tree)[node.Parent()].LeftChild()].Size())) { + small_siblings->push_back(entry); + } else if (node.IsRoot()) { + small_siblings->push_back(entry); + } else { + big_siblings->push_back(entry); + } + } + } +} + void QuantileHistMaker::Builder::ExpandWithDepthWise( const GHistIndexMatrix &gmat, const GHistIndexBlockMatrix &gmatb, @@ -227,14 +312,19 @@ void QuantileHistMaker::Builder::ExpandWithDepthWise( int num_leaves = 0; // in depth_wise growing, we feed loss_chg with 0.0 since it is not used anyway - qexpand_depth_wise_.emplace_back(ExpandEntry(ExpandEntry::kRootNid, + qexpand_depth_wise_.emplace_back(ExpandEntry(ExpandEntry::kRootNid, ExpandEntry::kEmptyNid, p_tree->GetDepth(ExpandEntry::kRootNid), 0.0, timestamp++)); ++num_leaves; for (int depth = 0; depth < param_.max_depth + 1; depth++) { int starting_index = std::numeric_limits::max(); int sync_count = 0; std::vector temp_qexpand_depth; + + SplitSiblings(qexpand_depth_wise_, &nodes_for_explicit_hist_build_, + &nodes_for_subtraction_trick_, p_tree); + BuildLocalHistograms(&starting_index, &sync_count, gmat, gmatb, p_tree, gpair_h); + SyncHistograms(starting_index, sync_count, p_tree); BuildNodeStats(gmat, p_fmat, p_tree, gpair_h); EvaluateSplits(gmat, column_matrix, p_fmat, p_tree, &num_leaves, depth, ×tamp, @@ -242,6 +332,7 @@ void QuantileHistMaker::Builder::ExpandWithDepthWise( // clean up qexpand_depth_wise_.clear(); nodes_for_subtraction_trick_.clear(); + nodes_for_explicit_hist_build_.clear(); if (temp_qexpand_depth.empty()) { break; } else { @@ -262,14 +353,12 @@ void QuantileHistMaker::Builder::ExpandWithLossGuide( unsigned timestamp = 0; int num_leaves = 0; - hist_.AddHistRow(ExpandEntry::kRootNid); - BuildHist(gpair_h, row_set_collection_[ExpandEntry::kRootNid], gmat, gmatb, - hist_[ExpandEntry::kRootNid], true); + ExpandEntry node(ExpandEntry::kRootNid, ExpandEntry::kEmptyNid, + p_tree->GetDepth(0), 0.0f, timestamp++); + BuildHistogramsLossGuide(node, gmat, gmatb, p_tree, gpair_h); this->InitNewNode(ExpandEntry::kRootNid, gmat, gpair_h, *p_fmat, *p_tree); - ExpandEntry node(ExpandEntry::kRootNid, p_tree->GetDepth(ExpandEntry::kRootNid), - snode_[ExpandEntry::kRootNid].best.loss_chg, timestamp++); this->EvaluateSplit({node}, gmat, hist_, *p_fmat, *p_tree); node.loss_chg = snode_[ExpandEntry::kRootNid].best.loss_chg; @@ -289,20 +378,20 @@ void QuantileHistMaker::Builder::ExpandWithLossGuide( const int cleft = (*p_tree)[nid].LeftChild(); const int cright = (*p_tree)[nid].RightChild(); - hist_.AddHistRow(cleft); - hist_.AddHistRow(cright); + + ExpandEntry left_node(cleft, cright, p_tree->GetDepth(cleft), + 0.0f, timestamp++); + ExpandEntry right_node(cright, cleft, p_tree->GetDepth(cright), + 0.0f, timestamp++); if (rabit::IsDistributed()) { // in distributed mode, we need to keep consistent across workers - BuildHist(gpair_h, row_set_collection_[cleft], gmat, gmatb, hist_[cleft], true); - SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]); + BuildHistogramsLossGuide(left_node, gmat, gmatb, p_tree, gpair_h); } else { if (row_set_collection_[cleft].Size() < row_set_collection_[cright].Size()) { - BuildHist(gpair_h, row_set_collection_[cleft], gmat, gmatb, hist_[cleft], true); - SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]); + BuildHistogramsLossGuide(left_node, gmat, gmatb, p_tree, gpair_h); } else { - BuildHist(gpair_h, row_set_collection_[cright], gmat, gmatb, hist_[cright], true); - SubtractionTrick(hist_[cleft], hist_[cright], hist_[nid]); + BuildHistogramsLossGuide(right_node, gmat, gmatb, p_tree, gpair_h); } } @@ -313,11 +402,6 @@ void QuantileHistMaker::Builder::ExpandWithLossGuide( snode_[cleft].weight, snode_[cright].weight); interaction_constraints_.Split(nid, featureid, cleft, cright); - ExpandEntry left_node(cleft, p_tree->GetDepth(cleft), - snode_[cleft].best.loss_chg, timestamp++); - ExpandEntry right_node(cright, p_tree->GetDepth(cright), - snode_[cright].best.loss_chg, timestamp++); - this->EvaluateSplit({left_node, right_node}, gmat, hist_, *p_fmat, *p_tree); left_node.loss_chg = snode_[cleft].best.loss_chg; right_node.loss_chg = snode_[cright].best.loss_chg; @@ -427,6 +511,7 @@ void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& gmat, // initialize histogram collection uint32_t nbins = gmat.cut.Ptrs().back(); hist_.Init(nbins); + hist_buffer_.Init(nbins); // initialize histogram builder #pragma omp parallel @@ -609,7 +694,7 @@ void QuantileHistMaker::Builder::EvaluateSplit(const std::vector& n }, 1); // Start parallel enumeration for all tree nodes in the set and all features - common::ParallelFor2d(space, [&](size_t nid_in_set, common::Range1d r) { + common::ParallelFor2d(space, this->nthread_, [&](size_t nid_in_set, common::Range1d r) { const int32_t nid = nodes_set[nid_in_set].nid; const auto tid = static_cast(omp_get_thread_num()); GHistRow node_hist = hist[nid]; diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h index 3e3ab9a745f8..420158dd4965 100644 --- a/src/tree/updater_quantile_hist.h +++ b/src/tree/updater_quantile_hist.h @@ -157,17 +157,13 @@ class QuantileHistMaker: public TreeUpdater { const RowSetCollection::Elem row_indices, const GHistIndexMatrix& gmat, const GHistIndexBlockMatrix& gmatb, - GHistRow hist, - bool sync_hist) { + GHistRow hist) { builder_monitor_.Start("BuildHist"); if (param_.enable_feature_grouping > 0) { hist_builder_.BuildBlockHist(gpair, row_indices, gmatb, hist); } else { hist_builder_.BuildHist(gpair, row_indices, gmat, hist); } - if (sync_hist) { - this->histred_.Allreduce(hist.data(), hist_builder_.GetNumBins()); - } builder_monitor_.Stop("BuildHist"); } @@ -183,13 +179,15 @@ class QuantileHistMaker: public TreeUpdater { protected: /* tree growing policies */ struct ExpandEntry { - static const int kRootNid = 0; + static const int kRootNid = 0; + static const int kEmptyNid = -1; int nid; + int sibling_nid; int depth; bst_float loss_chg; unsigned timestamp; - ExpandEntry(int nid, int depth, bst_float loss_chg, unsigned tstmp) - : nid(nid), depth(depth), loss_chg(loss_chg), timestamp(tstmp) {} + ExpandEntry(int nid, int sibling_nid, int depth, bst_float loss_chg, unsigned tstmp): + nid(nid), sibling_nid(sibling_nid), depth(depth), loss_chg(loss_chg), timestamp(tstmp) {} }; // initialize temp data structure @@ -265,6 +263,20 @@ class QuantileHistMaker: public TreeUpdater { const GHistIndexBlockMatrix &gmatb, RegTree *p_tree, const std::vector &gpair_h); + void BuildHistogramsLossGuide( + ExpandEntry entry, + const GHistIndexMatrix &gmat, + const GHistIndexBlockMatrix &gmatb, + RegTree *p_tree, + const std::vector &gpair_h); + + // Split nodes to 2 sets depending on amount of rows in each node + // Histograms for small nodes will be built explicitly + // Histograms for big nodes will be built by 'Subtraction Trick' + void SplitSiblings(const std::vector& nodes, + std::vector* small_siblings, + std::vector* big_siblings, + RegTree *p_tree); void SyncHistograms(int starting_index, int sync_count, @@ -336,12 +348,15 @@ class QuantileHistMaker: public TreeUpdater { std::vector qexpand_depth_wise_; // key is the node id which should be calculated by Subtraction Trick, value is the node which // provides the evidence for substracts - std::unordered_map nodes_for_subtraction_trick_; + std::vector nodes_for_subtraction_trick_; + // list of nodes whose histograms would be built explicitly. + std::vector nodes_for_explicit_hist_build_; enum DataLayout { kDenseDataZeroBased, kDenseDataOneBased, kSparseData }; DataLayout data_layout_; common::Monitor builder_monitor_; + common::ParallelGHistBuilder hist_buffer_; rabit::Reducer histred_; }; diff --git a/tests/cpp/common/test_hist_util.cc b/tests/cpp/common/test_hist_util.cc index 4d502370751e..3e64add6360d 100644 --- a/tests/cpp/common/test_hist_util.cc +++ b/tests/cpp/common/test_hist_util.cc @@ -9,6 +9,98 @@ namespace xgboost { namespace common { +size_t GetNThreads() { + size_t nthreads; + #pragma omp parallel + { + #pragma omp master + nthreads = omp_get_num_threads(); + } + return nthreads; +} + +TEST(ParallelGHistBuilder, Reset) { + constexpr size_t kBins = 10; + constexpr size_t kNodes = 5; + constexpr size_t kNodesExtended = 10; + constexpr size_t kTasksPerNode = 10; + constexpr double kValue = 1.0; + const size_t nthreads = GetNThreads(); + + ParallelGHistBuilder hist_builder; + hist_builder.Init(kBins); + hist_builder.Reset(nthreads, kNodes); + + #pragma omp parallel for + for (int i = 0; i < kNodes * kTasksPerNode; i++) { + const size_t inode = i / kTasksPerNode; + const size_t itask = i % kTasksPerNode; + const size_t tid = omp_get_thread_num(); + + GHistRow hist = hist_builder.GetInitializedHist(tid, inode); + // fill hist by some non-null values + for(size_t j = 0; j < kBins; ++j) { + hist[j].Add(kValue, kValue); + } + } + + // reset and extend buffer + hist_builder.Reset(nthreads, kNodesExtended); + + for(size_t inode = 0; inode < kNodesExtended; inode++) { + for(size_t tid = 0; tid < nthreads; tid++) { + GHistRow hist = hist_builder.GetInitializedHist(tid, inode); + // now each hist should be filled by zeros again + for(size_t i = 0; i < kBins; ++i) { + ASSERT_EQ(0.0, hist[i].GetGrad()); + ASSERT_EQ(0.0, hist[i].GetHess()); + } + } + } +} + +TEST(ParallelGHistBuilder, ReduceHist) { + constexpr size_t kBins = 10; + constexpr size_t kNodes = 5; + constexpr size_t kNodesExtended = 10; + constexpr size_t kTasksPerNode = 10; + constexpr double kValue = 1.0; + const size_t nthreads = GetNThreads(); + + ParallelGHistBuilder hist_builder; + hist_builder.Init(kBins); + hist_builder.Reset(nthreads, kNodes); + + // Simple analog of BuildHist function, works in parallel for both tree-nodes and data in node + #pragma omp parallel for + for(int i = 0; i < kNodes * kTasksPerNode; i++) { + const size_t inode = i / kTasksPerNode; + const size_t itask = i % kTasksPerNode; + const size_t tid = omp_get_thread_num(); + + GHistRow hist = hist_builder.GetInitializedHist(tid, inode); + for(size_t i = 0; i < kBins; ++i) { + hist[i].Add(kValue, kValue); + } + } + + HistCollection collection; + collection.Init(kBins); + + for(size_t inode = 0; inode < kNodes; inode++) { + collection.AddHistRow(inode); + hist_builder.ReduceHist(collection[inode], inode, 0, kBins); + + // We had kTasksPerNode tasks to add kValue to each bin for each node + // So, after reducing we expect to have (kValue * kTasksPerNode) in each node + for(size_t i = 0; i < kBins; ++i) { + ASSERT_EQ(kValue * kTasksPerNode, collection[inode][i].GetGrad()); + ASSERT_EQ(kValue * kTasksPerNode, collection[inode][i].GetHess()); + } + } +} + + TEST(CutsBuilder, SearchGroupInd) { size_t constexpr kNumGroups = 4; size_t constexpr kRows = 17; diff --git a/tests/cpp/common/test_threading_utils.cc b/tests/cpp/common/test_threading_utils.cc index f89d6bdb910e..0d49c490a050 100755 --- a/tests/cpp/common/test_threading_utils.cc +++ b/tests/cpp/common/test_threading_utils.cc @@ -37,7 +37,7 @@ TEST(ParallelFor2d, Test) { return kDim2; }, kGrainSize); - ParallelFor2d(space, [&](size_t i, Range1d r) { + ParallelFor2d(space, 4, [&](size_t i, Range1d r) { for (auto j = r.begin(); j < r.end(); ++j) { matrix[i*kDim2 + j] += 1; } @@ -65,7 +65,7 @@ TEST(ParallelFor2dNonUniform, Test) { working_space[i].resize(dim2[i], 0); } - ParallelFor2d(space, [&](size_t i, Range1d r) { + ParallelFor2d(space, 4, [&](size_t i, Range1d r) { for (auto j = r.begin(); j < r.end(); ++j) { working_space[i][j] += 1; } diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc index db89d64869be..657f067997a2 100644 --- a/tests/cpp/tree/test_quantile_hist.cc +++ b/tests/cpp/tree/test_quantile_hist.cc @@ -107,7 +107,7 @@ class QuantileHistMock : public QuantileHistMaker { GHistIndexBlockMatrix dummy; hist_.AddHistRow(nid); BuildHist(gpair, row_set_collection_[nid], - gmat, dummy, hist_[nid], false); + gmat, dummy, hist_[nid]); // Check if number of histogram bins is correct ASSERT_EQ(hist_[nid].size(), gmat.cut.Ptrs().back()); @@ -149,7 +149,7 @@ class QuantileHistMock : public QuantileHistMaker { hist_.AddHistRow(0); BuildHist(row_gpairs, row_set_collection_[0], - gmat, quantile_index_block, hist_[0], false); + gmat, quantile_index_block, hist_[0]); RealImpl::InitNewNode(0, gmat, row_gpairs, *(*dmat), tree); @@ -211,7 +211,8 @@ class QuantileHistMock : public QuantileHistMaker { } /* Now compare against result given by EvaluateSplit() */ - ExpandEntry node(0, tree.GetDepth(0), snode_[0].best.loss_chg, 0); + ExpandEntry node(ExpandEntry::kRootNid, ExpandEntry::kEmptyNid, + tree.GetDepth(0), snode_[0].best.loss_chg, 0); RealImpl::EvaluateSplit({node}, gmat, hist_, *(*dmat), tree); ASSERT_EQ(snode_[0].best.SplitIndex(), best_split_feature); ASSERT_EQ(snode_[0].best.split_value, gmat.cut.Values()[best_split_threshold]);