diff --git a/doc/parameter.md b/doc/parameter.md index f6802fb891d7..fa95aac525da 100644 --- a/doc/parameter.md +++ b/doc/parameter.md @@ -107,6 +107,23 @@ Parameters for Tree Booster - This is only used if 'hist' is specified as `tree_method`. - Maximum number of discrete bins to bucket continuous features. - Increasing this number improves the optimality of splits at the cost of higher computation time. +* use_columnar_access, [default=1] + - This is only used if 'hist' is specified as `tree_method`. + - If greater than zero, store a transposed copy of input matrix for fast columnar access. May increase memory usage and initial setup time. +* sparse_threshold, [default=0.2] + - range: [0.0, 1.0] + - This is only used if 'hist' is specified as `tree_method`. + - Percentage threshold for treating a feature as sparse. For instance, 0.2 indicates that any feature with fewer than 20% nonzero rows will be considered sparse. May impact computation time slightly. +* enable_feature_grouping, [default=0] + - This is only used if 'hist' is specified as `tree_method`. + - If greater than zero, group complementary features together so as to improve work balance for parallel histogram aggregation. May increase memory usage and initial setup time. +* max_conflict_rate, [default=0] + - range: [0.0, 1.0] + - Only relevant when `enable_feature_grouping=1` is specified. + - Specifies criterion for "complementary" features. By default, only features with no common nonzero rows are considered complementary. Increase this number to encourage larger feature groups. +* max_search_group, [default=100] + - Only relevant when `enable_feature_grouping=1` is specified. + - Increasing this number will result in better feature grouping, at the cost of greater initial setup time. Additional parameters for Dart Booster -------------------------------------- diff --git a/src/common/column_matrix.h b/src/common/column_matrix.h index cbf3a368bc34..6f429f105f9d 100644 --- a/src/common/column_matrix.h +++ b/src/common/column_matrix.h @@ -153,31 +153,36 @@ class ColumnMatrix { std::vector num_nonzeros; num_nonzeros.resize(nfeature); std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0); - for (size_t rid = 0; rid < nrow; ++rid) { - const size_t ibegin = gmat.row_ptr[rid]; - const size_t iend = gmat.row_ptr[rid + 1]; - size_t fid = 0; - for (size_t i = ibegin; i < iend; ++i) { - const uint32_t bin_id = gmat.index[i]; - while (bin_id >= gmat.cut->row_ptr[fid + 1]) { - ++fid; - } - if (type_[fid] == kDenseColumn) { - XGBOOST_TYPE_SWITCH(this->dtype, { - const size_t block_offset = boundary_[fid].index_begin / packing_factor_; - const size_t elem_offset = boundary_[fid].index_begin % packing_factor_; - DType* begin = reinterpret_cast(&index_[block_offset]) + elem_offset; - begin[rid] = static_cast(bin_id - index_base_[fid]); - }); - } else { - XGBOOST_TYPE_SWITCH(this->dtype, { - const size_t block_offset = boundary_[fid].index_begin / packing_factor_; - const size_t elem_offset = boundary_[fid].index_begin % packing_factor_; - DType* begin = reinterpret_cast(&index_[block_offset]) + elem_offset; - begin[num_nonzeros[fid]] = static_cast(bin_id - index_base_[fid]); - }); - row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid; - ++num_nonzeros[fid]; + + const int nthread = omp_get_max_threads(); + #pragma omp parallel num_threads(nthread) + { + for (size_t rid = 0; rid < nrow; ++rid) { + const size_t ibegin = gmat.row_ptr[rid]; + const size_t iend = gmat.row_ptr[rid + 1]; + #pragma omp for schedule(static) + for (size_t i = ibegin; i < iend; ++i) { + const uint32_t bin_id = gmat.index[i]; + const auto& vec = gmat.cut->row_ptr; + auto it = std::upper_bound(vec.begin(), vec.end(), bin_id); + const size_t fid = it - vec.begin() - 1; + if (type_[fid] == kDenseColumn) { + XGBOOST_TYPE_SWITCH(this->dtype, { + const size_t block_offset = boundary_[fid].index_begin / packing_factor_; + const size_t elem_offset = boundary_[fid].index_begin % packing_factor_; + DType* begin = reinterpret_cast(&index_[block_offset]) + elem_offset; + begin[rid] = static_cast(bin_id - index_base_[fid]); + }); + } else { + XGBOOST_TYPE_SWITCH(this->dtype, { + const size_t block_offset = boundary_[fid].index_begin / packing_factor_; + const size_t elem_offset = boundary_[fid].index_begin % packing_factor_; + DType* begin = reinterpret_cast(&index_[block_offset]) + elem_offset; + begin[num_nonzeros[fid]] = static_cast(bin_id - index_base_[fid]); + }); + row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid; + ++num_nonzeros[fid]; + } } } } diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index ee64d9778a99..c96380bbc7e8 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -12,11 +12,12 @@ #include "./column_matrix.h" #include "./hist_util.h" #include "./quantile.h" +#include "./memory.h" namespace xgboost { namespace common { -void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) { +void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins, bool verbose) { typedef common::WXQuantileSketch WXQSketch; const MetaInfo& info = p_fmat->info(); @@ -33,6 +34,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) { s.Init(info.num_row, 1.0 / (max_num_bins * kFactor)); } + LOG(INFO) << "Generating sketches..."; dmlc::DataIter* iter = p_fmat->RowIterator(); iter->BeforeFirst(); while (iter->Next()) { @@ -55,51 +57,64 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) { } } - // gather the histogram data - rabit::SerializeReducer sreducer; - std::vector summary_array; - summary_array.resize(sketchs.size()); - for (size_t i = 0; i < sketchs.size(); ++i) { - WXQSketch::SummaryContainer out; - sketchs[i].GetSummary(&out); - summary_array[i].Reserve(max_num_bins * kFactor); - summary_array[i].SetPrune(out, max_num_bins * kFactor); - } - size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor); - sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size()); - this->min_val.resize(info.num_col); row_ptr.push_back(0); - for (size_t fid = 0; fid < summary_array.size(); ++fid) { - WXQSketch::SummaryContainer a; - a.Reserve(max_num_bins); - a.SetPrune(summary_array[fid], max_num_bins); - const bst_float mval = a.data[0].value; - this->min_val[fid] = mval - fabs(mval); - if (a.size > 1 && a.size <= 16) { - /* specialized code categorial / ordinal data -- use midpoints */ - for (size_t i = 1; i < a.size; ++i) { - bst_float cpt = (a.data[i].value + a.data[i - 1].value) / 2.0; - if (i == 1 || cpt > cut.back()) { - cut.push_back(cpt); + // gather the histogram data + rabit::SerializeReducer sreducer; + const size_t bundle_size // limit this task to 1GB + = std::min(GetSystemMemory() / 2, + static_cast(1) * 1024 * 1024 * 1024) + / (max_num_bins * kFactor * 16); + for (size_t ibegin = 0; ibegin < sketchs.size(); ibegin += bundle_size) { + const size_t iend = std::min(ibegin + bundle_size, sketchs.size()); + const size_t batch_size = iend - ibegin; + + std::vector summary_array; + summary_array.resize(batch_size); + if (verbose) { + LOG(INFO) << "Computing quantiles for features [" + << ibegin << ", " << iend << ")..."; + } + for (size_t i = ibegin; i < iend; ++i) { + WXQSketch::SummaryContainer out; + sketchs[i].GetSummary(&out); + summary_array[i - ibegin].Reserve(max_num_bins * kFactor); + summary_array[i - ibegin].SetPrune(out, max_num_bins * kFactor); + } + size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor); + sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size()); + + for (size_t fid = ibegin; fid < iend; ++fid) { + WXQSketch::SummaryContainer a; + a.Reserve(max_num_bins); + a.SetPrune(summary_array[fid - ibegin], max_num_bins); + const bst_float mval = a.data[0].value; + this->min_val[fid] = mval - fabs(mval); + if (a.size > 1 && a.size <= 16) { + /* specialized code categorial / ordinal data -- use midpoints */ + for (size_t i = 1; i < a.size; ++i) { + bst_float cpt = (a.data[i].value + a.data[i - 1].value) / 2.0; + if (i == 1 || cpt > cut.back()) { + cut.push_back(cpt); + } } - } - } else { - for (size_t i = 2; i < a.size; ++i) { - bst_float cpt = a.data[i - 1].value; - if (i == 2 || cpt > cut.back()) { - cut.push_back(cpt); + } else { + for (size_t i = 2; i < a.size; ++i) { + bst_float cpt = a.data[i - 1].value; + if (i == 2 || cpt > cut.back()) { + cut.push_back(cpt); + } } } + // push a value that is greater than anything + if (a.size != 0) { + bst_float cpt = a.data[a.size - 1].value; + // this must be bigger than last value in a scale + bst_float last = cpt + fabs(cpt); + cut.push_back(last); + } + row_ptr.push_back(cut.size()); } - // push a value that is greater than anything - if (a.size != 0) { - bst_float cpt = a.data[a.size - 1].value; - // this must be bigger than last value in a scale - bst_float last = cpt + fabs(cpt); - cut.push_back(last); - } - row_ptr.push_back(cut.size()); } } @@ -296,8 +311,15 @@ FastFeatureGrouping(const GHistIndexMatrix& gmat, return feature_nnz[a] > feature_nnz[b]; }); - auto groups_alt1 = FindGroups(feature_list, feature_nnz, colmat, nrow, param); - auto groups_alt2 = FindGroups(features_by_nnz, feature_nnz, colmat, nrow, param); + std::vector> groups_alt1, groups_alt2; + + #pragma omp parallel sections + { + #pragma omp section + groups_alt1 = FindGroups(feature_list, feature_nnz, colmat, nrow, param); + #pragma omp section + groups_alt2 = FindGroups(features_by_nnz, feature_nnz, colmat, nrow, param); + } auto& groups = (groups_alt1.size() > groups_alt2.size()) ? groups_alt2 : groups_alt1; // take apart small, sparse groups, as it won't help speed @@ -338,6 +360,7 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat, cut = gmat.cut; const size_t nrow = gmat.row_ptr.size() - 1; + const size_t nfeature = gmat.cut->row_ptr.size() - 1; const uint32_t nbins = gmat.cut->row_ptr.back(); /* step 1: form feature groups */ @@ -355,10 +378,24 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat, } } } + + std::vector block_nnz(nblock, 0); + { + std::vector feature_nnz(nfeature); + gmat.GetFeatureCounts(&feature_nnz[0]); + for (uint32_t group_id = 0; group_id < nblock; ++group_id) { + for (auto& fid : groups[group_id]) { + block_nnz[group_id] += feature_nnz[fid]; + } + } + } + std::vector> index_temp(nblock); std::vector> row_ptr_temp(nblock); for (uint32_t block_id = 0; block_id < nblock; ++block_id) { + row_ptr_temp[block_id].reserve(nrow + 1); row_ptr_temp[block_id].push_back(0); + index_temp[block_id].reserve(block_nnz[block_id]); } for (size_t rid = 0; rid < nrow; ++rid) { const size_t ibegin = gmat.row_ptr[rid]; @@ -378,6 +415,16 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat, std::vector row_ptr_blk_ptr; index_blk_ptr.push_back(0); row_ptr_blk_ptr.push_back(0); + + { + size_t tot = 0, tot2 = 0; + for (uint32_t block_id = 0; block_id < nblock; ++block_id) { + tot += index_temp[block_id].size(); + tot2 += row_ptr_temp[block_id].size(); + } + index.reserve(tot); + row_ptr.reserve(tot2); + } for (uint32_t block_id = 0; block_id < nblock; ++block_id) { index.insert(index.end(), index_temp[block_id].begin(), index_temp[block_id].end()); row_ptr.insert(row_ptr.end(), row_ptr_temp[block_id].begin(), row_ptr_temp[block_id].end()); diff --git a/src/common/hist_util.h b/src/common/hist_util.h index 4d5456e8523c..e22e986e2b79 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -79,7 +79,7 @@ struct HistCutMatrix { } // create histogram cut matrix given statistics from data // using approximate quantile sketch approach - void Init(DMatrix* p_fmat, uint32_t max_num_bins); + void Init(DMatrix* p_fmat, uint32_t max_num_bins, bool verbose = false); }; diff --git a/src/common/memory.h b/src/common/memory.h new file mode 100644 index 000000000000..5fb9e96a3681 --- /dev/null +++ b/src/common/memory.h @@ -0,0 +1,38 @@ +/*! + * Copyright 2017 by Contributors + * \file memory.h + * \brief Utility for memory + * \author Philip Cho + */ +#ifndef XGBOOST_COMMON_MEMORY_H_ +#define XGBOOST_COMMON_MEMORY_H_ + +#ifndef _WIN32 +#include +#else +#include +#endif + +namespace xgboost { +namespace common { + +#ifndef _WIN32 +inline unsigned long long GetSystemMemory() +{ + long pages = sysconf(_SC_PHYS_PAGES); + long page_size = sysconf(_SC_PAGE_SIZE); + return pages * page_size; +} +#else +inline unsigned long long GetSystemMemory() +{ + MEMORYSTATUSEX status; + status.dwLength = sizeof(status); + GlobalMemoryStatusEx(&status); + return status.ullTotalPhys; +} +#endif + +} // namespace common +} // namespace xgboost +#endif // XGBOOST_COMMON_MEMORY_H_ diff --git a/src/tree/fast_hist_param.h b/src/tree/fast_hist_param.h index 5ca9e0b5e76a..633a21552162 100644 --- a/src/tree/fast_hist_param.h +++ b/src/tree/fast_hist_param.h @@ -18,8 +18,10 @@ struct FastHistParam : public dmlc::Parameter { // percentage threshold for treating a feature as sparse // e.g. 0.2 indicates a feature with fewer than 20% nonzeros is considered sparse double sparse_threshold; - // use feature grouping? (default yes) + // use feature grouping? (default no) int enable_feature_grouping; + // use columnar access structure? (default yes) + int use_columnar_access; // when grouping features, how many "conflicts" to allow. // conflict is when an instance has nonzero values for two or more features // default is 0, meaning features should be strictly complementary @@ -45,7 +47,9 @@ struct FastHistParam : public dmlc::Parameter { DMLC_DECLARE_FIELD(enable_feature_grouping).set_lower_bound(0).set_default(0) .describe("if >0, enable feature grouping to ameliorate work imbalance " "among worker threads"); - DMLC_DECLARE_FIELD(max_conflict_rate).set_range(0, 1.0).set_default(0) + DMLC_DECLARE_FIELD(use_columnar_access).set_lower_bound(0).set_default(1) + .describe("if >0, store a transposed copy of input matrix for fast columnar access"); + DMLC_DECLARE_FIELD(max_conflict_rate).set_range(1, 1.0).set_default(0) .describe("when grouping features, how many \"conflicts\" to allow." "conflict is when an instance has nonzero values for two or more features." "default is 0, meaning features should be strictly complementary."); diff --git a/src/tree/updater_fast_hist.cc b/src/tree/updater_fast_hist.cc index 3f1c6c5ee98c..fff2806e7c92 100644 --- a/src/tree/updater_fast_hist.cc +++ b/src/tree/updater_fast_hist.cc @@ -60,17 +60,37 @@ class FastHistMaker: public TreeUpdater { const std::vector& trees) override { TStats::CheckInfo(dmat->info()); if (is_gmat_initialized_ == false) { + if (fhparam.use_columnar_access == 0) { + CHECK_EQ(fhparam.enable_feature_grouping, 0) + << "Feature grouping requires columnar access structure"; + } double tstart = dmlc::GetTime(); - hmat_.Init(dmat, static_cast(param.max_bin)); + hmat_.Init(dmat, static_cast(param.max_bin), param.debug_verbose); + if (param.debug_verbose > 0) { + LOG(INFO) << "Quantizing data matrix entries into quantile indices..."; + } gmat_.cut = &hmat_; gmat_.Init(dmat); - column_matrix_.Init(gmat_, fhparam); + if (param.debug_verbose > 0) { + LOG(INFO) << "Generating columnar access structure..."; + } + if (fhparam.use_columnar_access > 0) { + column_matrix_.Init(gmat_, fhparam); + } if (fhparam.enable_feature_grouping > 0) { + if (param.debug_verbose > 0) { + LOG(INFO) << "Grouping features together..."; + } gmatb_.Init(gmat_, column_matrix_, fhparam); + // free up memory by deleting gmat; only gmatb will be used + gmat_.row_ptr.clear(); + gmat_.index.clear(); + gmat_.hit_count.clear(); } is_gmat_initialized_ = true; if (param.debug_verbose > 0) { - LOG(INFO) << "Generating gmat: " << dmlc::GetTime() - tstart << " sec"; + LOG(INFO) << "Done initializing training: " + << dmlc::GetTime() - tstart << " sec"; } } // rescale learning rate according to size of trees @@ -191,7 +211,8 @@ class FastHistMaker: public TreeUpdater { (*p_tree)[nid].set_leaf(snode[nid].weight * param.learning_rate); } else { tstart = dmlc::GetTime(); - this->ApplySplit(nid, gmat, column_matrix, hist_, *p_fmat, p_tree); + this->ApplySplit(nid, gmat, column_matrix, hist_, *p_fmat, p_tree, + fhparam.use_columnar_access); time_apply_split += dmlc::GetTime() - tstart; tstart = dmlc::GetTime(); @@ -494,19 +515,8 @@ class FastHistMaker: public TreeUpdater { const ColumnMatrix& column_matrix, const HistCollection& hist, const DMatrix& fmat, - RegTree* p_tree) { - XGBOOST_TYPE_SWITCH(column_matrix.dtype, { - ApplySplit_(nid, gmat, column_matrix, hist, fmat, p_tree); - }); - } - - template - inline void ApplySplit_(int nid, - const GHistIndexMatrix& gmat, - const ColumnMatrix& column_matrix, - const HistCollection& hist, - const DMatrix& fmat, - RegTree* p_tree) { + RegTree* p_tree, + bool use_columnar_access) { // TODO(hcho3): support feature sampling by levels /* 1. Create child nodes */ @@ -544,18 +554,38 @@ class FastHistMaker: public TreeUpdater { } const auto& rowset = row_set_collection_[nid]; + if (use_columnar_access) { + XGBOOST_TYPE_SWITCH(column_matrix.dtype, { + ApplySplit_(rowset, gmat, &row_split_tloc_, column_matrix, fid, lower_bound, + upper_bound, split_cond, default_left); + }); + } else { + ApplySplitSparseDataOld(rowset, gmat, &row_split_tloc_, lower_bound, upper_bound, + split_cond, default_left); + } + + row_set_collection_.AddSplit( + nid, row_split_tloc_, (*p_tree)[nid].cleft(), (*p_tree)[nid].cright()); + } + template + inline void ApplySplit_(const RowSetCollection::Elem rowset, + const GHistIndexMatrix& gmat, + std::vector* p_row_split_tloc, + const ColumnMatrix& column_matrix, + bst_uint fid, + bst_uint lower_bound, + bst_uint upper_bound, + bst_int split_cond, + bool default_left) { Column column = column_matrix.GetColumn(fid); if (column.type == xgboost::common::kDenseColumn) { - ApplySplitDenseData(rowset, gmat, &row_split_tloc_, column, split_cond, + ApplySplitDenseData(rowset, gmat, p_row_split_tloc, column, split_cond, default_left); } else { - ApplySplitSparseData(rowset, gmat, &row_split_tloc_, column, lower_bound, + ApplySplitSparseData(rowset, gmat, p_row_split_tloc, column, lower_bound, upper_bound, split_cond, default_left); } - - row_set_collection_.AddSplit( - nid, row_split_tloc_, (*p_tree)[nid].cleft(), (*p_tree)[nid].cright()); } template diff --git a/src/tree/updater_histmaker.cc b/src/tree/updater_histmaker.cc index 80e8b5495d38..19924cc14a7a 100644 --- a/src/tree/updater_histmaker.cc +++ b/src/tree/updater_histmaker.cc @@ -405,6 +405,7 @@ class CQHistMaker: public HistMaker { const size_t work_set_size = work_set.size(); sketchs.resize(this->qexpand.size() * work_set_size); + LOG(INFO) << "sketchs.resize(" << this->qexpand.size() * work_set_size << ")"; for (size_t i = 0; i < sketchs.size(); ++i) { sketchs[i].Init(info.num_row, this->param.sketch_eps); } @@ -415,6 +416,9 @@ class CQHistMaker: public HistMaker { for (size_t i = 0; i < sketchs.size(); ++i) { summary_array[i].Reserve(max_size); } + LOG(INFO) << "sketchs.size() = " << sketchs.size(); + LOG(INFO) << "Will be Reserving " << (size_t)max_size * sketchs.size() * 16 << " bytes total"; + LOG(INFO) << "Each sketch costs " << (size_t)max_size * 16 << " bytes"; { // get smmary thread_sketch.resize(omp_get_max_threads());