diff --git a/doc/parameter.md b/doc/parameter.md
index f6802fb891d7..fa95aac525da 100644
--- a/doc/parameter.md
+++ b/doc/parameter.md
@@ -107,6 +107,23 @@ Parameters for Tree Booster
   - This is only used if 'hist' is specified as `tree_method`.
   - Maximum number of discrete bins to bucket continuous features.
   - Increasing this number improves the optimality of splits at the cost of higher computation time.
+* use_columnar_access, [default=1]
+  - This is only used if 'hist' is specified as `tree_method`.
+  - If greater than zero, store a transposed copy of input matrix for fast columnar access. May increase memory usage and initial setup time.
+* sparse_threshold, [default=0.2]
+  - range: [0.0, 1.0]
+  - This is only used if 'hist' is specified as `tree_method`.
+  - Percentage threshold for treating a feature as sparse. For instance, 0.2 indicates that any feature with fewer than 20% nonzero rows will be considered sparse. May impact computation time slightly.
+* enable_feature_grouping, [default=0]
+  - This is only used if 'hist' is specified as `tree_method`.
+  - If greater than zero, group complementary features together so as to improve work balance for parallel histogram aggregation. May increase memory usage and initial setup time.
+* max_conflict_rate, [default=0]
+  - range: [0.0, 1.0]
+  - Only relevant when `enable_feature_grouping=1` is specified.
+  - Specifies criterion for "complementary" features. By default, only features with no common nonzero rows are considered complementary. Increase this number to encourage larger feature groups.
+* max_search_group, [default=100]
+  - Only relevant when `enable_feature_grouping=1` is specified.
+  - Increasing this number will result in better feature grouping, at the cost of greater initial setup time.
 
 Additional parameters for Dart Booster
 --------------------------------------
diff --git a/src/common/column_matrix.h b/src/common/column_matrix.h
index cbf3a368bc34..6f429f105f9d 100644
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@@ -153,31 +153,36 @@ class ColumnMatrix {
     std::vector<size_t> num_nonzeros;
     num_nonzeros.resize(nfeature);
     std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
-    for (size_t rid = 0; rid < nrow; ++rid) {
-      const size_t ibegin = gmat.row_ptr[rid];
-      const size_t iend = gmat.row_ptr[rid + 1];
-      size_t fid = 0;
-      for (size_t i = ibegin; i < iend; ++i) {
-        const uint32_t bin_id = gmat.index[i];
-        while (bin_id >= gmat.cut->row_ptr[fid + 1]) {
-          ++fid;
-        }
-        if (type_[fid] == kDenseColumn) {
-          XGBOOST_TYPE_SWITCH(this->dtype, {
-            const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
-            const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
-            DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
-            begin[rid] = static_cast<DType>(bin_id - index_base_[fid]);
-          });
-        } else {
-          XGBOOST_TYPE_SWITCH(this->dtype, {
-            const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
-            const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
-            DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
-            begin[num_nonzeros[fid]] = static_cast<DType>(bin_id - index_base_[fid]);
-          });
-          row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid;
-          ++num_nonzeros[fid];
+
+    const int nthread = omp_get_max_threads();
+    #pragma omp parallel num_threads(nthread)
+    {
+      for (size_t rid = 0; rid < nrow; ++rid) {
+        const size_t ibegin = gmat.row_ptr[rid];
+        const size_t iend = gmat.row_ptr[rid + 1];
+        #pragma omp for schedule(static)
+        for (size_t i = ibegin; i < iend; ++i) {
+          const uint32_t bin_id = gmat.index[i];
+          const auto& vec = gmat.cut->row_ptr;
+          auto it = std::upper_bound(vec.begin(), vec.end(), bin_id);
+          const size_t fid = it - vec.begin() - 1;
+          if (type_[fid] == kDenseColumn) {
+            XGBOOST_TYPE_SWITCH(this->dtype, {
+              const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
+              const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
+              DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
+              begin[rid] = static_cast<DType>(bin_id - index_base_[fid]);
+            });
+          } else {
+            XGBOOST_TYPE_SWITCH(this->dtype, {
+              const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
+              const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
+              DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
+              begin[num_nonzeros[fid]] = static_cast<DType>(bin_id - index_base_[fid]);
+            });
+            row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid;
+            ++num_nonzeros[fid];
+          }
         }
       }
     }
diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index ee64d9778a99..c96380bbc7e8 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -12,11 +12,12 @@
 #include "./column_matrix.h"
 #include "./hist_util.h"
 #include "./quantile.h"
+#include "./memory.h"
 
 namespace xgboost {
 namespace common {
 
-void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
+void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins, bool verbose) {
   typedef common::WXQuantileSketch<bst_float, bst_float> WXQSketch;
   const MetaInfo& info = p_fmat->info();
 
@@ -33,6 +34,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
     s.Init(info.num_row, 1.0 / (max_num_bins * kFactor));
   }
 
+  LOG(INFO) << "Generating sketches...";
   dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
   iter->BeforeFirst();
   while (iter->Next()) {
@@ -55,51 +57,64 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
     }
   }
 
-  // gather the histogram data
-  rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
-  std::vector<WXQSketch::SummaryContainer> summary_array;
-  summary_array.resize(sketchs.size());
-  for (size_t i = 0; i < sketchs.size(); ++i) {
-    WXQSketch::SummaryContainer out;
-    sketchs[i].GetSummary(&out);
-    summary_array[i].Reserve(max_num_bins * kFactor);
-    summary_array[i].SetPrune(out, max_num_bins * kFactor);
-  }
-  size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor);
-  sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
-
   this->min_val.resize(info.num_col);
   row_ptr.push_back(0);
-  for (size_t fid = 0; fid < summary_array.size(); ++fid) {
-    WXQSketch::SummaryContainer a;
-    a.Reserve(max_num_bins);
-    a.SetPrune(summary_array[fid], max_num_bins);
-    const bst_float mval = a.data[0].value;
-    this->min_val[fid] = mval - fabs(mval);
-    if (a.size > 1 && a.size <= 16) {
-      /* specialized code categorial / ordinal data -- use midpoints */
-      for (size_t i = 1; i < a.size; ++i) {
-        bst_float cpt = (a.data[i].value + a.data[i - 1].value) / 2.0;
-        if (i == 1 || cpt > cut.back()) {
-          cut.push_back(cpt);
+  // gather the histogram data
+  rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
+  const size_t bundle_size       // limit this task to 1GB
+    = std::min(GetSystemMemory() / 2,
+               static_cast<unsigned long long>(1) * 1024 * 1024 * 1024)
+                / (max_num_bins * kFactor * 16);
+  for (size_t ibegin = 0; ibegin < sketchs.size(); ibegin += bundle_size) {
+    const size_t iend = std::min(ibegin + bundle_size, sketchs.size());
+    const size_t batch_size = iend - ibegin;
+
+    std::vector<WXQSketch::SummaryContainer> summary_array;
+    summary_array.resize(batch_size);
+    if (verbose) {
+      LOG(INFO) << "Computing quantiles for features ["
+                << ibegin << ", " << iend << ")...";
+    }
+    for (size_t i = ibegin; i < iend; ++i) {
+      WXQSketch::SummaryContainer out;
+      sketchs[i].GetSummary(&out);
+      summary_array[i - ibegin].Reserve(max_num_bins * kFactor);
+      summary_array[i - ibegin].SetPrune(out, max_num_bins * kFactor);
+    }
+    size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor);
+    sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
+
+    for (size_t fid = ibegin; fid < iend; ++fid) {
+      WXQSketch::SummaryContainer a;
+      a.Reserve(max_num_bins);
+      a.SetPrune(summary_array[fid - ibegin], max_num_bins);
+      const bst_float mval = a.data[0].value;
+      this->min_val[fid] = mval - fabs(mval);
+      if (a.size > 1 && a.size <= 16) {
+        /* specialized code categorial / ordinal data -- use midpoints */
+        for (size_t i = 1; i < a.size; ++i) {
+          bst_float cpt = (a.data[i].value + a.data[i - 1].value) / 2.0;
+          if (i == 1 || cpt > cut.back()) {
+            cut.push_back(cpt);
+          }
         }
-      }
-    } else {
-      for (size_t i = 2; i < a.size; ++i) {
-        bst_float cpt = a.data[i - 1].value;
-        if (i == 2 || cpt > cut.back()) {
-          cut.push_back(cpt);
+      } else {
+        for (size_t i = 2; i < a.size; ++i) {
+          bst_float cpt = a.data[i - 1].value;
+          if (i == 2 || cpt > cut.back()) {
+            cut.push_back(cpt);
+          }
         }
       }
+      // push a value that is greater than anything
+      if (a.size != 0) {
+        bst_float cpt = a.data[a.size - 1].value;
+        // this must be bigger than last value in a scale
+        bst_float last = cpt + fabs(cpt);
+        cut.push_back(last);
+      }
+      row_ptr.push_back(cut.size());
     }
-    // push a value that is greater than anything
-    if (a.size != 0) {
-      bst_float cpt = a.data[a.size - 1].value;
-      // this must be bigger than last value in a scale
-      bst_float last = cpt + fabs(cpt);
-      cut.push_back(last);
-    }
-    row_ptr.push_back(cut.size());
   }
 }
 
@@ -296,8 +311,15 @@ FastFeatureGrouping(const GHistIndexMatrix& gmat,
     return feature_nnz[a] > feature_nnz[b];
   });
 
-  auto groups_alt1 = FindGroups(feature_list, feature_nnz, colmat, nrow, param);
-  auto groups_alt2 = FindGroups(features_by_nnz, feature_nnz, colmat, nrow, param);
+  std::vector<std::vector<unsigned>> groups_alt1, groups_alt2;
+
+  #pragma omp parallel sections
+  {
+    #pragma omp section
+    groups_alt1 = FindGroups(feature_list, feature_nnz, colmat, nrow, param);
+    #pragma omp section
+    groups_alt2 = FindGroups(features_by_nnz, feature_nnz, colmat, nrow, param);
+  }
   auto& groups = (groups_alt1.size() > groups_alt2.size()) ? groups_alt2 : groups_alt1;
 
   // take apart small, sparse groups, as it won't help speed
@@ -338,6 +360,7 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
   cut = gmat.cut;
 
   const size_t nrow = gmat.row_ptr.size() - 1;
+  const size_t nfeature = gmat.cut->row_ptr.size() - 1;
   const uint32_t nbins = gmat.cut->row_ptr.back();
 
   /* step 1: form feature groups */
@@ -355,10 +378,24 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
       }
     }
   }
+
+  std::vector<size_t> block_nnz(nblock, 0);
+  {
+    std::vector<size_t> feature_nnz(nfeature);
+    gmat.GetFeatureCounts(&feature_nnz[0]);
+    for (uint32_t group_id = 0; group_id < nblock; ++group_id) {
+      for (auto& fid : groups[group_id]) {
+        block_nnz[group_id] += feature_nnz[fid];
+      }
+    }
+  }
+
   std::vector<std::vector<uint32_t>> index_temp(nblock);
   std::vector<std::vector<size_t>> row_ptr_temp(nblock);
   for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
+    row_ptr_temp[block_id].reserve(nrow + 1);
     row_ptr_temp[block_id].push_back(0);
+    index_temp[block_id].reserve(block_nnz[block_id]);
   }
   for (size_t rid = 0; rid < nrow; ++rid) {
     const size_t ibegin = gmat.row_ptr[rid];
@@ -378,6 +415,16 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
   std::vector<size_t> row_ptr_blk_ptr;
   index_blk_ptr.push_back(0);
   row_ptr_blk_ptr.push_back(0);
+
+  {
+    size_t tot = 0, tot2 = 0;
+    for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
+      tot += index_temp[block_id].size();
+      tot2 += row_ptr_temp[block_id].size();
+    }
+    index.reserve(tot);
+    row_ptr.reserve(tot2);
+  }
   for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
     index.insert(index.end(), index_temp[block_id].begin(), index_temp[block_id].end());
     row_ptr.insert(row_ptr.end(), row_ptr_temp[block_id].begin(), row_ptr_temp[block_id].end());
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index 4d5456e8523c..e22e986e2b79 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -79,7 +79,7 @@ struct HistCutMatrix {
   }
   // create histogram cut matrix given statistics from data
   // using approximate quantile sketch approach
-  void Init(DMatrix* p_fmat, uint32_t max_num_bins);
+  void Init(DMatrix* p_fmat, uint32_t max_num_bins, bool verbose = false);
 };
 
 
diff --git a/src/common/memory.h b/src/common/memory.h
new file mode 100644
index 000000000000..5fb9e96a3681
--- /dev/null
+++ b/src/common/memory.h
@@ -0,0 +1,38 @@
+/*!
+ * Copyright 2017 by Contributors
+ * \file memory.h
+ * \brief Utility for memory
+ * \author Philip Cho
+ */
+#ifndef XGBOOST_COMMON_MEMORY_H_
+#define XGBOOST_COMMON_MEMORY_H_
+
+#ifndef _WIN32
+#include <unistd.h>
+#else
+#include <windows.h>
+#endif
+
+namespace xgboost {
+namespace common {
+
+#ifndef _WIN32
+inline unsigned long long GetSystemMemory()
+{
+  long pages = sysconf(_SC_PHYS_PAGES);
+  long page_size = sysconf(_SC_PAGE_SIZE);
+  return pages * page_size;
+}
+#else
+inline unsigned long long GetSystemMemory()
+{
+  MEMORYSTATUSEX status;
+  status.dwLength = sizeof(status);
+  GlobalMemoryStatusEx(&status);
+  return status.ullTotalPhys;
+}
+#endif
+
+}  // namespace common
+}  // namespace xgboost
+#endif  // XGBOOST_COMMON_MEMORY_H_
diff --git a/src/tree/fast_hist_param.h b/src/tree/fast_hist_param.h
index 5ca9e0b5e76a..633a21552162 100644
--- a/src/tree/fast_hist_param.h
+++ b/src/tree/fast_hist_param.h
@@ -18,8 +18,10 @@ struct FastHistParam : public dmlc::Parameter<FastHistParam> {
   // percentage threshold for treating a feature as sparse
   // e.g. 0.2 indicates a feature with fewer than 20% nonzeros is considered sparse
   double sparse_threshold;
-  // use feature grouping? (default yes)
+  // use feature grouping? (default no)
   int enable_feature_grouping;
+  // use columnar access structure? (default yes)
+  int use_columnar_access;
   // when grouping features, how many "conflicts" to allow.
   // conflict is when an instance has nonzero values for two or more features
   // default is 0, meaning features should be strictly complementary
@@ -45,7 +47,9 @@ struct FastHistParam : public dmlc::Parameter<FastHistParam> {
     DMLC_DECLARE_FIELD(enable_feature_grouping).set_lower_bound(0).set_default(0)
         .describe("if >0, enable feature grouping to ameliorate work imbalance "
                   "among worker threads");
-    DMLC_DECLARE_FIELD(max_conflict_rate).set_range(0, 1.0).set_default(0)
+    DMLC_DECLARE_FIELD(use_columnar_access).set_lower_bound(0).set_default(1)
+        .describe("if >0, store a transposed copy of input matrix for fast columnar access");
+    DMLC_DECLARE_FIELD(max_conflict_rate).set_range(1, 1.0).set_default(0)
         .describe("when grouping features, how many \"conflicts\" to allow."
        "conflict is when an instance has nonzero values for two or more features."
        "default is 0, meaning features should be strictly complementary.");
diff --git a/src/tree/updater_fast_hist.cc b/src/tree/updater_fast_hist.cc
index 3f1c6c5ee98c..fff2806e7c92 100644
--- a/src/tree/updater_fast_hist.cc
+++ b/src/tree/updater_fast_hist.cc
@@ -60,17 +60,37 @@ class FastHistMaker: public TreeUpdater {
               const std::vector<RegTree*>& trees) override {
     TStats::CheckInfo(dmat->info());
     if (is_gmat_initialized_ == false) {
+      if (fhparam.use_columnar_access == 0) {
+        CHECK_EQ(fhparam.enable_feature_grouping, 0)
+          << "Feature grouping requires columnar access structure";
+      }
       double tstart = dmlc::GetTime();
-      hmat_.Init(dmat, static_cast<uint32_t>(param.max_bin));
+      hmat_.Init(dmat, static_cast<uint32_t>(param.max_bin), param.debug_verbose);
+      if (param.debug_verbose > 0) {
+        LOG(INFO) << "Quantizing data matrix entries into quantile indices...";
+      }
       gmat_.cut = &hmat_;
       gmat_.Init(dmat);
-      column_matrix_.Init(gmat_, fhparam);
+      if (param.debug_verbose > 0) {
+        LOG(INFO) << "Generating columnar access structure...";
+      }
+      if (fhparam.use_columnar_access > 0) {
+        column_matrix_.Init(gmat_, fhparam);
+      }
       if (fhparam.enable_feature_grouping > 0) {
+        if (param.debug_verbose > 0) {
+          LOG(INFO) << "Grouping features together...";
+        }
         gmatb_.Init(gmat_, column_matrix_, fhparam);
+        // free up memory by deleting gmat; only gmatb will be used
+        gmat_.row_ptr.clear();
+        gmat_.index.clear();
+        gmat_.hit_count.clear();
       }
       is_gmat_initialized_ = true;
       if (param.debug_verbose > 0) {
-        LOG(INFO) << "Generating gmat: " << dmlc::GetTime() - tstart << " sec";
+        LOG(INFO) << "Done initializing training: "
+                  << dmlc::GetTime() - tstart << " sec";
       }
     }
     // rescale learning rate according to size of trees
@@ -191,7 +211,8 @@ class FastHistMaker: public TreeUpdater {
           (*p_tree)[nid].set_leaf(snode[nid].weight * param.learning_rate);
         } else {
           tstart = dmlc::GetTime();
-          this->ApplySplit(nid, gmat, column_matrix, hist_, *p_fmat, p_tree);
+          this->ApplySplit(nid, gmat, column_matrix, hist_, *p_fmat, p_tree,
+                           fhparam.use_columnar_access);
           time_apply_split += dmlc::GetTime() - tstart;
 
           tstart = dmlc::GetTime();
@@ -494,19 +515,8 @@ class FastHistMaker: public TreeUpdater {
                            const ColumnMatrix& column_matrix,
                            const HistCollection& hist,
                            const DMatrix& fmat,
-                           RegTree* p_tree) {
-      XGBOOST_TYPE_SWITCH(column_matrix.dtype, {
-        ApplySplit_<DType>(nid, gmat, column_matrix, hist, fmat, p_tree);
-      });
-    }
-
-    template <typename T>
-    inline void ApplySplit_(int nid,
-                            const GHistIndexMatrix& gmat,
-                            const ColumnMatrix& column_matrix,
-                            const HistCollection& hist,
-                            const DMatrix& fmat,
-                            RegTree* p_tree) {
+                           RegTree* p_tree,
+                           bool use_columnar_access) {
       // TODO(hcho3): support feature sampling by levels
 
       /* 1. Create child nodes */
@@ -544,18 +554,38 @@ class FastHistMaker: public TreeUpdater {
       }
 
       const auto& rowset = row_set_collection_[nid];
+      if (use_columnar_access) {
+        XGBOOST_TYPE_SWITCH(column_matrix.dtype, {
+          ApplySplit_<DType>(rowset, gmat, &row_split_tloc_, column_matrix, fid, lower_bound,
+            upper_bound, split_cond, default_left);
+        });
+      } else {
+        ApplySplitSparseDataOld(rowset, gmat, &row_split_tloc_, lower_bound, upper_bound,
+          split_cond, default_left);
+      }
+
+      row_set_collection_.AddSplit(
+        nid, row_split_tloc_, (*p_tree)[nid].cleft(), (*p_tree)[nid].cright());
+    }
 
+    template <typename T>
+    inline void ApplySplit_(const RowSetCollection::Elem rowset,
+                            const GHistIndexMatrix& gmat,
+                            std::vector<RowSetCollection::Split>* p_row_split_tloc,
+                            const ColumnMatrix& column_matrix,
+                            bst_uint fid,
+                            bst_uint lower_bound,
+                            bst_uint upper_bound,
+                            bst_int split_cond,
+                            bool default_left) {
       Column<T> column = column_matrix.GetColumn<T>(fid);
       if (column.type == xgboost::common::kDenseColumn) {
-        ApplySplitDenseData(rowset, gmat, &row_split_tloc_, column, split_cond,
+        ApplySplitDenseData(rowset, gmat, p_row_split_tloc, column, split_cond,
           default_left);
       } else {
-        ApplySplitSparseData(rowset, gmat, &row_split_tloc_, column, lower_bound,
+        ApplySplitSparseData(rowset, gmat, p_row_split_tloc, column, lower_bound,
           upper_bound, split_cond, default_left);
       }
-
-      row_set_collection_.AddSplit(
-        nid, row_split_tloc_, (*p_tree)[nid].cleft(), (*p_tree)[nid].cright());
     }
 
     template<typename T>
diff --git a/src/tree/updater_histmaker.cc b/src/tree/updater_histmaker.cc
index 80e8b5495d38..19924cc14a7a 100644
--- a/src/tree/updater_histmaker.cc
+++ b/src/tree/updater_histmaker.cc
@@ -405,6 +405,7 @@ class CQHistMaker: public HistMaker<TStats> {
     const size_t work_set_size = work_set.size();
 
     sketchs.resize(this->qexpand.size() * work_set_size);
+    LOG(INFO) << "sketchs.resize(" << this->qexpand.size() * work_set_size << ")";
     for (size_t i = 0; i < sketchs.size(); ++i) {
       sketchs[i].Init(info.num_row, this->param.sketch_eps);
     }
@@ -415,6 +416,9 @@ class CQHistMaker: public HistMaker<TStats> {
     for (size_t i = 0; i < sketchs.size(); ++i) {
       summary_array[i].Reserve(max_size);
     }
+    LOG(INFO) << "sketchs.size() = " << sketchs.size();
+    LOG(INFO) << "Will be Reserving " << (size_t)max_size * sketchs.size() * 16 << " bytes total";
+    LOG(INFO) << "Each sketch costs " << (size_t)max_size * 16 << " bytes";
     {
       // get smmary
       thread_sketch.resize(omp_get_max_threads());