diff --git a/src/common/quantile.cu b/src/common/quantile.cu index 776752476054..2bf6070d5e89 100644 --- a/src/common/quantile.cu +++ b/src/common/quantile.cu @@ -634,12 +634,25 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) { }); CHECK_EQ(num_columns_, d_in_columns_ptr.size() - 1); max_values.resize(d_in_columns_ptr.size() - 1); + + // In some cases (e.g. column-wise data split), we may have empty columns, so we need to keep + // track of the unique keys (feature indices) after the thrust::reduce_by_key` call. + dh::caching_device_vector d_max_keys(d_in_columns_ptr.size() - 1); dh::caching_device_vector d_max_values(d_in_columns_ptr.size() - 1); - thrust::reduce_by_key(thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it, - thrust::make_discard_iterator(), d_max_values.begin(), - thrust::equal_to{}, - [] __device__(auto l, auto r) { return l.value > r.value ? l : r; }); - dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_values)); + auto new_end = thrust::reduce_by_key( + thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it, d_max_keys.begin(), + d_max_values.begin(), thrust::equal_to{}, + [] __device__(auto l, auto r) { return l.value > r.value ? l : r; }); + d_max_keys.erase(new_end.first, d_max_keys.end()); + d_max_values.erase(new_end.second, d_max_values.end()); + + // The device vector needs to be initialized explicitly since we may have some missing columns. + SketchEntry default_entry{}; + dh::caching_device_vector d_max_results(d_in_columns_ptr.size() - 1, + default_entry); + thrust::scatter(thrust::cuda::par(alloc), d_max_values.begin(), d_max_values.end(), + d_max_keys.begin(), d_max_results.begin()); + dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_results)); auto max_it = MakeIndexTransformIter([&](auto i) { if (IsCat(h_feature_types, i)) { return max_values[i].value; diff --git a/src/common/quantile.h b/src/common/quantile.h index 48758b8dce1e..47db5f875af5 100644 --- a/src/common/quantile.h +++ b/src/common/quantile.h @@ -35,13 +35,13 @@ struct WQSummary { /*! \brief an entry in the sketch summary */ struct Entry { /*! \brief minimum rank */ - RType rmin; + RType rmin{}; /*! \brief maximum rank */ - RType rmax; + RType rmax{}; /*! \brief maximum weight */ - RType wmin; + RType wmin{}; /*! \brief the value of data */ - DType value; + DType value{}; // constructor XGBOOST_DEVICE Entry() {} // NOLINT // constructor diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index 57c9da7038c6..49353439f21a 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -339,6 +339,31 @@ TEST(GPUQuantile, MultiMerge) { }); } +TEST(GPUQuantile, MissingColumns) { + auto dmat = std::unique_ptr{[=]() { + std::size_t constexpr kRows = 1000, kCols = 100; + auto sparsity = 0.5f; + std::vector ft(kCols); + for (size_t i = 0; i < ft.size(); ++i) { + ft[i] = (i % 2 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical; + } + auto dmat = RandomDataGenerator{kRows, kCols, sparsity} + .Seed(0) + .Lower(.0f) + .Upper(1.0f) + .Type(ft) + .MaxCategory(13) + .GenerateDMatrix(); + return dmat->SliceCol(2, 1); + }()}; + dmat->Info().data_split_mode = DataSplitMode::kRow; + + auto ctx = MakeCUDACtx(0); + std::size_t constexpr kBins = 64; + HistogramCuts cuts = common::DeviceSketch(&ctx, dmat.get(), kBins); + ASSERT_TRUE(cuts.HasCategorical()); +} + namespace { void TestAllReduceBasic() { auto const world = collective::GetWorldSize(); @@ -422,18 +447,14 @@ TEST_F(MGPUQuantileTest, AllReduceBasic) { } namespace { -void TestColumnSplitBasic() { +void TestColumnSplit(DMatrix* dmat) { auto const world = collective::GetWorldSize(); auto const rank = collective::GetRank(); - std::size_t constexpr kRows = 1000, kCols = 100, kBins = 64; - - auto m = std::unique_ptr{[=]() { - auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(); - return dmat->SliceCol(world, rank); - }()}; + auto m = std::unique_ptr{dmat->SliceCol(world, rank)}; // Generate cuts for distributed environment. auto ctx = MakeCUDACtx(GPUIDX); + std::size_t constexpr kBins = 64; HistogramCuts distributed_cuts = common::DeviceSketch(&ctx, m.get(), kBins); // Generate cuts for single node environment @@ -466,7 +487,26 @@ void TestColumnSplitBasic() { } // anonymous namespace TEST_F(MGPUQuantileTest, ColumnSplitBasic) { - DoTest(TestColumnSplitBasic); + std::size_t constexpr kRows = 1000, kCols = 100; + auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(); + DoTest(TestColumnSplit, dmat.get()); +} + +TEST_F(MGPUQuantileTest, ColumnSplitCategorical) { + std::size_t constexpr kRows = 1000, kCols = 100; + auto sparsity = 0.5f; + std::vector ft(kCols); + for (size_t i = 0; i < ft.size(); ++i) { + ft[i] = (i % 2 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical; + } + auto dmat = RandomDataGenerator{kRows, kCols, sparsity} + .Seed(0) + .Lower(.0f) + .Upper(1.0f) + .Type(ft) + .MaxCategory(13) + .GenerateDMatrix(); + DoTest(TestColumnSplit, dmat.get()); } namespace {