Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup to prepare for using mmap pointer in external memory. #9317

Merged
merged 3 commits into from
Jun 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/collective/nccl_device_communicator.cu
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ NcclDeviceCommunicator::~NcclDeviceCommunicator() {

namespace {
ncclDataType_t GetNcclDataType(DataType const &data_type) {
ncclDataType_t result;
ncclDataType_t result{ncclInt8};
switch (data_type) {
case DataType::kInt8:
result = ncclInt8;
Expand Down Expand Up @@ -108,7 +108,7 @@ bool IsBitwiseOp(Operation const &op) {
}

ncclRedOp_t GetNcclRedOp(Operation const &op) {
ncclRedOp_t result;
ncclRedOp_t result{ncclMax};
switch (op) {
case Operation::kMax:
result = ncclMax;
Expand Down
94 changes: 55 additions & 39 deletions src/common/bitfield.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*!
* Copyright 2019 by Contributors
/**
* Copyright 2019-2023, XGBoost Contributors
* \file bitfield.h
*/
#ifndef XGBOOST_COMMON_BITFIELD_H_
Expand Down Expand Up @@ -50,14 +50,17 @@ __forceinline__ __device__ BitFieldAtomicType AtomicAnd(BitFieldAtomicType* addr
}
#endif // defined(__CUDACC__)

/*!
* \brief A non-owning type with auxiliary methods defined for manipulating bits.
/**
* @brief A non-owning type with auxiliary methods defined for manipulating bits.
*
* \tparam Direction Whether the bits start from left or from right.
* @tparam VT Underlying value type, must be an unsigned integer.
* @tparam Direction Whether the bits start from left or from right.
* @tparam IsConst Whether the view is const.
*/
template <typename VT, typename Direction, bool IsConst = false>
struct BitFieldContainer {
using value_type = std::conditional_t<IsConst, VT const, VT>; // NOLINT
using size_type = size_t; // NOLINT
using index_type = size_t; // NOLINT
using pointer = value_type*; // NOLINT

Expand All @@ -70,8 +73,9 @@ struct BitFieldContainer {
};

private:
common::Span<value_type> bits_;
static_assert(!std::is_signed<VT>::value, "Must use unsiged type as underlying storage.");
value_type* bits_{nullptr};
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain why we should use a raw pointer here? You mentioned the use of std::vector<bool> in ColumnMatrix. Does the span behave badly when value_type == bool ?

Copy link
Member Author

@trivialfis trivialfis Jun 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's used in hot loops, the check used in span can be expensive.

We won't be using std::vector<bool> after #9315 . In that PR, I have defined a reference counted view that's a replacement for std::vector, with the ability that we can swap out the underlying storage when needed. It doesn't have the bool specialization, therefore, an additional bitfield is used.

size_type n_values_{0};
static_assert(!std::is_signed<VT>::value, "Must use an unsiged type as the underlying storage.");

public:
XGBOOST_DEVICE static Pos ToBitPos(index_type pos) {
Expand All @@ -86,13 +90,15 @@ struct BitFieldContainer {

public:
BitFieldContainer() = default;
XGBOOST_DEVICE explicit BitFieldContainer(common::Span<value_type> bits) : bits_{bits} {}
XGBOOST_DEVICE BitFieldContainer(BitFieldContainer const& other) : bits_{other.bits_} {}
hcho3 marked this conversation as resolved.
Show resolved Hide resolved
XGBOOST_DEVICE explicit BitFieldContainer(common::Span<value_type> bits)
: bits_{bits.data()}, n_values_{bits.size()} {}
BitFieldContainer(BitFieldContainer const& other) = default;
BitFieldContainer(BitFieldContainer&& other) = default;
BitFieldContainer &operator=(BitFieldContainer const &that) = default;
BitFieldContainer &operator=(BitFieldContainer &&that) = default;

XGBOOST_DEVICE common::Span<value_type> Bits() { return bits_; }
XGBOOST_DEVICE common::Span<value_type const> Bits() const { return bits_; }
XGBOOST_DEVICE auto Bits() { return common::Span<value_type>{bits_, NumValues()}; }
XGBOOST_DEVICE auto Bits() const { return common::Span<value_type const>{bits_, NumValues()}; }

/*\brief Compute the size of needed memory allocation. The returned value is in terms
* of number of elements with `BitFieldContainer::value_type'.
Expand All @@ -103,93 +109,103 @@ struct BitFieldContainer {
#if defined(__CUDA_ARCH__)
__device__ BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
size_t min_size = min(bits_.size(), rhs.bits_.size());
size_t min_size = min(NumValues(), rhs.NumValues());
if (tid < min_size) {
bits_[tid] |= rhs.bits_[tid];
Data()[tid] |= rhs.Data()[tid];
}
return *this;
}
#else
BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
size_t min_size = std::min(bits_.size(), rhs.bits_.size());
size_t min_size = std::min(NumValues(), rhs.NumValues());
for (size_t i = 0; i < min_size; ++i) {
bits_[i] |= rhs.bits_[i];
Data()[i] |= rhs.Data()[i];
}
return *this;
}
#endif // #if defined(__CUDA_ARCH__)

#if defined(__CUDA_ARCH__)
__device__ BitFieldContainer& operator&=(BitFieldContainer const& rhs) {
size_t min_size = min(bits_.size(), rhs.bits_.size());
size_t min_size = min(NumValues(), rhs.NumValues());
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < min_size) {
bits_[tid] &= rhs.bits_[tid];
Data()[tid] &= rhs.Data()[tid];
}
return *this;
}
#else
BitFieldContainer& operator&=(BitFieldContainer const& rhs) {
size_t min_size = std::min(bits_.size(), rhs.bits_.size());
size_t min_size = std::min(NumValues(), rhs.NumValues());
for (size_t i = 0; i < min_size; ++i) {
bits_[i] &= rhs.bits_[i];
Data()[i] &= rhs.Data()[i];
}
return *this;
}
#endif // defined(__CUDA_ARCH__)

#if defined(__CUDA_ARCH__)
__device__ auto Set(index_type pos) {
__device__ auto Set(index_type pos) noexcept(true) {
Pos pos_v = Direction::Shift(ToBitPos(pos));
value_type& value = bits_[pos_v.int_pos];
value_type& value = Data()[pos_v.int_pos];
value_type set_bit = kOne << pos_v.bit_pos;
using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
atomicOr(reinterpret_cast<Type *>(&value), set_bit);
}
__device__ void Clear(index_type pos) {
__device__ void Clear(index_type pos) noexcept(true) {
Pos pos_v = Direction::Shift(ToBitPos(pos));
value_type& value = bits_[pos_v.int_pos];
value_type& value = Data()[pos_v.int_pos];
value_type clear_bit = ~(kOne << pos_v.bit_pos);
using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
atomicAnd(reinterpret_cast<Type *>(&value), clear_bit);
}
#else
void Set(index_type pos) {
void Set(index_type pos) noexcept(true) {
Pos pos_v = Direction::Shift(ToBitPos(pos));
value_type& value = bits_[pos_v.int_pos];
value_type& value = Data()[pos_v.int_pos];
value_type set_bit = kOne << pos_v.bit_pos;
value |= set_bit;
}
void Clear(index_type pos) {
void Clear(index_type pos) noexcept(true) {
Pos pos_v = Direction::Shift(ToBitPos(pos));
value_type& value = bits_[pos_v.int_pos];
value_type& value = Data()[pos_v.int_pos];
value_type clear_bit = ~(kOne << pos_v.bit_pos);
value &= clear_bit;
}
#endif // defined(__CUDA_ARCH__)

XGBOOST_DEVICE bool Check(Pos pos_v) const {
XGBOOST_DEVICE bool Check(Pos pos_v) const noexcept(true) {
pos_v = Direction::Shift(pos_v);
SPAN_LT(pos_v.int_pos, bits_.size());
value_type const value = bits_[pos_v.int_pos];
assert(pos_v.int_pos < NumValues());
value_type const value = Data()[pos_v.int_pos];
value_type const test_bit = kOne << pos_v.bit_pos;
value_type result = test_bit & value;
return static_cast<bool>(result);
}
XGBOOST_DEVICE bool Check(index_type pos) const {
[[nodiscard]] XGBOOST_DEVICE bool Check(index_type pos) const noexcept(true) {
Pos pos_v = ToBitPos(pos);
return Check(pos_v);
}
/**
* @brief Returns the total number of bits that can be viewed. This is equal to or
* larger than the acutal number of valid bits.
*/
[[nodiscard]] XGBOOST_DEVICE size_type Capacity() const noexcept(true) {
return kValueSize * NumValues();
}
/**
* @brief Number of storage unit used in this bit field.
*/
[[nodiscard]] XGBOOST_DEVICE size_type NumValues() const noexcept(true) { return n_values_; }

XGBOOST_DEVICE size_t Size() const { return kValueSize * bits_.size(); }

XGBOOST_DEVICE pointer Data() const { return bits_.data(); }
XGBOOST_DEVICE pointer Data() const noexcept(true) { return bits_; }

inline friend std::ostream &
operator<<(std::ostream &os, BitFieldContainer<VT, Direction, IsConst> field) {
os << "Bits " << "storage size: " << field.bits_.size() << "\n";
for (typename common::Span<value_type>::index_type i = 0; i < field.bits_.size(); ++i) {
std::bitset<BitFieldContainer<VT, Direction, IsConst>::kValueSize> bset(field.bits_[i]);
inline friend std::ostream& operator<<(std::ostream& os,
BitFieldContainer<VT, Direction, IsConst> field) {
os << "Bits "
<< "storage size: " << field.NumValues() << "\n";
for (typename common::Span<value_type>::index_type i = 0; i < field.NumValues(); ++i) {
std::bitset<BitFieldContainer<VT, Direction, IsConst>::kValueSize> bset(field.Data()[i]);
os << bset << "\n";
}
return os;
Expand Down
5 changes: 2 additions & 3 deletions src/common/categorical.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*!
* Copyright 2020-2022 by XGBoost Contributors
/**
* Copyright 2020-2023, XGBoost Contributors
* \file categorical.h
*/
#ifndef XGBOOST_COMMON_CATEGORICAL_H_
Expand All @@ -10,7 +10,6 @@
#include "bitfield.h"
#include "xgboost/base.h"
#include "xgboost/data.h"
#include "xgboost/parameter.h"
#include "xgboost/span.h"

namespace xgboost {
Expand Down
29 changes: 16 additions & 13 deletions src/common/hist_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,16 +84,16 @@ class HistogramCuts {
return *this;
}

uint32_t FeatureBins(bst_feature_t feature) const {
[[nodiscard]] bst_bin_t FeatureBins(bst_feature_t feature) const {
return cut_ptrs_.ConstHostVector().at(feature + 1) - cut_ptrs_.ConstHostVector()[feature];
}

std::vector<uint32_t> const& Ptrs() const { return cut_ptrs_.ConstHostVector(); }
std::vector<float> const& Values() const { return cut_values_.ConstHostVector(); }
std::vector<float> const& MinValues() const { return min_vals_.ConstHostVector(); }

bool HasCategorical() const { return has_categorical_; }
float MaxCategory() const { return max_cat_; }
[[nodiscard]] bool HasCategorical() const { return has_categorical_; }
[[nodiscard]] float MaxCategory() const { return max_cat_; }
/**
* \brief Set meta info about categorical features.
*
Expand All @@ -105,12 +105,13 @@ class HistogramCuts {
max_cat_ = max_cat;
}

size_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }
[[nodiscard]] bst_bin_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }

// Return the index of a cut point that is strictly greater than the input
// value, or the last available index if none exists
bst_bin_t SearchBin(float value, bst_feature_t column_id, std::vector<uint32_t> const& ptrs,
std::vector<float> const& values) const {
[[nodiscard]] bst_bin_t SearchBin(float value, bst_feature_t column_id,
std::vector<uint32_t> const& ptrs,
std::vector<float> const& values) const {
auto end = ptrs[column_id + 1];
auto beg = ptrs[column_id];
auto it = std::upper_bound(values.cbegin() + beg, values.cbegin() + end, value);
Expand All @@ -119,20 +120,20 @@ class HistogramCuts {
return idx;
}

bst_bin_t SearchBin(float value, bst_feature_t column_id) const {
[[nodiscard]] bst_bin_t SearchBin(float value, bst_feature_t column_id) const {
return this->SearchBin(value, column_id, Ptrs(), Values());
}

/**
* \brief Search the bin index for numerical feature.
*/
bst_bin_t SearchBin(Entry const& e) const { return SearchBin(e.fvalue, e.index); }
[[nodiscard]] bst_bin_t SearchBin(Entry const& e) const { return SearchBin(e.fvalue, e.index); }

/**
* \brief Search the bin index for categorical feature.
*/
bst_bin_t SearchCatBin(float value, bst_feature_t fidx, std::vector<uint32_t> const& ptrs,
std::vector<float> const& vals) const {
[[nodiscard]] bst_bin_t SearchCatBin(float value, bst_feature_t fidx,
std::vector<uint32_t> const& ptrs,
std::vector<float> const& vals) const {
auto end = ptrs.at(fidx + 1) + vals.cbegin();
auto beg = ptrs[fidx] + vals.cbegin();
// Truncates the value in case it's not perfectly rounded.
Expand All @@ -143,12 +144,14 @@ class HistogramCuts {
}
return bin_idx;
}
bst_bin_t SearchCatBin(float value, bst_feature_t fidx) const {
[[nodiscard]] bst_bin_t SearchCatBin(float value, bst_feature_t fidx) const {
auto const& ptrs = this->Ptrs();
auto const& vals = this->Values();
return this->SearchCatBin(value, fidx, ptrs, vals);
}
bst_bin_t SearchCatBin(Entry const& e) const { return SearchCatBin(e.fvalue, e.index); }
[[nodiscard]] bst_bin_t SearchCatBin(Entry const& e) const {
return SearchCatBin(e.fvalue, e.index);
}

/**
* \brief Return numerical bin value given bin index.
Expand Down
2 changes: 1 addition & 1 deletion src/data/array_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -590,7 +590,7 @@ class ArrayInterface {
template <std::int32_t D, typename Fn>
void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
// Only used for cuDF at the moment.
CHECK_EQ(array.valid.Size(), 0);
CHECK_EQ(array.valid.Capacity(), 0);
auto dispatch = [&](auto t) {
using T = std::remove_const_t<decltype(t)> const;
// Set the data size to max as we don't know the original size of a sliced array:
Expand Down
3 changes: 2 additions & 1 deletion src/data/data.cc
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,8 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::Tensor<T
p_out->Reshape(array.shape);
return;
}
CHECK(array.valid.Size() == 0) << "Meta info like label or weight can not have missing value.";
CHECK_EQ(array.valid.Capacity(), 0)
<< "Meta info like label or weight can not have missing value.";
if (array.is_contiguous && array.type == ToDType<T>::kType) {
// Handle contigious
p_out->ModifyInplace([&](HostDeviceVector<T>* data, common::Span<size_t, D> shape) {
Expand Down
3 changes: 2 additions & 1 deletion src/data/data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
p_out->Reshape(array.shape);
return;
}
CHECK(array.valid.Size() == 0) << "Meta info like label or weight can not have missing value.";
CHECK_EQ(array.valid.Capacity(), 0)
<< "Meta info like label or weight can not have missing value.";
auto ptr_device = SetDeviceToPtr(array.data);
p_out->SetDevice(ptr_device);

Expand Down
8 changes: 6 additions & 2 deletions src/data/ellpack_page.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <thrust/iterator/transform_output_iterator.h>

#include "../common/categorical.h"
#include "../common/cuda_context.cuh"
#include "../common/hist_util.cuh"
#include "../common/random.h"
#include "../common/transform_iterator.h" // MakeIndexTransformIter
Expand Down Expand Up @@ -313,7 +314,8 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
auto d_csc_indptr = dh::ToSpan(csc_indptr);

auto bin_type = page.index.GetBinTypeSize();
common::CompressedBufferWriter writer{page.cut.TotalBins() + 1}; // +1 for null value
common::CompressedBufferWriter writer{page.cut.TotalBins() +
static_cast<std::size_t>(1)}; // +1 for null value

dh::LaunchN(row_stride * page.Size(), [=] __device__(size_t idx) mutable {
auto ridx = idx / row_stride;
Expand Down Expand Up @@ -357,8 +359,10 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag

// copy gidx
common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
dh::device_vector<size_t> row_ptr(page.row_ptr);
dh::device_vector<size_t> row_ptr(page.row_ptr.size());
auto d_row_ptr = dh::ToSpan(row_ptr);
dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));

auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
auto null = accessor.NullValue();
Expand Down
Loading