Skip to content

Commit

Permalink
Remove default UNKNOWN_NULL_COUNT from cudf::column member functions (#…
Browse files Browse the repository at this point in the history
…13341)

Remove the default parameters for null-mask and null-count from `cudf::column` constructors and `set_null_mask` member functions.

Reference #13311

Authors:
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - MithunR (https://github.com/mythrocks)

URL: #13341
  • Loading branch information
davidwendt authored May 18, 2023
1 parent ce307b8 commit c59c2fa
Show file tree
Hide file tree
Showing 22 changed files with 205 additions and 153 deletions.
10 changes: 7 additions & 3 deletions cpp/benchmarks/common/generate_input.cu
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,8 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
dtype,
num_rows,
data.release(),
profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{});
profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{},
null_count);
}

struct valid_or_zero {
Expand Down Expand Up @@ -721,8 +722,11 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
thrust::device_pointer_cast(offsets.end())[-1] =
current_child_column->size(); // Always include all elements

auto offsets_column = std::make_unique<cudf::column>(
cudf::data_type{cudf::type_id::INT32}, num_rows + 1, offsets.release());
auto offsets_column = std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32},
num_rows + 1,
offsets.release(),
rmm::device_buffer{},
0);

auto [null_mask, null_count] = cudf::detail::valid_if(valids.begin(),
valids.end(),
Expand Down
82 changes: 25 additions & 57 deletions cpp/include/cudf/column/column.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,16 +80,11 @@ class column {
* @brief Construct a new column by taking ownership of the contents of a device_uvector.
*
* @param other The device_uvector whose contents will be moved into the new column.
* @param null_mask Optional, column's null value indicator bitmask. May
* be empty if `null_count` is 0 or `UNKNOWN_NULL_COUNT`.
* @param null_count Optional, the count of null elements. If unknown, specify
* `UNKNOWN_NULL_COUNT` to indicate that the null count should be computed on
* the first invocation of `null_count()`.
* @param null_mask Column's null value indicator bitmask. May be empty if `null_count` is 0.
* @param null_count The count of null elements.
*/
template <typename T, CUDF_ENABLE_IF(cudf::is_numeric<T>() or cudf::is_chrono<T>())>
column(rmm::device_uvector<T>&& other,
rmm::device_buffer&& null_mask = {},
size_type null_count = UNKNOWN_NULL_COUNT)
column(rmm::device_uvector<T>&& other, rmm::device_buffer&& null_mask, size_type null_count)
: _type{cudf::data_type{cudf::type_to_id<T>()}},
_size{[&]() {
CUDF_EXPECTS(
Expand All @@ -111,22 +106,19 @@ class column {
*
* @throws cudf::logic_error if `size < 0`
*
* @param[in] dtype The element type
* @param[in] size The number of elements in the column
* @param[in] data The column's data
* @param[in] null_mask Optional, column's null value indicator bitmask. May
* be empty if `null_count` is 0 or `UNKNOWN_NULL_COUNT`.
* @param null_count Optional, the count of null elements. If unknown, specify
* `UNKNOWN_NULL_COUNT` to indicate that the null count should be computed on
* the first invocation of `null_count()`.
* @param dtype The element type
* @param size The number of elements in the column
* @param data The column's data
* @param null_mask Column's null value indicator bitmask. May be empty if `null_count` is 0.
* @param null_count Optional, the count of null elements.
* @param children Optional, vector of child columns
*/
template <typename B1, typename B2 = rmm::device_buffer>
column(data_type dtype,
size_type size,
B1&& data,
B2&& null_mask = {},
size_type null_count = UNKNOWN_NULL_COUNT,
B2&& null_mask,
size_type null_count,
std::vector<std::unique_ptr<column>>&& children = {})
: _type{dtype},
_size{size},
Expand Down Expand Up @@ -169,11 +161,6 @@ class column {
/**
* @brief Returns the count of null elements.
*
* @note If the column was constructed with `UNKNOWN_NULL_COUNT`, or if at any
* point `set_null_count(UNKNOWN_NULL_COUNT)` was invoked, then the
* first invocation of `null_count()` will compute and store the count of null
* elements indicated by the `null_mask` (if it exists).
*
* @return The number of null elements
*/
[[nodiscard]] size_type null_count() const;
Expand All @@ -186,13 +173,10 @@ class column {
*
* @param new_null_mask New null value indicator bitmask (rvalue overload &
* moved) to set the column's null value indicator mask. May be empty if
* `new_null_count` is 0 or `UNKOWN_NULL_COUNT`.
* @param new_null_count Optional, the count of null elements. If unknown,
* specify `UNKNOWN_NULL_COUNT` to indicate that the null count should be
* computed on the first invocation of `null_count()`.
* `new_null_count` is 0.
* @param new_null_count The count of null elements.
*/
void set_null_mask(rmm::device_buffer&& new_null_mask,
size_type new_null_count = UNKNOWN_NULL_COUNT);
void set_null_mask(rmm::device_buffer&& new_null_mask, size_type new_null_count);

/**
* @brief Sets the column's null value indicator bitmask to `new_null_mask`.
Expand All @@ -201,25 +185,18 @@ class column {
* does not match the size of this column.
*
* @param new_null_mask New null value indicator bitmask (lvalue overload & copied) to set the
* column's null value indicator mask. May be empty if `new_null_count` is 0 or
* `UNKOWN_NULL_COUNT`.
* @param new_null_count Optional, the count of null elements. If unknown, specify
* `UNKNOWN_NULL_COUNT` to indicate that the null count should be computed on the first invocation
* of `null_count()`.
* column's null value indicator mask. May be empty if `new_null_count` is 0.
* @param new_null_count The count of null elements
* @param stream The stream on which to perform the allocation and copy. Uses the default CUDF
* stream if none is specified.
*/
void set_null_mask(rmm::device_buffer const& new_null_mask,
size_type new_null_count = UNKNOWN_NULL_COUNT,
size_type new_null_count,
rmm::cuda_stream_view stream = cudf::get_default_stream());

/**
* @brief Updates the count of null elements.
*
* @note `UNKNOWN_NULL_COUNT` can be specified as `new_null_count` to force
* the next invocation of `null_count()` to recompute the null count from the
* null mask.
*
* @throws cudf::logic_error if `new_null_count > 0 and nullable() == false`
*
* @param new_null_count The new null count.
Expand Down Expand Up @@ -321,14 +298,8 @@ class column {
operator column_view() const { return this->view(); };

/**
* @brief Creates a mutable, non-owning view of the column's data and
* children.
*
* @note Creating a mutable view of a `column` invalidates the `column`'s
* `null_count()` by setting it to `UNKNOWN_NULL_COUNT`. The user can
* either explicitly update the null count with `set_null_count()`, or
* if not, the null count will be recomputed on the next invocation of
*`null_count()`.
* @brief Creates a mutable, non-owning view of the column's data, null mask,
* and children
*
* @return The mutable, non-owning view
*/
Expand All @@ -338,13 +309,10 @@ class column {
* @brief Implicit conversion operator to a `mutable_column_view`.
*
* This allows passing a `column` object into a function that accepts a
*`mutable_column_view`. The conversion is automatic.
* @note Creating a mutable view of a `column` invalidates the `column`'s
* `null_count()` by setting it to `UNKNOWN_NULL_COUNT`. For best performance,
* the user should explicitly update the null count with `set_null_count()`.
* Otherwise, the null count will be recomputed on the next invocation of
* `null_count()`.
* `mutable_column_view`. The conversion is automatic.
*
* The caller is expected to update the null count appropriately if the null mask
* is modified.
*
* @return Mutable, non-owning `mutable_column_view`
*/
Expand All @@ -357,9 +325,9 @@ class column {
///< buffer containing the column elements
rmm::device_buffer _null_mask{}; ///< Bitmask used to represent null values.
///< May be empty if `null_count() == 0`
mutable cudf::size_type _null_count{UNKNOWN_NULL_COUNT}; ///< The number of null elements
std::vector<std::unique_ptr<column>> _children{}; ///< Depending on element type, child
///< columns may contain additional data
mutable cudf::size_type _null_count{}; ///< The number of null elements
std::vector<std::unique_ptr<column>> _children{}; ///< Depending on element type, child
///< columns may contain additional data
};

/** @} */ // end of group
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/column/column_factories.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ std::unique_ptr<column> make_empty_column(data_type type)
{
CUDF_EXPECTS(type.id() == type_id::EMPTY || !cudf::is_nested(type),
"make_empty_column is invalid to call on nested types");
return std::make_unique<column>(type, 0, rmm::device_buffer{});
return std::make_unique<column>(type, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0);
}

// Empty column of specified type id
Expand Down
6 changes: 4 additions & 2 deletions cpp/src/datetime/timezone.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -500,8 +500,10 @@ std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_
auto d_offsets = cudf::detail::make_device_uvector_async(offsets_typed, stream, mr);

std::vector<std::unique_ptr<column>> tz_table_columns;
tz_table_columns.emplace_back(std::make_unique<cudf::column>(std::move(d_ttimes)));
tz_table_columns.emplace_back(std::make_unique<cudf::column>(std::move(d_offsets)));
tz_table_columns.emplace_back(
std::make_unique<cudf::column>(std::move(d_ttimes), rmm::device_buffer{}, 0));
tz_table_columns.emplace_back(
std::make_unique<cudf::column>(std::move(d_offsets), rmm::device_buffer{}, 0));

// Need to finish copies before transition_times and offsets go out of scope
stream.synchronize();
Expand Down
17 changes: 13 additions & 4 deletions cpp/src/interop/from_arrow.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>

#include <thrust/gather.h>

Expand Down Expand Up @@ -169,7 +170,11 @@ struct dispatch_to_cudf_column {

std::unique_ptr<column> get_empty_type_column(size_type size)
{
return std::make_unique<column>(data_type(type_id::EMPTY), size, rmm::device_buffer{});
// this abomination is required by cuDF Python, which needs to handle
// [PyArrow null arrays](https://arrow.apache.org/docs/python/generated/pyarrow.NullArray.html)
// of finite length
return std::make_unique<column>(
data_type(type_id::EMPTY), size, rmm::device_buffer{}, rmm::device_buffer{}, size);
}

/**
Expand Down Expand Up @@ -319,8 +324,11 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::dictionary32>(

// Child columns shouldn't have masks and we need the mask in main column
auto column_contents = indices_column->release();
indices_column = std::make_unique<column>(
dict_indices_type, static_cast<size_type>(array.length()), std::move(*(column_contents.data)));
indices_column = std::make_unique<column>(dict_indices_type,
static_cast<size_type>(array.length()),
std::move(*(column_contents.data)),
rmm::device_buffer{},
0);

return make_dictionary_column(std::move(keys_column),
std::move(indices_column),
Expand Down Expand Up @@ -435,7 +443,8 @@ std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
return get_column(*array_chunk, cudf_type, false, stream, mr);
});
if (concat_columns.empty()) {
return std::make_unique<column>(cudf_type, 0, rmm::device_buffer{});
return std::make_unique<column>(
cudf_type, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0);
} else if (concat_columns.size() == 1) {
return std::move(concat_columns[0]);
}
Expand Down
11 changes: 8 additions & 3 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -849,8 +849,11 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first);

// Note: json_col modified here, reuse the memory
auto offsets_column = std::make_unique<column>(
data_type{type_id::INT32}, num_rows + 1, json_col.child_offsets.release());
auto offsets_column = std::make_unique<column>(data_type{type_id::INT32},
num_rows + 1,
json_col.child_offsets.release(),
rmm::device_buffer{},
0);
// Create children column
auto [child_column, names] =
json_col.child_columns.empty()
Expand All @@ -859,7 +862,9 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
std::vector<column_name_info>>{std::make_unique<column>(
data_type{type_id::INT8},
0,
rmm::device_buffer{0, stream, mr}),
rmm::device_buffer{},
rmm::device_buffer{},
0),
std::vector<column_name_info>{}}
: device_json_column_to_cudf_column(
json_col.child_columns.begin()->second,
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/json/nested_json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1718,8 +1718,8 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to

rmm::device_uvector<json_column::row_offset_t> d_offsets =
cudf::detail::make_device_uvector_async(json_col.child_offsets, stream, mr);
auto offsets_column =
std::make_unique<column>(data_type{type_id::INT32}, num_rows, d_offsets.release());
auto offsets_column = std::make_unique<column>(
data_type{type_id::INT32}, num_rows, d_offsets.release(), rmm::device_buffer{}, 0);
// Create children column
auto [child_column, names] =
json_col.child_columns.empty()
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/json/write_json.cu
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
row_string_offsets.begin());
return make_strings_column(
strings_columns.num_rows(),
std::make_unique<cudf::column>(std::move(row_string_offsets)),
std::make_unique<cudf::column>(std::move(row_string_offsets), rmm::device_buffer{}, 0),
std::move(joined_col->release().children[strings_column_view::chars_column_index]),
0,
{});
Expand Down Expand Up @@ -381,7 +381,7 @@ std::unique_ptr<column> join_list_of_strings(lists_column_view const& lists_stri
row_string_offsets.begin());
return make_strings_column(
num_lists,
std::make_unique<cudf::column>(std::move(row_string_offsets)),
std::make_unique<cudf::column>(std::move(row_string_offsets), rmm::device_buffer{}, 0),
std::move(joined_col->release().children[strings_column_view::chars_column_index]),
lists_strings.null_count(),
cudf::detail::copy_bitmask(lists_strings.parent(), stream, mr));
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/utilities/column_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ std::unique_ptr<column> make_column(column_buffer& buffer,

case type_id::LIST: {
// make offsets column
auto offsets =
std::make_unique<column>(data_type{type_id::INT32}, buffer.size, std::move(buffer._data));
auto offsets = std::make_unique<column>(
data_type{type_id::INT32}, buffer.size, std::move(buffer._data), rmm::device_buffer{}, 0);

column_name_info* child_info = nullptr;
if (schema_info != nullptr) {
Expand Down
7 changes: 5 additions & 2 deletions cpp/src/lists/copying/copying.cu
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,11 @@ std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
offsets_data + end + 1, // size of offsets column is 1 greater than slice length
out_offsets.data(),
[start_offset] __device__(cudf::size_type i) { return i - start_offset; });
auto offsets = std::make_unique<cudf::column>(
cudf::data_type{cudf::type_id::INT32}, offsets_count, out_offsets.release());
auto offsets = std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32},
offsets_count,
out_offsets.release(),
rmm::device_buffer{},
0);

// Compute the child column of the result.
// If the child of this lists column is itself a lists column, we call copy_slice() on it.
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/partitioning/partitioning.cu
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,8 @@ struct copy_block_partitions_dispatcher {
grid_size,
stream);

return std::make_unique<column>(input.type(), input.size(), std::move(output));
return std::make_unique<column>(
input.type(), input.size(), std::move(output), rmm::device_buffer{}, 0);
}

template <typename DataType, CUDF_ENABLE_IF(not is_copy_block_supported<DataType>())>
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/round/round.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -331,7 +331,7 @@ std::unique_ptr<column> round(column_view const& input,
if (input.is_empty()) {
if (is_fixed_point(input.type())) {
auto const type = data_type{input.type().id(), numeric::scale_type{-decimal_places}};
return std::make_unique<cudf::column>(type, 0, rmm::device_buffer{});
return make_empty_column(type);
}
return empty_like(input);
}
Expand Down
14 changes: 10 additions & 4 deletions cpp/src/text/subword/load_hash_file.cu
Original file line number Diff line number Diff line change
Expand Up @@ -268,13 +268,19 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(

auto cp_metadata = detail::get_codepoint_metadata(stream);
auto const cp_metadata_size = static_cast<cudf::size_type>(cp_metadata.size());
result.cp_metadata = std::make_unique<cudf::column>(
cudf::data_type{cudf::type_id::UINT32}, cp_metadata_size, cp_metadata.release());
result.cp_metadata = std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::UINT32},
cp_metadata_size,
cp_metadata.release(),
rmm::device_buffer{},
0);

auto aux_cp_table = detail::get_aux_codepoint_data(stream);
auto const aux_cp_table_size = static_cast<cudf::size_type>(aux_cp_table.size());
result.aux_cp_table = std::make_unique<cudf::column>(
cudf::data_type{cudf::type_id::UINT64}, aux_cp_table_size, aux_cp_table.release());
result.aux_cp_table = std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::UINT64},
aux_cp_table_size,
aux_cp_table.release(),
rmm::device_buffer{},
0);

return std::make_unique<hashed_vocabulary>(std::move(result));
}
Expand Down
Loading

0 comments on commit c59c2fa

Please sign in to comment.