From c59c2fa2fb35e69f7b27445d850d5bf86cda3363 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 18 May 2023 17:00:29 -0400 Subject: [PATCH] Remove default UNKNOWN_NULL_COUNT from cudf::column member functions (#13341) Remove the default parameters for null-mask and null-count from `cudf::column` constructors and `set_null_mask` member functions. Reference #13311 Authors: - David Wendt (https://github.com/davidwendt) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Jason Lowe (https://github.com/jlowe) - GALI PREM SAGAR (https://github.com/galipremsagar) - Vyas Ramasubramani (https://github.com/vyasr) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/13341 --- cpp/benchmarks/common/generate_input.cu | 10 ++- cpp/include/cudf/column/column.hpp | 82 +++++++-------------- cpp/src/column/column_factories.cpp | 2 +- cpp/src/datetime/timezone.cpp | 6 +- cpp/src/interop/from_arrow.cu | 17 ++++- cpp/src/io/json/json_column.cu | 11 ++- cpp/src/io/json/nested_json_gpu.cu | 4 +- cpp/src/io/json/write_json.cu | 4 +- cpp/src/io/utilities/column_buffer.cpp | 4 +- cpp/src/lists/copying/copying.cu | 7 +- cpp/src/partitioning/partitioning.cu | 3 +- cpp/src/round/round.cu | 4 +- cpp/src/text/subword/load_hash_file.cu | 14 +++- cpp/tests/bitmask/bitmask_tests.cpp | 7 +- cpp/tests/column/column_test.cpp | 86 ++++++++++++++-------- cpp/tests/copying/concatenate_tests.cpp | 17 +++-- cpp/tests/datetime/datetime_ops_test.cpp | 9 +-- cpp/tests/structs/structs_column_tests.cpp | 4 +- cpp/tests/unary/cast_tests.cpp | 41 ++++++++--- java/src/main/native/src/TableJni.cpp | 4 +- java/src/main/native/src/row_conversion.cu | 13 ++-- python/cudf/cudf/_lib/join.pyx | 9 ++- 22 files changed, 205 insertions(+), 153 deletions(-) diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index fd7b469cffd..ba8c58574b9 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -441,7 +441,8 @@ std::unique_ptr create_random_column(data_profile const& profile, dtype, num_rows, data.release(), - profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{}); + profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{}, + null_count); } struct valid_or_zero { @@ -721,8 +722,11 @@ std::unique_ptr create_random_column(data_profile thrust::device_pointer_cast(offsets.end())[-1] = current_child_column->size(); // Always include all elements - auto offsets_column = std::make_unique( - cudf::data_type{cudf::type_id::INT32}, num_rows + 1, offsets.release()); + auto offsets_column = std::make_unique(cudf::data_type{cudf::type_id::INT32}, + num_rows + 1, + offsets.release(), + rmm::device_buffer{}, + 0); auto [null_mask, null_count] = cudf::detail::valid_if(valids.begin(), valids.end(), diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp index c160cecbf84..8356d8144f2 100644 --- a/cpp/include/cudf/column/column.hpp +++ b/cpp/include/cudf/column/column.hpp @@ -80,16 +80,11 @@ class column { * @brief Construct a new column by taking ownership of the contents of a device_uvector. * * @param other The device_uvector whose contents will be moved into the new column. - * @param null_mask Optional, column's null value indicator bitmask. May - * be empty if `null_count` is 0 or `UNKNOWN_NULL_COUNT`. - * @param null_count Optional, the count of null elements. If unknown, specify - * `UNKNOWN_NULL_COUNT` to indicate that the null count should be computed on - * the first invocation of `null_count()`. + * @param null_mask Column's null value indicator bitmask. May be empty if `null_count` is 0. + * @param null_count The count of null elements. */ template () or cudf::is_chrono())> - column(rmm::device_uvector&& other, - rmm::device_buffer&& null_mask = {}, - size_type null_count = UNKNOWN_NULL_COUNT) + column(rmm::device_uvector&& other, rmm::device_buffer&& null_mask, size_type null_count) : _type{cudf::data_type{cudf::type_to_id()}}, _size{[&]() { CUDF_EXPECTS( @@ -111,22 +106,19 @@ class column { * * @throws cudf::logic_error if `size < 0` * - * @param[in] dtype The element type - * @param[in] size The number of elements in the column - * @param[in] data The column's data - * @param[in] null_mask Optional, column's null value indicator bitmask. May - * be empty if `null_count` is 0 or `UNKNOWN_NULL_COUNT`. - * @param null_count Optional, the count of null elements. If unknown, specify - * `UNKNOWN_NULL_COUNT` to indicate that the null count should be computed on - * the first invocation of `null_count()`. + * @param dtype The element type + * @param size The number of elements in the column + * @param data The column's data + * @param null_mask Column's null value indicator bitmask. May be empty if `null_count` is 0. + * @param null_count Optional, the count of null elements. * @param children Optional, vector of child columns */ template column(data_type dtype, size_type size, B1&& data, - B2&& null_mask = {}, - size_type null_count = UNKNOWN_NULL_COUNT, + B2&& null_mask, + size_type null_count, std::vector>&& children = {}) : _type{dtype}, _size{size}, @@ -169,11 +161,6 @@ class column { /** * @brief Returns the count of null elements. * - * @note If the column was constructed with `UNKNOWN_NULL_COUNT`, or if at any - * point `set_null_count(UNKNOWN_NULL_COUNT)` was invoked, then the - * first invocation of `null_count()` will compute and store the count of null - * elements indicated by the `null_mask` (if it exists). - * * @return The number of null elements */ [[nodiscard]] size_type null_count() const; @@ -186,13 +173,10 @@ class column { * * @param new_null_mask New null value indicator bitmask (rvalue overload & * moved) to set the column's null value indicator mask. May be empty if - * `new_null_count` is 0 or `UNKOWN_NULL_COUNT`. - * @param new_null_count Optional, the count of null elements. If unknown, - * specify `UNKNOWN_NULL_COUNT` to indicate that the null count should be - * computed on the first invocation of `null_count()`. + * `new_null_count` is 0. + * @param new_null_count The count of null elements. */ - void set_null_mask(rmm::device_buffer&& new_null_mask, - size_type new_null_count = UNKNOWN_NULL_COUNT); + void set_null_mask(rmm::device_buffer&& new_null_mask, size_type new_null_count); /** * @brief Sets the column's null value indicator bitmask to `new_null_mask`. @@ -201,25 +185,18 @@ class column { * does not match the size of this column. * * @param new_null_mask New null value indicator bitmask (lvalue overload & copied) to set the - * column's null value indicator mask. May be empty if `new_null_count` is 0 or - * `UNKOWN_NULL_COUNT`. - * @param new_null_count Optional, the count of null elements. If unknown, specify - * `UNKNOWN_NULL_COUNT` to indicate that the null count should be computed on the first invocation - * of `null_count()`. + * column's null value indicator mask. May be empty if `new_null_count` is 0. + * @param new_null_count The count of null elements * @param stream The stream on which to perform the allocation and copy. Uses the default CUDF * stream if none is specified. */ void set_null_mask(rmm::device_buffer const& new_null_mask, - size_type new_null_count = UNKNOWN_NULL_COUNT, + size_type new_null_count, rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Updates the count of null elements. * - * @note `UNKNOWN_NULL_COUNT` can be specified as `new_null_count` to force - * the next invocation of `null_count()` to recompute the null count from the - * null mask. - * * @throws cudf::logic_error if `new_null_count > 0 and nullable() == false` * * @param new_null_count The new null count. @@ -321,14 +298,8 @@ class column { operator column_view() const { return this->view(); }; /** - * @brief Creates a mutable, non-owning view of the column's data and - * children. - * - * @note Creating a mutable view of a `column` invalidates the `column`'s - * `null_count()` by setting it to `UNKNOWN_NULL_COUNT`. The user can - * either explicitly update the null count with `set_null_count()`, or - * if not, the null count will be recomputed on the next invocation of - *`null_count()`. + * @brief Creates a mutable, non-owning view of the column's data, null mask, + * and children * * @return The mutable, non-owning view */ @@ -338,13 +309,10 @@ class column { * @brief Implicit conversion operator to a `mutable_column_view`. * * This allows passing a `column` object into a function that accepts a - *`mutable_column_view`. The conversion is automatic. - - * @note Creating a mutable view of a `column` invalidates the `column`'s - * `null_count()` by setting it to `UNKNOWN_NULL_COUNT`. For best performance, - * the user should explicitly update the null count with `set_null_count()`. - * Otherwise, the null count will be recomputed on the next invocation of - * `null_count()`. + * `mutable_column_view`. The conversion is automatic. + * + * The caller is expected to update the null count appropriately if the null mask + * is modified. * * @return Mutable, non-owning `mutable_column_view` */ @@ -357,9 +325,9 @@ class column { ///< buffer containing the column elements rmm::device_buffer _null_mask{}; ///< Bitmask used to represent null values. ///< May be empty if `null_count() == 0` - mutable cudf::size_type _null_count{UNKNOWN_NULL_COUNT}; ///< The number of null elements - std::vector> _children{}; ///< Depending on element type, child - ///< columns may contain additional data + mutable cudf::size_type _null_count{}; ///< The number of null elements + std::vector> _children{}; ///< Depending on element type, child + ///< columns may contain additional data }; /** @} */ // end of group diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp index e147b12ad99..3fcc67a67d3 100644 --- a/cpp/src/column/column_factories.cpp +++ b/cpp/src/column/column_factories.cpp @@ -64,7 +64,7 @@ std::unique_ptr make_empty_column(data_type type) { CUDF_EXPECTS(type.id() == type_id::EMPTY || !cudf::is_nested(type), "make_empty_column is invalid to call on nested types"); - return std::make_unique(type, 0, rmm::device_buffer{}); + return std::make_unique(type, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0); } // Empty column of specified type id diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp index 55d68fe4a1a..68b87ceaf62 100644 --- a/cpp/src/datetime/timezone.cpp +++ b/cpp/src/datetime/timezone.cpp @@ -500,8 +500,10 @@ std::unique_ptr make_timezone_transition_table(std::optional> tz_table_columns; - tz_table_columns.emplace_back(std::make_unique(std::move(d_ttimes))); - tz_table_columns.emplace_back(std::make_unique(std::move(d_offsets))); + tz_table_columns.emplace_back( + std::make_unique(std::move(d_ttimes), rmm::device_buffer{}, 0)); + tz_table_columns.emplace_back( + std::make_unique(std::move(d_offsets), rmm::device_buffer{}, 0)); // Need to finish copies before transition_times and offsets go out of scope stream.synchronize(); diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index f9d67a43b7a..40f08df78ac 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -33,6 +33,7 @@ #include #include +#include #include @@ -169,7 +170,11 @@ struct dispatch_to_cudf_column { std::unique_ptr get_empty_type_column(size_type size) { - return std::make_unique(data_type(type_id::EMPTY), size, rmm::device_buffer{}); + // this abomination is required by cuDF Python, which needs to handle + // [PyArrow null arrays](https://arrow.apache.org/docs/python/generated/pyarrow.NullArray.html) + // of finite length + return std::make_unique( + data_type(type_id::EMPTY), size, rmm::device_buffer{}, rmm::device_buffer{}, size); } /** @@ -319,8 +324,11 @@ std::unique_ptr dispatch_to_cudf_column::operator()( // Child columns shouldn't have masks and we need the mask in main column auto column_contents = indices_column->release(); - indices_column = std::make_unique( - dict_indices_type, static_cast(array.length()), std::move(*(column_contents.data))); + indices_column = std::make_unique(dict_indices_type, + static_cast(array.length()), + std::move(*(column_contents.data)), + rmm::device_buffer{}, + 0); return make_dictionary_column(std::move(keys_column), std::move(indices_column), @@ -435,7 +443,8 @@ std::unique_ptr
from_arrow(arrow::Table const& input_table, return get_column(*array_chunk, cudf_type, false, stream, mr); }); if (concat_columns.empty()) { - return std::make_unique(cudf_type, 0, rmm::device_buffer{}); + return std::make_unique( + cudf_type, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0); } else if (concat_columns.size() == 1) { return std::move(concat_columns[0]); } diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 0fdcd33ada4..e94179face7 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -849,8 +849,11 @@ std::pair, std::vector> device_json_co json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first); // Note: json_col modified here, reuse the memory - auto offsets_column = std::make_unique( - data_type{type_id::INT32}, num_rows + 1, json_col.child_offsets.release()); + auto offsets_column = std::make_unique(data_type{type_id::INT32}, + num_rows + 1, + json_col.child_offsets.release(), + rmm::device_buffer{}, + 0); // Create children column auto [child_column, names] = json_col.child_columns.empty() @@ -859,7 +862,9 @@ std::pair, std::vector> device_json_co std::vector>{std::make_unique( data_type{type_id::INT8}, 0, - rmm::device_buffer{0, stream, mr}), + rmm::device_buffer{}, + rmm::device_buffer{}, + 0), std::vector{}} : device_json_column_to_cudf_column( json_col.child_columns.begin()->second, diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index d8ca0411910..d437aac8d38 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1718,8 +1718,8 @@ std::pair, std::vector> json_column_to rmm::device_uvector d_offsets = cudf::detail::make_device_uvector_async(json_col.child_offsets, stream, mr); - auto offsets_column = - std::make_unique(data_type{type_id::INT32}, num_rows, d_offsets.release()); + auto offsets_column = std::make_unique( + data_type{type_id::INT32}, num_rows, d_offsets.release(), rmm::device_buffer{}, 0); // Create children column auto [child_column, names] = json_col.child_columns.empty() diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index 7f774c1bb04..0217e85ea77 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -261,7 +261,7 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, row_string_offsets.begin()); return make_strings_column( strings_columns.num_rows(), - std::make_unique(std::move(row_string_offsets)), + std::make_unique(std::move(row_string_offsets), rmm::device_buffer{}, 0), std::move(joined_col->release().children[strings_column_view::chars_column_index]), 0, {}); @@ -381,7 +381,7 @@ std::unique_ptr join_list_of_strings(lists_column_view const& lists_stri row_string_offsets.begin()); return make_strings_column( num_lists, - std::make_unique(std::move(row_string_offsets)), + std::make_unique(std::move(row_string_offsets), rmm::device_buffer{}, 0), std::move(joined_col->release().children[strings_column_view::chars_column_index]), lists_strings.null_count(), cudf::detail::copy_bitmask(lists_strings.parent(), stream, mr)); diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp index d0783fe8a01..18b17bc8611 100644 --- a/cpp/src/io/utilities/column_buffer.cpp +++ b/cpp/src/io/utilities/column_buffer.cpp @@ -142,8 +142,8 @@ std::unique_ptr make_column(column_buffer& buffer, case type_id::LIST: { // make offsets column - auto offsets = - std::make_unique(data_type{type_id::INT32}, buffer.size, std::move(buffer._data)); + auto offsets = std::make_unique( + data_type{type_id::INT32}, buffer.size, std::move(buffer._data), rmm::device_buffer{}, 0); column_name_info* child_info = nullptr; if (schema_info != nullptr) { diff --git a/cpp/src/lists/copying/copying.cu b/cpp/src/lists/copying/copying.cu index 0e4b631d56b..2d3826c8004 100644 --- a/cpp/src/lists/copying/copying.cu +++ b/cpp/src/lists/copying/copying.cu @@ -64,8 +64,11 @@ std::unique_ptr copy_slice(lists_column_view const& lists, offsets_data + end + 1, // size of offsets column is 1 greater than slice length out_offsets.data(), [start_offset] __device__(cudf::size_type i) { return i - start_offset; }); - auto offsets = std::make_unique( - cudf::data_type{cudf::type_id::INT32}, offsets_count, out_offsets.release()); + auto offsets = std::make_unique(cudf::data_type{cudf::type_id::INT32}, + offsets_count, + out_offsets.release(), + rmm::device_buffer{}, + 0); // Compute the child column of the result. // If the child of this lists column is itself a lists column, we call copy_slice() on it. diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index b0174d3bd83..9b6134435a7 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -424,7 +424,8 @@ struct copy_block_partitions_dispatcher { grid_size, stream); - return std::make_unique(input.type(), input.size(), std::move(output)); + return std::make_unique( + input.type(), input.size(), std::move(output), rmm::device_buffer{}, 0); } template ())> diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu index 58e21fc97ab..4b3f80fc6e2 100644 --- a/cpp/src/round/round.cu +++ b/cpp/src/round/round.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -331,7 +331,7 @@ std::unique_ptr round(column_view const& input, if (input.is_empty()) { if (is_fixed_point(input.type())) { auto const type = data_type{input.type().id(), numeric::scale_type{-decimal_places}}; - return std::make_unique(type, 0, rmm::device_buffer{}); + return make_empty_column(type); } return empty_like(input); } diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu index 3eebce92c69..cb18d0e0ecf 100644 --- a/cpp/src/text/subword/load_hash_file.cu +++ b/cpp/src/text/subword/load_hash_file.cu @@ -268,13 +268,19 @@ std::unique_ptr load_vocabulary_file( auto cp_metadata = detail::get_codepoint_metadata(stream); auto const cp_metadata_size = static_cast(cp_metadata.size()); - result.cp_metadata = std::make_unique( - cudf::data_type{cudf::type_id::UINT32}, cp_metadata_size, cp_metadata.release()); + result.cp_metadata = std::make_unique(cudf::data_type{cudf::type_id::UINT32}, + cp_metadata_size, + cp_metadata.release(), + rmm::device_buffer{}, + 0); auto aux_cp_table = detail::get_aux_codepoint_data(stream); auto const aux_cp_table_size = static_cast(aux_cp_table.size()); - result.aux_cp_table = std::make_unique( - cudf::data_type{cudf::type_id::UINT64}, aux_cp_table_size, aux_cp_table.release()); + result.aux_cp_table = std::make_unique(cudf::data_type{cudf::type_id::UINT64}, + aux_cp_table_size, + aux_cp_table.release(), + rmm::device_buffer{}, + 0); return std::make_unique(std::move(result)); } diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp index 4693fc8e342..36d12d522c5 100644 --- a/cpp/tests/bitmask/bitmask_tests.cpp +++ b/cpp/tests/bitmask/bitmask_tests.cpp @@ -621,14 +621,15 @@ TEST_F(CopyBitmaskTest, TestCopyColumnViewVectorContiguous) for (auto& m : validity_bit) { m = this->generate(); } - auto gold_mask = - std::get<0>(cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end())); + auto [gold_mask, null_count] = + cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end()); rmm::device_buffer copy_mask{gold_mask, cudf::get_default_stream()}; cudf::column original{t, num_elements, rmm::device_buffer{num_elements * sizeof(int), cudf::get_default_stream()}, - std::move(copy_mask)}; + std::move(copy_mask), + null_count}; std::vector indices{0, 104, 104, diff --git a/cpp/tests/column/column_test.cpp b/cpp/tests/column/column_test.cpp index 33e89a45250..b278e4928e5 100644 --- a/cpp/tests/column/column_test.cpp +++ b/cpp/tests/column/column_test.cpp @@ -92,7 +92,8 @@ void verify_column_views(cudf::column col) TYPED_TEST(TypedColumnTest, DefaultNullCountNoMask) { - cudf::column col{this->type(), this->num_elements(), std::move(this->data)}; + cudf::column col{ + this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}, 0}; EXPECT_FALSE(col.nullable()); EXPECT_FALSE(col.has_nulls()); EXPECT_EQ(0, col.null_count()); @@ -100,7 +101,8 @@ TYPED_TEST(TypedColumnTest, DefaultNullCountNoMask) TYPED_TEST(TypedColumnTest, DefaultNullCountEmptyMask) { - cudf::column col{this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}}; + cudf::column col{ + this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}, 0}; EXPECT_FALSE(col.nullable()); EXPECT_FALSE(col.has_nulls()); EXPECT_EQ(0, col.null_count()); @@ -109,7 +111,7 @@ TYPED_TEST(TypedColumnTest, DefaultNullCountEmptyMask) TYPED_TEST(TypedColumnTest, DefaultNullCountAllValid) { cudf::column col{ - this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)}; + this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask), 0}; EXPECT_TRUE(col.nullable()); EXPECT_FALSE(col.has_nulls()); EXPECT_EQ(0, col.null_count()); @@ -126,8 +128,11 @@ TYPED_TEST(TypedColumnTest, ExplicitNullCountAllValid) TYPED_TEST(TypedColumnTest, DefaultNullCountAllNull) { - cudf::column col{ - this->type(), this->num_elements(), std::move(this->data), std::move(this->all_null_mask)}; + cudf::column col{this->type(), + this->num_elements(), + std::move(this->data), + std::move(this->all_null_mask), + this->num_elements()}; EXPECT_TRUE(col.nullable()); EXPECT_TRUE(col.has_nulls()); EXPECT_EQ(this->num_elements(), col.null_count()); @@ -147,13 +152,15 @@ TYPED_TEST(TypedColumnTest, ExplicitNullCountAllNull) TYPED_TEST(TypedColumnTest, SetNullCountNoMask) { - cudf::column col{this->type(), this->num_elements(), std::move(this->data)}; + cudf::column col{ + this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}, 0}; EXPECT_THROW(col.set_null_count(1), cudf::logic_error); } TYPED_TEST(TypedColumnTest, SetEmptyNullMaskNonZeroNullCount) { - cudf::column col{this->type(), this->num_elements(), std::move(this->data)}; + cudf::column col{ + this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}, 0}; rmm::device_buffer empty_null_mask{}; EXPECT_THROW(col.set_null_mask(std::move(empty_null_mask), this->num_elements()), cudf::logic_error); @@ -161,7 +168,8 @@ TYPED_TEST(TypedColumnTest, SetEmptyNullMaskNonZeroNullCount) TYPED_TEST(TypedColumnTest, SetInvalidSizeNullMaskNonZeroNullCount) { - cudf::column col{this->type(), this->num_elements(), std::move(this->data)}; + cudf::column col{ + this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}, 0}; auto invalid_size_null_mask = create_null_mask(std::min(this->num_elements() - 50, 0), cudf::mask_state::ALL_VALID); EXPECT_THROW( @@ -171,30 +179,37 @@ TYPED_TEST(TypedColumnTest, SetInvalidSizeNullMaskNonZeroNullCount) TYPED_TEST(TypedColumnTest, SetNullCountEmptyMask) { - cudf::column col{this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}}; + cudf::column col{ + this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}, 0}; EXPECT_THROW(col.set_null_count(1), cudf::logic_error); } TYPED_TEST(TypedColumnTest, SetNullCountAllValid) { cudf::column col{ - this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)}; + this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask), 0}; EXPECT_NO_THROW(col.set_null_count(0)); EXPECT_EQ(0, col.null_count()); } TYPED_TEST(TypedColumnTest, SetNullCountAllNull) { - cudf::column col{ - this->type(), this->num_elements(), std::move(this->data), std::move(this->all_null_mask)}; + cudf::column col{this->type(), + this->num_elements(), + std::move(this->data), + std::move(this->all_null_mask), + this->num_elements()}; EXPECT_NO_THROW(col.set_null_count(this->num_elements())); EXPECT_EQ(this->num_elements(), col.null_count()); } TYPED_TEST(TypedColumnTest, ResetNullCountAllNull) { - cudf::column col{ - this->type(), this->num_elements(), std::move(this->data), std::move(this->all_null_mask)}; + cudf::column col{this->type(), + this->num_elements(), + std::move(this->data), + std::move(this->all_null_mask), + this->num_elements()}; EXPECT_EQ(this->num_elements(), col.null_count()); } @@ -202,13 +217,14 @@ TYPED_TEST(TypedColumnTest, ResetNullCountAllNull) TYPED_TEST(TypedColumnTest, ResetNullCountAllValid) { cudf::column col{ - this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)}; + this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask), 0}; EXPECT_EQ(0, col.null_count()); } TYPED_TEST(TypedColumnTest, CopyDataNoMask) { - cudf::column col{this->type(), this->num_elements(), std::move(this->data)}; + cudf::column col{ + this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}, 0}; EXPECT_EQ(this->type(), col.type()); EXPECT_FALSE(col.nullable()); EXPECT_EQ(0, col.null_count()); @@ -226,7 +242,8 @@ TYPED_TEST(TypedColumnTest, CopyDataNoMask) TYPED_TEST(TypedColumnTest, MoveDataNoMask) { void* original_data = this->data.data(); - cudf::column col{this->type(), this->num_elements(), std::move(this->data)}; + cudf::column col{ + this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}, 0}; EXPECT_EQ(this->type(), col.type()); EXPECT_FALSE(col.nullable()); EXPECT_EQ(0, col.null_count()); @@ -245,7 +262,8 @@ TYPED_TEST(TypedColumnTest, CopyDataAndMask) cudf::column col{this->type(), this->num_elements(), rmm::device_buffer{this->data, cudf::get_default_stream()}, - rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()}}; + rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()}, + 0}; EXPECT_EQ(this->type(), col.type()); EXPECT_TRUE(col.nullable()); EXPECT_EQ(0, col.null_count()); @@ -267,7 +285,7 @@ TYPED_TEST(TypedColumnTest, MoveDataAndMask) void* original_data = this->data.data(); void* original_mask = this->all_valid_mask.data(); cudf::column col{ - this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)}; + this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask), 0}; EXPECT_EQ(this->type(), col.type()); EXPECT_TRUE(col.nullable()); EXPECT_EQ(0, col.null_count()); @@ -284,7 +302,8 @@ TYPED_TEST(TypedColumnTest, MoveDataAndMask) TYPED_TEST(TypedColumnTest, CopyConstructorNoMask) { - cudf::column original{this->type(), this->num_elements(), std::move(this->data)}; + cudf::column original{ + this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}, 0}; cudf::column copy{original}; verify_column_views(copy); CUDF_TEST_EXPECT_COLUMNS_EQUAL(original, copy); @@ -298,7 +317,7 @@ TYPED_TEST(TypedColumnTest, CopyConstructorNoMask) TYPED_TEST(TypedColumnTest, CopyConstructorWithMask) { cudf::column original{ - this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)}; + this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask), 0}; cudf::column copy{original}; verify_column_views(copy); CUDF_TEST_EXPECT_COLUMNS_EQUAL(original, copy); @@ -312,7 +331,8 @@ TYPED_TEST(TypedColumnTest, CopyConstructorWithMask) TYPED_TEST(TypedColumnTest, MoveConstructorNoMask) { - cudf::column original{this->type(), this->num_elements(), std::move(this->data)}; + cudf::column original{ + this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}, 0}; auto original_data = original.view().head(); @@ -331,7 +351,7 @@ TYPED_TEST(TypedColumnTest, MoveConstructorNoMask) TYPED_TEST(TypedColumnTest, MoveConstructorWithMask) { cudf::column original{ - this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)}; + this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask), 0}; auto original_data = original.view().head(); auto original_mask = original.view().null_mask(); cudf::column moved_to{std::move(original)}; @@ -354,7 +374,7 @@ TYPED_TEST(TypedColumnTest, DeviceUvectorConstructorNoMask) auto original = cudf::detail::make_device_uvector_async( data, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto original_data = original.data(); - cudf::column moved_to{std::move(original)}; + cudf::column moved_to{std::move(original), rmm::device_buffer{}, 0}; verify_column_views(moved_to); // Verify move @@ -371,7 +391,7 @@ TYPED_TEST(TypedColumnTest, DeviceUvectorConstructorWithMask) data, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto original_data = original.data(); auto original_mask = this->all_valid_mask.data(); - cudf::column moved_to{std::move(original), std::move(this->all_valid_mask)}; + cudf::column moved_to{std::move(original), std::move(this->all_valid_mask), 0}; verify_column_views(moved_to); // Verify move @@ -388,12 +408,14 @@ TYPED_TEST(TypedColumnTest, ConstructWithChildren) cudf::data_type{cudf::type_id::INT8}, 42, rmm::device_buffer{this->data, cudf::get_default_stream()}, - rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()})); + rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()}, + 0)); children.emplace_back(std::make_unique( cudf::data_type{cudf::type_id::FLOAT64}, 314, rmm::device_buffer{this->data, cudf::get_default_stream()}, - rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()})); + rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()}, + 0)); cudf::column col{this->type(), this->num_elements(), rmm::device_buffer{this->data, cudf::get_default_stream()}, @@ -412,7 +434,7 @@ TYPED_TEST(TypedColumnTest, ConstructWithChildren) TYPED_TEST(TypedColumnTest, ReleaseNoChildren) { cudf::column col{ - this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)}; + this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask), 0}; auto original_data = col.view().head(); auto original_mask = col.view().null_mask(); @@ -433,12 +455,14 @@ TYPED_TEST(TypedColumnTest, ReleaseWithChildren) this->type(), this->num_elements(), rmm::device_buffer{this->data, cudf::get_default_stream()}, - rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()})); + rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()}, + 0)); children.emplace_back(std::make_unique( this->type(), this->num_elements(), rmm::device_buffer{this->data, cudf::get_default_stream()}, - rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()})); + rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()}, + 0)); cudf::column col{this->type(), this->num_elements(), rmm::device_buffer{this->data, cudf::get_default_stream()}, @@ -462,7 +486,7 @@ TYPED_TEST(TypedColumnTest, ReleaseWithChildren) TYPED_TEST(TypedColumnTest, ColumnViewConstructorWithMask) { cudf::column original{ - this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)}; + this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask), 0}; cudf::column_view original_view = original; cudf::column copy{original_view}; verify_column_views(copy); diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index 6658bbee283..a36b018bc78 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -24,15 +24,12 @@ #include #include #include -#include +#include #include #include -#include #include #include -#include - #include #include #include @@ -66,10 +63,13 @@ struct TypedColumnTest : public cudf::test::BaseFixture { cudaMemcpyAsync(typed_data, h_data.data(), data.size(), cudaMemcpyDefault, stream.value())); CUDF_CUDA_TRY( cudaMemcpyAsync(typed_mask, h_mask.data(), mask.size(), cudaMemcpyDefault, stream.value())); + _null_count = cudf::detail::null_count( + static_cast(mask.data()), 0, _num_elements, stream); stream.synchronize(); } - cudf::size_type num_elements() { return _num_elements; } + cudf::size_type num_elements() const { return _num_elements; } + cudf::size_type null_count() const { return _null_count; } std::random_device r; std::default_random_engine generator{r()}; @@ -77,6 +77,7 @@ struct TypedColumnTest : public cudf::test::BaseFixture { cudf::size_type _num_elements{distribution(generator)}; rmm::device_buffer data{}; rmm::device_buffer mask{}; + cudf::size_type _null_count{}; rmm::device_buffer all_valid_mask{create_null_mask(num_elements(), cudf::mask_state::ALL_VALID)}; rmm::device_buffer all_null_mask{create_null_mask(num_elements(), cudf::mask_state::ALL_NULL)}; }; @@ -105,7 +106,11 @@ TYPED_TEST(TypedColumnTest, ConcatenateNoColumns) TYPED_TEST(TypedColumnTest, ConcatenateColumnView) { - column original{this->type(), this->num_elements(), std::move(this->data), std::move(this->mask)}; + column original{this->type(), + this->num_elements(), + std::move(this->data), + std::move(this->mask), + this->null_count()}; std::vector indices{0, this->num_elements() / 3, this->num_elements() / 3, diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index e2750d169ae..6bae20efa8c 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -51,7 +51,7 @@ TYPED_TEST(NonTimestampTest, TestThrowsOnNonTimestamp) using namespace cuda::std::chrono; cudf::data_type dtype{cudf::type_to_id()}; - cudf::column col{dtype, 0, rmm::device_buffer{}}; + cudf::column col{dtype, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0}; EXPECT_THROW(extract_year(col), cudf::logic_error); EXPECT_THROW(extract_month(col), cudf::logic_error); @@ -65,8 +65,7 @@ TYPED_TEST(NonTimestampTest, TestThrowsOnNonTimestamp) EXPECT_THROW(extract_nanosecond_fraction(col), cudf::logic_error); EXPECT_THROW(last_day_of_month(col), cudf::logic_error); EXPECT_THROW(day_of_year(col), cudf::logic_error); - EXPECT_THROW(add_calendrical_months( - col, cudf::column{cudf::data_type{cudf::type_id::INT16}, 0, rmm::device_buffer{}}), + EXPECT_THROW(add_calendrical_months(col, *cudf::make_empty_column(cudf::type_id::INT16)), cudf::logic_error); } @@ -215,8 +214,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestEmptyColumns) auto int16s_dtype = cudf::data_type{cudf::type_to_id()}; auto timestamps_dtype = cudf::data_type{cudf::type_to_id()}; - cudf::column int16s{int16s_dtype, 0, rmm::device_buffer{}}; - cudf::column timestamps{timestamps_dtype, 0, rmm::device_buffer{}}; + cudf::column int16s{int16s_dtype, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0}; + cudf::column timestamps{timestamps_dtype, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps), int16s); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps), int16s); diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp index e7370c248c8..b7617fc5724 100644 --- a/cpp/tests/structs/structs_column_tests.cpp +++ b/cpp/tests/structs/structs_column_tests.cpp @@ -626,8 +626,8 @@ TEST_F(StructColumnWrapperTest, TestStructsColumnWithEmptyChild) // because EMPTY columns cannot have a null mask. This test ensures that // we can construct a structs column with a parent null mask and an EMPTY // child and then view it. - auto empty_col = - std::make_unique(cudf::data_type(cudf::type_id::EMPTY), 3, rmm::device_buffer{}); + auto empty_col = std::make_unique( + cudf::data_type(cudf::type_id::EMPTY), 3, rmm::device_buffer{}, rmm::device_buffer{}, 0); int num_rows{empty_col->size()}; vector_of_columns cols; cols.push_back(std::move(empty_col)); diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp index 70451d49182..9123003216f 100644 --- a/cpp/tests/unary/cast_tests.cpp +++ b/cpp/tests/unary/cast_tests.cpp @@ -90,70 +90,91 @@ inline cudf::column make_exp_chrono_column(cudf::type_id type_id) test_timestamps_D.size(), rmm::device_buffer{test_timestamps_D.data(), test_timestamps_D.size() * sizeof(test_timestamps_D.front()), - cudf::get_default_stream()}); + cudf::get_default_stream()}, + + rmm::device_buffer{}, + 0); case cudf::type_id::TIMESTAMP_SECONDS: return cudf::column( cudf::data_type{type_id}, test_timestamps_s.size(), rmm::device_buffer{test_timestamps_s.data(), test_timestamps_s.size() * sizeof(test_timestamps_s.front()), - cudf::get_default_stream()}); + cudf::get_default_stream()}, + rmm::device_buffer{}, + 0); case cudf::type_id::TIMESTAMP_MILLISECONDS: return cudf::column( cudf::data_type{type_id}, test_timestamps_ms.size(), rmm::device_buffer{test_timestamps_ms.data(), test_timestamps_ms.size() * sizeof(test_timestamps_ms.front()), - cudf::get_default_stream()}); + cudf::get_default_stream()}, + rmm::device_buffer{}, + 0); case cudf::type_id::TIMESTAMP_MICROSECONDS: return cudf::column( cudf::data_type{type_id}, test_timestamps_us.size(), rmm::device_buffer{test_timestamps_us.data(), test_timestamps_us.size() * sizeof(test_timestamps_us.front()), - cudf::get_default_stream()}); + cudf::get_default_stream()}, + rmm::device_buffer{}, + 0); case cudf::type_id::TIMESTAMP_NANOSECONDS: return cudf::column( cudf::data_type{type_id}, test_timestamps_ns.size(), rmm::device_buffer{test_timestamps_ns.data(), test_timestamps_ns.size() * sizeof(test_timestamps_ns.front()), - cudf::get_default_stream()}); + cudf::get_default_stream()}, + rmm::device_buffer{}, + 0); case cudf::type_id::DURATION_DAYS: return cudf::column( cudf::data_type{type_id}, test_durations_D.size(), rmm::device_buffer{test_durations_D.data(), test_durations_D.size() * sizeof(test_durations_D.front()), - cudf::get_default_stream()}); + cudf::get_default_stream()}, + rmm::device_buffer{}, + 0); case cudf::type_id::DURATION_SECONDS: return cudf::column( cudf::data_type{type_id}, test_durations_s.size(), rmm::device_buffer{test_durations_s.data(), test_durations_s.size() * sizeof(test_durations_s.front()), - cudf::get_default_stream()}); + cudf::get_default_stream()}, + rmm::device_buffer{}, + 0); case cudf::type_id::DURATION_MILLISECONDS: return cudf::column( cudf::data_type{type_id}, test_durations_ms.size(), rmm::device_buffer{test_durations_ms.data(), test_durations_ms.size() * sizeof(test_durations_ms.front()), - cudf::get_default_stream()}); + cudf::get_default_stream()}, + rmm::device_buffer{}, + 0); case cudf::type_id::DURATION_MICROSECONDS: return cudf::column( cudf::data_type{type_id}, test_durations_us.size(), rmm::device_buffer{test_durations_us.data(), test_durations_us.size() * sizeof(test_durations_us.front()), - cudf::get_default_stream()}); + cudf::get_default_stream()}, + rmm::device_buffer{}, + 0); case cudf::type_id::DURATION_NANOSECONDS: return cudf::column( cudf::data_type{type_id}, test_durations_ns.size(), rmm::device_buffer{test_durations_ns.data(), test_durations_ns.size() * sizeof(test_durations_ns.front()), - cudf::get_default_stream()}); + cudf::get_default_stream()}, + rmm::device_buffer{}, + 0); default: CUDF_FAIL("Unsupported type_id"); } }; diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 51c2e92492f..321a795ac31 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -3480,8 +3480,8 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups( auto const vec = thrust::host_vector(begin, end); auto buf = rmm::device_buffer{vec.data(), size * sizeof(cudf::size_type), cudf::get_default_stream()}; - auto gather_map_col = std::make_unique(cudf::data_type{cudf::type_id::INT32}, - size, std::move(buf)); + auto gather_map_col = std::make_unique( + cudf::data_type{cudf::type_id::INT32}, size, std::move(buf), rmm::device_buffer{}, 0); // gather the first key in each group to remove duplicated ones. group_by_result_table = cudf::gather(groups.keys->view(), gather_map_col->view()); diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 41a7000d492..f1a55fd52c0 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -1885,12 +1885,13 @@ std::vector> convert_to_rows( std::transform(counting_iter, counting_iter + batch_info.row_batches.size(), std::back_inserter(ret), [&](auto batch) { auto const offset_count = batch_info.row_batches[batch].row_offsets.size(); - auto offsets = std::make_unique( - data_type{type_id::INT32}, (size_type)offset_count, - batch_info.row_batches[batch].row_offsets.release()); - auto data = std::make_unique(data_type{type_id::INT8}, - batch_info.row_batches[batch].num_bytes, - std::move(output_buffers[batch])); + auto offsets = + std::make_unique(data_type{type_id::INT32}, (size_type)offset_count, + batch_info.row_batches[batch].row_offsets.release(), + rmm::device_buffer{}, 0); + auto data = std::make_unique( + data_type{type_id::INT8}, batch_info.row_batches[batch].num_bytes, + std::move(output_buffers[batch]), rmm::device_buffer{}, 0); return make_lists_column( batch_info.row_batches[batch].row_count, std::move(offsets), std::move(data), diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index da03e8dcdd1..70667c639bb 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock @@ -6,6 +6,8 @@ from libcpp.memory cimport make_unique, unique_ptr from libcpp.pair cimport pair from libcpp.utility cimport move +from rmm._lib.device_buffer cimport device_buffer + cimport cudf._lib.cpp.join as cpp_join from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column @@ -61,10 +63,11 @@ def semi_join(list lhs, list rhs, how=None): cdef Column _gather_map_as_column(cpp_join.gather_map_type gather_map): - # helple to convert a gather map to a Column + # help to convert a gather map to a Column + cdef device_buffer c_empty cdef size_type size = gather_map.get()[0].size() cdef unique_ptr[column] c_col = make_unique[column]( data_type(type_id.INT32), size, - gather_map.get()[0].release()) + gather_map.get()[0].release(), move(c_empty), 0) return Column.from_unique_ptr(move(c_col))