From 434df44d9fe1c94e8047bcc37266ae663eae8a8d Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 5 Dec 2023 07:05:35 -0500 Subject: [PATCH] Deprecate cudf::make_strings_column accepting typed offsets (#14461) Deprecates the following factory functions: ``` std::unique_ptr make_strings_column( cudf::device_span strings, cudf::device_span offsets, cudf::device_span null_mask, size_type null_count, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); std::unique_ptr make_strings_column(size_type num_strings, rmm::device_uvector&& offsets, rmm::device_uvector&& chars, rmm::device_buffer&& null_mask, size_type null_count); ``` The first one is not needed, rarely used, and inefficient. The only callers found can use other, more efficient functions which do not require copying the device data. The 2nd one is only used in 8/150 places and can be easily recoded in place to use type-erased factory function instead. Removing these APIs helps reduce the number of functions that require type-specific offsets to make it easier to support offsets of larger types in the future. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/14461 --- cpp/benchmarks/common/generate_input.cu | 8 ++-- cpp/include/cudf/column/column_factories.hpp | 16 +++++--- .../cudf/utilities/type_dispatcher.hpp | 14 +++++++ cpp/include/cudf_test/column_wrapper.hpp | 33 ++++++++++----- cpp/src/io/json/legacy/reader_impl.cu | 40 ++++++++++++------- cpp/src/io/json/write_json.cu | 12 ++++-- cpp/src/io/parquet/predicate_pushdown.cpp | 8 ++-- cpp/src/io/text/multibyte_split.cu | 7 +++- cpp/src/strings/strings_column_factories.cu | 4 ++ cpp/tests/strings/contains_tests.cpp | 16 +++++--- cpp/tests/strings/factories_test.cu | 30 +++++++++----- java/src/main/native/src/row_conversion.cu | 11 +++-- 12 files changed, 138 insertions(+), 61 deletions(-) diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index aef3d92b4f5..cf597e644aa 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -536,10 +536,10 @@ std::unique_ptr create_random_utf8_string_column(data_profile cons rmm::mr::get_current_device_resource()); return cudf::make_strings_column( num_rows, - std::move(offsets), - std::move(chars), - profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{}, - null_count); + std::make_unique(std::move(offsets), rmm::device_buffer{}, 0), + std::make_unique(std::move(chars), rmm::device_buffer{}, 0), + null_count, + profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{}); } /** diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp index 68d7df7e4eb..ce5772dcf3c 100644 --- a/cpp/include/cudf/column/column_factories.hpp +++ b/cpp/include/cudf/column/column_factories.hpp @@ -415,6 +415,8 @@ std::unique_ptr make_strings_column( * span of byte offsets identifying individual strings within the char vector, and an optional * null bitmask. * + * @deprecated Since 24.02 + * * `offsets.front()` must always be zero. * * The total number of char bytes must not exceed the maximum size of size_type. Use the @@ -435,7 +437,7 @@ std::unique_ptr make_strings_column( * columns' device memory * @return Constructed strings column */ -std::unique_ptr make_strings_column( +[[deprecated]] std::unique_ptr make_strings_column( cudf::device_span strings, cudf::device_span offsets, cudf::device_span null_mask, @@ -470,6 +472,8 @@ std::unique_ptr make_strings_column(size_type num_strings, * @brief Construct a STRING type column given offsets, columns, and optional null count and null * mask. * + * @deprecated Since 24.02 + * * @param[in] num_strings The number of strings the column represents. * @param[in] offsets The offset values for this column. The number of elements is one more than the * total number of strings so the `offset[last] - offset[0]` is the total number of bytes in the @@ -481,11 +485,11 @@ std::unique_ptr make_strings_column(size_type num_strings, * @param[in] null_count The number of null string entries. * @return Constructed strings column */ -std::unique_ptr make_strings_column(size_type num_strings, - rmm::device_uvector&& offsets, - rmm::device_uvector&& chars, - rmm::device_buffer&& null_mask, - size_type null_count); +[[deprecated]] std::unique_ptr make_strings_column(size_type num_strings, + rmm::device_uvector&& offsets, + rmm::device_uvector&& chars, + rmm::device_buffer&& null_mask, + size_type null_count); /** * @brief Construct a LIST type column given offsets column, child column, null mask and null diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp index a80cd15d3c6..1aad197b1e3 100644 --- a/cpp/include/cudf/utilities/type_dispatcher.hpp +++ b/cpp/include/cudf/utilities/type_dispatcher.hpp @@ -186,6 +186,20 @@ CUDF_TYPE_MAPPING(numeric::decimal64, type_id::DECIMAL64) CUDF_TYPE_MAPPING(numeric::decimal128, type_id::DECIMAL128) CUDF_TYPE_MAPPING(cudf::struct_view, type_id::STRUCT) +/** + * @brief Specialization to map 'char' type to type_id::INT8 + * + * Required when passing device_uvector to column constructor. + * Possibly can be removed when PR 14202 is merged. + * + * @return id for 'char' type + */ +template <> // CUDF_TYPE_MAPPING(char,INT8) causes duplicate id_to_type_impl definition +constexpr inline type_id type_to_id() +{ + return type_id::INT8; +} + /** * @brief Use this specialization on `type_dispatcher` whenever you only need to operate on the * underlying stored type. diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index df62e47e4c4..abcd89c3035 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -756,14 +756,21 @@ class strings_column_wrapper : public detail::column_wrapper { template strings_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{} { + size_type num_strings = std::distance(begin, end); auto all_valid = thrust::make_constant_iterator(true); auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, all_valid); - auto d_chars = cudf::detail::make_device_uvector_sync( - chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); - auto d_offsets = cudf::detail::make_device_uvector_sync( - offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); + auto d_chars = std::make_unique( + cudf::detail::make_device_uvector_sync( + chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), + rmm::device_buffer{}, + 0); + auto d_offsets = std::make_unique( + cudf::detail::make_device_uvector_sync( + offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), + rmm::device_buffer{}, + 0); wrapped = - cudf::make_strings_column(d_chars, d_offsets, {}, 0, cudf::test::get_default_stream()); + cudf::make_strings_column(num_strings, std::move(d_offsets), std::move(d_chars), 0, {}); } /** @@ -801,14 +808,20 @@ class strings_column_wrapper : public detail::column_wrapper { size_type num_strings = std::distance(begin, end); auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, v); auto [null_mask, null_count] = detail::make_null_mask_vector(v, v + num_strings); - auto d_chars = cudf::detail::make_device_uvector_sync( - chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); - auto d_offsets = cudf::detail::make_device_uvector_sync( - offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); + auto d_chars = std::make_unique( + cudf::detail::make_device_uvector_sync( + chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), + rmm::device_buffer{}, + 0); + auto d_offsets = std::make_unique( + cudf::detail::make_device_uvector_sync( + offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), + rmm::device_buffer{}, + 0); auto d_bitmask = cudf::detail::make_device_uvector_sync( null_mask, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); wrapped = cudf::make_strings_column( - d_chars, d_offsets, d_bitmask, null_count, cudf::test::get_default_stream()); + num_strings, std::move(d_offsets), std::move(d_chars), null_count, d_bitmask.release()); } /** diff --git a/cpp/src/io/json/legacy/reader_impl.cu b/cpp/src/io/json/legacy/reader_impl.cu index 1ae7ccf71c1..205a6b96aa9 100644 --- a/cpp/src/io/json/legacy/reader_impl.cu +++ b/cpp/src/io/json/legacy/reader_impl.cu @@ -530,21 +530,31 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, auto repl_chars = std::vector{'"', '\\', '\t', '\r', '\b'}; auto repl_offsets = std::vector{0, 1, 2, 3, 4, 5}; - auto target = - make_strings_column(cudf::detail::make_device_uvector_async( - target_chars, stream, rmm::mr::get_current_device_resource()), - cudf::detail::make_device_uvector_async( - target_offsets, stream, rmm::mr::get_current_device_resource()), - {}, - 0, - stream); - auto repl = make_strings_column(cudf::detail::make_device_uvector_async( - repl_chars, stream, rmm::mr::get_current_device_resource()), - cudf::detail::make_device_uvector_async( - repl_offsets, stream, rmm::mr::get_current_device_resource()), - {}, - 0, - stream); + auto target = make_strings_column( + static_cast(target_offsets.size() - 1), + std::make_unique( + cudf::detail::make_device_uvector_async( + target_offsets, stream, rmm::mr::get_current_device_resource()), + rmm::device_buffer{}, + 0), + std::make_unique(cudf::detail::make_device_uvector_async( + target_chars, stream, rmm::mr::get_current_device_resource()), + rmm::device_buffer{}, + 0), + 0, + {}); + auto repl = make_strings_column( + static_cast(repl_offsets.size() - 1), + std::make_unique(cudf::detail::make_device_uvector_async( + repl_offsets, stream, rmm::mr::get_current_device_resource()), + rmm::device_buffer{}, + 0), + std::make_unique(cudf::detail::make_device_uvector_async( + repl_chars, stream, rmm::mr::get_current_device_resource()), + rmm::device_buffer{}, + 0), + 0, + {}); auto const h_valid_counts = cudf::detail::make_std_vector_sync(d_valid_counts, stream); std::vector> out_columns; diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index 938f9728fe8..f1a43baa9b0 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -763,10 +763,16 @@ std::unique_ptr make_strings_column_from_host(host_span{}, [](auto& str) { return str.size(); }); - auto d_offsets = - cudf::detail::make_device_uvector_sync(offsets, stream, rmm::mr::get_current_device_resource()); + auto d_offsets = std::make_unique( + cudf::detail::make_device_uvector_sync(offsets, stream, rmm::mr::get_current_device_resource()), + rmm::device_buffer{}, + 0); return cudf::make_strings_column( - host_strings.size(), std::move(d_offsets), std::move(d_chars), {}, 0); + host_strings.size(), + std::move(d_offsets), + std::make_unique(std::move(d_chars), rmm::device_buffer{}, 0), + 0, + {}); } std::unique_ptr make_column_names_column(host_span column_names, diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index a5851de3c20..9c8b03886b5 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -188,11 +188,11 @@ struct stats_caster { auto [d_chars, d_offsets] = make_strings_children(val, stream, mr); return cudf::make_strings_column( val.size(), - std::move(d_offsets), - std::move(d_chars), + std::make_unique(std::move(d_offsets), rmm::device_buffer{}, 0), + std::make_unique(std::move(d_chars), rmm::device_buffer{}, 0), + null_count, rmm::device_buffer{ - null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr}, - null_count); + null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr}); } return std::make_unique( dtype, diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 772bcad8ada..deaab5995af 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -549,7 +549,12 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source }); return cudf::strings::detail::make_strings_column(it, it + string_count, stream, mr); } else { - return cudf::make_strings_column(string_count, std::move(offsets), std::move(chars), {}, 0); + return cudf::make_strings_column( + string_count, + std::make_unique(std::move(offsets), rmm::device_buffer{}, 0), + std::make_unique(std::move(chars), rmm::device_buffer{}, 0), + 0, + {}); } } diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index 6e3b93973d2..0b55e18b00a 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -121,6 +121,8 @@ std::unique_ptr make_strings_column(size_type num_strings, { CUDF_FUNC_RANGE(); + if (num_strings == 0) { return make_empty_column(type_id::STRING); } + if (null_count > 0) CUDF_EXPECTS(null_mask.size() > 0, "Column with nulls must be nullable."); CUDF_EXPECTS(num_strings == offsets_column->size() - 1, "Invalid offsets column size for strings column."); @@ -146,6 +148,8 @@ std::unique_ptr make_strings_column(size_type num_strings, { CUDF_FUNC_RANGE(); + if (num_strings == 0) { return make_empty_column(type_id::STRING); } + auto const offsets_size = static_cast(offsets.size()); auto const chars_size = static_cast(chars.size()); diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index 0cb5023a32e..13459197aa3 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -297,11 +297,17 @@ TEST_F(StringsContainsTests, HexTest) std::vector offsets( {thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + count + 1}); - auto d_chars = cudf::detail::make_device_uvector_sync( - ascii_chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); - auto d_offsets = cudf::detail::make_device_uvector_sync( - offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); - auto input = cudf::make_strings_column(d_chars, d_offsets, {}, 0); + auto d_chars = std::make_unique( + cudf::detail::make_device_uvector_sync( + ascii_chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource()), + rmm::device_buffer{}, + 0); + auto d_offsets = std::make_unique( + cudf::detail::make_device_uvector_sync( + offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource()), + rmm::device_buffer{}, + 0); + auto input = cudf::make_strings_column(count, std::move(d_offsets), std::move(d_chars), 0, {}); auto strings_view = cudf::strings_column_view(input->view()); for (auto ch : ascii_chars) { diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu index a3d392cfed0..1066738df72 100644 --- a/cpp/tests/strings/factories_test.cu +++ b/cpp/tests/strings/factories_test.cu @@ -143,13 +143,20 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets) } std::vector h_nulls{h_null_mask}; - auto d_buffer = cudf::detail::make_device_uvector_sync( - h_buffer, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); - auto d_offsets = cudf::detail::make_device_uvector_sync( - h_offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); + auto d_buffer = std::make_unique( + cudf::detail::make_device_uvector_sync( + h_buffer, cudf::get_default_stream(), rmm::mr::get_current_device_resource()), + rmm::device_buffer{}, + 0); + auto d_offsets = std::make_unique( + cudf::detail::make_device_uvector_sync( + h_offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource()), + rmm::device_buffer{}, + 0); auto d_nulls = cudf::detail::make_device_uvector_sync( h_nulls, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); - auto column = cudf::make_strings_column(d_buffer, d_offsets, d_nulls, null_count); + auto column = cudf::make_strings_column( + count, std::move(d_offsets), std::move(d_buffer), null_count, d_nulls.release()); EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING}); EXPECT_EQ(column->null_count(), null_count); EXPECT_EQ(2, column->num_children()); @@ -186,12 +193,17 @@ TEST_F(StringsFactoriesTest, CreateScalar) TEST_F(StringsFactoriesTest, EmptyStringsColumn) { - rmm::device_uvector d_chars{0, cudf::get_default_stream()}; - auto d_offsets = cudf::detail::make_zeroed_device_uvector_sync( - 1, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); + auto d_chars = std::make_unique( + rmm::device_uvector{0, cudf::get_default_stream()}, rmm::device_buffer{}, 0); + auto d_offsets = std::make_unique( + cudf::detail::make_zeroed_device_uvector_sync( + 1, cudf::get_default_stream(), rmm::mr::get_current_device_resource()), + rmm::device_buffer{}, + 0); rmm::device_uvector d_nulls{0, cudf::get_default_stream()}; - auto results = cudf::make_strings_column(d_chars, d_offsets, d_nulls, 0); + auto results = + cudf::make_strings_column(0, std::move(d_offsets), std::move(d_chars), 0, d_nulls.release()); cudf::test::expect_column_empty(results->view()); rmm::device_uvector> d_strings{ diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index d93d38c7758..e5d3930843f 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -2260,10 +2260,13 @@ std::unique_ptr convert_from_rows(lists_column_view const &input, // stuff real string column auto const null_count = string_row_offset_columns[string_idx]->null_count(); auto string_data = string_row_offset_columns[string_idx].release()->release(); - output_columns[i] = - make_strings_column(num_rows, std::move(string_col_offsets[string_idx]), - std::move(string_data_cols[string_idx]), - std::move(*string_data.null_mask.release()), null_count); + output_columns[i] = make_strings_column( + num_rows, + std::make_unique(std::move(string_col_offsets[string_idx]), + rmm::device_buffer{}, 0), + std::make_unique(std::move(string_data_cols[string_idx]), + rmm::device_buffer{}, 0), + null_count, std::move(*string_data.null_mask.release())); string_idx++; } }