diff --git a/cpp/include/cudf/lists/detail/lists_column_factories.hpp b/cpp/include/cudf/lists/detail/lists_column_factories.hpp index f4dcbfcce7e..7b821a00b0d 100644 --- a/cpp/include/cudf/lists/detail/lists_column_factories.hpp +++ b/cpp/include/cudf/lists/detail/lists_column_factories.hpp @@ -41,7 +41,7 @@ std::unique_ptr make_lists_column_from_scalar(list_scalar const& v rmm::mr::device_memory_resource* mr); /** - * @brief Create an empty lists column + * @brief Create an empty lists column. * * A list column requires a child type and so cannot be created with `make_empty_column`. * @@ -53,6 +53,19 @@ std::unique_ptr make_empty_lists_column(data_type child_type, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @brief Create a lists column with all null rows. + * + * @param size Size of the output lists column + * @param child_type The type used for the empty child column + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + */ +std::unique_ptr make_all_nulls_lists_column(size_type size, + data_type child_type, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + } // namespace detail } // namespace lists } // namespace cudf diff --git a/cpp/src/lists/lists_column_factories.cu b/cpp/src/lists/lists_column_factories.cu index 875bf67133f..754735f5a5b 100644 --- a/cpp/src/lists/lists_column_factories.cu +++ b/cpp/src/lists/lists_column_factories.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -95,6 +96,22 @@ std::unique_ptr make_empty_lists_column(data_type child_type, 0, std::move(offsets), std::move(child), 0, rmm::device_buffer{}, stream, mr); } +std::unique_ptr make_all_nulls_lists_column(size_type size, + data_type child_type, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto offsets = [&] { + auto offsets_buff = + cudf::detail::make_zeroed_device_uvector_async(size + 1, stream, mr); + return std::make_unique(std::move(offsets_buff), rmm::device_buffer{}, 0); + }(); + auto child = make_empty_column(child_type); + auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); + return make_lists_column( + size, std::move(offsets), std::move(child), size, std::move(null_mask), stream, mr); +} + } // namespace detail } // namespace lists diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu index 791e0ed4ecb..5f89b6d9b3b 100644 --- a/cpp/src/reshape/byte_cast.cu +++ b/cpp/src/reshape/byte_cast.cu @@ -16,8 +16,10 @@ #include #include +#include #include #include +#include #include #include #include @@ -31,96 +33,133 @@ #include #include +#include + namespace cudf { namespace detail { namespace { -struct byte_list_conversion { - /** - * @brief Function object for converting primitive types and string columns to lists of bytes. - */ - template - std::enable_if_t and !is_floating_point(), std::unique_ptr> - operator()(column_view const&, - flip_endianness, - rmm::cuda_stream_view, - rmm::mr::device_memory_resource*) const + +// Data type of the output data column after conversion. +constexpr data_type output_type{type_id::UINT8}; + +template +struct byte_list_conversion_fn { + template + static std::unique_ptr invoke(Args&&...) { CUDF_FAIL("Unsupported non-numeric and non-string column"); } +}; +struct byte_list_conversion_dispatcher { template - std::enable_if_t() or std::is_integral_v, std::unique_ptr> - operator()(column_view const& input_column, - flip_endianness configuration, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const + std::unique_ptr operator()(column_view const& input, + flip_endianness configuration, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const + { + return byte_list_conversion_fn::invoke(input, configuration, stream, mr); + } +}; + +template +struct byte_list_conversion_fn()>> { + static std::unique_ptr invoke(column_view const& input, + flip_endianness configuration, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - size_type num_bytes = input_column.size() * sizeof(T); - auto byte_column = make_numeric_column( - data_type{type_id::UINT8}, num_bytes, mask_state::UNALLOCATED, stream, mr); + if (input.size() == 0) { + return cudf::lists::detail::make_empty_lists_column(output_type, stream, mr); + } + if (input.size() == input.null_count()) { + return cudf::lists::detail::make_all_nulls_lists_column( + input.size(), output_type, stream, mr); + } + + auto const num_bytes = static_cast(input.size() * sizeof(T)); + auto byte_column = + make_numeric_column(output_type, num_bytes, mask_state::UNALLOCATED, stream, mr); - char* d_chars = reinterpret_cast(byte_column->mutable_view().data()); - char const* d_data = reinterpret_cast(input_column.data()); - size_type mask = sizeof(T) - 1; + auto const d_inp = reinterpret_cast(input.data()); + auto const d_out = byte_column->mutable_view().data(); if (configuration == flip_endianness::YES) { thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_bytes), - [d_chars, d_data, mask] __device__(auto index) { - d_chars[index] = d_data[index + mask - ((index & mask) << 1)]; + [d_inp, d_out] __device__(auto index) { + constexpr auto mask = static_cast(sizeof(T) - 1); + d_out[index] = d_inp[index + mask - ((index & mask) << 1)]; }); } else { - thrust::copy_n(rmm::exec_policy(stream), d_data, num_bytes, d_chars); + thrust::copy_n(rmm::exec_policy(stream), d_inp, num_bytes, d_out); } - auto begin = thrust::make_constant_iterator(cudf::size_of(input_column.type())); - auto offsets_column = std::get<0>( - cudf::detail::make_offsets_child_column(begin, begin + input_column.size(), stream, mr)); + auto const it = thrust::make_constant_iterator(cudf::size_of(input.type())); + auto offsets_column = + std::get<0>(cudf::detail::make_offsets_child_column(it, it + input.size(), stream, mr)); + + auto result = make_lists_column(input.size(), + std::move(offsets_column), + std::move(byte_column), + input.null_count(), + detail::copy_bitmask(input, stream, mr), + stream, + mr); + + // If any nulls are present, the corresponding lists must be purged so that + // the result is sanitized. + if (auto const result_cv = result->view(); + cudf::detail::has_nonempty_nulls(result_cv, stream)) { + return cudf::detail::purge_nonempty_nulls(result_cv, stream, mr); + } - rmm::device_buffer null_mask = detail::copy_bitmask(input_column, stream, mr); + return result; + } +}; - return make_lists_column(input_column.size(), - std::move(offsets_column), - std::move(byte_column), - input_column.null_count(), - std::move(null_mask), - stream, - mr); +template +struct byte_list_conversion_fn>> { + static std::unique_ptr invoke(column_view const& input, + flip_endianness, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + if (input.size() == 0) { + return cudf::lists::detail::make_empty_lists_column(output_type, stream, mr); + } + if (input.size() == input.null_count()) { + return cudf::lists::detail::make_all_nulls_lists_column( + input.size(), output_type, stream, mr); + } + + auto col_content = std::make_unique(input, stream, mr)->release(); + auto chars_contents = col_content.children[strings_column_view::chars_column_index]->release(); + auto const num_chars = chars_contents.data->size(); + auto uint8_col = std::make_unique( + output_type, num_chars, std::move(*(chars_contents.data)), rmm::device_buffer{}, 0); + + auto result = make_lists_column( + input.size(), + std::move(col_content.children[cudf::strings_column_view::offsets_column_index]), + std::move(uint8_col), + input.null_count(), + detail::copy_bitmask(input, stream, mr), + stream, + mr); + + // If any nulls are present, the corresponding lists must be purged so that + // the result is sanitized. + if (auto const result_cv = result->view(); + cudf::detail::has_nonempty_nulls(result_cv, stream)) { + return cudf::detail::purge_nonempty_nulls(result_cv, stream, mr); + } + + return result; } }; -template <> -std::unique_ptr byte_list_conversion::operator()( - column_view const& input_column, - flip_endianness, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const -{ - strings_column_view input_strings(input_column); - auto strings_count = input_strings.size(); - if (strings_count == 0) return cudf::empty_like(input_column); - - auto col_content = std::make_unique(input_column, stream, mr)->release(); - auto contents = - col_content.children[strings_column_view::chars_column_index].release()->release(); - auto data = contents.data.release(); - auto null_mask = contents.null_mask.release(); - auto uint8_col = std::make_unique(data_type{type_id::UINT8}, - data->size(), - std::move(*data), - std::move(*null_mask), - UNKNOWN_NULL_COUNT); - - return make_lists_column( - input_column.size(), - std::move(col_content.children[cudf::strings_column_view::offsets_column_index]), - std::move(uint8_col), - input_column.null_count(), - detail::copy_bitmask(input_column, stream, mr), - stream, - mr); -} } // namespace /** @@ -128,13 +167,13 @@ std::unique_ptr byte_list_conversion::operator()( * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr byte_cast(column_view const& input_column, +std::unique_ptr byte_cast(column_view const& input, flip_endianness endian_configuration, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { return type_dispatcher( - input_column.type(), byte_list_conversion{}, input_column, endian_configuration, stream, mr); + input.type(), byte_list_conversion_dispatcher{}, input, endian_configuration, stream, mr); } } // namespace detail @@ -142,12 +181,12 @@ std::unique_ptr byte_cast(column_view const& input_column, /** * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::mr::device_memory_resource*) */ -std::unique_ptr byte_cast(column_view const& input_column, +std::unique_ptr byte_cast(column_view const& input, flip_endianness endian_configuration, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::byte_cast(input_column, endian_configuration, cudf::get_default_stream(), mr); + return detail::byte_cast(input, endian_configuration, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/tests/reshape/byte_cast_tests.cpp b/cpp/tests/reshape/byte_cast_tests.cpp index 6eafc9a2759..309e8341bcf 100644 --- a/cpp/tests/reshape/byte_cast_tests.cpp +++ b/cpp/tests/reshape/byte_cast_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -55,18 +56,11 @@ TEST_F(ByteCastTest, int16ValuesWithNulls) cudf::test::fixed_width_column_wrapper const int16_col( {short(0), short(100), short(-100), limits::min(), limits::max()}, {0, 1, 0, 1, 0}); - /* CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT compares underlying values even when specified as null, - * resulting in erroneous test failures. The commented out data tests the case where underlying - * values are different, but are both null. */ - // auto int16_data = - // cudf::test::fixed_width_column_wrapper{0xee, 0xff, 0x00, 0x64, 0xee, 0xff, 0x80, - // 0x00, 0xee, 0xff}; - auto int16_data = cudf::test::fixed_width_column_wrapper{ - 0x00, 0x00, 0x00, 0x64, 0xff, 0x9c, 0x80, 0x00, 0x7f, 0xff}; + auto int16_data = cudf::test::fixed_width_column_wrapper{0x00, 0x64, 0x80, 0x00}; auto int16_expected = cudf::make_lists_column( 5, - std::move(cudf::test::fixed_width_column_wrapper{0, 2, 4, 6, 8, 10}.release()), + std::move(cudf::test::fixed_width_column_wrapper{0, 0, 2, 2, 4, 4}.release()), std::move(int16_data.release()), 3, cudf::test::detail::make_null_mask(odd_validity, odd_validity + 5)); @@ -106,19 +100,12 @@ TEST_F(ByteCastTest, int32ValuesWithNulls) cudf::test::fixed_width_column_wrapper const int32_col( {0, 100, -100, limits::min(), limits::max()}, {1, 0, 1, 0, 1}); - /* Data commented out below explained by comment in int16ValuesWithNulls test */ - // auto int32_data = - // cudf::test::fixed_width_column_wrapper{0x00, 0x00, 0x00, 0x00, 0xcc, 0xdd, 0xee, - // 0xff, 0xff, 0xff, - // 0xff, 0x9c, 0xcc, 0xdd, 0xee, 0xff, 0x7f, 0xff, 0xff, - // 0xff}; + auto int32_data = cudf::test::fixed_width_column_wrapper{ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0xff, 0xff, - 0xff, 0x9c, 0x80, 0x00, 0x00, 0x00, 0x7f, 0xff, 0xff, 0xff}; + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0x9c, 0x7f, 0xff, 0xff, 0xff}; auto int32_expected = cudf::make_lists_column( 5, - std::move( - cudf::test::fixed_width_column_wrapper{0, 4, 8, 12, 16, 20}.release()), + std::move(cudf::test::fixed_width_column_wrapper{0, 4, 4, 8, 8, 12}.release()), std::move(int32_data.release()), 2, cudf::test::detail::make_null_mask(even_validity, even_validity + 5)); @@ -165,19 +152,13 @@ TEST_F(ByteCastTest, int64ValuesWithNulls) cudf::test::fixed_width_column_wrapper const int64_col( {long(0), long(100), long(-100), limits::min(), limits::max()}, {0, 1, 0, 1, 0}); - /* Data commented out below explained by comment in int16ValuesWithNulls test */ - // auto int64_data = cudf::test::fixed_width_column_wrapper{ - // 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // 0x00, 0x64, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x80, 0x00, 0x00, 0x00, - // 0x00, 0x00, 0x00, 0x00, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}; + auto int64_data = cudf::test::fixed_width_column_wrapper{ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x64, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x9c, 0x80, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; auto int64_expected = cudf::make_lists_column( 5, std::move( - cudf::test::fixed_width_column_wrapper{0, 8, 16, 24, 32, 40}.release()), + cudf::test::fixed_width_column_wrapper{0, 0, 8, 8, 16, 16}.release()), std::move(int64_data.release()), 3, cudf::test::detail::make_null_mask(odd_validity, odd_validity + 5)); @@ -231,19 +212,12 @@ TEST_F(ByteCastTest, fp32ValuesWithNulls) cudf::test::fixed_width_column_wrapper const fp32_col( {float(0.0), float(100.0), float(-100.0), limits::min(), limits::max()}, {1, 0, 1, 0, 1}); - /* Data commented out below explained by comment in int16ValuesWithNulls test */ - // auto fp32_data = - // cudf::test::fixed_width_column_wrapper{0x00, 0x00, 0x00, 0x00, 0xcc, 0xdd, 0xee, - // 0xff, 0xc2, 0xc8, - // 0x00, 0x00, 0xcc, 0xdd, 0xee, 0xff, 0x7f, 0x7f, 0xff, - // 0xff}; + auto fp32_data = cudf::test::fixed_width_column_wrapper{ - 0x00, 0x00, 0x00, 0x00, 0x42, 0xc8, 0x00, 0x00, 0xc2, 0xc8, - 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x7f, 0x7f, 0xff, 0xff}; + 0x00, 0x00, 0x00, 0x00, 0xc2, 0xc8, 0x00, 0x00, 0x7f, 0x7f, 0xff, 0xff}; auto fp32_expected = cudf::make_lists_column( 5, - std::move( - cudf::test::fixed_width_column_wrapper{0, 4, 8, 12, 16, 20}.release()), + std::move(cudf::test::fixed_width_column_wrapper{0, 4, 4, 8, 8, 12}.release()), std::move(fp32_data.release()), 2, cudf::test::detail::make_null_mask(even_validity, even_validity + 5)); @@ -307,19 +281,13 @@ TEST_F(ByteCastTest, fp64ValuesWithNulls) cudf::test::fixed_width_column_wrapper const fp64_col( {double(0.0), double(100.0), double(-100.0), limits::min(), limits::max()}, {0, 1, 0, 1, 0}); - /* Data commented out below explained by comment in int16ValuesWithNulls test */ - // auto fp64_data = cudf::test::fixed_width_column_wrapper{ - // 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x40, 0x59, 0x00, 0x00, 0x00, 0x00, - // 0x00, 0x00, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00, 0x10, 0x00, 0x00, - // 0x00, 0x00, 0x00, 0x00, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}; + auto fp64_data = cudf::test::fixed_width_column_wrapper{ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x59, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0xc0, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x7f, 0xef, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + 0x40, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; auto fp64_expected = cudf::make_lists_column( 5, std::move( - cudf::test::fixed_width_column_wrapper{0, 8, 16, 24, 32, 40}.release()), + cudf::test::fixed_width_column_wrapper{0, 0, 8, 8, 16, 16}.release()), std::move(fp64_data.release()), 3, cudf::test::detail::make_null_mask(odd_validity, odd_validity + 5)); @@ -328,7 +296,7 @@ TEST_F(ByteCastTest, fp64ValuesWithNulls) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(output_fp64->view(), fp64_expected->view()); } -TEST_F(ByteCastTest, StringValues) +TEST_F(ByteCastTest, StringValuesNoNulls) { cudf::test::strings_column_wrapper const strings_col( {"", "The quick", " brown fox...", "!\"#$%&\'()*+,-./", "0123456789:;<=>?@", "[\\]^_`{|}~"}); @@ -360,3 +328,73 @@ TEST_F(ByteCastTest, StringValues) CUDF_TEST_EXPECT_COLUMNS_EQUAL(output_strings->view(), strings_expected); } + +TEST_F(ByteCastTest, StringValuesWithNulls) +{ + auto const strings_col = [] { + auto output = + cudf::test::strings_column_wrapper( + {"", "The quick", " brown fox...", "!\"#$%&\'()*+,-./", "0123456789:;<=>?@", "[\\]^_`{|}~"}) + .release(); + + // Set nulls by `set_null_mask` so the output column will have non-empty nulls. + // This is intentional. + auto const null_iter = cudf::test::iterators::nulls_at({2, 4}); + output->set_null_mask(cudf::test::detail::make_null_mask(null_iter, null_iter + output->size()), + 2); + return output; + }(); + + auto const strings_expected = cudf::test::lists_column_wrapper{ + {{}, + {0x54, 0x68, 0x65, 0x20, 0x71, 0x75, 0x69, 0x63, 0x6b}, + {} /*NULL*/, + {0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f}, + {} /*NULL*/, + {0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x7b, 0x7c, 0x7d, 0x7e}}, + cudf::test::iterators::nulls_at({2, 4})}; + + auto const output_strings = cudf::byte_cast(*strings_col, cudf::flip_endianness::YES); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output_strings->view(), strings_expected); +} + +TEST_F(ByteCastTest, int32Empty) +{ + auto const input = cudf::test::fixed_width_column_wrapper{}; + auto const expected = cudf::test::lists_column_wrapper{}; + auto const output = cudf::byte_cast(input, cudf::flip_endianness::YES); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *output); +} + +TEST_F(ByteCastTest, int32sAllNulls) +{ + auto const input = + cudf::test::fixed_width_column_wrapper{{0, 0, 0}, cudf::test::iterators::all_nulls()}; + auto const output = cudf::byte_cast(input, cudf::flip_endianness::YES); + auto const& out_child = output->child(cudf::lists_column_view::child_column_index); + EXPECT_EQ(output->size(), 3); + EXPECT_EQ(output->null_count(), 3); + EXPECT_EQ(out_child.size(), 0); + EXPECT_EQ(out_child.type().id(), cudf::type_id::UINT8); +} + +TEST_F(ByteCastTest, StringEmpty) +{ + auto const input = cudf::test::strings_column_wrapper{}; + auto const expected = cudf::test::lists_column_wrapper{}; + auto const output = cudf::byte_cast(input, cudf::flip_endianness::YES); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *output); +} + +TEST_F(ByteCastTest, StringsAllNulls) +{ + auto const input = + cudf::test::strings_column_wrapper{{"", "", ""}, cudf::test::iterators::all_nulls()}; + auto const output = cudf::byte_cast(input, cudf::flip_endianness::YES); + auto const& out_child = output->child(cudf::lists_column_view::child_column_index); + EXPECT_EQ(output->size(), 3); + EXPECT_EQ(output->null_count(), 3); + EXPECT_EQ(out_child.size(), 0); + EXPECT_EQ(out_child.type().id(), cudf::type_id::UINT8); +}