diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index dfc340b1459..682f1ac5fca 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -185,8 +185,10 @@ ConfigureBench(STRINGS_BENCH string/convert_floats_benchmark.cpp string/copy_benchmark.cpp string/extract_benchmark.cpp + string/factory_benchmark.cu string/filter_benchmark.cpp string/find_benchmark.cpp string/replace_benchmark.cpp string/split_benchmark.cpp + string/substring_benchmark.cpp string/url_decode_benchmark.cpp) diff --git a/cpp/benchmarks/string/factory_benchmark.cu b/cpp/benchmarks/string/factory_benchmark.cu new file mode 100644 index 00000000000..6c5dceffaa8 --- /dev/null +++ b/cpp/benchmarks/string/factory_benchmark.cu @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "string_bench_args.hpp" + +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include + +#include + +namespace { +using string_pair = thrust::pair; +struct string_view_to_pair { + __device__ string_pair operator()(thrust::pair const& p) + { + return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0}; + } +}; +} // namespace + +class StringsFactory : public cudf::benchmark { +}; + +static void BM_factory(benchmark::State& state) +{ + cudf::size_type const n_rows{static_cast(state.range(0))}; + cudf::size_type const max_str_length{static_cast(state.range(1))}; + data_profile table_profile; + table_profile.set_distribution_params( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); + auto const table = + create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile); + auto d_column = cudf::column_device_view::create(table->view().column(0)); + rmm::device_vector pairs(d_column->size()); + thrust::transform(thrust::device, + d_column->pair_begin(), + d_column->pair_end(), + pairs.data(), + string_view_to_pair{}); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + cudf::make_strings_column(pairs); + } + + cudf::strings_column_view input(table->view().column(0)); + state.SetBytesProcessed(state.iterations() * input.chars_size()); +} + +static void generate_bench_args(benchmark::internal::Benchmark* b) +{ + int const min_rows = 1 << 12; + int const max_rows = 1 << 24; + int const row_mult = 8; + int const min_rowlen = 1 << 5; + int const max_rowlen = 1 << 13; + int const len_mult = 4; + generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); +} + +#define STRINGS_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(StringsFactory, name) \ + (::benchmark::State & st) { BM_factory(st); } \ + BENCHMARK_REGISTER_F(StringsFactory, name) \ + ->Apply(generate_bench_args) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +STRINGS_BENCHMARK_DEFINE(factory) diff --git a/cpp/benchmarks/string/string_bench_args.hpp b/cpp/benchmarks/string/string_bench_args.hpp index f81f859de74..9c709b064dd 100644 --- a/cpp/benchmarks/string/string_bench_args.hpp +++ b/cpp/benchmarks/string/string_bench_args.hpp @@ -17,6 +17,8 @@ #include +#include + /** * @brief Generate row count and row length argument ranges for a string benchmark. * diff --git a/cpp/benchmarks/string/substring_benchmark.cpp b/cpp/benchmarks/string/substring_benchmark.cpp new file mode 100644 index 00000000000..d47c42e45be --- /dev/null +++ b/cpp/benchmarks/string/substring_benchmark.cpp @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "string_bench_args.hpp" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +class StringSubstring : public cudf::benchmark { +}; + +enum substring_type { position, multi_position, delimiter, multi_delimiter }; + +static void BM_substring(benchmark::State& state, substring_type rt) +{ + cudf::size_type const n_rows{static_cast(state.range(0))}; + cudf::size_type const max_str_length{static_cast(state.range(1))}; + data_profile table_profile; + table_profile.set_distribution_params( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); + auto const table = + create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + auto starts_itr = thrust::constant_iterator(1); + auto stops_itr = thrust::constant_iterator(max_str_length / 2); + cudf::test::fixed_width_column_wrapper starts(starts_itr, starts_itr + n_rows); + cudf::test::fixed_width_column_wrapper stops(stops_itr, stops_itr + n_rows); + auto delim_itr = thrust::constant_iterator(" "); + cudf::test::strings_column_wrapper delimiters(delim_itr, delim_itr + n_rows); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + switch (rt) { + case position: cudf::strings::slice_strings(input, 1, max_str_length / 2); break; + case multi_position: cudf::strings::slice_strings(input, starts, stops); break; + case delimiter: cudf::strings::slice_strings(input, std::string{" "}, 1); break; + case multi_delimiter: + cudf::strings::slice_strings(input, cudf::strings_column_view(delimiters), 1); + break; + } + } + + state.SetBytesProcessed(state.iterations() * input.chars_size()); +} + +static void generate_bench_args(benchmark::internal::Benchmark* b) +{ + int const min_rows = 1 << 12; + int const max_rows = 1 << 24; + int const row_mult = 8; + int const min_rowlen = 1 << 5; + int const max_rowlen = 1 << 13; + int const len_mult = 4; + generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); +} + +#define STRINGS_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(StringSubstring, name) \ + (::benchmark::State & st) { BM_substring(st, substring_type::name); } \ + BENCHMARK_REGISTER_F(StringSubstring, name) \ + ->Apply(generate_bench_args) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +STRINGS_BENCHMARK_DEFINE(position) +STRINGS_BENCHMARK_DEFINE(multi_position) +STRINGS_BENCHMARK_DEFINE(delimiter) +STRINGS_BENCHMARK_DEFINE(multi_delimiter) diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp index 300722920f4..1f5b6241850 100644 --- a/cpp/include/cudf/strings/char_types/char_types.hpp +++ b/cpp/include/cudf/strings/char_types/char_types.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -146,82 +146,6 @@ std::unique_ptr filter_characters_of_type( string_character_types types_to_keep = string_character_types::ALL_TYPES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @brief Returns a boolean column identifying strings in which all - * characters are valid for conversion to integers. - * - * The output row entry will be set to `true` if the corresponding string element - * has at least one character in [-+0-9]. - * - * @code{.pseudo} - * Example: - * s = ['123', '-456', '', 'A', '+7'] - * b = s.is_integer(s) - * b is [true, true, false, false, true] - * @endcode - * - * Any null row results in a null entry for that row in the output column. - * - * @param strings Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of boolean results for each string. - */ -std::unique_ptr is_integer( - strings_column_view const& strings, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Returns `true` if all strings contain - * characters that are valid for conversion to integers. - * - * This function will return `true` if all string elements - * has at least one character in [-+0-9]. - * - * Any null entry or empty string will cause this function to return `false`. - * - * @param strings Strings instance for this operation. - * @return true if all string are valid - */ -bool all_integer(strings_column_view const& strings); - -/** - * @brief Returns a boolean column identifying strings in which all - * characters are valid for conversion to floats. - * - * The output row entry will be set to `true` if the corresponding string element - * has at least one character in [-+0-9eE.]. - * - * @code{.pseudo} - * Example: - * s = ['123', '-456', '', 'A', '+7', '8.9' '3.7e+5'] - * b = s.is_float(s) - * b is [true, true, false, false, true, true, true] - * @endcode - * - * Any null row results in a null entry for that row in the output column. - * - * @param strings Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of boolean results for each string. - */ -std::unique_ptr is_float( - strings_column_view const& strings, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Returns `true` if all strings contain - * characters that are valid for conversion to floats. - * - * This function will return `true` if all string elements - * has at least one character in [-+0-9eE.]. - * - * Any null entry or empty string will cause this function to return `false`. - * - * @param strings Strings instance for this operation. - * @return true if all string are valid - */ -bool all_float(strings_column_view const& strings); - /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/strings/convert/convert_floats.hpp b/cpp/include/cudf/strings/convert/convert_floats.hpp index cb4746dbf40..d1e00b36f6f 100644 --- a/cpp/include/cudf/strings/convert/convert_floats.hpp +++ b/cpp/include/cudf/strings/convert/convert_floats.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -68,6 +68,30 @@ std::unique_ptr from_floats( column_view const& floats, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a boolean column identifying strings in which all + * characters are valid for conversion to floats. + * + * The output row entry will be set to `true` if the corresponding string element + * has at least one character in [-+0-9eE.]. + * + * @code{.pseudo} + * Example: + * s = ['123', '-456', '', 'A', '+7', '8.9' '3.7e+5'] + * b = s.is_float(s) + * b is [true, true, false, false, true, true, true] + * @endcode + * + * Any null row results in a null entry for that row in the output column. + * + * @param strings Strings instance for this operation. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New column of boolean results for each string. + */ +std::unique_ptr is_float( + strings_column_view const& strings, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp index 8f42deb380d..1e2fa80b129 100644 --- a/cpp/include/cudf/strings/convert/convert_integers.hpp +++ b/cpp/include/cudf/strings/convert/convert_integers.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -73,6 +73,30 @@ std::unique_ptr from_integers( column_view const& integers, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a boolean column identifying strings in which all + * characters are valid for conversion to integers. + * + * The output row entry will be set to `true` if the corresponding string element + * has at least one character in [-+0-9]. + * + * @code{.pseudo} + * Example: + * s = ['123', '-456', '', 'A', '+7'] + * b = s.is_integer(s) + * b is [true, true, false, false, true] + * @endcode + * + * Any null row results in a null entry for that row in the output column. + * + * @param strings Strings instance for this operation. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New column of boolean results for each string. + */ +std::unique_ptr is_integer( + strings_column_view const& strings, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Returns a new integer numeric column parsing hexadecimal values from the * provided strings column. diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 28da8ef4324..988fa552100 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -31,15 +31,60 @@ #include namespace cudf { +namespace strings { +namespace detail { -template -constexpr inline bool is_signed_iterator() +/** + * @brief Returns a new chars column using the specified indices to select + * strings from the input iterator. + * + * This uses a character-parallel gather CUDA kernel that performs very + * well on a strings column with long strings (e.g. average > 64 bytes). + * + * @tparam StringIterator Iterator should produce `string_view` objects. + * @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`. + * + * @param strings_begin Start of the iterator to retrieve `string_view` instances + * @param map_begin Start of index iterator. + * @param map_end End of index iterator. + * @param offsets The offset values to be associated with the output chars column. + * @param chars_bytes The total number of bytes for the output chars column. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @return New chars column fit for a strings column. + */ +template +std::unique_ptr gather_chars(StringIterator strings_begin, + MapIterator map_begin, + MapIterator map_end, + cudf::device_span const offsets, + size_type chars_bytes, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return std::is_signed::value_type>::value; -} + auto const output_count = std::distance(map_begin, map_end); + if (output_count == 0) return make_empty_column(data_type{type_id::INT8}); -namespace strings { -namespace detail { + auto chars_column = create_chars_child_column(output_count, 0, chars_bytes, stream, mr); + auto const d_chars = chars_column->mutable_view().template data(); + + auto gather_chars_fn = [strings_begin, map_begin, offsets] __device__(size_type out_idx) -> char { + auto const out_row = + thrust::prev(thrust::upper_bound(thrust::seq, offsets.begin(), offsets.end(), out_idx)); + auto const row_idx = map_begin[thrust::distance(offsets.begin(), out_row)]; // get row index + auto const d_str = strings_begin[row_idx]; // get row's string + auto const offset = out_idx - *out_row; // get string's char + return d_str.data()[offset]; + }; + + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(chars_bytes), + d_chars, + gather_chars_fn); + + return chars_column; +} /** * @brief Returns a new strings column using the specified indices to select @@ -107,29 +152,15 @@ std::unique_ptr gather( rmm::exec_policy(stream), d_out_offsets, d_out_offsets + output_count + 1, d_out_offsets); // build chars column - size_type const out_chars_bytes = static_cast(total_bytes); - auto out_chars_column = create_chars_child_column(output_count, 0, out_chars_bytes, stream, mr); - auto const d_out_chars = out_chars_column->mutable_view().template data(); - - // fill in chars cudf::device_span const d_out_offsets_span(d_out_offsets, output_count + 1); - auto const d_in_chars = (strings_count > 0) ? strings.chars().data() : nullptr; - auto gather_chars_fn = - [d_out_offsets_span, begin, d_in_offsets, d_in_chars] __device__(size_type out_char_idx) { - // find output row index for this output char index - auto const next_row_ptr = thrust::upper_bound( - thrust::seq, d_out_offsets_span.begin(), d_out_offsets_span.end(), out_char_idx); - auto const out_row_idx = thrust::distance(d_out_offsets_span.begin(), next_row_ptr) - 1; - auto const str_char_offset = out_char_idx - d_out_offsets_span[out_row_idx]; - auto const in_row_idx = begin[out_row_idx]; - auto const in_char_offset = d_in_offsets[in_row_idx] + str_char_offset; - return d_in_chars[in_char_offset]; - }; - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(out_chars_bytes), - d_out_chars, - gather_chars_fn); + auto const d_strings = column_device_view::create(strings.parent(), stream); + auto out_chars_column = gather_chars(d_strings->begin(), + begin, + end, + d_out_offsets_span, + static_cast(total_bytes), + stream, + mr); return make_strings_column(output_count, std::move(out_offsets_column), diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh index 8e843c555c5..932f7eb0926 100644 --- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh +++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -27,6 +28,7 @@ #include #include +#include #include #include @@ -34,7 +36,27 @@ namespace cudf { namespace strings { namespace detail { -// Create a strings-type column from iterators of pointer/size pairs +/** + * @brief Average string byte-length threshold for deciding character-level + * vs. row-level parallel algorithm. + * + * This value was determined by running the factory_benchmark against different + * string lengths and observing the point where the performance is faster for + * long strings. + */ +constexpr size_type FACTORY_BYTES_PER_ROW_THRESHOLD = 64; + +/** + * @brief Create a strings-type column from iterators of pointer/size pairs + * + * @tparam IndexPairIterator iterator over type `pair` values + * + * @param begin First string row (inclusive) + * @param end Last string row (exclusive) + * @param stream CUDA stream used for device memory operations + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column + */ template std::unique_ptr make_strings_column(IndexPairIterator begin, IndexPairIterator end, @@ -51,7 +73,7 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, auto size_checker = [] __device__(string_index_pair const& item) { return (item.first != nullptr) ? item.second : 0; }; - size_t bytes = thrust::transform_reduce( + size_t const bytes = thrust::transform_reduce( rmm::exec_policy(stream), begin, end, size_checker, 0, thrust::plus()); CUDF_EXPECTS(bytes < static_cast(std::numeric_limits::max()), "total size of strings is too large for cudf column"); @@ -65,26 +87,49 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); // create null mask - auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; }; - auto new_nulls = cudf::detail::valid_if(begin, end, validator, stream, mr); - auto null_count = new_nulls.second; + auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; }; + auto new_nulls = cudf::detail::valid_if(begin, end, validator, stream, mr); + auto const null_count = new_nulls.second; auto null_mask = (null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr}; // build chars column - auto chars_column = - strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr); - auto d_chars = chars_column->mutable_view().template data(); - auto copy_chars = [d_chars] __device__(auto item) { - string_index_pair str = thrust::get<0>(item); - size_type offset = thrust::get<1>(item); - if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second); - }; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_zip_iterator( - thrust::make_tuple(begin, offsets_column->view().template begin())), - strings_count, - copy_chars); + std::unique_ptr chars_column = [&] { + // use a character-parallel kernel for long string lengths + auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1); + if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) { + auto const d_offsets = + device_span{offsets_column->view().template data(), + static_cast(offsets_column->size())}; + auto const str_begin = thrust::make_transform_iterator(begin, [] __device__(auto ip) { + return string_view{ip.first, ip.second}; + }); + + return gather_chars(str_begin, + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + d_offsets, + static_cast(bytes), + stream, + mr); + } else { + // this approach is 2-3x faster for a large number of smaller string lengths + auto chars_column = + strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr); + auto d_chars = chars_column->mutable_view().template data(); + auto copy_chars = [d_chars] __device__(auto item) { + string_index_pair const str = thrust::get<0>(item); + size_type const offset = thrust::get<1>(item); + if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second); + }; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_zip_iterator(thrust::make_tuple( + begin, offsets_column->view().template begin())), + strings_count, + copy_chars); + return chars_column; + } + }(); return make_strings_column(strings_count, std::move(offsets_column), @@ -95,7 +140,22 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, mr); } -// Create a strings-type column from iterators to chars, offsets, and bitmask. +/** + * @brief Create a strings-type column from iterators to chars, offsets, and bitmask. + * + * @tparam CharIterator iterator over character bytes (int8) + * @tparam OffsetIterator iterator over offset values (size_type) + * + * @param chars_begin First character byte (inclusive) + * @param chars_end Last character byte (exclusive) + * @param offset_begin First offset value (inclusive) + * @param offset_end Last offset value (exclusive) + * @param null_count Number of null rows + * @param null_mask The validity bitmask in Arrow format + * @param stream CUDA stream used for device memory operations + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column + */ template std::unique_ptr make_strings_column(CharIterator chars_begin, CharIterator chars_end, diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index e045476ea77..1e0d45d081d 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -224,6 +224,18 @@ constexpr inline bool is_unsigned(data_type type) return cudf::type_dispatcher(type, is_unsigned_impl{}); } +/** + * @brief Indicates whether the `Iterator` value type is unsigned. + * + * @tparam Iterator The type to verify + * @return true if the iterator's value type is unsigned + */ +template +constexpr inline bool is_signed_iterator() +{ + return std::is_signed::value_type>::value; +} + /** * @brief Indicates whether the type `T` is a floating point type. * diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu index 10496b89328..0b384ad0631 100644 --- a/cpp/src/strings/char_types/char_types.cu +++ b/cpp/src/strings/char_types/char_types.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -186,91 +186,6 @@ std::unique_ptr filter_characters_of_type(strings_column_view const& str mr); } -std::unique_ptr is_integer( - strings_column_view const& strings, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) -{ - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - // create output column - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings.size(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); - auto d_results = results->mutable_view().data(); - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings.size()), - d_results, - [d_column] __device__(size_type idx) { - if (d_column.is_null(idx)) return false; - return string::is_integer(d_column.element(idx)); - }); - results->set_null_count(strings.null_count()); - return results; -} - -bool all_integer(strings_column_view const& strings, rmm::cuda_stream_view stream) -{ - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - auto transformer_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), [d_column] __device__(size_type idx) { - if (d_column.is_null(idx)) return false; - return string::is_integer(d_column.element(idx)); - }); - return thrust::all_of(rmm::exec_policy(stream), - transformer_itr, - transformer_itr + strings.size(), - thrust::identity()); -} - -std::unique_ptr is_float( - strings_column_view const& strings, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) -{ - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - // create output column - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings.size(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); - auto d_results = results->mutable_view().data(); - // check strings for valid float chars - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings.size()), - d_results, - [d_column] __device__(size_type idx) { - if (d_column.is_null(idx)) return false; - return string::is_float(d_column.element(idx)); - }); - results->set_null_count(strings.null_count()); - return results; -} - -bool all_float(strings_column_view const& strings, rmm::cuda_stream_view stream) -{ - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - auto transformer_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), [d_column] __device__(size_type idx) { - if (d_column.is_null(idx)) return false; - return string::is_float(d_column.element(idx)); - }); - return thrust::all_of(rmm::exec_policy(stream), - transformer_itr, - transformer_itr + strings.size(), - thrust::identity()); -} - } // namespace detail // external API @@ -295,31 +210,5 @@ std::unique_ptr filter_characters_of_type(strings_column_view const& str strings, types_to_remove, replacement, types_to_keep, rmm::cuda_stream_default, mr); } -std::unique_ptr is_integer(strings_column_view const& strings, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::is_integer(strings, rmm::cuda_stream_default, mr); -} - -std::unique_ptr is_float(strings_column_view const& strings, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::is_float(strings, rmm::cuda_stream_default, mr); -} - -bool all_integer(strings_column_view const& strings) -{ - CUDF_FUNC_RANGE(); - return detail::all_integer(strings, rmm::cuda_stream_default); -} - -bool all_float(strings_column_view const& strings) -{ - CUDF_FUNC_RANGE(); - return detail::all_float(strings, rmm::cuda_stream_default); -} - } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu index 2bf65976986..b6d99efd51f 100644 --- a/cpp/src/strings/convert/convert_floats.cu +++ b/cpp/src/strings/convert/convert_floats.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -536,12 +537,50 @@ std::unique_ptr from_floats(column_view const& floats, } // namespace detail // external API - std::unique_ptr from_floats(column_view const& floats, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::from_floats(floats, rmm::cuda_stream_default, mr); } +namespace detail { +std::unique_ptr is_float( + strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + auto strings_column = column_device_view::create(strings.parent(), stream); + auto d_column = *strings_column; + // create output column + auto results = make_numeric_column(data_type{type_id::BOOL8}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); + auto d_results = results->mutable_view().data(); + // check strings for valid float chars + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings.size()), + d_results, + [d_column] __device__(size_type idx) { + if (d_column.is_null(idx)) return false; + return string::is_float(d_column.element(idx)); + }); + results->set_null_count(strings.null_count()); + return results; +} + +} // namespace detail + +// external API +std::unique_ptr is_float(strings_column_view const& strings, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::is_float(strings, rmm::cuda_stream_default, mr); +} + } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu index 112550fc25b..5c5032b5c87 100644 --- a/cpp/src/strings/convert/convert_integers.cu +++ b/cpp/src/strings/convert/convert_integers.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -245,7 +246,6 @@ std::unique_ptr from_integers(column_view const& integers, } // namespace detail // external API - std::unique_ptr from_integers(column_view const& integers, rmm::mr::device_memory_resource* mr) { @@ -253,5 +253,42 @@ std::unique_ptr from_integers(column_view const& integers, return detail::from_integers(integers, rmm::cuda_stream_default, mr); } +namespace detail { +std::unique_ptr is_integer( + strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + auto strings_column = column_device_view::create(strings.parent(), stream); + auto d_column = *strings_column; + // create output column + auto results = make_numeric_column(data_type{type_id::BOOL8}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); + auto d_results = results->mutable_view().data(); + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings.size()), + d_results, + [d_column] __device__(size_type idx) { + if (d_column.is_null(idx)) return false; + return string::is_integer(d_column.element(idx)); + }); + results->set_null_count(strings.null_count()); + return results; +} +} // namespace detail + +// external API +std::unique_ptr is_integer(strings_column_view const& strings, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::is_integer(strings, rmm::cuda_stream_default, mr); +} + } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu index 68080c0eb89..f712b0cb6aa 100644 --- a/cpp/src/strings/substring.cu +++ b/cpp/src/strings/substring.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -43,17 +43,25 @@ namespace { * using the provided start, stop, and step parameters. */ struct substring_fn { - const column_device_view d_column; - numeric_scalar_device_view d_start, d_stop, d_step; - const int32_t* d_offsets{}; + column_device_view const d_column; + numeric_scalar_device_view const d_start; + numeric_scalar_device_view const d_stop; + numeric_scalar_device_view const d_step; + int32_t* d_offsets{}; char* d_chars{}; - __device__ cudf::size_type operator()(size_type idx) + __device__ void operator()(size_type idx) { - if (d_column.is_null(idx)) return 0; // null string - string_view d_str = d_column.template element(idx); + if (d_column.is_null(idx)) { + if (!d_chars) d_offsets[idx] = 0; + return; + } + auto const d_str = d_column.template element(idx); auto const length = d_str.length(); - if (length == 0) return 0; // empty string + if (length == 0) { + if (!d_chars) d_offsets[idx] = 0; + return; + } size_type const step = d_step.is_valid() ? d_step.value() : 1; auto const begin = [&] { // always inclusive // when invalid, default depends on step @@ -88,7 +96,7 @@ struct substring_fn { if (d_buffer) d_buffer += from_char_utf8(*itr, d_buffer); itr += step; } - return bytes; + if (!d_chars) d_offsets[idx] = bytes; } }; @@ -103,42 +111,26 @@ std::unique_ptr slice_strings( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(stream, mr); + if (strings.is_empty()) return make_empty_strings_column(stream, mr); if (step.is_valid()) CUDF_EXPECTS(step.value(stream) != 0, "Step parameter must not be 0"); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - auto d_start = get_scalar_device_view(const_cast&>(start)); - auto d_stop = get_scalar_device_view(const_cast&>(stop)); - auto d_step = get_scalar_device_view(const_cast&>(step)); - - // copy the null mask - rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); - - // build offsets column - auto offsets_transformer_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), substring_fn{d_column, d_start, d_stop, d_step}); - auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto d_new_offsets = offsets_column->view().data(); - - // build chars column - auto bytes = cudf::detail::get_value(offsets_column->view(), strings_count, stream); - auto chars_column = strings::detail::create_chars_child_column( - strings_count, strings.null_count(), bytes, stream, mr); - auto d_chars = chars_column->mutable_view().data(); - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - substring_fn{d_column, d_start, d_stop, d_step, d_new_offsets, d_chars}); + auto const d_column = column_device_view::create(strings.parent(), stream); + auto const d_start = get_scalar_device_view(const_cast&>(start)); + auto const d_stop = get_scalar_device_view(const_cast&>(stop)); + auto const d_step = get_scalar_device_view(const_cast&>(step)); - return make_strings_column(strings_count, - std::move(offsets_column), - std::move(chars_column), + auto children = make_strings_children(substring_fn{*d_column, d_start, d_stop, d_step}, + strings.size(), + strings.null_count(), + stream, + mr); + + return make_strings_column(strings.size(), + std::move(children.first), + std::move(children.second), strings.null_count(), - std::move(null_mask), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), stream, mr); } @@ -166,25 +158,33 @@ namespace { * This both calculates the output size and executes the substring. */ struct substring_from_fn { - const column_device_view d_column; - const cudf::detail::input_indexalator starts; - const cudf::detail::input_indexalator stops; - const int32_t* d_offsets{}; + column_device_view const d_column; + cudf::detail::input_indexalator const starts; + cudf::detail::input_indexalator const stops; + int32_t* d_offsets{}; char* d_chars{}; - __device__ size_type operator()(size_type idx) + __device__ void operator()(size_type idx) { - if (d_column.is_null(idx)) return 0; // null string - string_view d_str = d_column.template element(idx); + if (d_column.is_null(idx)) { + if (!d_chars) d_offsets[idx] = 0; + return; + } + auto const d_str = d_column.template element(idx); auto const length = d_str.length(); auto const start = starts[idx]; - if (start >= length) return 0; // empty string + if (start >= length) { + if (!d_chars) d_offsets[idx] = 0; + return; + } auto const stop = stops[idx]; auto const end = (((stop < 0) || (stop > length)) ? length : stop); - string_view d_substr = d_str.substr(start, end - start); - if (d_chars) memcpy(d_chars + d_offsets[idx], d_substr.data(), d_substr.size_bytes()); - return d_substr.size_bytes(); + auto const d_substr = d_str.substr(start, end - start); + if (d_chars) + memcpy(d_chars + d_offsets[idx], d_substr.data(), d_substr.size_bytes()); + else + d_offsets[idx] = d_substr.size_bytes(); } }; @@ -212,32 +212,18 @@ std::unique_ptr compute_substrings_from_fn(column_device_view const& d_c auto strings_count = d_column.size(); // Copy the null mask - rmm::device_buffer null_mask{0, stream, mr}; - if (d_column.nullable()) - null_mask = rmm::device_buffer( - d_column.null_mask(), cudf::bitmask_allocation_size_bytes(strings_count), stream, mr); - - // Build offsets column - auto offsets_transformer_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), substring_from_fn{d_column, starts, stops}); - auto offsets_column = cudf::strings::detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto d_new_offsets = offsets_column->view().data(); - - // Build chars column - auto bytes = cudf::detail::get_value(offsets_column->view(), strings_count, stream); - auto chars_column = - cudf::strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr); - auto chars_view = chars_column->mutable_view(); - auto d_chars = chars_view.template data(); - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - substring_from_fn{d_column, starts, stops, d_new_offsets, d_chars}); + rmm::device_buffer null_mask = + !d_column.nullable() + ? rmm::device_buffer{0, stream, mr} + : rmm::device_buffer( + d_column.null_mask(), cudf::bitmask_allocation_size_bytes(strings_count), stream, mr); + + auto children = make_strings_children( + substring_from_fn{d_column, starts, stops}, strings_count, null_count, stream, mr); return make_strings_column(strings_count, - std::move(offsets_column), - std::move(chars_column), + std::move(children.first), + std::move(children.second), null_count, std::move(null_mask), stream, diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp index fa3bde8cb52..451fa82d5a3 100644 --- a/cpp/tests/merge/merge_test.cpp +++ b/cpp/tests/merge/merge_test.cpp @@ -729,4 +729,36 @@ TEST_F(MergeTest, KeysWithNulls) } } +template +struct FixedPointTestBothReps : public cudf::test::BaseFixture { +}; + +template +using fp_wrapper = cudf::test::fixed_point_column_wrapper; + +TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes); + +TYPED_TEST(FixedPointTestBothReps, FixedPointMerge) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = cudf::device_storage_type_t; + + auto const a = fp_wrapper{{4, 22, 33, 44, 55}, scale_type{-1}}; + auto const b = fp_wrapper{{5, 7, 10}, scale_type{-1}}; + auto const table_a = cudf::table_view(std::vector{a}); + auto const table_b = cudf::table_view(std::vector{b}); + auto const tables = std::vector{table_a, table_b}; + + auto const key_cols = std::vector{0}; + auto const order = std::vector{cudf::order::ASCENDING}; + + auto const exp = fp_wrapper{{4, 5, 7, 10, 22, 33, 44, 55}, scale_type{-1}}; + auto const exp_table = cudf::table_view(std::vector{exp}); + + auto const result = cudf::merge(tables, key_cols, order); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(exp_table.column(0), result->view().column(0)); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp index 803a9b01b07..702329edaba 100644 --- a/cpp/tests/strings/chars_types_tests.cpp +++ b/cpp/tests/strings/chars_types_tests.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include #include @@ -228,54 +227,6 @@ TEST_F(StringsCharsTest, Numerics) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -TEST_F(StringsCharsTest, Integers) -{ - cudf::test::strings_column_wrapper strings1( - {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""}); - auto results = cudf::strings::is_integer(cudf::strings_column_view(strings1)); - cudf::test::fixed_width_column_wrapper expected1({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1); - EXPECT_FALSE(cudf::strings::all_integer(cudf::strings_column_view(strings1))); - - cudf::test::strings_column_wrapper strings2( - {"0", "+0", "-0", "1234567890", "-27341132", "+012", "023", "-045"}); - results = cudf::strings::is_integer(cudf::strings_column_view(strings2)); - cudf::test::fixed_width_column_wrapper expected2({1, 1, 1, 1, 1, 1, 1, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); - EXPECT_TRUE(cudf::strings::all_integer(cudf::strings_column_view(strings2))); -} - -TEST_F(StringsCharsTest, Floats) -{ - cudf::test::strings_column_wrapper strings1({"+175", - "-9.8", - "7+2", - "+-4", - "6.7e17", - "-1.2e-5", - "e", - ".e", - "1.e+-2", - "00.00", - "1.0e+1.0", - "1.2.3", - "+", - "--", - ""}); - auto results = cudf::strings::is_float(cudf::strings_column_view(strings1)); - cudf::test::fixed_width_column_wrapper expected1( - {1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1); - EXPECT_FALSE(cudf::strings::all_float(cudf::strings_column_view(strings1))); - - cudf::test::strings_column_wrapper strings2( - {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"}); - results = cudf::strings::is_float(cudf::strings_column_view(strings2)); - cudf::test::fixed_width_column_wrapper expected2({1, 1, 1, 1, 1, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); - EXPECT_TRUE(cudf::strings::all_float(cudf::strings_column_view(strings2))); -} - TEST_F(StringsCharsTest, EmptyStrings) { cudf::test::strings_column_wrapper strings({"", "", ""}); @@ -284,12 +235,6 @@ TEST_F(StringsCharsTest, EmptyStrings) auto results = cudf::strings::all_characters_of_type( strings_view, cudf::strings::string_character_types::ALPHANUM); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = cudf::strings::is_integer(strings_view); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - EXPECT_FALSE(cudf::strings::all_integer(strings_view)); - results = cudf::strings::is_float(strings_view); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - EXPECT_FALSE(cudf::strings::all_float(strings_view)); } TEST_F(StringsCharsTest, FilterCharTypes) @@ -379,14 +324,6 @@ TEST_F(StringsCharsTest, EmptyStringsColumn) EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); EXPECT_EQ(0, results->view().size()); - results = cudf::strings::is_integer(strings_view); - EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); - EXPECT_EQ(0, results->view().size()); - - results = cudf::strings::is_float(strings_view); - EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); - EXPECT_EQ(0, results->view().size()); - results = cudf::strings::filter_characters_of_type( strings_view, cudf::strings::string_character_types::NUMERIC); EXPECT_EQ(cudf::type_id::STRING, results->view().type().id()); diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu index f904c404251..bd463a7ab0d 100644 --- a/cpp/tests/strings/factories_test.cu +++ b/cpp/tests/strings/factories_test.cu @@ -19,12 +19,18 @@ #include #include #include +#include #include #include #include #include #include +#include + +#include +#include + #include #include @@ -198,3 +204,31 @@ TEST_F(StringsFactoriesTest, CreateOffsets) } } } + +namespace { +using string_pair = thrust::pair; +struct string_view_to_pair { + __device__ string_pair operator()(thrust::pair const& p) + { + return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0}; + } +}; +} // namespace + +TEST_F(StringsFactoriesTest, StringPairWithNullsAndEmpty) +{ + cudf::test::strings_column_wrapper data( + {"", "this", "is", "", "a", "", "column", "of", "strings", "", ""}, + {0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1}); + + auto d_column = cudf::column_device_view::create(data); + rmm::device_vector pairs(d_column->size()); + thrust::transform(thrust::device, + d_column->pair_begin(), + d_column->pair_end(), + pairs.data(), + string_view_to_pair{}); + + auto result = cudf::make_strings_column(pairs); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), data); +} diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp index b98416d9edd..f7151363d83 100644 --- a/cpp/tests/strings/floats_tests.cpp +++ b/cpp/tests/strings/floats_tests.cpp @@ -27,6 +27,41 @@ struct StringsConvertTest : public cudf::test::BaseFixture { }; +TEST_F(StringsConvertTest, IsFloat) +{ + cudf::test::strings_column_wrapper strings; + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::is_float(strings_view); + EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); + EXPECT_EQ(0, results->view().size()); + + cudf::test::strings_column_wrapper strings1({"+175", + "-9.8", + "7+2", + "+-4", + "6.7e17", + "-1.2e-5", + "e", + ".e", + "1.e+-2", + "00.00", + "1.0e+1.0", + "1.2.3", + "+", + "--", + ""}); + results = cudf::strings::is_float(cudf::strings_column_view(strings1)); + cudf::test::fixed_width_column_wrapper expected1( + {1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1); + + cudf::test::strings_column_wrapper strings2( + {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"}); + results = cudf::strings::is_float(cudf::strings_column_view(strings2)); + cudf::test::fixed_width_column_wrapper expected2({1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); +} + TEST_F(StringsConvertTest, ToFloats32) { std::vector h_strings{"1234", diff --git a/cpp/tests/strings/integers_tests.cu b/cpp/tests/strings/integers_tests.cu index 9e2b9809b26..d6bf03b3f76 100644 --- a/cpp/tests/strings/integers_tests.cu +++ b/cpp/tests/strings/integers_tests.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,6 +29,27 @@ struct StringsConvertTest : public cudf::test::BaseFixture { }; +TEST_F(StringsConvertTest, IsInteger) +{ + cudf::test::strings_column_wrapper strings; + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::is_integer(strings_view); + EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); + EXPECT_EQ(0, results->view().size()); + + cudf::test::strings_column_wrapper strings1( + {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""}); + results = cudf::strings::is_integer(cudf::strings_column_view(strings1)); + cudf::test::fixed_width_column_wrapper expected1({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1); + + cudf::test::strings_column_wrapper strings2( + {"0", "+0", "-0", "1234567890", "-27341132", "+012", "023", "-045"}); + results = cudf::strings::is_integer(cudf::strings_column_view(strings2)); + cudf::test::fixed_width_column_wrapper expected2({1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); +} + TEST_F(StringsConvertTest, ToInteger) { std::vector h_strings{ diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 7385b55d0df..0dc529d423f 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -183,6 +183,9 @@ public long getDeviceMemorySize() { private static native ContiguousTable[] contiguousSplit(long inputTable, int[] indices); + private static native long[] partition(long inputTable, long partitionView, + int numberOfPartitions, int[] outputOffsets); + private static native long[] hashPartition(long inputTable, int[] columnsToHash, int hashTypeId, @@ -516,6 +519,10 @@ private static native long[] repeatColumnCount(long tableHandle, private static native long[] explodePosition(long tableHandle, int index); + private static native long[] explodeOuter(long tableHandle, int index); + + private static native long[] explodeOuterPosition(long tableHandle, int index); + private static native long createCudfTableView(long[] nativeColumnViewHandles); private static native long[] columnViewsFromPacked(ByteBuffer metadata, long dataAddress); @@ -1253,6 +1260,24 @@ public Table repeat(ColumnVector counts, boolean checkCount) { return new Table(repeatColumnCount(this.nativeHandle, counts.getNativeView(), checkCount)); } + /** + * Partition this table using the mapping in partitionMap. partitionMap must be an integer + * column. The number of rows in partitionMap must be the same as this table. Each row + * in the map will indicate which partition the rows in the table belong to. + * @param partitionMap the partitions for each row. + * @param numberOfPartitions number of partitions + * @return {@link PartitionedTable} Table that exposes a limited functionality of the + * {@link Table} class + */ + public PartitionedTable partition(ColumnView partitionMap, int numberOfPartitions) { + int[] partitionOffsets = new int[numberOfPartitions]; + return new PartitionedTable(new Table(partition( + getNativeView(), + partitionMap.getNativeView(), + partitionOffsets.length, + partitionOffsets)), partitionOffsets); + } + /** * Find smallest indices in a sorted table where values should be inserted to maintain order. *
@@ -1725,7 +1750,7 @@ public ContiguousTable[] contiguousSplit(int... indices) {
    * Example:
    * input:  [[5,10,15], 100],
    *         [[20,25],   200],
-   *         [[30],      300],
+   *         [[30],      300]
    * index: 0
    * output: [5,         100],
    *         [10,        100],
@@ -1737,12 +1762,12 @@ public ContiguousTable[] contiguousSplit(int... indices) {
    *
    * Nulls propagate in different ways depending on what is null.
    * 
-   *     [[5,null,15], 100],
-   *     [null,        200]
-   * returns:
-   *     [5,           100],
-   *     [null,        100],
-   *     [15,          100]
+   * input:  [[5,null,15], 100],
+   *         [null,        200]
+   * index: 0
+   * output: [5,           100],
+   *         [null,        100],
+   *         [15,          100]
    * 
    * Note that null lists are completely removed from the output
    * and nulls inside lists are pulled out and remain.
@@ -1763,27 +1788,26 @@ public Table explode(int index) {
    * in the output. The corresponding rows for other columns in the input are duplicated. A position
    * column is added that has the index inside the original list for each row. Example:
    * 
-   * [[5,10,15], 100],
-   * [[20,25],   200],
-   * [[30],      300],
-   * returns
-   * [0,   5,    100],
-   * [1,   10,   100],
-   * [2,   15,    100],
-   * [0,   20,    200],
-   * [1,   25,    200],
-   * [0,   30,    300],
+   * input:  [[5,10,15], 100],
+   *         [[20,25],   200],
+   *         [[30],      300]
+   * index: 0
+   * output: [0,   5,    100],
+   *         [1,   10,   100],
+   *         [2,   15,   100],
+   *         [0,   20,   200],
+   *         [1,   25,   200],
+   *         [0,   30,   300]
    * 
    *
    * Nulls and empty lists propagate in different ways depending on what is null or empty.
    * 
-   * [[5,null,15], 100],
-   * [null,        200],
-   * [[],          300],
-   * returns
-   * [0,    5,     100],
-   * [1,    null,  100],
-   * [2,    15,    100],
+   * input:  [[5,null,15], 100],
+   *         [null,        200]
+   * index: 0
+   * output: [5,           100],
+   *         [null,        100],
+   *         [15,          100]
    * 
    *
    * Note that null lists are not included in the resulting table, but nulls inside
@@ -1799,6 +1823,96 @@ public Table explodePosition(int index) {
     return new Table(explodePosition(nativeHandle, index));
   }
 
+  /**
+   * Explodes a list column's elements.
+   *
+   * Any list is exploded, which means the elements of the list in each row are expanded
+   * into new rows in the output. The corresponding rows for other columns in the input
+   * are duplicated.
+   *
+   * 
+   * Example:
+   * input:  [[5,10,15], 100],
+   *         [[20,25],   200],
+   *         [[30],      300],
+   * index: 0
+   * output: [5,         100],
+   *         [10,        100],
+   *         [15,        100],
+   *         [20,        200],
+   *         [25,        200],
+   *         [30,        300]
+   * 
+   *
+   * Nulls propagate in different ways depending on what is null.
+   * 
+   *  input:  [[5,null,15], 100],
+   *          [null,        200]
+   * index: 0
+   * output:  [5,           100],
+   *          [null,        100],
+   *          [15,          100],
+   *          [null,        200]
+   * 
+   * Note that null lists are completely removed from the output
+   * and nulls inside lists are pulled out and remain.
+   *
+   * @param index Column index to explode inside the table.
+   * @return A new table with explode_col exploded.
+   */
+  public Table explodeOuter(int index) {
+    assert 0 <= index && index < columns.length : "Column index is out of range";
+    assert columns[index].getType().equals(DType.LIST) : "Column to explode must be of type LIST";
+    return new Table(explodeOuter(nativeHandle, index));
+  }
+
+  /**
+   * Explodes a list column's elements retaining any null entries or empty lists and includes a
+   * position column.
+   *
+   * Any list is exploded, which means the elements of the list in each row are expanded into new rows
+   * in the output. The corresponding rows for other columns in the input are duplicated. A position
+   * column is added that has the index inside the original list for each row. Example:
+   *
+   * 
+   * Example:
+   * input:  [[5,10,15], 100],
+   *         [[20,25],   200],
+   *         [[30],      300],
+   * index: 0
+   * output: [0,   5,    100],
+   *         [1,   10,   100],
+   *         [2,   15,   100],
+   *         [0,   20,   200],
+   *         [1,   25,   200],
+   *         [0,   30,   300]
+   * 
+   *
+   * Nulls and empty lists propagate as null entries in the result.
+   * 
+   * input:  [[5,null,15], 100],
+   *         [null,        200],
+   *         [[],          300]
+   * index: 0
+   * output: [0,     5,    100],
+   *         [1,  null,    100],
+   *         [2,    15,    100],
+   *         [0,  null,    200],
+   *         [0,  null,    300]
+   * 
+   *
+   *    returns
+   *
+   * @param index Column index to explode inside the table.
+   * @return A new table with exploded value and position. The column order of return table is
+   *         [cols before explode_input, explode_position, explode_value, cols after explode_input].
+   */
+  public Table explodeOuterPosition(int index) {
+    assert 0 <= index && index < columns.length : "Column index is out of range";
+    assert columns[index].getType().equals(DType.LIST) : "Column to explode must be of type LIST";
+    return new Table(explodeOuterPosition(nativeHandle, index));
+  }
+
   /**
    * Gathers the rows of this table according to `gatherMap` such that row "i"
    * in the resulting table's columns will contain row "gatherMap[i]" from this table.
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 0ce9d6303e4..ac14e1605d7 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -37,7 +37,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 4548156055a..81b9882104f 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1614,6 +1614,39 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_concatenate(JNIEnv *env,
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv *env, jclass,
+                                                                 jlong input_table,
+                                                                 jlong partition_column,
+                                                                 jint number_of_partitions,
+                                                                 jintArray output_offsets) {
+
+  JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
+  JNI_NULL_CHECK(env, partition_column, "partition_column is null", NULL);
+  JNI_NULL_CHECK(env, output_offsets, "output_offsets is null", NULL);
+  JNI_ARG_CHECK(env, number_of_partitions > 0, "number_of_partitions is zero", NULL);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *n_input_table = reinterpret_cast(input_table);
+    cudf::column_view *n_part_column = reinterpret_cast(partition_column);
+    cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
+
+    auto result = cudf::partition(*n_input_table,
+                                  *n_part_column,
+                                  number_of_partitions);
+
+    for (size_t i = 0; i < result.second.size() - 1; i++) {
+      // for what ever reason partition returns the length of the result at then
+      // end and hash partition/round robin do not, so skip the last entry for
+      // consistency
+      n_output_offsets[i] = result.second[i];
+    }
+
+    return cudf::jni::convert_table_for_return(env, result.first);
+  }
+  CATCH_STD(env, NULL);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(JNIEnv *env, jclass,
                                                                      jlong input_table,
                                                                      jintArray columns_to_hash,
@@ -2052,4 +2085,32 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodePosition(JNIEnv *e
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuter(JNIEnv *env, jclass,
+                                                                    jlong input_jtable,
+                                                                    jint column_index) {
+  JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *input_table = reinterpret_cast(input_jtable);
+    cudf::size_type col_index = static_cast(column_index);
+    std::unique_ptr exploded = cudf::explode_outer(*input_table, col_index);
+    return cudf::jni::convert_table_for_return(env, exploded);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuterPosition(JNIEnv *env, jclass,
+                                                                            jlong input_jtable,
+                                                                            jint column_index) {
+  JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *input_table = reinterpret_cast(input_jtable);
+    cudf::size_type col_index = static_cast(column_index);
+    std::unique_ptr exploded = cudf::explode_outer_position(*input_table, col_index);
+    return cudf::jni::convert_table_for_return(env, exploded);
+  }
+  CATCH_STD(env, 0);
+}
+
 } // extern "C"
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 626f7828012..c075f074068 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -1773,6 +1773,23 @@ void testPartStability() {
     }
   }
 
+  @Test
+  void testPartition() {
+    try (Table t = new Table.TestBuilder()
+        .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+        .build();
+         ColumnVector parts = ColumnVector
+             .fromInts(1, 2, 1, 2, 1, 2, 1, 2, 1, 2);
+         PartitionedTable pt = t.partition(parts, 3);
+         Table expected = new Table.TestBuilder()
+             .column(1, 3, 5, 7, 9, 2, 4, 6, 8, 10)
+             .build()) {
+      int[] partCutoffs = pt.getPartitions();
+      assertArrayEquals(new int[]{0, 0, 5}, partCutoffs);
+      assertTablesAreEqual(expected, pt.getTable());
+    }
+  }
+
   @Test
   void testIdentityHashPartition() {
     final int count = 1024 * 1024;
@@ -4635,7 +4652,7 @@ private Table[] buildExplodeTestTableWithPrimitiveTypes(boolean pos, boolean out
     }
   }
 
-  private Table[] buildExplodeTestTableWithNestedTypes(boolean pos) {
+  private Table[] buildExplodeTestTableWithNestedTypes(boolean pos, boolean outer) {
     StructType nestedType = new StructType(true,
         new BasicType(false, DType.INT32), new BasicType(false, DType.STRING));
     try (Table input = new Table.TestBuilder()
@@ -4644,23 +4661,42 @@ private Table[] buildExplodeTestTableWithNestedTypes(boolean pos) {
             Arrays.asList(struct(4, "k4"), struct(5, "k5")),
             Arrays.asList(struct(6, "k6")),
             Arrays.asList(new HostColumnVector.StructData((List) null)),
-            Arrays.asList())
+            null)
         .column("s1", "s2", "s3", "s4", "s5")
         .column(1, 3, 5, 7, 9)
         .column(12.0, 14.0, 13.0, 11.0, 15.0)
         .build()) {
       Table.TestBuilder expectedBuilder = new Table.TestBuilder();
       if (pos) {
-        expectedBuilder.column(0, 1, 2, 0, 1, 0, 0);
+        if (!outer)
+          expectedBuilder.column(0, 1, 2, 0, 1, 0, 0);
+        else
+          expectedBuilder.column(0, 1, 2, 0, 1, 0, 0, 0);
       }
-      try (Table expected = expectedBuilder
-          .column(nestedType,
+      List expectedData = new ArrayList(){{
+        if (!outer) {
+          this.add(new HostColumnVector.StructData[]{
+              struct(1, "k1"), struct(2, "k2"), struct(3, "k3"),
+              struct(4, "k4"), struct(5, "k5"), struct(6, "k6"),
+              new HostColumnVector.StructData((List) null)});
+          this.add(new String[]{"s1", "s1", "s1", "s2", "s2", "s3", "s4"});
+          this.add(new Integer[]{1, 1, 1, 3, 3, 5, 7});
+          this.add(new Double[]{12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0});
+        } else {
+          this.add(new HostColumnVector.StructData[]{
               struct(1, "k1"), struct(2, "k2"), struct(3, "k3"),
               struct(4, "k4"), struct(5, "k5"), struct(6, "k6"),
-              new HostColumnVector.StructData((List) null))
-          .column("s1", "s1", "s1", "s2", "s2", "s3", "s4")
-          .column(1, 1, 1, 3, 3, 5, 7)
-          .column(12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0)
+              new HostColumnVector.StructData((List) null), null});
+          this.add(new String[]{"s1", "s1", "s1", "s2", "s2", "s3", "s4", "s5"});
+          this.add(new Integer[]{1, 1, 1, 3, 3, 5, 7, 9});
+          this.add(new Double[]{12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0, 15.0});
+        }
+      }};
+      try (Table expected = expectedBuilder
+          .column(nestedType, (HostColumnVector.StructData[]) expectedData.get(0))
+          .column((String[]) expectedData.get(1))
+          .column((Integer[]) expectedData.get(2))
+          .column((Double[]) expectedData.get(3))
           .build()) {
         return new Table[]{new Table(input.getColumns()), new Table(expected.getColumns())};
       }
@@ -4679,7 +4715,7 @@ void testExplode() {
     }
 
     // Child is nested type
-    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false);
+    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false, false);
     try (Table input = testTables2[0];
          Table expected = testTables2[1]) {
       try (Table exploded = input.explode(0)) {
@@ -4689,7 +4725,7 @@ void testExplode() {
   }
 
   @Test
-  void testPosExplode() {
+  void testExplodePosition() {
     // Child is primitive type
     Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(true, false);
     try (Table input = testTables[0];
@@ -4699,8 +4735,8 @@ void testPosExplode() {
       }
     }
 
-    // Child is primitive type
-    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true);
+    // Child is nested type
+    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true, false);
     try (Table input = testTables2[0];
          Table expected = testTables2[1]) {
       try (Table exploded = input.explodePosition(0)) {
@@ -4709,4 +4745,45 @@ void testPosExplode() {
     }
   }
 
+  @Test
+  void testExplodeOuter() {
+    // Child is primitive type
+    Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(false, true);
+    try (Table input = testTables[0];
+         Table expected = testTables[1]) {
+      try (Table exploded = input.explodeOuter(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+
+    // Child is nested type
+    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false, true);
+    try (Table input = testTables2[0];
+         Table expected = testTables2[1]) {
+      try (Table exploded = input.explodeOuter(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+  }
+
+  @Test
+  void testExplodeOuterPosition() {
+    // Child is primitive type
+    Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(true, true);
+    try (Table input = testTables[0];
+         Table expected = testTables[1]) {
+      try (Table exploded = input.explodeOuterPosition(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+
+    // Child is nested type
+    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true, true);
+    try (Table input = testTables2[0];
+         Table expected = testTables2[1]) {
+      try (Table exploded = input.explodeOuterPosition(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+  }
 }
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index ad798a73ed2..e5501428624 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -3,7 +3,7 @@
 import pandas as pd
 
 from libcpp cimport bool
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr, shared_ptr, make_shared
 from libcpp.vector cimport vector
 from libcpp.utility cimport move
 from libc.stdint cimport int32_t, int64_t
@@ -24,6 +24,10 @@ from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
+from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+from cudf._lib.cpp.lists.gather cimport (
+    segmented_gather as cpp_segmented_gather
+)
 cimport cudf._lib.cpp.copying as cpp_copying
 
 # workaround for https://github.com/cython/cython/issues/3885
@@ -704,3 +708,22 @@ def sample(Table input, size_type n,
             else input._index_names
         )
     )
+
+
+def segmented_gather(Column source_column, Column gather_map):
+    cdef shared_ptr[lists_column_view] source_LCV = (
+        make_shared[lists_column_view](source_column.view())
+    )
+    cdef shared_ptr[lists_column_view] gather_map_LCV = (
+        make_shared[lists_column_view](gather_map.view())
+    )
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_segmented_gather(
+                source_LCV.get()[0], gather_map_LCV.get()[0])
+        )
+
+    result = Column.from_unique_ptr(move(c_result))
+    return result
diff --git a/python/cudf/cudf/_lib/cpp/lists/gather.pxd b/python/cudf/cudf/_lib/cpp/lists/gather.pxd
new file mode 100644
index 00000000000..ea664eee82e
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/lists/gather.pxd
@@ -0,0 +1,13 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+
+
+cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] segmented_gather(
+        const lists_column_view source_column,
+        const lists_column_view gather_map_list
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/char_types.pxd b/python/cudf/cudf/_lib/cpp/strings/char_types.pxd
index ad675027c10..934269c6f25 100644
--- a/python/cudf/cudf/_lib/cpp/strings/char_types.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/char_types.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -33,11 +33,3 @@ cdef extern from "cudf/strings/char_types/char_types.hpp" \
         string_character_types types_to_remove,
         string_scalar replacement,
         string_character_types types_to_keep) except +
-
-    cdef unique_ptr[column] is_integer(
-        column_view source_strings
-    ) except +
-
-    cdef unique_ptr[column] is_float(
-        column_view source_strings
-    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
index baee01b8f99..55a84b60efd 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -14,3 +14,7 @@ cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \
 
     cdef unique_ptr[column] from_floats(
         column_view input_col) except +
+
+    cdef unique_ptr[column] is_float(
+        column_view source_strings
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
index 92f99a2f5cb..6e45d4ba869 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -15,6 +15,10 @@ cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \
     cdef unique_ptr[column] from_integers(
         column_view input_col) except +
 
+    cdef unique_ptr[column] is_integer(
+        column_view source_strings
+    ) except +
+
     cdef unique_ptr[column] hex_to_integers(
         column_view input_col,
         data_type output_type) except +
diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx
index 5d8d1522418..1890e98f956 100644
--- a/python/cudf/cudf/_lib/strings/char_types.pyx
+++ b/python/cudf/cudf/_lib/strings/char_types.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -14,8 +14,6 @@ from cudf._lib.cpp.strings.char_types cimport (
     all_characters_of_type as cpp_all_characters_of_type,
     filter_characters_of_type as cpp_filter_characters_of_type,
     string_character_types as string_character_types,
-    is_integer as cpp_is_integer,
-    is_float as cpp_is_float,
 )
 
 
@@ -191,35 +189,3 @@ def is_space(Column source_strings):
         ))
 
     return Column.from_unique_ptr(move(c_result))
-
-
-def is_integer(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have intergers.
-    """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_integer(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
-
-def is_float(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have floats.
-    """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_float(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
new file mode 100644
index 00000000000..195d9b71f6e
--- /dev/null
+++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
@@ -0,0 +1,29 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+
+from cudf._lib.cpp.strings.convert.convert_floats cimport (
+    is_float as cpp_is_float,
+)
+
+
+def is_float(Column source_strings):
+    """
+    Returns a Column of boolean values with True for `source_strings`
+    that have floats.
+    """
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+
+    with nogil:
+        c_result = move(cpp_is_float(
+            source_view
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
new file mode 100644
index 00000000000..d1bae1edd37
--- /dev/null
+++ b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
@@ -0,0 +1,29 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+
+from cudf._lib.cpp.strings.convert.convert_integers cimport (
+    is_integer as cpp_is_integer,
+)
+
+
+def is_integer(Column source_strings):
+    """
+    Returns a Column of boolean values with True for `source_strings`
+    that have intergers.
+    """
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+
+    with nogil:
+        c_result = move(cpp_is_integer(
+            source_view
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index a60fe627acb..1d3f73822a9 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -2,14 +2,16 @@
 
 import pickle
 
+import numpy as np
 import pyarrow as pa
 
 import cudf
+from cudf._lib.copying import segmented_gather
 from cudf._lib.lists import count_elements
 from cudf.core.buffer import Buffer
-from cudf.core.column import ColumnBase, column
+from cudf.core.column import ColumnBase, as_column, column
 from cudf.core.column.methods import ColumnMethodsMixin
-from cudf.utils.dtypes import is_list_dtype
+from cudf.utils.dtypes import is_list_dtype, is_numerical_dtype
 
 
 class ListColumn(ColumnBase):
@@ -228,3 +230,58 @@ def len(self):
         dtype: int32
         """
         return self._return_or_inplace(count_elements(self._column))
+
+    def take(self, lists_indices):
+        """
+        Collect list elements based on given indices.
+
+        Parameters
+        ----------
+        lists_indices: List type arrays
+            Specifies what to collect from each row
+
+        Returns
+        -------
+        ListColumn
+
+        Examples
+        --------
+        >>> s = cudf.Series([[1, 2, 3], None, [4, 5]])
+        >>> s
+        0    [1, 2, 3]
+        1         None
+        2       [4, 5]
+        dtype: list
+        >>> s.list.take([[0, 1], [], []])
+        0    [1, 2]
+        1      None
+        2        []
+        dtype: list
+        """
+
+        lists_indices_col = as_column(lists_indices)
+        if not isinstance(lists_indices_col, ListColumn):
+            raise ValueError("lists_indices should be list type array.")
+        if not lists_indices_col.size == self._column.size:
+            raise ValueError(
+                "lists_indices and list column is of different " "size."
+            )
+        if not is_numerical_dtype(
+            lists_indices_col.children[1].dtype
+        ) or not np.issubdtype(
+            lists_indices_col.children[1].dtype, np.integer
+        ):
+            raise TypeError(
+                "lists_indices should be column of values of index types."
+            )
+
+        try:
+            res = self._return_or_inplace(
+                segmented_gather(self._column, lists_indices_col)
+            )
+        except RuntimeError as e:
+            if "contains nulls" in str(e):
+                raise ValueError("lists_indices contains null.") from e
+            raise
+        else:
+            return res
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index ea01aa07b91..11dd7556812 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -70,13 +70,15 @@
     is_alpha as cpp_is_alpha,
     is_decimal as cpp_is_decimal,
     is_digit as cpp_is_digit,
-    is_float as cpp_is_float,
-    is_integer as cpp_is_integer,
     is_lower as cpp_is_lower,
     is_numeric as cpp_is_numeric,
     is_space as cpp_isspace,
     is_upper as cpp_is_upper,
 )
+from cudf._lib.strings.convert.convert_integers import (
+    is_integer as cpp_is_integer,
+)
+from cudf._lib.strings.convert.convert_floats import is_float as cpp_is_float
 from cudf._lib.strings.combine import (
     concatenate as cpp_concatenate,
     join as cpp_join,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 25f57748765..9672ab3002f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1518,11 +1518,7 @@ def fallback(col, fn):
                 else:
                     if col not in df_cols:
                         r_opr = other_cols[col]
-                        l_opr = Series(
-                            column_empty(
-                                len(self), masked=True, dtype=other.dtype
-                            )
-                        )
+                        l_opr = Series(as_column(np.nan, length=len(self)))
                     if col not in other_cols_keys:
                         r_opr = None
                         l_opr = self[col]
@@ -2198,7 +2194,7 @@ def rpow(self, other, axis="columns", level=None, fill_value=None):
         return self._apply_op("rpow", other, fill_value)
 
     def __rpow__(self, other):
-        return self._apply_op("__pow__", other)
+        return self._apply_op("__rpow__", other)
 
     def floordiv(self, other, axis="columns", level=None, fill_value=None):
         """
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 5e7121c0488..b06fef178f6 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1501,9 +1501,7 @@ def _binaryop(
         If ``reflect`` is ``True``, swap the order of the operands.
         """
         if isinstance(other, cudf.DataFrame):
-            # TODO: fn is not the same as arg expected by _apply_op
-            # e.g. for fn = 'and', _apply_op equivalent is '__and__'
-            return other._apply_op(self, fn)
+            return NotImplemented
 
         result_name = utils.get_result_name(self, other)
         if isinstance(other, Series):
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 4e5e4ce1987..535e497e8dc 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -8,7 +8,9 @@
 from pandas.core.tools.datetimes import _unit_map
 
 import cudf
-from cudf._lib.strings.char_types import is_integer as cpp_is_integer
+from cudf._lib.strings.convert.convert_integers import (
+    is_integer as cpp_is_integer,
+)
 from cudf.core import column
 from cudf.core.index import as_index
 from cudf.utils.dtypes import is_scalar
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 77548b95277..5f4d571e8c5 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -4996,13 +4996,13 @@ def test_cov_nans():
 @pytest.mark.parametrize(
     "gsr",
     [
-        cudf.Series([1, 2, 3]),
-        cudf.Series([1, 2, 3], index=["a", "b", "c"]),
-        cudf.Series([1, 2, 3], index=["a", "b", "d"]),
-        cudf.Series([1, 2], index=["a", "b"]),
-        cudf.Series([1, 2, 3], index=cudf.core.index.RangeIndex(0, 3)),
+        cudf.Series([4, 2, 3]),
+        cudf.Series([4, 2, 3], index=["a", "b", "c"]),
+        cudf.Series([4, 2, 3], index=["a", "b", "d"]),
+        cudf.Series([4, 2], index=["a", "b"]),
+        cudf.Series([4, 2, 3], index=cudf.core.index.RangeIndex(0, 3)),
         pytest.param(
-            cudf.Series([1, 2, 3, 4, 5], index=["a", "b", "d", "0", "12"]),
+            cudf.Series([4, 2, 3, 4, 5], index=["a", "b", "d", "0", "12"]),
             marks=pytest.mark.xfail,
         ),
     ],
@@ -5017,32 +5017,32 @@ def test_cov_nans():
         operator.truediv,
         operator.mod,
         operator.pow,
-        # comparison ops will temporarily XFAIL
-        # see PR  https://github.com/rapidsai/cudf/pull/7491
-        pytest.param(operator.eq, marks=pytest.mark.xfail()),
-        pytest.param(operator.lt, marks=pytest.mark.xfail()),
-        pytest.param(operator.le, marks=pytest.mark.xfail()),
-        pytest.param(operator.gt, marks=pytest.mark.xfail()),
-        pytest.param(operator.ge, marks=pytest.mark.xfail()),
-        pytest.param(operator.ne, marks=pytest.mark.xfail()),
+        operator.eq,
+        operator.lt,
+        operator.le,
+        operator.gt,
+        operator.ge,
+        operator.ne,
     ],
 )
 def test_df_sr_binop(gsr, colnames, op):
-    data = [[0, 2, 5], [3, None, 5], [6, 7, np.nan]]
+    data = [[3.0, 2.0, 5.0], [3.0, None, 5.0], [6.0, 7.0, np.nan]]
     data = dict(zip(colnames, data))
 
+    gsr = gsr.astype("float64")
+
     gdf = cudf.DataFrame(data)
-    pdf = pd.DataFrame.from_dict(data)
+    pdf = gdf.to_pandas(nullable=True)
 
-    psr = gsr.to_pandas()
+    psr = gsr.to_pandas(nullable=True)
 
     expect = op(pdf, psr)
-    got = op(gdf, gsr)
-    assert_eq(expect.astype(float), got.astype(float))
+    got = op(gdf, gsr).to_pandas(nullable=True)
+    assert_eq(expect, got, check_dtype=False)
 
     expect = op(psr, pdf)
-    got = op(psr, pdf)
-    assert_eq(expect.astype(float), got.astype(float))
+    got = op(gsr, gdf).to_pandas(nullable=True)
+    assert_eq(expect, got, check_dtype=False)
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 195d8749ec6..33812cfa7a7 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -112,3 +112,50 @@ def test_len(data):
     got = gsr.list.len()
 
     assert_eq(expect, got, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    ("data", "idx"),
+    [
+        ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[0, 1], [2], [1, 2]]),
+        ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[1, 2, 0], [1, 0, 2], [0, 1, 2]]),
+        ([[1, 2, 3], []], [[0, 1], []]),
+        ([[1, 2, 3], [None]], [[0, 1], []]),
+        ([[1, None, 3], None], [[0, 1], []]),
+    ],
+)
+def test_take(data, idx):
+    ps = pd.Series(data)
+    gs = cudf.from_pandas(ps)
+
+    expected = pd.Series(zip(ps, idx)).map(
+        lambda x: [x[0][i] for i in x[1]] if x[0] is not None else None
+    )
+    got = gs.list.take(idx)
+    assert_eq(expected, got)
+
+
+@pytest.mark.parametrize(
+    ("invalid", "exception"),
+    [
+        ([[0]], pytest.raises(ValueError, match="different size")),
+        ([1, 2, 3, 4], pytest.raises(ValueError, match="should be list type")),
+        (
+            [["a", "b"], ["c"]],
+            pytest.raises(
+                TypeError, match="should be column of values of index types"
+            ),
+        ),
+        (
+            [[[1], [0]], [[0]]],
+            pytest.raises(
+                TypeError, match="should be column of values of index types"
+            ),
+        ),
+        ([[0, 1], None], pytest.raises(ValueError, match="contains null")),
+    ],
+)
+def test_take_invalid(invalid, exception):
+    gs = cudf.Series([[0, 1], [2, 3]])
+    with exception:
+        gs.list.take(invalid)