diff --git a/build.sh b/build.sh index e6169b2036b..bee66d819b4 100755 --- a/build.sh +++ b/build.sh @@ -315,9 +315,11 @@ if buildAll || hasArg libcudf; then LIBCUDF_FS=$(ls -lh ${LIB_BUILD_DIR}/libcudf.so | awk '{print $5}') MSG="${MSG}
libcudf.so size: $LIBCUDF_FS" fi - echo "$MSG" - python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${LIB_BUILD_DIR}/ninja_log.html - cp ${LIB_BUILD_DIR}/.ninja_log ${LIB_BUILD_DIR}/ninja.log + BMR_DIR=${RAPIDS_ARTIFACTS_DIR:-"${LIB_BUILD_DIR}"} + echo "Metrics output dir: [$BMR_DIR]" + mkdir -p ${BMR_DIR} + python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${BMR_DIR}/ninja_log.html + cp ${LIB_BUILD_DIR}/.ninja_log ${BMR_DIR}/ninja.log fi if [[ ${INSTALL_TARGET} != "" ]]; then diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index 3b45b3ce2e7..b68c2bdbef6 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. set -euo pipefail @@ -14,3 +14,29 @@ rapids-logger "Begin cpp build" rapids-mamba-retry mambabuild conda/recipes/libcudf rapids-upload-conda-to-s3 cpp + +echo "++++++++++++++++++++++++++++++++++++++++++++" + +if [[ -d $RAPIDS_ARTIFACTS_DIR ]]; then + ls -l ${RAPIDS_ARTIFACTS_DIR} +fi + +echo "++++++++++++++++++++++++++++++++++++++++++++" + +FILE=${RAPIDS_ARTIFACTS_DIR}/ninja.log +if [[ -f $FILE ]]; then + echo -e "\x1B[33;1m\x1B[48;5;240m Ninja log for this build available at the following link \x1B[0m" + UPLOAD_NAME=cpp_cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch).ninja.log + rapids-upload-to-s3 "${UPLOAD_NAME}" "${FILE}" +fi + +echo "++++++++++++++++++++++++++++++++++++++++++++" + +FILE=${RAPIDS_ARTIFACTS_DIR}/ninja_log.html +if [[ -f $FILE ]]; then + echo -e "\x1B[33;1m\x1B[48;5;240m Build Metrics Report for this build available at the following link \x1B[0m" + UPLOAD_NAME=cpp_cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch).BuildMetricsReport.html + rapids-upload-to-s3 "${UPLOAD_NAME}" "${FILE}" +fi + +echo "++++++++++++++++++++++++++++++++++++++++++++" diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index 0be72486319..983a63d4ce9 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -66,21 +66,5 @@ for gt in "$CONDA_PREFIX"/bin/gtests/{libcudf,libcudf_kafka}/* ; do fi done -if [[ "${RAPIDS_BUILD_TYPE}" == "nightly" ]]; then - rapids-logger "Memcheck gtests with rmm_mode=cuda" - export GTEST_CUDF_RMM_MODE=cuda - COMPUTE_SANITIZER_CMD="compute-sanitizer --tool memcheck" - for gt in "$CONDA_PREFIX"/bin/gtests/{libcudf,libcudf_kafka}/* ; do - test_name=$(basename ${gt}) - if [[ "$test_name" == "ERROR_TEST" ]]; then - continue - fi - echo "Running gtest $test_name" - ${COMPUTE_SANITIZER_CMD} ${gt} | tee "${RAPIDS_TESTS_DIR}${test_name}.cs.log" - done - unset GTEST_CUDF_RMM_MODE - # TODO: test-results/*.cs.log are processed in CI -fi - rapids-logger "Test script exiting with value: $EXITCODE" exit ${EXITCODE} diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index b0b86b427b7..fbfcf6e71a2 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -27,6 +27,7 @@ build: - SCCACHE_IDLE_TIMEOUT - AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY + - RAPIDS_ARTIFACTS_DIR requirements: build: diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp index 0f005c462cc..1b3f4190680 100644 --- a/cpp/benchmarks/string/split.cpp +++ b/cpp/benchmarks/string/split.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,12 +57,12 @@ static void BM_split(benchmark::State& state, split_type rt) static void generate_bench_args(benchmark::internal::Benchmark* b) { - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 5; - int const max_rowlen = 1 << 13; - int const len_mult = 4; + int constexpr min_rows = 1 << 12; + int constexpr max_rows = 1 << 24; + int constexpr row_mult = 8; + int constexpr min_rowlen = 1 << 5; + int constexpr max_rowlen = 1 << 13; + int constexpr len_mult = 2; for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) { for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) { // avoid generating combinations that exceed the cudf column limit diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index fd4c049e2fc..a0ef2155f7d 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -112,11 +112,21 @@ class datasource { /** * @brief Creates a source from a host memory buffer. * + # @deprecated Since 23.04 + * * @param[in] buffer Host buffer object * @return Constructed datasource object */ static std::unique_ptr create(host_buffer const& buffer); + /** + * @brief Creates a source from a host memory buffer. + * + * @param[in] buffer Host buffer object + * @return Constructed datasource object + */ + static std::unique_ptr create(cudf::host_span buffer); + /** * @brief Creates a source from a device memory buffer. * diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index 06b52563e19..6f97eb768d9 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -150,6 +150,8 @@ struct table_with_metadata { /** * @brief Non-owning view of a host memory buffer * + * @deprecated Since 23.04 + * * Used to describe buffer input in `source_info` objects. */ struct host_buffer { @@ -166,6 +168,22 @@ struct host_buffer { host_buffer(const char* data, size_t size) : data(data), size(size) {} }; +/** + * @brief Returns `true` if the type is byte-like, meaning it is reasonable to pass as a pointer to + * bytes. + * + * @tparam T The representation type + * @return `true` if the type is considered a byte-like type + */ +template +constexpr inline auto is_byte_like_type() +{ + using non_cv_T = std::remove_cv_t; + return std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v; +} + /** * @brief Source information for read interfaces */ @@ -191,21 +209,70 @@ struct source_info { /** * @brief Construct a new source info object for multiple buffers in host memory * + * @deprecated Since 23.04 + * * @param host_buffers Input buffers in host memory */ - explicit source_info(std::vector const& host_buffers) - : _type(io_type::HOST_BUFFER), _host_buffers(host_buffers) + explicit source_info(std::vector const& host_buffers) : _type(io_type::HOST_BUFFER) { + _host_buffers.reserve(host_buffers.size()); + std::transform(host_buffers.begin(), + host_buffers.end(), + std::back_inserter(_host_buffers), + [](auto const hb) { + return cudf::host_span{ + reinterpret_cast(hb.data), hb.size}; + }); } /** * @brief Construct a new source info object for a single buffer * + * @deprecated Since 23.04 + * * @param host_data Input buffer in host memory * @param size Size of the buffer */ explicit source_info(const char* host_data, size_t size) - : _type(io_type::HOST_BUFFER), _host_buffers({{host_data, size}}) + : _type(io_type::HOST_BUFFER), + _host_buffers( + {cudf::host_span(reinterpret_cast(host_data), size)}) + { + } + + /** + * @brief Construct a new source info object for multiple buffers in host memory + * + * @param host_buffers Input buffers in host memory + */ + template >())> + explicit source_info(cudf::host_span> const host_buffers) + : _type(io_type::HOST_BUFFER) + { + if constexpr (not std::is_same_v, std::byte>) { + _host_buffers.reserve(host_buffers.size()); + std::transform(host_buffers.begin(), + host_buffers.end(), + std::back_inserter(_host_buffers), + [](auto const s) { + return cudf::host_span{ + reinterpret_cast(s.data()), s.size()}; + }); + } else { + _host_buffers.assign(host_buffers.begin(), host_buffers.end()); + } + } + + /** + * @brief Construct a new source info object for a single buffer + * + * @param host_data Input buffer in host memory + */ + template >())> + explicit source_info(cudf::host_span host_data) + : _type(io_type::HOST_BUFFER), + _host_buffers{cudf::host_span( + reinterpret_cast(host_data.data()), host_data.size())} { } @@ -289,7 +356,7 @@ struct source_info { private: io_type _type = io_type::FILEPATH; std::vector _filepaths; - std::vector _host_buffers; + std::vector> _host_buffers; std::vector> _device_buffers; std::vector _user_sources; }; diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index c2f7b18d443..71d64900398 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -329,10 +329,16 @@ std::unique_ptr datasource::create(const std::string& filepath, } std::unique_ptr datasource::create(host_buffer const& buffer) +{ + return create( + cudf::host_span{reinterpret_cast(buffer.data), buffer.size}); +} + +std::unique_ptr datasource::create(cudf::host_span buffer) { // Use Arrow IO buffer class for zero-copy reads of host memory return std::make_unique(std::make_shared( - reinterpret_cast(buffer.data), buffer.size)); + reinterpret_cast(buffer.data()), buffer.size())); } std::unique_ptr datasource::create(cudf::device_span buffer) diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu index c11d7ad47f9..18599fb568a 100644 --- a/cpp/src/strings/split/split.cu +++ b/cpp/src/strings/split/split.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "split.cuh" + #include #include #include @@ -31,14 +33,10 @@ #include #include -#include -#include -#include #include #include #include #include -#include #include #include @@ -46,321 +44,8 @@ namespace cudf { namespace strings { namespace detail { -using string_index_pair = thrust::pair; - namespace { -/** - * @brief Base class for delimiter-based tokenizers. - * - * These are common methods used by both split and rsplit tokenizer functors. - */ -struct base_split_tokenizer { - __device__ const char* get_base_ptr() const - { - return d_strings.child(strings_column_view::chars_column_index).data(); - } - - __device__ string_view const get_string(size_type idx) const - { - return d_strings.element(idx); - } - - __device__ bool is_valid(size_type idx) const { return d_strings.is_valid(idx); } - - /** - * @brief Initialize token elements for all strings. - * - * The process_tokens() only handles creating tokens for strings that contain - * delimiters. This function will initialize the output tokens for all - * strings by assigning null entries for null and empty strings and the - * string itself for strings with no delimiters. - * - * The tokens are placed in output order so that all tokens for each output - * column are stored consecutively in `d_all_tokens`. - * - * @param idx Index of string in column - * @param column_count Number of columns in output - * @param d_all_tokens Tokens vector for all strings - */ - __device__ void init_tokens(size_type idx, - size_type column_count, - string_index_pair* d_all_tokens) const - { - auto d_tokens = d_all_tokens + idx; - if (is_valid(idx)) { - auto d_str = get_string(idx); - *d_tokens = string_index_pair{d_str.data(), d_str.size_bytes()}; - --column_count; - d_tokens += d_strings.size(); - } - // this is like fill() but output needs to be strided - for (size_type col = 0; col < column_count; ++col) - d_tokens[d_strings.size() * col] = string_index_pair{nullptr, 0}; - } - - base_split_tokenizer(column_device_view const& d_strings, - string_view const& d_delimiter, - size_type max_tokens) - : d_strings(d_strings), d_delimiter(d_delimiter), max_tokens(max_tokens) - { - } - - protected: - column_device_view const d_strings; // strings to split - string_view const d_delimiter; // delimiter for split - size_type max_tokens; -}; - -/** - * @brief The tokenizer functions for split(). - * - * The methods here count delimiters, tokens, and output token elements - * for each string in a strings column. - */ -struct split_tokenizer_fn : base_split_tokenizer { - /** - * @brief This will create tokens around each delimiter honoring the string boundaries - * in which the delimiter resides. - * - * Each token is placed in `d_all_tokens` so they align consecutively - * with other tokens for the same output column. - * That is, `d_tokens[col * strings_count + string_index]` is the token at column `col` - * for string at `string_index`. - * - * @param idx Index of the delimiter in the chars column - * @param d_token_counts Token counts for each string - * @param d_positions The beginning byte position of each delimiter - * @param positions_count Number of delimiters - * @param d_indexes Indices of the strings for each delimiter - * @param d_all_tokens All output tokens for the strings column - */ - __device__ void process_tokens(size_type idx, - size_type const* d_token_counts, - size_type const* d_positions, - size_type positions_count, - size_type const* d_indexes, - string_index_pair* d_all_tokens) const - { - size_type str_idx = d_indexes[idx]; - if ((idx > 0) && d_indexes[idx - 1] == str_idx) - return; // the first delimiter for the string rules them all - --str_idx; // all of these are off by 1 from the upper_bound call - size_type token_count = d_token_counts[str_idx]; // max_tokens already included - const char* const base_ptr = get_base_ptr(); // d_positions values are based on this ptr - // this string's tokens output - auto d_tokens = d_all_tokens + str_idx; - // this string - const string_view d_str = get_string(str_idx); - const char* str_ptr = d_str.data(); // beginning of the string - const char* const str_end_ptr = str_ptr + d_str.size_bytes(); // end of the string - // build the index-pair of each token for this string - for (size_type col = 0; col < token_count; ++col) { - auto next_delim = ((idx + col) < positions_count) // boundary check for delims in last string - ? (base_ptr + d_positions[idx + col]) // start of next delimiter - : str_end_ptr; // or end of this string - auto eptr = (next_delim < str_end_ptr) // make sure delimiter is inside this string - && (col + 1 < token_count) // and this is not the last token - ? next_delim - : str_end_ptr; - // store the token into the output vector - d_tokens[col * d_strings.size()] = - string_index_pair{str_ptr, static_cast(eptr - str_ptr)}; - // point past this delimiter - str_ptr = eptr + d_delimiter.size_bytes(); - } - } - - /** - * @brief Returns `true` if the byte at `idx` is the start of the delimiter. - * - * @param idx Index of a byte in the chars column. - * @param d_offsets Offsets values to locate the chars ranges. - * @param chars_bytes Total number of characters to process. - * @return true if delimiter is found starting at position `idx` - */ - __device__ bool is_delimiter(size_type idx, // chars index - int32_t const* d_offsets, - size_type chars_bytes) const - { - auto d_chars = get_base_ptr() + d_offsets[0]; - if (idx + d_delimiter.size_bytes() > chars_bytes) return false; - return d_delimiter.compare(d_chars + idx, d_delimiter.size_bytes()) == 0; - } - - /** - * @brief This counts the tokens for strings that contain delimiters. - * - * @param idx Index of a delimiter - * @param d_positions Start positions of all the delimiters - * @param positions_count The number of delimiters - * @param d_indexes Indices of the strings for each delimiter - * @param d_counts The token counts for all the strings - */ - __device__ void count_tokens(size_type idx, // delimiter index - size_type const* d_positions, - size_type positions_count, - size_type const* d_indexes, - size_type* d_counts) const - { - size_type str_idx = d_indexes[idx]; - if ((idx > 0) && d_indexes[idx - 1] == str_idx) - return; // first delimiter found handles all of them for this string - auto const delim_length = d_delimiter.size_bytes(); - string_view const d_str = get_string(str_idx - 1); - const char* const base_ptr = get_base_ptr(); - size_type delim_count = 0; // re-count delimiters to compute the token-count - size_type last_pos = d_positions[idx] - delim_length; - while ((idx < positions_count) && (d_indexes[idx] == str_idx)) { - // make sure the whole delimiter is inside the string before counting it - auto d_pos = d_positions[idx]; - if (((base_ptr + d_pos + delim_length - 1) < (d_str.data() + d_str.size_bytes())) && - ((d_pos - last_pos) >= delim_length)) { - ++delim_count; // only count if the delimiter fits - last_pos = d_pos; // overlapping delimiters are ignored too - } - ++idx; - } - // the number of tokens is delim_count+1 but capped to max_tokens - d_counts[str_idx - 1] = - ((max_tokens > 0) && (delim_count + 1 > max_tokens)) ? max_tokens : delim_count + 1; - } - - split_tokenizer_fn(column_device_view const& d_strings, - string_view const& d_delimiter, - size_type max_tokens) - : base_split_tokenizer(d_strings, d_delimiter, max_tokens) - { - } -}; - -/** - * @brief The tokenizer functions for split(). - * - * The methods here count delimiters, tokens, and output token elements - * for each string in a strings column. - * - * Same as split_tokenizer_fn except tokens are counted from the end of each string. - */ -struct rsplit_tokenizer_fn : base_split_tokenizer { - /** - * @brief This will create tokens around each delimiter honoring the string boundaries - * in which the delimiter resides. - * - * The tokens are processed from the end of each string so the `max_tokens` - * is honored correctly. - * - * Each token is placed in `d_all_tokens` so they align consecutively - * with other tokens for the same output column. - * That is, `d_tokens[col * strings_count + string_index]` is the token at column `col` - * for string at `string_index`. - * - * @param idx Index of the delimiter in the chars column - * @param d_token_counts Token counts for each string - * @param d_positions The ending byte position of each delimiter - * @param positions_count Number of delimiters - * @param d_indexes Indices of the strings for each delimiter - * @param d_all_tokens All output tokens for the strings column - */ - __device__ void process_tokens(size_type idx, // delimiter position index - size_type const* d_token_counts, // token counts for each string - size_type const* d_positions, // end of each delimiter - size_type positions_count, // total number of delimiters - size_type const* d_indexes, // string indices for each delimiter - string_index_pair* d_all_tokens) const - { - size_type str_idx = d_indexes[idx]; - if ((idx + 1 < positions_count) && d_indexes[idx + 1] == str_idx) - return; // the last delimiter for the string rules them all - --str_idx; // all of these are off by 1 from the upper_bound call - size_type token_count = d_token_counts[str_idx]; // max_tokens already included - const char* const base_ptr = get_base_ptr(); // d_positions values are based on this ptr - // this string's tokens output - auto d_tokens = d_all_tokens + str_idx; - // this string - const string_view d_str = get_string(str_idx); - const char* const str_begin_ptr = d_str.data(); // beginning of the string - const char* str_ptr = str_begin_ptr + d_str.size_bytes(); // end of the string - // build the index-pair of each token for this string - for (size_type col = 0; col < token_count; ++col) { - auto prev_delim = (idx >= col) // boundary check for delims in first string - ? (base_ptr + d_positions[idx - col] + 1) // end of prev delimiter - : str_begin_ptr; // or the start of this string - auto sptr = (prev_delim > str_begin_ptr) // make sure delimiter is inside the string - && (col + 1 < token_count) // and this is not the last token - ? prev_delim - : str_begin_ptr; - // store the token into the output -- building the array backwards - d_tokens[d_strings.size() * (token_count - 1 - col)] = - string_index_pair{sptr, static_cast(str_ptr - sptr)}; - str_ptr = sptr - d_delimiter.size_bytes(); // get ready for the next prev token - } - } - - /** - * @brief Returns `true` if the byte at `idx` is the end of the delimiter. - * - * @param idx Index of a byte in the chars column. - * @param d_offsets Offsets values to locate the chars ranges. - * @return true if delimiter is found ending at position `idx` - */ - __device__ bool is_delimiter(size_type idx, int32_t const* d_offsets, size_type) const - { - auto delim_length = d_delimiter.size_bytes(); - if (idx < delim_length - 1) return false; - auto d_chars = get_base_ptr() + d_offsets[0]; - return d_delimiter.compare(d_chars + idx - (delim_length - 1), delim_length) == 0; - } - - /** - * @brief This counts the tokens for strings that contain delimiters. - * - * Token counting starts at the end of the string to honor the `max_tokens` - * appropriately. - * - * @param idx Index of a delimiter - * @param d_positions End positions of all the delimiters - * @param positions_count The number of delimiters - * @param d_indexes Indices of the strings for each delimiter - * @param d_counts The token counts for all the strings - */ - __device__ void count_tokens(size_type idx, - size_type const* d_positions, - size_type positions_count, - size_type const* d_indexes, - size_type* d_counts) const - { - size_type str_idx = d_indexes[idx]; // 1-based string index created by upper_bound() - if ((idx > 0) && d_indexes[idx - 1] == str_idx) - return; // first delimiter found handles all of them for this string - auto const delim_length = d_delimiter.size_bytes(); - const string_view d_str = get_string(str_idx - 1); // -1 for 0-based index - const char* const base_ptr = get_base_ptr(); - size_type delim_count = 0; - size_type last_pos = d_positions[idx] - delim_length; - while ((idx < positions_count) && (d_indexes[idx] == str_idx)) { - // make sure the whole delimiter is inside the string before counting it - auto d_pos = d_positions[idx]; - if (((base_ptr + d_pos + 1 - delim_length) >= d_str.data()) && - ((d_pos - last_pos) >= delim_length)) { - ++delim_count; // only count if the delimiter fits - last_pos = d_pos; // overlapping delimiters are also ignored - } - ++idx; - } - // the number of tokens is delim_count+1 but capped to max_tokens - d_counts[str_idx - 1] = - ((max_tokens > 0) && (delim_count + 1 > max_tokens)) ? max_tokens : delim_count + 1; - } - - rsplit_tokenizer_fn(column_device_view const& d_strings, - string_view const& d_delimiter, - size_type max_tokens) - : base_split_tokenizer(d_strings, d_delimiter, max_tokens) - { - } -}; - /** * @brief Generic split function called by split() and rsplit(). * @@ -423,125 +108,42 @@ struct rsplit_tokenizer_fn : base_split_tokenizer { * @return table of columns for the output of the split */ template -std::unique_ptr split_fn(strings_column_view const& strings_column, +std::unique_ptr
split_fn(strings_column_view const& input, Tokenizer tokenizer, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { std::vector> results; - auto const strings_count = strings_column.size(); - if (strings_count == 0) { - results.push_back(make_empty_column(type_id::STRING)); + if (input.size() == input.null_count()) { + results.push_back(std::make_unique(input.parent(), stream, mr)); return std::make_unique
(std::move(results)); } - auto d_offsets = strings_column.offsets_begin(); - auto const chars_bytes = - cudf::detail::get_value( - strings_column.offsets(), strings_column.offset() + strings_count, stream) - - cudf::detail::get_value(strings_column.offsets(), strings_column.offset(), stream); + // builds the offsets and the vector of all tokens + auto [offsets, tokens] = split_helper(input, tokenizer, stream, mr); + auto const d_offsets = offsets->view().template data(); + auto const d_tokens = tokens.data(); - // count the number of delimiters in the entire column - auto const delimiter_count = - thrust::count_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(chars_bytes), - [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) { - return tokenizer.is_delimiter(idx, d_offsets, chars_bytes); - }); - - // create vector of every delimiter position in the chars column - rmm::device_uvector delimiter_positions(delimiter_count, stream); - auto d_positions = delimiter_positions.data(); - auto copy_end = thrust::copy_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(chars_bytes), - delimiter_positions.begin(), - [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) { - return tokenizer.is_delimiter(idx, d_offsets, chars_bytes); - }); - - // create vector of string indices for each delimiter - rmm::device_uvector string_indices(delimiter_count, stream); // these will - auto d_string_indices = string_indices.data(); // be strings that only contain delimiters - thrust::upper_bound(rmm::exec_policy(stream), - d_offsets, - d_offsets + strings_count, - delimiter_positions.begin(), - copy_end, - string_indices.begin()); - - // compute the number of tokens per string - rmm::device_uvector token_counts(strings_count, stream); - auto d_token_counts = token_counts.data(); - // first, initialize token counts for strings without delimiters in them - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - d_token_counts, - [tokenizer] __device__(size_type idx) { - // null are 0, all others 1 - return static_cast(tokenizer.is_valid(idx)); - }); - - // now compute the number of tokens in each string - thrust::for_each_n( + // compute the maximum number of tokens for any string + auto const columns_count = thrust::transform_reduce( rmm::exec_policy(stream), thrust::make_counting_iterator(0), - delimiter_count, - [tokenizer, d_positions, delimiter_count, d_string_indices, d_token_counts] __device__( - size_type idx) { - tokenizer.count_tokens(idx, d_positions, delimiter_count, d_string_indices, d_token_counts); - }); - - // the columns_count is the maximum number of tokens for any string - auto const columns_count = thrust::reduce( - rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{}); - // boundary case: if no columns, return one null column (custrings issue #119) - if (columns_count == 0) { - results.push_back(std::make_unique( - data_type{type_id::STRING}, - strings_count, - rmm::device_buffer{0, stream, mr}, // no data - cudf::detail::create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr), - strings_count)); - } + thrust::make_counting_iterator(input.size()), + [d_offsets] __device__(auto idx) -> size_type { return d_offsets[idx + 1] - d_offsets[idx]; }, + 0, + thrust::maximum{}); - // create working area to hold all token positions - rmm::device_uvector tokens(columns_count * strings_count, stream); - string_index_pair* d_tokens = tokens.data(); - // initialize the token positions - // -- accounts for nulls, empty, and strings with no delimiter in them - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - [tokenizer, columns_count, d_tokens] __device__(size_type idx) { - tokenizer.init_tokens(idx, columns_count, d_tokens); - }); - - // get the positions for every token using the delimiter positions - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - delimiter_count, - [tokenizer, - d_token_counts, - d_positions, - delimiter_count, - d_string_indices, - d_tokens] __device__(size_type idx) { - tokenizer.process_tokens( - idx, d_token_counts, d_positions, delimiter_count, d_string_indices, d_tokens); - }); - - // Create each column. - // - Each pair points to the strings for that column for each row. - // - Create the strings column from the vector using the strings factory. + // build strings columns for each token position for (size_type col = 0; col < columns_count; ++col) { - auto column_tokens = d_tokens + (col * strings_count); - results.emplace_back( - make_strings_column(column_tokens, column_tokens + strings_count, stream, mr)); + auto itr = cudf::detail::make_counting_transform_iterator( + 0, [d_tokens, d_offsets, col] __device__(size_type idx) { + auto const offset = d_offsets[idx]; + auto const token_count = d_offsets[idx + 1] - offset; + return (col < token_count) ? d_tokens[offset + col] : string_index_pair{nullptr, 0}; + }); + results.emplace_back(make_strings_column(itr, itr + input.size(), stream, mr)); } + return std::make_unique
(std::move(results)); } diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh new file mode 100644 index 00000000000..41213dac58b --- /dev/null +++ b/cpp/src/strings/split/split.cuh @@ -0,0 +1,403 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace cudf::strings::detail { + +/** + * @brief Base class for delimiter-based tokenizers + * + * These are common methods used by both split and rsplit tokenizer functors. + * + * The Derived class is required to implement the `process_tokens` function. + */ +template +struct base_split_tokenizer { + __device__ char const* get_base_ptr() const + { + return d_strings.child(strings_column_view::chars_column_index).data(); + } + + __device__ string_view const get_string(size_type idx) const + { + return d_strings.element(idx); + } + + __device__ bool is_valid(size_type idx) const { return d_strings.is_valid(idx); } + + /** + * @brief Returns `true` if the byte at `idx` is the start of the delimiter + * + * @param idx Index of a byte in the chars column + * @param d_offsets Offsets values to locate the chars ranges + * @param chars_bytes Total number of characters to process + * @return true if delimiter is found starting at position `idx` + */ + __device__ bool is_delimiter(size_type idx, + size_type const* d_offsets, + size_type chars_bytes) const + { + auto const d_chars = get_base_ptr() + d_offsets[0]; + if (idx + d_delimiter.size_bytes() > chars_bytes) { return false; } + return d_delimiter.compare(d_chars + idx, d_delimiter.size_bytes()) == 0; + } + + /** + * @brief This counts the tokens for strings that contain delimiters + * + * Counting tokens is the same regardless if counting from the left + * or from the right. This logic counts from the left which is simpler. + * The count will be truncated appropriately to the max_tokens value. + * + * @param idx Index of input string + * @param d_positions Start positions of all the delimiters + * @param d_delimiter_offsets Offsets per string to delimiters in d_positions + */ + __device__ size_type count_tokens(size_type idx, + size_type const* d_positions, + size_type const* d_delimiter_offsets) const + { + if (!is_valid(idx)) { return 0; } + + auto const delim_size = d_delimiter.size_bytes(); + auto const d_str = get_string(idx); + auto const d_str_end = d_str.data() + d_str.size_bytes(); + auto const base_ptr = get_base_ptr() + delim_size - 1; + auto const delimiters = + cudf::device_span(d_positions + d_delimiter_offsets[idx], + d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]); + + size_type token_count = 1; // all strings will have at least one token + size_type last_pos = delimiters[0] - delim_size; + for (auto d_pos : delimiters) { + // delimiter must fit in string && overlapping delimiters are ignored + if (((base_ptr + d_pos) < d_str_end) && ((d_pos - last_pos) >= delim_size)) { + ++token_count; + last_pos = d_pos; + } + } + // number of tokens is capped to max_tokens + return ((max_tokens > 0) && (token_count > max_tokens)) ? max_tokens : token_count; + } + + /** + * @brief This will create tokens around each delimiter honoring the string boundaries + * in which the delimiter resides + * + * Each token is placed in `d_all_tokens` so they align consecutively + * with other tokens for the same output column. + * + * The actual token extraction is performed in the subclass process_tokens() function. + * + * @param idx Index of the string to tokenize + * @param d_tokens_offsets Token offsets for each string + * @param d_positions The beginning byte position of each delimiter + * @param d_delimiter_offsets Offsets to d_positions to each delimiter set per string + * @param d_all_tokens All output tokens for the strings column + */ + __device__ void get_tokens(size_type idx, + size_type const* d_tokens_offsets, + size_type const* d_positions, + size_type const* d_delimiter_offsets, + string_index_pair* d_all_tokens) const + { + auto const d_tokens = // this string's tokens output + cudf::device_span(d_all_tokens + d_tokens_offsets[idx], + d_tokens_offsets[idx + 1] - d_tokens_offsets[idx]); + + if (!is_valid(idx)) { return; } + + auto const d_str = get_string(idx); + + // max_tokens already included in token counts + if (d_tokens.size() == 1) { + d_tokens[0] = string_index_pair{d_str.data(), d_str.size_bytes()}; + return; + } + + auto const delimiters = + cudf::device_span(d_positions + d_delimiter_offsets[idx], + d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]); + + auto& derived = static_cast(*this); + derived.process_tokens(d_str, delimiters, d_tokens); + } + + base_split_tokenizer(column_device_view const& d_strings, + string_view const& d_delimiter, + size_type max_tokens) + : d_strings(d_strings), d_delimiter(d_delimiter), max_tokens(max_tokens) + { + } + + protected: + column_device_view const d_strings; // strings to split + string_view const d_delimiter; // delimiter for split + size_type max_tokens; // maximum number of tokens to identify +}; + +/** + * @brief The tokenizer functions for forward splitting + */ +struct split_tokenizer_fn : base_split_tokenizer { + /** + * @brief This will create tokens around each delimiter honoring the string boundaries + * + * The tokens are processed from the beginning of each string ignoring overlapping + * delimiters and honoring the `max_tokens` value. + * + * @param d_str String to tokenize + * @param d_delimiters Positions of delimiters for this string + * @param d_tokens Output vector to store tokens for this string + */ + __device__ void process_tokens(string_view const d_str, + device_span d_delimiters, + device_span d_tokens) const + { + auto const base_ptr = get_base_ptr(); // d_positions values based on this + auto str_ptr = d_str.data(); + auto const str_end = str_ptr + d_str.size_bytes(); // end of the string + auto const token_count = static_cast(d_tokens.size()); + auto const delim_size = d_delimiter.size_bytes(); + + // build the index-pair of each token for this string + size_type token_idx = 0; + for (auto d_pos : d_delimiters) { + auto const next_delim = base_ptr + d_pos; + if (next_delim < str_ptr || ((next_delim + delim_size) > str_end)) { continue; } + auto const end_ptr = (token_idx + 1 < token_count) ? next_delim : str_end; + + // store the token into the output vector + d_tokens[token_idx++] = + string_index_pair{str_ptr, static_cast(thrust::distance(str_ptr, end_ptr))}; + + // setup for next token + str_ptr = end_ptr + delim_size; + } + // include anything leftover + if (token_idx < token_count) { + d_tokens[token_idx] = + string_index_pair{str_ptr, static_cast(thrust::distance(str_ptr, str_end))}; + } + } + + split_tokenizer_fn(column_device_view const& d_strings, + string_view const& d_delimiter, + size_type max_tokens) + : base_split_tokenizer(d_strings, d_delimiter, max_tokens) + { + } +}; + +/** + * @brief The tokenizer functions for backwards splitting + * + * Same as split_tokenizer_fn except delimiters are searched from the end of each string. + */ +struct rsplit_tokenizer_fn : base_split_tokenizer { + /** + * @brief This will create tokens around each delimiter honoring the string boundaries + * + * The tokens are processed from the end of each string ignoring overlapping + * delimiters and honoring the `max_tokens` value. + * + * @param d_str String to tokenize + * @param d_delimiters Positions of delimiters for this string + * @param d_tokens Output vector to store tokens for this string + */ + __device__ void process_tokens(string_view const d_str, + device_span d_delimiters, + device_span d_tokens) const + { + auto const base_ptr = get_base_ptr(); // d_positions values are based on this ptr + auto const str_begin = d_str.data(); // beginning of the string + auto const token_count = static_cast(d_tokens.size()); + auto const delim_count = static_cast(d_delimiters.size()); + auto const delim_size = d_delimiter.size_bytes(); + + // build the index-pair of each token for this string + auto str_ptr = str_begin + d_str.size_bytes(); + size_type token_idx = 0; + for (auto d = delim_count - 1; d >= 0; --d) { // read right-to-left + auto const prev_delim = base_ptr + d_delimiters[d] + delim_size; + if (prev_delim > str_ptr || ((prev_delim - delim_size) < str_begin)) { continue; } + auto const start_ptr = (token_idx + 1 < token_count) ? prev_delim : str_begin; + + // store the token into the output vector right-to-left + d_tokens[token_count - token_idx - 1] = + string_index_pair{start_ptr, static_cast(thrust::distance(start_ptr, str_ptr))}; + + // setup for next token + str_ptr = start_ptr - delim_size; + ++token_idx; + } + // include anything leftover (rightover?) + if (token_idx < token_count) { + d_tokens[0] = + string_index_pair{str_begin, static_cast(thrust::distance(str_begin, str_ptr))}; + } + } + + rsplit_tokenizer_fn(column_device_view const& d_strings, + string_view const& d_delimiter, + size_type max_tokens) + : base_split_tokenizer(d_strings, d_delimiter, max_tokens) + { + } +}; + +/** + * @brief Helper function used by split/rsplit and split_record/rsplit_record + * + * This function returns all the token/split positions within the input column as processed by + * the given tokenizer. It also returns the offsets for each set of tokens identified per string. + * + * @tparam Tokenizer Type of the tokenizer object + * + * @param input The input column of strings to split + * @param tokenizer Object used for counting and identifying delimiters and tokens + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned objects' device memory. + */ +template +std::pair, rmm::device_uvector> split_helper( + strings_column_view const& input, + Tokenizer tokenizer, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const strings_count = input.size(); + auto const chars_bytes = + cudf::detail::get_value(input.offsets(), input.offset() + strings_count, stream) - + cudf::detail::get_value(input.offsets(), input.offset(), stream); + + auto d_offsets = input.offsets_begin(); + + // count the number of delimiters in the entire column + auto const delimiter_count = + thrust::count_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(chars_bytes), + [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) { + return tokenizer.is_delimiter(idx, d_offsets, chars_bytes); + }); + // Create a vector of every delimiter position in the chars column. + // These may include overlapping or otherwise out-of-bounds delimiters which + // will be resolved during token processing. + auto delimiter_positions = rmm::device_uvector(delimiter_count, stream); + auto d_positions = delimiter_positions.data(); + auto const copy_end = + thrust::copy_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(chars_bytes), + delimiter_positions.begin(), + [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) { + return tokenizer.is_delimiter(idx, d_offsets, chars_bytes); + }); + + // create a vector of offsets to each string's delimiter set within delimiter_positions + auto const delimiter_offsets = [&] { + // first, create a vector of string indices for each delimiter + auto string_indices = rmm::device_uvector(delimiter_count, stream); + thrust::upper_bound(rmm::exec_policy(stream), + d_offsets, + d_offsets + strings_count, + delimiter_positions.begin(), + copy_end, + string_indices.begin()); + + // compute delimiter offsets per string + auto delimiter_offsets = rmm::device_uvector(strings_count + 1, stream); + auto d_delimiter_offsets = delimiter_offsets.data(); + + // memset to zero-out the delimiter counts for any null-entries or strings with no delimiters + CUDF_CUDA_TRY(cudaMemsetAsync( + d_delimiter_offsets, 0, delimiter_offsets.size() * sizeof(size_type), stream.value())); + + // next, count the number of delimiters per string + auto d_string_indices = string_indices.data(); // identifies strings with delimiters only + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + delimiter_count, + [d_string_indices, d_delimiter_offsets] __device__(size_type idx) { + auto const str_idx = d_string_indices[idx] - 1; + atomicAdd(d_delimiter_offsets + str_idx, 1); + }); + // finally, convert the delimiter counts into offsets + thrust::exclusive_scan(rmm::exec_policy(stream), + delimiter_offsets.begin(), + delimiter_offsets.end(), + delimiter_offsets.begin()); + return delimiter_offsets; + }(); + auto const d_delimiter_offsets = delimiter_offsets.data(); + + // compute the number of tokens per string + auto token_counts = rmm::device_uvector(strings_count, stream); + thrust::transform( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + token_counts.begin(), + [tokenizer, d_positions, d_delimiter_offsets] __device__(size_type idx) -> size_type { + return tokenizer.count_tokens(idx, d_positions, d_delimiter_offsets); + }); + + // create offsets from the counts for return to the caller + auto offsets = std::get<0>( + cudf::detail::make_offsets_child_column(token_counts.begin(), token_counts.end(), stream, mr)); + auto const total_tokens = + cudf::detail::get_value(offsets->view(), strings_count, stream); + auto const d_tokens_offsets = offsets->view().data(); + + // build a vector of all the token positions for all the strings + auto tokens = rmm::device_uvector(total_tokens, stream); + auto d_tokens = tokens.data(); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + strings_count, + [tokenizer, d_tokens_offsets, d_positions, d_delimiter_offsets, d_tokens] __device__( + size_type idx) { + tokenizer.get_tokens(idx, d_tokens_offsets, d_positions, d_delimiter_offsets, d_tokens); + }); + + return std::make_pair(std::move(offsets), std::move(tokens)); +} + +} // namespace cudf::strings::detail diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu index d935ad0b1da..5b79fdefb5a 100644 --- a/cpp/src/strings/split/split_record.cu +++ b/cpp/src/strings/split/split_record.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "split.cuh" + #include #include #include @@ -23,14 +25,12 @@ #include #include #include -#include #include #include #include #include -#include #include #include @@ -38,108 +38,43 @@ namespace cudf { namespace strings { namespace detail { -using string_index_pair = thrust::pair; - namespace { -enum class Dir { FORWARD, BACKWARD }; - -/** - * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`. - * - * The number of tokens is the same regardless if counting from the beginning - * or the end of the string. - */ -struct token_counter_fn { - column_device_view const d_strings; // strings to split - string_view const d_delimiter; // delimiter for split - size_type const max_tokens = std::numeric_limits::max(); - - __device__ size_type operator()(size_type idx) const - { - if (d_strings.is_null(idx)) { return 0; } - - auto const d_str = d_strings.element(idx); - size_type token_count = 0; - size_type start_pos = 0; - while (token_count < max_tokens - 1) { - auto const delimiter_pos = d_str.find(d_delimiter, start_pos); - if (delimiter_pos == string_view::npos) break; - token_count++; - start_pos = delimiter_pos + d_delimiter.length(); - } - return token_count + 1; // always at least one token - } -}; - -/** - * @brief Identify the tokens from the `idx'th` string element of `d_strings`. - */ -template -struct token_reader_fn { - column_device_view const d_strings; // strings to split - string_view const d_delimiter; // delimiter for split - int32_t* d_token_offsets{}; // for locating tokens in d_tokens - string_index_pair* d_tokens{}; - - __device__ string_index_pair resolve_token(string_view const& d_str, - size_type start_pos, - size_type end_pos, - size_type delimiter_pos) const - { - if (dir == Dir::FORWARD) { - auto const byte_offset = d_str.byte_offset(start_pos); - return string_index_pair{d_str.data() + byte_offset, - d_str.byte_offset(delimiter_pos) - byte_offset}; - } else { - auto const byte_offset = d_str.byte_offset(delimiter_pos + d_delimiter.length()); - return string_index_pair{d_str.data() + byte_offset, - d_str.byte_offset(end_pos) - byte_offset}; - } +template +std::unique_ptr split_record_fn(strings_column_view const& input, + Tokenizer tokenizer, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (input.is_empty()) { return make_empty_column(type_id::LIST); } + if (input.size() == input.null_count()) { + auto offsets = std::make_unique(input.offsets(), stream, mr); + auto results = make_empty_column(type_id::STRING); + return make_lists_column(input.size(), + std::move(offsets), + std::move(results), + input.null_count(), + copy_bitmask(input.parent(), stream, mr), + stream, + mr); } - __device__ void operator()(size_type idx) - { - if (d_strings.is_null(idx)) { return; } + // builds the offsets and the vector of all tokens + auto [offsets, tokens] = split_helper(input, tokenizer, stream, mr); - auto const token_offset = d_token_offsets[idx]; - auto const token_count = d_token_offsets[idx + 1] - token_offset; - auto d_result = d_tokens + token_offset; - auto const d_str = d_strings.element(idx); - if (d_str.empty()) { - // Pandas str.split("") for non-whitespace delimiter is an empty string - *d_result = string_index_pair{"", 0}; - return; - } + // build a strings column from the tokens + auto strings_child = make_strings_column(tokens.begin(), tokens.end(), stream, mr); - size_type token_idx = 0; - size_type start_pos = 0; // updates only if moving forward - size_type end_pos = d_str.length(); // updates only if moving backward - while (token_idx < token_count - 1) { - auto const delimiter_pos = dir == Dir::FORWARD ? d_str.find(d_delimiter, start_pos) - : d_str.rfind(d_delimiter, start_pos, end_pos); - if (delimiter_pos == string_view::npos) break; - auto const token = resolve_token(d_str, start_pos, end_pos, delimiter_pos); - if (dir == Dir::FORWARD) { - d_result[token_idx] = token; - start_pos = delimiter_pos + d_delimiter.length(); - } else { - d_result[token_count - 1 - token_idx] = token; - end_pos = delimiter_pos; - } - token_idx++; - } + return make_lists_column(input.size(), + std::move(offsets), + std::move(strings_child), + input.null_count(), + copy_bitmask(input.parent(), stream, mr), + stream, + mr); +} - // set last token to remainder of the string - if (dir == Dir::FORWARD) { - auto const offset_bytes = d_str.byte_offset(start_pos); - d_result[token_idx] = - string_index_pair{d_str.data() + offset_bytes, d_str.byte_offset(end_pos) - offset_bytes}; - } else { - d_result[0] = string_index_pair{d_str.data(), d_str.byte_offset(end_pos)}; - } - } -}; +enum class Dir { FORWARD, BACKWARD }; /** * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`. @@ -196,7 +131,7 @@ struct whitespace_token_reader_fn { whitespace_string_tokenizer tokenizer(d_str, dir != Dir::FORWARD); size_type token_idx = 0; position_pair token{0, 0}; - if (dir == Dir::FORWARD) { + if constexpr (dir == Dir::FORWARD) { while (tokenizer.next_token() && (token_idx < token_count)) { token = tokenizer.get_token(); d_result[token_idx++] = @@ -224,11 +159,11 @@ struct whitespace_token_reader_fn { // The output is one list item per string template -std::unique_ptr split_record_fn(strings_column_view const& strings, - TokenCounter counter, - TokenReader reader, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr whitespace_split_record_fn(strings_column_view const& strings, + TokenCounter counter, + TokenReader reader, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // create offsets column by counting the number of tokens per string auto strings_count = strings.size(); @@ -244,7 +179,7 @@ std::unique_ptr split_record_fn(strings_column_view const& strings, rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets); // last entry is the total number of tokens to be generated - auto total_tokens = cudf::detail::get_value(offsets->view(), strings_count, stream); + auto total_tokens = cudf::detail::get_value(offsets->view(), strings_count, stream); // split each string into an array of index-pair values rmm::device_uvector tokens(total_tokens, stream); reader.d_token_offsets = d_offsets; @@ -277,18 +212,21 @@ std::unique_ptr split_record(strings_column_view const& strings, auto d_strings_column_ptr = column_device_view::create(strings.parent(), stream); if (delimiter.size() == 0) { - return split_record_fn(strings, - whitespace_token_counter_fn{*d_strings_column_ptr, max_tokens}, - whitespace_token_reader_fn{*d_strings_column_ptr, max_tokens}, - stream, - mr); + return whitespace_split_record_fn( + strings, + whitespace_token_counter_fn{*d_strings_column_ptr, max_tokens}, + whitespace_token_reader_fn{*d_strings_column_ptr, max_tokens}, + stream, + mr); } else { string_view d_delimiter(delimiter.data(), delimiter.size()); - return split_record_fn(strings, - token_counter_fn{*d_strings_column_ptr, d_delimiter, max_tokens}, - token_reader_fn{*d_strings_column_ptr, d_delimiter}, - stream, - mr); + if (dir == Dir::FORWARD) { + return split_record_fn( + strings, split_tokenizer_fn{*d_strings_column_ptr, d_delimiter, max_tokens}, stream, mr); + } else { + return split_record_fn( + strings, rsplit_tokenizer_fn{*d_strings_column_ptr, d_delimiter, max_tokens}, stream, mr); + } } } diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 21752196430..48f69e3ecd3 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -357,6 +357,10 @@ struct ParquetWriterSchemaTest : public ParquetWriterTest { auto type() { return cudf::data_type{cudf::type_to_id()}; } }; +template +struct ParquetReaderSourceTest : public ParquetReaderTest { +}; + // Declare typed test cases // TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352 using SupportedTypes = cudf::test::Types; @@ -369,6 +373,8 @@ using SupportedTimestampTypes = cudf::test::Types; TYPED_TEST_SUITE(ParquetWriterTimestampTypeTest, SupportedTimestampTypes); TYPED_TEST_SUITE(ParquetWriterSchemaTest, cudf::test::AllTypes); +using ByteLikeTypes = cudf::test::Types; +TYPED_TEST_SUITE(ParquetReaderSourceTest, ByteLikeTypes); // Base test fixture for chunked writer tests struct ParquetChunkedWriterTest : public cudf::test::BaseFixture { @@ -5113,4 +5119,72 @@ TEST_P(ParquetSizedTest, DictionaryTest) EXPECT_EQ(nbits, GetParam()); } +TYPED_TEST(ParquetReaderSourceTest, BufferSourceTypes) +{ + using T = TypeParam; + + srand(31337); + auto table = create_random_fixed_table(5, 5, true); + + std::vector out_buffer; + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), *table); + cudf::io::write_parquet(out_opts); + + { + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info( + cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size()))); + const auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*table, result.tbl->view()); + } + + { + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info(cudf::host_span( + reinterpret_cast(out_buffer.data()), out_buffer.size()))); + const auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*table, result.tbl->view()); + } +} + +TYPED_TEST(ParquetReaderSourceTest, BufferSourceArrayTypes) +{ + using T = TypeParam; + + srand(31337); + auto table = create_random_fixed_table(5, 5, true); + + std::vector out_buffer; + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), *table); + cudf::io::write_parquet(out_opts); + + auto full_table = cudf::concatenate(std::vector({*table, *table})); + + { + auto spans = std::vector>{ + cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size()), + cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size())}; + cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder( + cudf::io::source_info(cudf::host_span>(spans.data(), spans.size()))); + const auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*full_table, result.tbl->view()); + } + + { + auto spans = std::vector>{ + cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size()), + cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size())}; + cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder( + cudf::io::source_info(cudf::host_span>(spans.data(), spans.size()))); + const auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*full_table, result.tbl->view()); + } +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index 73d5adab427..714c1ad416a 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -308,6 +308,82 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); } +TEST_F(StringsSplitTest, MultiByteDelimiters) +{ + // Overlapping delimiters + auto input = + cudf::test::strings_column_wrapper({"u::", "w:::x", "y::::z", "::a", ":::b", ":::c:::"}); + auto view = cudf::strings_column_view(input); + using LCW = cudf::test::lists_column_wrapper; + { + auto result = cudf::strings::split_record(view, cudf::string_scalar("::")); + auto expected_left = LCW({LCW{"u", ""}, + LCW{"w", ":x"}, + LCW{"y", "", "z"}, + LCW{"", "a"}, + LCW{"", ":b"}, + LCW{"", ":c", ":"}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_left); + result = cudf::strings::rsplit_record(view, cudf::string_scalar("::")); + auto expected_right = LCW({LCW{"u", ""}, + LCW{"w:", "x"}, + LCW{"y", "", "z"}, + LCW{"", "a"}, + LCW{":", "b"}, + LCW{":", "c:", ""}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_right); + } + { + auto result = cudf::strings::split(view, cudf::string_scalar("::")); + + auto c0 = cudf::test::strings_column_wrapper({"u", "w", "y", "", "", ""}); + auto c1 = cudf::test::strings_column_wrapper({"", ":x", "", "a", ":b", ":c"}); + auto c2 = cudf::test::strings_column_wrapper({"", "", "z", "", "", ":"}, {0, 0, 1, 0, 0, 1}); + std::vector> expected_columns; + expected_columns.push_back(c0.release()); + expected_columns.push_back(c1.release()); + expected_columns.push_back(c2.release()); + auto expected_left = std::make_unique(std::move(expected_columns)); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected_left); + + result = cudf::strings::rsplit(view, cudf::string_scalar("::")); + + c0 = cudf::test::strings_column_wrapper({"u", "w:", "y", "", ":", ":"}); + c1 = cudf::test::strings_column_wrapper({"", "x", "", "a", "b", "c:"}); + c2 = cudf::test::strings_column_wrapper({"", "", "z", "", "", ""}, {0, 0, 1, 0, 0, 1}); + expected_columns.push_back(c0.release()); + expected_columns.push_back(c1.release()); + expected_columns.push_back(c2.release()); + auto expected_right = std::make_unique(std::move(expected_columns)); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected_right); + } + + // Delimiters that span across adjacent strings + input = cudf::test::strings_column_wrapper({"{a=1}:{b=2}:", "{c=3}", ":{}:{}"}); + view = cudf::strings_column_view(input); + { + auto result = cudf::strings::split_record(view, cudf::string_scalar("}:{")); + auto expected = LCW({LCW{"{a=1", "b=2}:"}, LCW{"{c=3}"}, LCW{":{", "}"}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); + result = cudf::strings::rsplit_record(view, cudf::string_scalar("}:{")); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); + } + { + auto result = cudf::strings::split(view, cudf::string_scalar("}:{")); + + auto c0 = cudf::test::strings_column_wrapper({"{a=1", "{c=3}", ":{"}); + auto c1 = cudf::test::strings_column_wrapper({"b=2}:", "", "}"}, {1, 0, 1}); + std::vector> expected_columns; + expected_columns.push_back(c0.release()); + expected_columns.push_back(c1.release()); + auto expected = std::make_unique(std::move(expected_columns)); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected); + + result = cudf::strings::rsplit(view, cudf::string_scalar("}:{")); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected); + } +} + TEST_F(StringsSplitTest, SplitRegex) { std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""}; diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index db64dcb08c7..937077c89c9 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -1045,8 +1045,8 @@ void decimal128Cv() { BigInteger bigInteger2 = new BigInteger("14"); BigInteger bigInteger3 = new BigInteger("152345742357340573405745"); final BigInteger[] bigInts = new BigInteger[] {bigInteger1, bigInteger2, bigInteger3}; - try (ColumnVector v = ColumnVector.decimalFromBigInt(-dec32Scale1, bigInts)) { - HostColumnVector hostColumnVector = v.copyToHost(); + try (ColumnVector v = ColumnVector.decimalFromBigInt(-dec32Scale1, bigInts); + HostColumnVector hostColumnVector = v.copyToHost()) { assertEquals(bigInteger1, hostColumnVector.getBigDecimal(0).unscaledValue()); assertEquals(bigInteger2, hostColumnVector.getBigDecimal(1).unscaledValue()); assertEquals(bigInteger3, hostColumnVector.getBigDecimal(2).unscaledValue()); diff --git a/java/src/test/java/ai/rapids/cudf/ScalarTest.java b/java/src/test/java/ai/rapids/cudf/ScalarTest.java index 86c340bb321..f4b652a7d03 100644 --- a/java/src/test/java/ai/rapids/cudf/ScalarTest.java +++ b/java/src/test/java/ai/rapids/cudf/ScalarTest.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,7 +41,7 @@ public void testDoubleClose() { } @Test - public void testIncRef() { + public void testIncRefAndDoubleFree() { Scalar s = Scalar.fromNull(DType.INT32); try (Scalar ignored1 = s) { try (Scalar ignored2 = s.incRefCount()) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 4f00bc7493d..c31bcf4f78d 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -244,7 +244,7 @@ void testOrderByWithNullsAndStrings() { } @Test - void testTableCreationIncreasesRefCount() { + void testTableCreationIncreasesRefCountWithDoubleFree() { //tests the Table increases the refcount on column vectors assertThrows(IllegalStateException.class, () -> { try (ColumnVector v1 = ColumnVector.build(DType.INT32, 5, Range.appendInts(5)); diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index ac058b1d9a1..c528eb69575 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -72,15 +72,6 @@ endif() include(rapids-cython) if(NOT cudf_FOUND) - # TODO: This will not be necessary once we upgrade to CMake 3.22, which will pull in the required - # languages for the C++ project even if this project does not require those languages. - include(rapids-cuda) - rapids_cuda_init_architectures(cudf-python) - enable_language(CUDA) - # Since cudf only enables CUDA optionally we need to manually include the file that - # rapids_cuda_init_architectures relies on `project` including. - include("${CMAKE_PROJECT_cudf-python_INCLUDE}") - set(BUILD_TESTS OFF) set(BUILD_BENCHMARKS OFF) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 965b413e84f..fb1bcf6d673 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -607,7 +607,7 @@ def _scatter_by_slice( start, stop, step = key.indices(len(self)) if start >= stop: return None - num_keys = (stop - start) // step + num_keys = len(range(start, stop, step)) self._check_scatter_key_length(num_keys, value) diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 6b2fb90e95b..d59226ee17a 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -347,3 +347,13 @@ def test_series_setitem_upcasting_string_value(): assert_eq(pd.Series([10, 0, 0], dtype=int), sr) with pytest.raises(ValueError): sr[0] = "non-integer" + + +def test_scatter_by_slice_with_start_and_step(): + source = pd.Series([1, 2, 3, 4, 5]) + csource = cudf.from_pandas(source) + target = pd.Series([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + ctarget = cudf.from_pandas(target) + target[1::2] = source + ctarget[1::2] = csource + assert_eq(target, ctarget) diff --git a/python/cudf/udf_cpp/CMakeLists.txt b/python/cudf/udf_cpp/CMakeLists.txt index ebf47ee8469..0c07236682f 100644 --- a/python/cudf/udf_cpp/CMakeLists.txt +++ b/python/cudf/udf_cpp/CMakeLists.txt @@ -18,8 +18,6 @@ include(rapids-cmake) include(rapids-cpm) include(rapids-find) -rapids_cuda_init_architectures(udf-cpp) - rapids_cpm_init() rapids_find_package(