diff --git a/build.sh b/build.sh
index e6169b2036b..bee66d819b4 100755
--- a/build.sh
+++ b/build.sh
@@ -315,9 +315,11 @@ if buildAll || hasArg libcudf; then
LIBCUDF_FS=$(ls -lh ${LIB_BUILD_DIR}/libcudf.so | awk '{print $5}')
MSG="${MSG}
libcudf.so size: $LIBCUDF_FS"
fi
- echo "$MSG"
- python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${LIB_BUILD_DIR}/ninja_log.html
- cp ${LIB_BUILD_DIR}/.ninja_log ${LIB_BUILD_DIR}/ninja.log
+ BMR_DIR=${RAPIDS_ARTIFACTS_DIR:-"${LIB_BUILD_DIR}"}
+ echo "Metrics output dir: [$BMR_DIR]"
+ mkdir -p ${BMR_DIR}
+ python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${BMR_DIR}/ninja_log.html
+ cp ${LIB_BUILD_DIR}/.ninja_log ${BMR_DIR}/ninja.log
fi
if [[ ${INSTALL_TARGET} != "" ]]; then
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 3b45b3ce2e7..b68c2bdbef6 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -1,5 +1,5 @@
#!/bin/bash
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
set -euo pipefail
@@ -14,3 +14,29 @@ rapids-logger "Begin cpp build"
rapids-mamba-retry mambabuild conda/recipes/libcudf
rapids-upload-conda-to-s3 cpp
+
+echo "++++++++++++++++++++++++++++++++++++++++++++"
+
+if [[ -d $RAPIDS_ARTIFACTS_DIR ]]; then
+ ls -l ${RAPIDS_ARTIFACTS_DIR}
+fi
+
+echo "++++++++++++++++++++++++++++++++++++++++++++"
+
+FILE=${RAPIDS_ARTIFACTS_DIR}/ninja.log
+if [[ -f $FILE ]]; then
+ echo -e "\x1B[33;1m\x1B[48;5;240m Ninja log for this build available at the following link \x1B[0m"
+ UPLOAD_NAME=cpp_cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch).ninja.log
+ rapids-upload-to-s3 "${UPLOAD_NAME}" "${FILE}"
+fi
+
+echo "++++++++++++++++++++++++++++++++++++++++++++"
+
+FILE=${RAPIDS_ARTIFACTS_DIR}/ninja_log.html
+if [[ -f $FILE ]]; then
+ echo -e "\x1B[33;1m\x1B[48;5;240m Build Metrics Report for this build available at the following link \x1B[0m"
+ UPLOAD_NAME=cpp_cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch).BuildMetricsReport.html
+ rapids-upload-to-s3 "${UPLOAD_NAME}" "${FILE}"
+fi
+
+echo "++++++++++++++++++++++++++++++++++++++++++++"
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 0be72486319..983a63d4ce9 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -66,21 +66,5 @@ for gt in "$CONDA_PREFIX"/bin/gtests/{libcudf,libcudf_kafka}/* ; do
fi
done
-if [[ "${RAPIDS_BUILD_TYPE}" == "nightly" ]]; then
- rapids-logger "Memcheck gtests with rmm_mode=cuda"
- export GTEST_CUDF_RMM_MODE=cuda
- COMPUTE_SANITIZER_CMD="compute-sanitizer --tool memcheck"
- for gt in "$CONDA_PREFIX"/bin/gtests/{libcudf,libcudf_kafka}/* ; do
- test_name=$(basename ${gt})
- if [[ "$test_name" == "ERROR_TEST" ]]; then
- continue
- fi
- echo "Running gtest $test_name"
- ${COMPUTE_SANITIZER_CMD} ${gt} | tee "${RAPIDS_TESTS_DIR}${test_name}.cs.log"
- done
- unset GTEST_CUDF_RMM_MODE
- # TODO: test-results/*.cs.log are processed in CI
-fi
-
rapids-logger "Test script exiting with value: $EXITCODE"
exit ${EXITCODE}
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index b0b86b427b7..fbfcf6e71a2 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -27,6 +27,7 @@ build:
- SCCACHE_IDLE_TIMEOUT
- AWS_ACCESS_KEY_ID
- AWS_SECRET_ACCESS_KEY
+ - RAPIDS_ARTIFACTS_DIR
requirements:
build:
diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp
index 0f005c462cc..1b3f4190680 100644
--- a/cpp/benchmarks/string/split.cpp
+++ b/cpp/benchmarks/string/split.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -57,12 +57,12 @@ static void BM_split(benchmark::State& state, split_type rt)
static void generate_bench_args(benchmark::internal::Benchmark* b)
{
- int const min_rows = 1 << 12;
- int const max_rows = 1 << 24;
- int const row_mult = 8;
- int const min_rowlen = 1 << 5;
- int const max_rowlen = 1 << 13;
- int const len_mult = 4;
+ int constexpr min_rows = 1 << 12;
+ int constexpr max_rows = 1 << 24;
+ int constexpr row_mult = 8;
+ int constexpr min_rowlen = 1 << 5;
+ int constexpr max_rowlen = 1 << 13;
+ int constexpr len_mult = 2;
for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) {
for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
// avoid generating combinations that exceed the cudf column limit
diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index fd4c049e2fc..a0ef2155f7d 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -112,11 +112,21 @@ class datasource {
/**
* @brief Creates a source from a host memory buffer.
*
+ # @deprecated Since 23.04
+ *
* @param[in] buffer Host buffer object
* @return Constructed datasource object
*/
static std::unique_ptr create(host_buffer const& buffer);
+ /**
+ * @brief Creates a source from a host memory buffer.
+ *
+ * @param[in] buffer Host buffer object
+ * @return Constructed datasource object
+ */
+ static std::unique_ptr create(cudf::host_span buffer);
+
/**
* @brief Creates a source from a device memory buffer.
*
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 06b52563e19..6f97eb768d9 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -150,6 +150,8 @@ struct table_with_metadata {
/**
* @brief Non-owning view of a host memory buffer
*
+ * @deprecated Since 23.04
+ *
* Used to describe buffer input in `source_info` objects.
*/
struct host_buffer {
@@ -166,6 +168,22 @@ struct host_buffer {
host_buffer(const char* data, size_t size) : data(data), size(size) {}
};
+/**
+ * @brief Returns `true` if the type is byte-like, meaning it is reasonable to pass as a pointer to
+ * bytes.
+ *
+ * @tparam T The representation type
+ * @return `true` if the type is considered a byte-like type
+ */
+template
+constexpr inline auto is_byte_like_type()
+{
+ using non_cv_T = std::remove_cv_t;
+ return std::is_same_v || std::is_same_v ||
+ std::is_same_v || std::is_same_v ||
+ std::is_same_v;
+}
+
/**
* @brief Source information for read interfaces
*/
@@ -191,21 +209,70 @@ struct source_info {
/**
* @brief Construct a new source info object for multiple buffers in host memory
*
+ * @deprecated Since 23.04
+ *
* @param host_buffers Input buffers in host memory
*/
- explicit source_info(std::vector const& host_buffers)
- : _type(io_type::HOST_BUFFER), _host_buffers(host_buffers)
+ explicit source_info(std::vector const& host_buffers) : _type(io_type::HOST_BUFFER)
{
+ _host_buffers.reserve(host_buffers.size());
+ std::transform(host_buffers.begin(),
+ host_buffers.end(),
+ std::back_inserter(_host_buffers),
+ [](auto const hb) {
+ return cudf::host_span{
+ reinterpret_cast(hb.data), hb.size};
+ });
}
/**
* @brief Construct a new source info object for a single buffer
*
+ * @deprecated Since 23.04
+ *
* @param host_data Input buffer in host memory
* @param size Size of the buffer
*/
explicit source_info(const char* host_data, size_t size)
- : _type(io_type::HOST_BUFFER), _host_buffers({{host_data, size}})
+ : _type(io_type::HOST_BUFFER),
+ _host_buffers(
+ {cudf::host_span(reinterpret_cast(host_data), size)})
+ {
+ }
+
+ /**
+ * @brief Construct a new source info object for multiple buffers in host memory
+ *
+ * @param host_buffers Input buffers in host memory
+ */
+ template >())>
+ explicit source_info(cudf::host_span> const host_buffers)
+ : _type(io_type::HOST_BUFFER)
+ {
+ if constexpr (not std::is_same_v, std::byte>) {
+ _host_buffers.reserve(host_buffers.size());
+ std::transform(host_buffers.begin(),
+ host_buffers.end(),
+ std::back_inserter(_host_buffers),
+ [](auto const s) {
+ return cudf::host_span{
+ reinterpret_cast(s.data()), s.size()};
+ });
+ } else {
+ _host_buffers.assign(host_buffers.begin(), host_buffers.end());
+ }
+ }
+
+ /**
+ * @brief Construct a new source info object for a single buffer
+ *
+ * @param host_data Input buffer in host memory
+ */
+ template >())>
+ explicit source_info(cudf::host_span host_data)
+ : _type(io_type::HOST_BUFFER),
+ _host_buffers{cudf::host_span(
+ reinterpret_cast(host_data.data()), host_data.size())}
{
}
@@ -289,7 +356,7 @@ struct source_info {
private:
io_type _type = io_type::FILEPATH;
std::vector _filepaths;
- std::vector _host_buffers;
+ std::vector> _host_buffers;
std::vector> _device_buffers;
std::vector _user_sources;
};
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index c2f7b18d443..71d64900398 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -329,10 +329,16 @@ std::unique_ptr datasource::create(const std::string& filepath,
}
std::unique_ptr datasource::create(host_buffer const& buffer)
+{
+ return create(
+ cudf::host_span{reinterpret_cast(buffer.data), buffer.size});
+}
+
+std::unique_ptr datasource::create(cudf::host_span buffer)
{
// Use Arrow IO buffer class for zero-copy reads of host memory
return std::make_unique(std::make_shared(
- reinterpret_cast(buffer.data), buffer.size));
+ reinterpret_cast(buffer.data()), buffer.size()));
}
std::unique_ptr datasource::create(cudf::device_span buffer)
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index c11d7ad47f9..18599fb568a 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
* limitations under the License.
*/
+#include "split.cuh"
+
#include
#include
#include
@@ -31,14 +33,10 @@
#include
#include
-#include
-#include
-#include
#include
#include
#include
#include
-#include
#include
#include
@@ -46,321 +44,8 @@ namespace cudf {
namespace strings {
namespace detail {
-using string_index_pair = thrust::pair;
-
namespace {
-/**
- * @brief Base class for delimiter-based tokenizers.
- *
- * These are common methods used by both split and rsplit tokenizer functors.
- */
-struct base_split_tokenizer {
- __device__ const char* get_base_ptr() const
- {
- return d_strings.child(strings_column_view::chars_column_index).data();
- }
-
- __device__ string_view const get_string(size_type idx) const
- {
- return d_strings.element(idx);
- }
-
- __device__ bool is_valid(size_type idx) const { return d_strings.is_valid(idx); }
-
- /**
- * @brief Initialize token elements for all strings.
- *
- * The process_tokens() only handles creating tokens for strings that contain
- * delimiters. This function will initialize the output tokens for all
- * strings by assigning null entries for null and empty strings and the
- * string itself for strings with no delimiters.
- *
- * The tokens are placed in output order so that all tokens for each output
- * column are stored consecutively in `d_all_tokens`.
- *
- * @param idx Index of string in column
- * @param column_count Number of columns in output
- * @param d_all_tokens Tokens vector for all strings
- */
- __device__ void init_tokens(size_type idx,
- size_type column_count,
- string_index_pair* d_all_tokens) const
- {
- auto d_tokens = d_all_tokens + idx;
- if (is_valid(idx)) {
- auto d_str = get_string(idx);
- *d_tokens = string_index_pair{d_str.data(), d_str.size_bytes()};
- --column_count;
- d_tokens += d_strings.size();
- }
- // this is like fill() but output needs to be strided
- for (size_type col = 0; col < column_count; ++col)
- d_tokens[d_strings.size() * col] = string_index_pair{nullptr, 0};
- }
-
- base_split_tokenizer(column_device_view const& d_strings,
- string_view const& d_delimiter,
- size_type max_tokens)
- : d_strings(d_strings), d_delimiter(d_delimiter), max_tokens(max_tokens)
- {
- }
-
- protected:
- column_device_view const d_strings; // strings to split
- string_view const d_delimiter; // delimiter for split
- size_type max_tokens;
-};
-
-/**
- * @brief The tokenizer functions for split().
- *
- * The methods here count delimiters, tokens, and output token elements
- * for each string in a strings column.
- */
-struct split_tokenizer_fn : base_split_tokenizer {
- /**
- * @brief This will create tokens around each delimiter honoring the string boundaries
- * in which the delimiter resides.
- *
- * Each token is placed in `d_all_tokens` so they align consecutively
- * with other tokens for the same output column.
- * That is, `d_tokens[col * strings_count + string_index]` is the token at column `col`
- * for string at `string_index`.
- *
- * @param idx Index of the delimiter in the chars column
- * @param d_token_counts Token counts for each string
- * @param d_positions The beginning byte position of each delimiter
- * @param positions_count Number of delimiters
- * @param d_indexes Indices of the strings for each delimiter
- * @param d_all_tokens All output tokens for the strings column
- */
- __device__ void process_tokens(size_type idx,
- size_type const* d_token_counts,
- size_type const* d_positions,
- size_type positions_count,
- size_type const* d_indexes,
- string_index_pair* d_all_tokens) const
- {
- size_type str_idx = d_indexes[idx];
- if ((idx > 0) && d_indexes[idx - 1] == str_idx)
- return; // the first delimiter for the string rules them all
- --str_idx; // all of these are off by 1 from the upper_bound call
- size_type token_count = d_token_counts[str_idx]; // max_tokens already included
- const char* const base_ptr = get_base_ptr(); // d_positions values are based on this ptr
- // this string's tokens output
- auto d_tokens = d_all_tokens + str_idx;
- // this string
- const string_view d_str = get_string(str_idx);
- const char* str_ptr = d_str.data(); // beginning of the string
- const char* const str_end_ptr = str_ptr + d_str.size_bytes(); // end of the string
- // build the index-pair of each token for this string
- for (size_type col = 0; col < token_count; ++col) {
- auto next_delim = ((idx + col) < positions_count) // boundary check for delims in last string
- ? (base_ptr + d_positions[idx + col]) // start of next delimiter
- : str_end_ptr; // or end of this string
- auto eptr = (next_delim < str_end_ptr) // make sure delimiter is inside this string
- && (col + 1 < token_count) // and this is not the last token
- ? next_delim
- : str_end_ptr;
- // store the token into the output vector
- d_tokens[col * d_strings.size()] =
- string_index_pair{str_ptr, static_cast(eptr - str_ptr)};
- // point past this delimiter
- str_ptr = eptr + d_delimiter.size_bytes();
- }
- }
-
- /**
- * @brief Returns `true` if the byte at `idx` is the start of the delimiter.
- *
- * @param idx Index of a byte in the chars column.
- * @param d_offsets Offsets values to locate the chars ranges.
- * @param chars_bytes Total number of characters to process.
- * @return true if delimiter is found starting at position `idx`
- */
- __device__ bool is_delimiter(size_type idx, // chars index
- int32_t const* d_offsets,
- size_type chars_bytes) const
- {
- auto d_chars = get_base_ptr() + d_offsets[0];
- if (idx + d_delimiter.size_bytes() > chars_bytes) return false;
- return d_delimiter.compare(d_chars + idx, d_delimiter.size_bytes()) == 0;
- }
-
- /**
- * @brief This counts the tokens for strings that contain delimiters.
- *
- * @param idx Index of a delimiter
- * @param d_positions Start positions of all the delimiters
- * @param positions_count The number of delimiters
- * @param d_indexes Indices of the strings for each delimiter
- * @param d_counts The token counts for all the strings
- */
- __device__ void count_tokens(size_type idx, // delimiter index
- size_type const* d_positions,
- size_type positions_count,
- size_type const* d_indexes,
- size_type* d_counts) const
- {
- size_type str_idx = d_indexes[idx];
- if ((idx > 0) && d_indexes[idx - 1] == str_idx)
- return; // first delimiter found handles all of them for this string
- auto const delim_length = d_delimiter.size_bytes();
- string_view const d_str = get_string(str_idx - 1);
- const char* const base_ptr = get_base_ptr();
- size_type delim_count = 0; // re-count delimiters to compute the token-count
- size_type last_pos = d_positions[idx] - delim_length;
- while ((idx < positions_count) && (d_indexes[idx] == str_idx)) {
- // make sure the whole delimiter is inside the string before counting it
- auto d_pos = d_positions[idx];
- if (((base_ptr + d_pos + delim_length - 1) < (d_str.data() + d_str.size_bytes())) &&
- ((d_pos - last_pos) >= delim_length)) {
- ++delim_count; // only count if the delimiter fits
- last_pos = d_pos; // overlapping delimiters are ignored too
- }
- ++idx;
- }
- // the number of tokens is delim_count+1 but capped to max_tokens
- d_counts[str_idx - 1] =
- ((max_tokens > 0) && (delim_count + 1 > max_tokens)) ? max_tokens : delim_count + 1;
- }
-
- split_tokenizer_fn(column_device_view const& d_strings,
- string_view const& d_delimiter,
- size_type max_tokens)
- : base_split_tokenizer(d_strings, d_delimiter, max_tokens)
- {
- }
-};
-
-/**
- * @brief The tokenizer functions for split().
- *
- * The methods here count delimiters, tokens, and output token elements
- * for each string in a strings column.
- *
- * Same as split_tokenizer_fn except tokens are counted from the end of each string.
- */
-struct rsplit_tokenizer_fn : base_split_tokenizer {
- /**
- * @brief This will create tokens around each delimiter honoring the string boundaries
- * in which the delimiter resides.
- *
- * The tokens are processed from the end of each string so the `max_tokens`
- * is honored correctly.
- *
- * Each token is placed in `d_all_tokens` so they align consecutively
- * with other tokens for the same output column.
- * That is, `d_tokens[col * strings_count + string_index]` is the token at column `col`
- * for string at `string_index`.
- *
- * @param idx Index of the delimiter in the chars column
- * @param d_token_counts Token counts for each string
- * @param d_positions The ending byte position of each delimiter
- * @param positions_count Number of delimiters
- * @param d_indexes Indices of the strings for each delimiter
- * @param d_all_tokens All output tokens for the strings column
- */
- __device__ void process_tokens(size_type idx, // delimiter position index
- size_type const* d_token_counts, // token counts for each string
- size_type const* d_positions, // end of each delimiter
- size_type positions_count, // total number of delimiters
- size_type const* d_indexes, // string indices for each delimiter
- string_index_pair* d_all_tokens) const
- {
- size_type str_idx = d_indexes[idx];
- if ((idx + 1 < positions_count) && d_indexes[idx + 1] == str_idx)
- return; // the last delimiter for the string rules them all
- --str_idx; // all of these are off by 1 from the upper_bound call
- size_type token_count = d_token_counts[str_idx]; // max_tokens already included
- const char* const base_ptr = get_base_ptr(); // d_positions values are based on this ptr
- // this string's tokens output
- auto d_tokens = d_all_tokens + str_idx;
- // this string
- const string_view d_str = get_string(str_idx);
- const char* const str_begin_ptr = d_str.data(); // beginning of the string
- const char* str_ptr = str_begin_ptr + d_str.size_bytes(); // end of the string
- // build the index-pair of each token for this string
- for (size_type col = 0; col < token_count; ++col) {
- auto prev_delim = (idx >= col) // boundary check for delims in first string
- ? (base_ptr + d_positions[idx - col] + 1) // end of prev delimiter
- : str_begin_ptr; // or the start of this string
- auto sptr = (prev_delim > str_begin_ptr) // make sure delimiter is inside the string
- && (col + 1 < token_count) // and this is not the last token
- ? prev_delim
- : str_begin_ptr;
- // store the token into the output -- building the array backwards
- d_tokens[d_strings.size() * (token_count - 1 - col)] =
- string_index_pair{sptr, static_cast(str_ptr - sptr)};
- str_ptr = sptr - d_delimiter.size_bytes(); // get ready for the next prev token
- }
- }
-
- /**
- * @brief Returns `true` if the byte at `idx` is the end of the delimiter.
- *
- * @param idx Index of a byte in the chars column.
- * @param d_offsets Offsets values to locate the chars ranges.
- * @return true if delimiter is found ending at position `idx`
- */
- __device__ bool is_delimiter(size_type idx, int32_t const* d_offsets, size_type) const
- {
- auto delim_length = d_delimiter.size_bytes();
- if (idx < delim_length - 1) return false;
- auto d_chars = get_base_ptr() + d_offsets[0];
- return d_delimiter.compare(d_chars + idx - (delim_length - 1), delim_length) == 0;
- }
-
- /**
- * @brief This counts the tokens for strings that contain delimiters.
- *
- * Token counting starts at the end of the string to honor the `max_tokens`
- * appropriately.
- *
- * @param idx Index of a delimiter
- * @param d_positions End positions of all the delimiters
- * @param positions_count The number of delimiters
- * @param d_indexes Indices of the strings for each delimiter
- * @param d_counts The token counts for all the strings
- */
- __device__ void count_tokens(size_type idx,
- size_type const* d_positions,
- size_type positions_count,
- size_type const* d_indexes,
- size_type* d_counts) const
- {
- size_type str_idx = d_indexes[idx]; // 1-based string index created by upper_bound()
- if ((idx > 0) && d_indexes[idx - 1] == str_idx)
- return; // first delimiter found handles all of them for this string
- auto const delim_length = d_delimiter.size_bytes();
- const string_view d_str = get_string(str_idx - 1); // -1 for 0-based index
- const char* const base_ptr = get_base_ptr();
- size_type delim_count = 0;
- size_type last_pos = d_positions[idx] - delim_length;
- while ((idx < positions_count) && (d_indexes[idx] == str_idx)) {
- // make sure the whole delimiter is inside the string before counting it
- auto d_pos = d_positions[idx];
- if (((base_ptr + d_pos + 1 - delim_length) >= d_str.data()) &&
- ((d_pos - last_pos) >= delim_length)) {
- ++delim_count; // only count if the delimiter fits
- last_pos = d_pos; // overlapping delimiters are also ignored
- }
- ++idx;
- }
- // the number of tokens is delim_count+1 but capped to max_tokens
- d_counts[str_idx - 1] =
- ((max_tokens > 0) && (delim_count + 1 > max_tokens)) ? max_tokens : delim_count + 1;
- }
-
- rsplit_tokenizer_fn(column_device_view const& d_strings,
- string_view const& d_delimiter,
- size_type max_tokens)
- : base_split_tokenizer(d_strings, d_delimiter, max_tokens)
- {
- }
-};
-
/**
* @brief Generic split function called by split() and rsplit().
*
@@ -423,125 +108,42 @@ struct rsplit_tokenizer_fn : base_split_tokenizer {
* @return table of columns for the output of the split
*/
template
-std::unique_ptr split_fn(strings_column_view const& strings_column,
+std::unique_ptr split_fn(strings_column_view const& input,
Tokenizer tokenizer,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
std::vector> results;
- auto const strings_count = strings_column.size();
- if (strings_count == 0) {
- results.push_back(make_empty_column(type_id::STRING));
+ if (input.size() == input.null_count()) {
+ results.push_back(std::make_unique(input.parent(), stream, mr));
return std::make_unique(std::move(results));
}
- auto d_offsets = strings_column.offsets_begin();
- auto const chars_bytes =
- cudf::detail::get_value(
- strings_column.offsets(), strings_column.offset() + strings_count, stream) -
- cudf::detail::get_value(strings_column.offsets(), strings_column.offset(), stream);
+ // builds the offsets and the vector of all tokens
+ auto [offsets, tokens] = split_helper(input, tokenizer, stream, mr);
+ auto const d_offsets = offsets->view().template data();
+ auto const d_tokens = tokens.data();
- // count the number of delimiters in the entire column
- auto const delimiter_count =
- thrust::count_if(rmm::exec_policy(stream),
- thrust::make_counting_iterator(0),
- thrust::make_counting_iterator(chars_bytes),
- [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) {
- return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
- });
-
- // create vector of every delimiter position in the chars column
- rmm::device_uvector delimiter_positions(delimiter_count, stream);
- auto d_positions = delimiter_positions.data();
- auto copy_end = thrust::copy_if(rmm::exec_policy(stream),
- thrust::make_counting_iterator(0),
- thrust::make_counting_iterator(chars_bytes),
- delimiter_positions.begin(),
- [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) {
- return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
- });
-
- // create vector of string indices for each delimiter
- rmm::device_uvector string_indices(delimiter_count, stream); // these will
- auto d_string_indices = string_indices.data(); // be strings that only contain delimiters
- thrust::upper_bound(rmm::exec_policy(stream),
- d_offsets,
- d_offsets + strings_count,
- delimiter_positions.begin(),
- copy_end,
- string_indices.begin());
-
- // compute the number of tokens per string
- rmm::device_uvector token_counts(strings_count, stream);
- auto d_token_counts = token_counts.data();
- // first, initialize token counts for strings without delimiters in them
- thrust::transform(rmm::exec_policy(stream),
- thrust::make_counting_iterator(0),
- thrust::make_counting_iterator(strings_count),
- d_token_counts,
- [tokenizer] __device__(size_type idx) {
- // null are 0, all others 1
- return static_cast(tokenizer.is_valid(idx));
- });
-
- // now compute the number of tokens in each string
- thrust::for_each_n(
+ // compute the maximum number of tokens for any string
+ auto const columns_count = thrust::transform_reduce(
rmm::exec_policy(stream),
thrust::make_counting_iterator(0),
- delimiter_count,
- [tokenizer, d_positions, delimiter_count, d_string_indices, d_token_counts] __device__(
- size_type idx) {
- tokenizer.count_tokens(idx, d_positions, delimiter_count, d_string_indices, d_token_counts);
- });
-
- // the columns_count is the maximum number of tokens for any string
- auto const columns_count = thrust::reduce(
- rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{});
- // boundary case: if no columns, return one null column (custrings issue #119)
- if (columns_count == 0) {
- results.push_back(std::make_unique(
- data_type{type_id::STRING},
- strings_count,
- rmm::device_buffer{0, stream, mr}, // no data
- cudf::detail::create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr),
- strings_count));
- }
+ thrust::make_counting_iterator(input.size()),
+ [d_offsets] __device__(auto idx) -> size_type { return d_offsets[idx + 1] - d_offsets[idx]; },
+ 0,
+ thrust::maximum{});
- // create working area to hold all token positions
- rmm::device_uvector tokens(columns_count * strings_count, stream);
- string_index_pair* d_tokens = tokens.data();
- // initialize the token positions
- // -- accounts for nulls, empty, and strings with no delimiter in them
- thrust::for_each_n(rmm::exec_policy(stream),
- thrust::make_counting_iterator(0),
- strings_count,
- [tokenizer, columns_count, d_tokens] __device__(size_type idx) {
- tokenizer.init_tokens(idx, columns_count, d_tokens);
- });
-
- // get the positions for every token using the delimiter positions
- thrust::for_each_n(
- rmm::exec_policy(stream),
- thrust::make_counting_iterator(0),
- delimiter_count,
- [tokenizer,
- d_token_counts,
- d_positions,
- delimiter_count,
- d_string_indices,
- d_tokens] __device__(size_type idx) {
- tokenizer.process_tokens(
- idx, d_token_counts, d_positions, delimiter_count, d_string_indices, d_tokens);
- });
-
- // Create each column.
- // - Each pair points to the strings for that column for each row.
- // - Create the strings column from the vector using the strings factory.
+ // build strings columns for each token position
for (size_type col = 0; col < columns_count; ++col) {
- auto column_tokens = d_tokens + (col * strings_count);
- results.emplace_back(
- make_strings_column(column_tokens, column_tokens + strings_count, stream, mr));
+ auto itr = cudf::detail::make_counting_transform_iterator(
+ 0, [d_tokens, d_offsets, col] __device__(size_type idx) {
+ auto const offset = d_offsets[idx];
+ auto const token_count = d_offsets[idx + 1] - offset;
+ return (col < token_count) ? d_tokens[offset + col] : string_index_pair{nullptr, 0};
+ });
+ results.emplace_back(make_strings_column(itr, itr + input.size(), stream, mr));
}
+
return std::make_unique(std::move(results));
}
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
new file mode 100644
index 00000000000..41213dac58b
--- /dev/null
+++ b/cpp/src/strings/split/split.cuh
@@ -0,0 +1,403 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+namespace cudf::strings::detail {
+
+/**
+ * @brief Base class for delimiter-based tokenizers
+ *
+ * These are common methods used by both split and rsplit tokenizer functors.
+ *
+ * The Derived class is required to implement the `process_tokens` function.
+ */
+template
+struct base_split_tokenizer {
+ __device__ char const* get_base_ptr() const
+ {
+ return d_strings.child(strings_column_view::chars_column_index).data();
+ }
+
+ __device__ string_view const get_string(size_type idx) const
+ {
+ return d_strings.element(idx);
+ }
+
+ __device__ bool is_valid(size_type idx) const { return d_strings.is_valid(idx); }
+
+ /**
+ * @brief Returns `true` if the byte at `idx` is the start of the delimiter
+ *
+ * @param idx Index of a byte in the chars column
+ * @param d_offsets Offsets values to locate the chars ranges
+ * @param chars_bytes Total number of characters to process
+ * @return true if delimiter is found starting at position `idx`
+ */
+ __device__ bool is_delimiter(size_type idx,
+ size_type const* d_offsets,
+ size_type chars_bytes) const
+ {
+ auto const d_chars = get_base_ptr() + d_offsets[0];
+ if (idx + d_delimiter.size_bytes() > chars_bytes) { return false; }
+ return d_delimiter.compare(d_chars + idx, d_delimiter.size_bytes()) == 0;
+ }
+
+ /**
+ * @brief This counts the tokens for strings that contain delimiters
+ *
+ * Counting tokens is the same regardless if counting from the left
+ * or from the right. This logic counts from the left which is simpler.
+ * The count will be truncated appropriately to the max_tokens value.
+ *
+ * @param idx Index of input string
+ * @param d_positions Start positions of all the delimiters
+ * @param d_delimiter_offsets Offsets per string to delimiters in d_positions
+ */
+ __device__ size_type count_tokens(size_type idx,
+ size_type const* d_positions,
+ size_type const* d_delimiter_offsets) const
+ {
+ if (!is_valid(idx)) { return 0; }
+
+ auto const delim_size = d_delimiter.size_bytes();
+ auto const d_str = get_string(idx);
+ auto const d_str_end = d_str.data() + d_str.size_bytes();
+ auto const base_ptr = get_base_ptr() + delim_size - 1;
+ auto const delimiters =
+ cudf::device_span(d_positions + d_delimiter_offsets[idx],
+ d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]);
+
+ size_type token_count = 1; // all strings will have at least one token
+ size_type last_pos = delimiters[0] - delim_size;
+ for (auto d_pos : delimiters) {
+ // delimiter must fit in string && overlapping delimiters are ignored
+ if (((base_ptr + d_pos) < d_str_end) && ((d_pos - last_pos) >= delim_size)) {
+ ++token_count;
+ last_pos = d_pos;
+ }
+ }
+ // number of tokens is capped to max_tokens
+ return ((max_tokens > 0) && (token_count > max_tokens)) ? max_tokens : token_count;
+ }
+
+ /**
+ * @brief This will create tokens around each delimiter honoring the string boundaries
+ * in which the delimiter resides
+ *
+ * Each token is placed in `d_all_tokens` so they align consecutively
+ * with other tokens for the same output column.
+ *
+ * The actual token extraction is performed in the subclass process_tokens() function.
+ *
+ * @param idx Index of the string to tokenize
+ * @param d_tokens_offsets Token offsets for each string
+ * @param d_positions The beginning byte position of each delimiter
+ * @param d_delimiter_offsets Offsets to d_positions to each delimiter set per string
+ * @param d_all_tokens All output tokens for the strings column
+ */
+ __device__ void get_tokens(size_type idx,
+ size_type const* d_tokens_offsets,
+ size_type const* d_positions,
+ size_type const* d_delimiter_offsets,
+ string_index_pair* d_all_tokens) const
+ {
+ auto const d_tokens = // this string's tokens output
+ cudf::device_span(d_all_tokens + d_tokens_offsets[idx],
+ d_tokens_offsets[idx + 1] - d_tokens_offsets[idx]);
+
+ if (!is_valid(idx)) { return; }
+
+ auto const d_str = get_string(idx);
+
+ // max_tokens already included in token counts
+ if (d_tokens.size() == 1) {
+ d_tokens[0] = string_index_pair{d_str.data(), d_str.size_bytes()};
+ return;
+ }
+
+ auto const delimiters =
+ cudf::device_span(d_positions + d_delimiter_offsets[idx],
+ d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]);
+
+ auto& derived = static_cast(*this);
+ derived.process_tokens(d_str, delimiters, d_tokens);
+ }
+
+ base_split_tokenizer(column_device_view const& d_strings,
+ string_view const& d_delimiter,
+ size_type max_tokens)
+ : d_strings(d_strings), d_delimiter(d_delimiter), max_tokens(max_tokens)
+ {
+ }
+
+ protected:
+ column_device_view const d_strings; // strings to split
+ string_view const d_delimiter; // delimiter for split
+ size_type max_tokens; // maximum number of tokens to identify
+};
+
+/**
+ * @brief The tokenizer functions for forward splitting
+ */
+struct split_tokenizer_fn : base_split_tokenizer {
+ /**
+ * @brief This will create tokens around each delimiter honoring the string boundaries
+ *
+ * The tokens are processed from the beginning of each string ignoring overlapping
+ * delimiters and honoring the `max_tokens` value.
+ *
+ * @param d_str String to tokenize
+ * @param d_delimiters Positions of delimiters for this string
+ * @param d_tokens Output vector to store tokens for this string
+ */
+ __device__ void process_tokens(string_view const d_str,
+ device_span d_delimiters,
+ device_span d_tokens) const
+ {
+ auto const base_ptr = get_base_ptr(); // d_positions values based on this
+ auto str_ptr = d_str.data();
+ auto const str_end = str_ptr + d_str.size_bytes(); // end of the string
+ auto const token_count = static_cast(d_tokens.size());
+ auto const delim_size = d_delimiter.size_bytes();
+
+ // build the index-pair of each token for this string
+ size_type token_idx = 0;
+ for (auto d_pos : d_delimiters) {
+ auto const next_delim = base_ptr + d_pos;
+ if (next_delim < str_ptr || ((next_delim + delim_size) > str_end)) { continue; }
+ auto const end_ptr = (token_idx + 1 < token_count) ? next_delim : str_end;
+
+ // store the token into the output vector
+ d_tokens[token_idx++] =
+ string_index_pair{str_ptr, static_cast(thrust::distance(str_ptr, end_ptr))};
+
+ // setup for next token
+ str_ptr = end_ptr + delim_size;
+ }
+ // include anything leftover
+ if (token_idx < token_count) {
+ d_tokens[token_idx] =
+ string_index_pair{str_ptr, static_cast(thrust::distance(str_ptr, str_end))};
+ }
+ }
+
+ split_tokenizer_fn(column_device_view const& d_strings,
+ string_view const& d_delimiter,
+ size_type max_tokens)
+ : base_split_tokenizer(d_strings, d_delimiter, max_tokens)
+ {
+ }
+};
+
+/**
+ * @brief The tokenizer functions for backwards splitting
+ *
+ * Same as split_tokenizer_fn except delimiters are searched from the end of each string.
+ */
+struct rsplit_tokenizer_fn : base_split_tokenizer {
+ /**
+ * @brief This will create tokens around each delimiter honoring the string boundaries
+ *
+ * The tokens are processed from the end of each string ignoring overlapping
+ * delimiters and honoring the `max_tokens` value.
+ *
+ * @param d_str String to tokenize
+ * @param d_delimiters Positions of delimiters for this string
+ * @param d_tokens Output vector to store tokens for this string
+ */
+ __device__ void process_tokens(string_view const d_str,
+ device_span d_delimiters,
+ device_span d_tokens) const
+ {
+ auto const base_ptr = get_base_ptr(); // d_positions values are based on this ptr
+ auto const str_begin = d_str.data(); // beginning of the string
+ auto const token_count = static_cast(d_tokens.size());
+ auto const delim_count = static_cast(d_delimiters.size());
+ auto const delim_size = d_delimiter.size_bytes();
+
+ // build the index-pair of each token for this string
+ auto str_ptr = str_begin + d_str.size_bytes();
+ size_type token_idx = 0;
+ for (auto d = delim_count - 1; d >= 0; --d) { // read right-to-left
+ auto const prev_delim = base_ptr + d_delimiters[d] + delim_size;
+ if (prev_delim > str_ptr || ((prev_delim - delim_size) < str_begin)) { continue; }
+ auto const start_ptr = (token_idx + 1 < token_count) ? prev_delim : str_begin;
+
+ // store the token into the output vector right-to-left
+ d_tokens[token_count - token_idx - 1] =
+ string_index_pair{start_ptr, static_cast(thrust::distance(start_ptr, str_ptr))};
+
+ // setup for next token
+ str_ptr = start_ptr - delim_size;
+ ++token_idx;
+ }
+ // include anything leftover (rightover?)
+ if (token_idx < token_count) {
+ d_tokens[0] =
+ string_index_pair{str_begin, static_cast(thrust::distance(str_begin, str_ptr))};
+ }
+ }
+
+ rsplit_tokenizer_fn(column_device_view const& d_strings,
+ string_view const& d_delimiter,
+ size_type max_tokens)
+ : base_split_tokenizer(d_strings, d_delimiter, max_tokens)
+ {
+ }
+};
+
+/**
+ * @brief Helper function used by split/rsplit and split_record/rsplit_record
+ *
+ * This function returns all the token/split positions within the input column as processed by
+ * the given tokenizer. It also returns the offsets for each set of tokens identified per string.
+ *
+ * @tparam Tokenizer Type of the tokenizer object
+ *
+ * @param input The input column of strings to split
+ * @param tokenizer Object used for counting and identifying delimiters and tokens
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned objects' device memory.
+ */
+template
+std::pair, rmm::device_uvector> split_helper(
+ strings_column_view const& input,
+ Tokenizer tokenizer,
+ rmm::cuda_stream_view stream,
+ rmm::mr::device_memory_resource* mr)
+{
+ auto const strings_count = input.size();
+ auto const chars_bytes =
+ cudf::detail::get_value(input.offsets(), input.offset() + strings_count, stream) -
+ cudf::detail::get_value(input.offsets(), input.offset(), stream);
+
+ auto d_offsets = input.offsets_begin();
+
+ // count the number of delimiters in the entire column
+ auto const delimiter_count =
+ thrust::count_if(rmm::exec_policy(stream),
+ thrust::make_counting_iterator(0),
+ thrust::make_counting_iterator(chars_bytes),
+ [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) {
+ return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
+ });
+ // Create a vector of every delimiter position in the chars column.
+ // These may include overlapping or otherwise out-of-bounds delimiters which
+ // will be resolved during token processing.
+ auto delimiter_positions = rmm::device_uvector(delimiter_count, stream);
+ auto d_positions = delimiter_positions.data();
+ auto const copy_end =
+ thrust::copy_if(rmm::exec_policy(stream),
+ thrust::make_counting_iterator(0),
+ thrust::make_counting_iterator(chars_bytes),
+ delimiter_positions.begin(),
+ [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) {
+ return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
+ });
+
+ // create a vector of offsets to each string's delimiter set within delimiter_positions
+ auto const delimiter_offsets = [&] {
+ // first, create a vector of string indices for each delimiter
+ auto string_indices = rmm::device_uvector(delimiter_count, stream);
+ thrust::upper_bound(rmm::exec_policy(stream),
+ d_offsets,
+ d_offsets + strings_count,
+ delimiter_positions.begin(),
+ copy_end,
+ string_indices.begin());
+
+ // compute delimiter offsets per string
+ auto delimiter_offsets = rmm::device_uvector(strings_count + 1, stream);
+ auto d_delimiter_offsets = delimiter_offsets.data();
+
+ // memset to zero-out the delimiter counts for any null-entries or strings with no delimiters
+ CUDF_CUDA_TRY(cudaMemsetAsync(
+ d_delimiter_offsets, 0, delimiter_offsets.size() * sizeof(size_type), stream.value()));
+
+ // next, count the number of delimiters per string
+ auto d_string_indices = string_indices.data(); // identifies strings with delimiters only
+ thrust::for_each_n(rmm::exec_policy(stream),
+ thrust::make_counting_iterator(0),
+ delimiter_count,
+ [d_string_indices, d_delimiter_offsets] __device__(size_type idx) {
+ auto const str_idx = d_string_indices[idx] - 1;
+ atomicAdd(d_delimiter_offsets + str_idx, 1);
+ });
+ // finally, convert the delimiter counts into offsets
+ thrust::exclusive_scan(rmm::exec_policy(stream),
+ delimiter_offsets.begin(),
+ delimiter_offsets.end(),
+ delimiter_offsets.begin());
+ return delimiter_offsets;
+ }();
+ auto const d_delimiter_offsets = delimiter_offsets.data();
+
+ // compute the number of tokens per string
+ auto token_counts = rmm::device_uvector(strings_count, stream);
+ thrust::transform(
+ rmm::exec_policy(stream),
+ thrust::make_counting_iterator(0),
+ thrust::make_counting_iterator(strings_count),
+ token_counts.begin(),
+ [tokenizer, d_positions, d_delimiter_offsets] __device__(size_type idx) -> size_type {
+ return tokenizer.count_tokens(idx, d_positions, d_delimiter_offsets);
+ });
+
+ // create offsets from the counts for return to the caller
+ auto offsets = std::get<0>(
+ cudf::detail::make_offsets_child_column(token_counts.begin(), token_counts.end(), stream, mr));
+ auto const total_tokens =
+ cudf::detail::get_value(offsets->view(), strings_count, stream);
+ auto const d_tokens_offsets = offsets->view().data();
+
+ // build a vector of all the token positions for all the strings
+ auto tokens = rmm::device_uvector(total_tokens, stream);
+ auto d_tokens = tokens.data();
+ thrust::for_each_n(
+ rmm::exec_policy(stream),
+ thrust::make_counting_iterator(0),
+ strings_count,
+ [tokenizer, d_tokens_offsets, d_positions, d_delimiter_offsets, d_tokens] __device__(
+ size_type idx) {
+ tokenizer.get_tokens(idx, d_tokens_offsets, d_positions, d_delimiter_offsets, d_tokens);
+ });
+
+ return std::make_pair(std::move(offsets), std::move(tokens));
+}
+
+} // namespace cudf::strings::detail
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index d935ad0b1da..5b79fdefb5a 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
* limitations under the License.
*/
+#include "split.cuh"
+
#include
#include
#include
@@ -23,14 +25,12 @@
#include
#include
#include
-#include
#include
#include
#include
#include
-#include
#include
#include
@@ -38,108 +38,43 @@ namespace cudf {
namespace strings {
namespace detail {
-using string_index_pair = thrust::pair;
-
namespace {
-enum class Dir { FORWARD, BACKWARD };
-
-/**
- * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`.
- *
- * The number of tokens is the same regardless if counting from the beginning
- * or the end of the string.
- */
-struct token_counter_fn {
- column_device_view const d_strings; // strings to split
- string_view const d_delimiter; // delimiter for split
- size_type const max_tokens = std::numeric_limits::max();
-
- __device__ size_type operator()(size_type idx) const
- {
- if (d_strings.is_null(idx)) { return 0; }
-
- auto const d_str = d_strings.element(idx);
- size_type token_count = 0;
- size_type start_pos = 0;
- while (token_count < max_tokens - 1) {
- auto const delimiter_pos = d_str.find(d_delimiter, start_pos);
- if (delimiter_pos == string_view::npos) break;
- token_count++;
- start_pos = delimiter_pos + d_delimiter.length();
- }
- return token_count + 1; // always at least one token
- }
-};
-
-/**
- * @brief Identify the tokens from the `idx'th` string element of `d_strings`.
- */
-template
-struct token_reader_fn {
- column_device_view const d_strings; // strings to split
- string_view const d_delimiter; // delimiter for split
- int32_t* d_token_offsets{}; // for locating tokens in d_tokens
- string_index_pair* d_tokens{};
-
- __device__ string_index_pair resolve_token(string_view const& d_str,
- size_type start_pos,
- size_type end_pos,
- size_type delimiter_pos) const
- {
- if (dir == Dir::FORWARD) {
- auto const byte_offset = d_str.byte_offset(start_pos);
- return string_index_pair{d_str.data() + byte_offset,
- d_str.byte_offset(delimiter_pos) - byte_offset};
- } else {
- auto const byte_offset = d_str.byte_offset(delimiter_pos + d_delimiter.length());
- return string_index_pair{d_str.data() + byte_offset,
- d_str.byte_offset(end_pos) - byte_offset};
- }
+template
+std::unique_ptr split_record_fn(strings_column_view const& input,
+ Tokenizer tokenizer,
+ rmm::cuda_stream_view stream,
+ rmm::mr::device_memory_resource* mr)
+{
+ if (input.is_empty()) { return make_empty_column(type_id::LIST); }
+ if (input.size() == input.null_count()) {
+ auto offsets = std::make_unique(input.offsets(), stream, mr);
+ auto results = make_empty_column(type_id::STRING);
+ return make_lists_column(input.size(),
+ std::move(offsets),
+ std::move(results),
+ input.null_count(),
+ copy_bitmask(input.parent(), stream, mr),
+ stream,
+ mr);
}
- __device__ void operator()(size_type idx)
- {
- if (d_strings.is_null(idx)) { return; }
+ // builds the offsets and the vector of all tokens
+ auto [offsets, tokens] = split_helper(input, tokenizer, stream, mr);
- auto const token_offset = d_token_offsets[idx];
- auto const token_count = d_token_offsets[idx + 1] - token_offset;
- auto d_result = d_tokens + token_offset;
- auto const d_str = d_strings.element(idx);
- if (d_str.empty()) {
- // Pandas str.split("") for non-whitespace delimiter is an empty string
- *d_result = string_index_pair{"", 0};
- return;
- }
+ // build a strings column from the tokens
+ auto strings_child = make_strings_column(tokens.begin(), tokens.end(), stream, mr);
- size_type token_idx = 0;
- size_type start_pos = 0; // updates only if moving forward
- size_type end_pos = d_str.length(); // updates only if moving backward
- while (token_idx < token_count - 1) {
- auto const delimiter_pos = dir == Dir::FORWARD ? d_str.find(d_delimiter, start_pos)
- : d_str.rfind(d_delimiter, start_pos, end_pos);
- if (delimiter_pos == string_view::npos) break;
- auto const token = resolve_token(d_str, start_pos, end_pos, delimiter_pos);
- if (dir == Dir::FORWARD) {
- d_result[token_idx] = token;
- start_pos = delimiter_pos + d_delimiter.length();
- } else {
- d_result[token_count - 1 - token_idx] = token;
- end_pos = delimiter_pos;
- }
- token_idx++;
- }
+ return make_lists_column(input.size(),
+ std::move(offsets),
+ std::move(strings_child),
+ input.null_count(),
+ copy_bitmask(input.parent(), stream, mr),
+ stream,
+ mr);
+}
- // set last token to remainder of the string
- if (dir == Dir::FORWARD) {
- auto const offset_bytes = d_str.byte_offset(start_pos);
- d_result[token_idx] =
- string_index_pair{d_str.data() + offset_bytes, d_str.byte_offset(end_pos) - offset_bytes};
- } else {
- d_result[0] = string_index_pair{d_str.data(), d_str.byte_offset(end_pos)};
- }
- }
-};
+enum class Dir { FORWARD, BACKWARD };
/**
* @brief Compute the number of tokens for the `idx'th` string element of `d_strings`.
@@ -196,7 +131,7 @@ struct whitespace_token_reader_fn {
whitespace_string_tokenizer tokenizer(d_str, dir != Dir::FORWARD);
size_type token_idx = 0;
position_pair token{0, 0};
- if (dir == Dir::FORWARD) {
+ if constexpr (dir == Dir::FORWARD) {
while (tokenizer.next_token() && (token_idx < token_count)) {
token = tokenizer.get_token();
d_result[token_idx++] =
@@ -224,11 +159,11 @@ struct whitespace_token_reader_fn {
// The output is one list item per string
template
-std::unique_ptr split_record_fn(strings_column_view const& strings,
- TokenCounter counter,
- TokenReader reader,
- rmm::cuda_stream_view stream,
- rmm::mr::device_memory_resource* mr)
+std::unique_ptr whitespace_split_record_fn(strings_column_view const& strings,
+ TokenCounter counter,
+ TokenReader reader,
+ rmm::cuda_stream_view stream,
+ rmm::mr::device_memory_resource* mr)
{
// create offsets column by counting the number of tokens per string
auto strings_count = strings.size();
@@ -244,7 +179,7 @@ std::unique_ptr split_record_fn(strings_column_view const& strings,
rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
// last entry is the total number of tokens to be generated
- auto total_tokens = cudf::detail::get_value(offsets->view(), strings_count, stream);
+ auto total_tokens = cudf::detail::get_value(offsets->view(), strings_count, stream);
// split each string into an array of index-pair values
rmm::device_uvector tokens(total_tokens, stream);
reader.d_token_offsets = d_offsets;
@@ -277,18 +212,21 @@ std::unique_ptr split_record(strings_column_view const& strings,
auto d_strings_column_ptr = column_device_view::create(strings.parent(), stream);
if (delimiter.size() == 0) {
- return split_record_fn(strings,
- whitespace_token_counter_fn{*d_strings_column_ptr, max_tokens},
- whitespace_token_reader_fn{*d_strings_column_ptr, max_tokens},
- stream,
- mr);
+ return whitespace_split_record_fn(
+ strings,
+ whitespace_token_counter_fn{*d_strings_column_ptr, max_tokens},
+ whitespace_token_reader_fn{*d_strings_column_ptr, max_tokens},
+ stream,
+ mr);
} else {
string_view d_delimiter(delimiter.data(), delimiter.size());
- return split_record_fn(strings,
- token_counter_fn{*d_strings_column_ptr, d_delimiter, max_tokens},
- token_reader_fn{*d_strings_column_ptr, d_delimiter},
- stream,
- mr);
+ if (dir == Dir::FORWARD) {
+ return split_record_fn(
+ strings, split_tokenizer_fn{*d_strings_column_ptr, d_delimiter, max_tokens}, stream, mr);
+ } else {
+ return split_record_fn(
+ strings, rsplit_tokenizer_fn{*d_strings_column_ptr, d_delimiter, max_tokens}, stream, mr);
+ }
}
}
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 21752196430..48f69e3ecd3 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -357,6 +357,10 @@ struct ParquetWriterSchemaTest : public ParquetWriterTest {
auto type() { return cudf::data_type{cudf::type_to_id()}; }
};
+template
+struct ParquetReaderSourceTest : public ParquetReaderTest {
+};
+
// Declare typed test cases
// TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352
using SupportedTypes = cudf::test::Types;
@@ -369,6 +373,8 @@ using SupportedTimestampTypes =
cudf::test::Types;
TYPED_TEST_SUITE(ParquetWriterTimestampTypeTest, SupportedTimestampTypes);
TYPED_TEST_SUITE(ParquetWriterSchemaTest, cudf::test::AllTypes);
+using ByteLikeTypes = cudf::test::Types;
+TYPED_TEST_SUITE(ParquetReaderSourceTest, ByteLikeTypes);
// Base test fixture for chunked writer tests
struct ParquetChunkedWriterTest : public cudf::test::BaseFixture {
@@ -5113,4 +5119,72 @@ TEST_P(ParquetSizedTest, DictionaryTest)
EXPECT_EQ(nbits, GetParam());
}
+TYPED_TEST(ParquetReaderSourceTest, BufferSourceTypes)
+{
+ using T = TypeParam;
+
+ srand(31337);
+ auto table = create_random_fixed_table(5, 5, true);
+
+ std::vector out_buffer;
+ cudf::io::parquet_writer_options out_opts =
+ cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), *table);
+ cudf::io::write_parquet(out_opts);
+
+ {
+ cudf::io::parquet_reader_options in_opts =
+ cudf::io::parquet_reader_options::builder(cudf::io::source_info(
+ cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size())));
+ const auto result = cudf::io::read_parquet(in_opts);
+
+ CUDF_TEST_EXPECT_TABLES_EQUAL(*table, result.tbl->view());
+ }
+
+ {
+ cudf::io::parquet_reader_options in_opts =
+ cudf::io::parquet_reader_options::builder(cudf::io::source_info(cudf::host_span(
+ reinterpret_cast(out_buffer.data()), out_buffer.size())));
+ const auto result = cudf::io::read_parquet(in_opts);
+
+ CUDF_TEST_EXPECT_TABLES_EQUAL(*table, result.tbl->view());
+ }
+}
+
+TYPED_TEST(ParquetReaderSourceTest, BufferSourceArrayTypes)
+{
+ using T = TypeParam;
+
+ srand(31337);
+ auto table = create_random_fixed_table(5, 5, true);
+
+ std::vector out_buffer;
+ cudf::io::parquet_writer_options out_opts =
+ cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), *table);
+ cudf::io::write_parquet(out_opts);
+
+ auto full_table = cudf::concatenate(std::vector({*table, *table}));
+
+ {
+ auto spans = std::vector>{
+ cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size()),
+ cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size())};
+ cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder(
+ cudf::io::source_info(cudf::host_span>(spans.data(), spans.size())));
+ const auto result = cudf::io::read_parquet(in_opts);
+
+ CUDF_TEST_EXPECT_TABLES_EQUAL(*full_table, result.tbl->view());
+ }
+
+ {
+ auto spans = std::vector>{
+ cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size()),
+ cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size())};
+ cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder(
+ cudf::io::source_info(cudf::host_span>(spans.data(), spans.size())));
+ const auto result = cudf::io::read_parquet(in_opts);
+
+ CUDF_TEST_EXPECT_TABLES_EQUAL(*full_table, result.tbl->view());
+ }
+}
+
CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index 73d5adab427..714c1ad416a 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -308,6 +308,82 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
}
+TEST_F(StringsSplitTest, MultiByteDelimiters)
+{
+ // Overlapping delimiters
+ auto input =
+ cudf::test::strings_column_wrapper({"u::", "w:::x", "y::::z", "::a", ":::b", ":::c:::"});
+ auto view = cudf::strings_column_view(input);
+ using LCW = cudf::test::lists_column_wrapper;
+ {
+ auto result = cudf::strings::split_record(view, cudf::string_scalar("::"));
+ auto expected_left = LCW({LCW{"u", ""},
+ LCW{"w", ":x"},
+ LCW{"y", "", "z"},
+ LCW{"", "a"},
+ LCW{"", ":b"},
+ LCW{"", ":c", ":"}});
+ CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_left);
+ result = cudf::strings::rsplit_record(view, cudf::string_scalar("::"));
+ auto expected_right = LCW({LCW{"u", ""},
+ LCW{"w:", "x"},
+ LCW{"y", "", "z"},
+ LCW{"", "a"},
+ LCW{":", "b"},
+ LCW{":", "c:", ""}});
+ CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_right);
+ }
+ {
+ auto result = cudf::strings::split(view, cudf::string_scalar("::"));
+
+ auto c0 = cudf::test::strings_column_wrapper({"u", "w", "y", "", "", ""});
+ auto c1 = cudf::test::strings_column_wrapper({"", ":x", "", "a", ":b", ":c"});
+ auto c2 = cudf::test::strings_column_wrapper({"", "", "z", "", "", ":"}, {0, 0, 1, 0, 0, 1});
+ std::vector> expected_columns;
+ expected_columns.push_back(c0.release());
+ expected_columns.push_back(c1.release());
+ expected_columns.push_back(c2.release());
+ auto expected_left = std::make_unique(std::move(expected_columns));
+ CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected_left);
+
+ result = cudf::strings::rsplit(view, cudf::string_scalar("::"));
+
+ c0 = cudf::test::strings_column_wrapper({"u", "w:", "y", "", ":", ":"});
+ c1 = cudf::test::strings_column_wrapper({"", "x", "", "a", "b", "c:"});
+ c2 = cudf::test::strings_column_wrapper({"", "", "z", "", "", ""}, {0, 0, 1, 0, 0, 1});
+ expected_columns.push_back(c0.release());
+ expected_columns.push_back(c1.release());
+ expected_columns.push_back(c2.release());
+ auto expected_right = std::make_unique(std::move(expected_columns));
+ CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected_right);
+ }
+
+ // Delimiters that span across adjacent strings
+ input = cudf::test::strings_column_wrapper({"{a=1}:{b=2}:", "{c=3}", ":{}:{}"});
+ view = cudf::strings_column_view(input);
+ {
+ auto result = cudf::strings::split_record(view, cudf::string_scalar("}:{"));
+ auto expected = LCW({LCW{"{a=1", "b=2}:"}, LCW{"{c=3}"}, LCW{":{", "}"}});
+ CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+ result = cudf::strings::rsplit_record(view, cudf::string_scalar("}:{"));
+ CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+ }
+ {
+ auto result = cudf::strings::split(view, cudf::string_scalar("}:{"));
+
+ auto c0 = cudf::test::strings_column_wrapper({"{a=1", "{c=3}", ":{"});
+ auto c1 = cudf::test::strings_column_wrapper({"b=2}:", "", "}"}, {1, 0, 1});
+ std::vector> expected_columns;
+ expected_columns.push_back(c0.release());
+ expected_columns.push_back(c1.release());
+ auto expected = std::make_unique(std::move(expected_columns));
+ CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected);
+
+ result = cudf::strings::rsplit(view, cudf::string_scalar("}:{"));
+ CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected);
+ }
+}
+
TEST_F(StringsSplitTest, SplitRegex)
{
std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""};
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index db64dcb08c7..937077c89c9 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -1045,8 +1045,8 @@ void decimal128Cv() {
BigInteger bigInteger2 = new BigInteger("14");
BigInteger bigInteger3 = new BigInteger("152345742357340573405745");
final BigInteger[] bigInts = new BigInteger[] {bigInteger1, bigInteger2, bigInteger3};
- try (ColumnVector v = ColumnVector.decimalFromBigInt(-dec32Scale1, bigInts)) {
- HostColumnVector hostColumnVector = v.copyToHost();
+ try (ColumnVector v = ColumnVector.decimalFromBigInt(-dec32Scale1, bigInts);
+ HostColumnVector hostColumnVector = v.copyToHost()) {
assertEquals(bigInteger1, hostColumnVector.getBigDecimal(0).unscaledValue());
assertEquals(bigInteger2, hostColumnVector.getBigDecimal(1).unscaledValue());
assertEquals(bigInteger3, hostColumnVector.getBigDecimal(2).unscaledValue());
diff --git a/java/src/test/java/ai/rapids/cudf/ScalarTest.java b/java/src/test/java/ai/rapids/cudf/ScalarTest.java
index 86c340bb321..f4b652a7d03 100644
--- a/java/src/test/java/ai/rapids/cudf/ScalarTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ScalarTest.java
@@ -1,6 +1,6 @@
/*
*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ public void testDoubleClose() {
}
@Test
- public void testIncRef() {
+ public void testIncRefAndDoubleFree() {
Scalar s = Scalar.fromNull(DType.INT32);
try (Scalar ignored1 = s) {
try (Scalar ignored2 = s.incRefCount()) {
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 4f00bc7493d..c31bcf4f78d 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -244,7 +244,7 @@ void testOrderByWithNullsAndStrings() {
}
@Test
- void testTableCreationIncreasesRefCount() {
+ void testTableCreationIncreasesRefCountWithDoubleFree() {
//tests the Table increases the refcount on column vectors
assertThrows(IllegalStateException.class, () -> {
try (ColumnVector v1 = ColumnVector.build(DType.INT32, 5, Range.appendInts(5));
diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
index ac058b1d9a1..c528eb69575 100644
--- a/python/cudf/CMakeLists.txt
+++ b/python/cudf/CMakeLists.txt
@@ -72,15 +72,6 @@ endif()
include(rapids-cython)
if(NOT cudf_FOUND)
- # TODO: This will not be necessary once we upgrade to CMake 3.22, which will pull in the required
- # languages for the C++ project even if this project does not require those languages.
- include(rapids-cuda)
- rapids_cuda_init_architectures(cudf-python)
- enable_language(CUDA)
- # Since cudf only enables CUDA optionally we need to manually include the file that
- # rapids_cuda_init_architectures relies on `project` including.
- include("${CMAKE_PROJECT_cudf-python_INCLUDE}")
-
set(BUILD_TESTS OFF)
set(BUILD_BENCHMARKS OFF)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 965b413e84f..fb1bcf6d673 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -607,7 +607,7 @@ def _scatter_by_slice(
start, stop, step = key.indices(len(self))
if start >= stop:
return None
- num_keys = (stop - start) // step
+ num_keys = len(range(start, stop, step))
self._check_scatter_key_length(num_keys, value)
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 6b2fb90e95b..d59226ee17a 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -347,3 +347,13 @@ def test_series_setitem_upcasting_string_value():
assert_eq(pd.Series([10, 0, 0], dtype=int), sr)
with pytest.raises(ValueError):
sr[0] = "non-integer"
+
+
+def test_scatter_by_slice_with_start_and_step():
+ source = pd.Series([1, 2, 3, 4, 5])
+ csource = cudf.from_pandas(source)
+ target = pd.Series([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+ ctarget = cudf.from_pandas(target)
+ target[1::2] = source
+ ctarget[1::2] = csource
+ assert_eq(target, ctarget)
diff --git a/python/cudf/udf_cpp/CMakeLists.txt b/python/cudf/udf_cpp/CMakeLists.txt
index ebf47ee8469..0c07236682f 100644
--- a/python/cudf/udf_cpp/CMakeLists.txt
+++ b/python/cudf/udf_cpp/CMakeLists.txt
@@ -18,8 +18,6 @@ include(rapids-cmake)
include(rapids-cpm)
include(rapids-find)
-rapids_cuda_init_architectures(udf-cpp)
-
rapids_cpm_init()
rapids_find_package(