From b1aa5d2027490de7a5e75d3b2375620a15ace515 Mon Sep 17 00:00:00 2001 From: "H. Thomson Comer" Date: Fri, 17 Feb 2023 11:24:57 -0600 Subject: [PATCH 1/6] Fix a bug with `num_keys` in `_scatter_by_slice` (#12749) This PR closes https://github.com/rapidsai/cudf/issues/12748 by changing the `num_keys` computation in `column._scatter_by_slice`. Authors: - H. Thomson Comer (https://github.com/thomcom) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/12749 --- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/tests/test_setitem.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 965b413e84f..fb1bcf6d673 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -607,7 +607,7 @@ def _scatter_by_slice( start, stop, step = key.indices(len(self)) if start >= stop: return None - num_keys = (stop - start) // step + num_keys = len(range(start, stop, step)) self._check_scatter_key_length(num_keys, value) diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 6b2fb90e95b..d59226ee17a 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -347,3 +347,13 @@ def test_series_setitem_upcasting_string_value(): assert_eq(pd.Series([10, 0, 0], dtype=int), sr) with pytest.raises(ValueError): sr[0] = "non-integer" + + +def test_scatter_by_slice_with_start_and_step(): + source = pd.Series([1, 2, 3, 4, 5]) + csource = cudf.from_pandas(source) + target = pd.Series([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + ctarget = cudf.from_pandas(target) + target[1::2] = source + ctarget[1::2] = csource + assert_eq(target, ctarget) From ec8704a45c93915067f987e35bfb37bfec2e05ae Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Fri, 17 Feb 2023 15:27:57 -0600 Subject: [PATCH 2/6] Fix a leak in a test and clarify some test names (#12781) Fix a small leak of host memory in a java unit test. Also updates some tests that verify that a double free is safe, but don't make it clear from the logs that the double free is expected to be there. This is so we don't have to spend too much time debugging if this double free is expected or not. Authors: - Robert (Bobby) Evans (https://github.com/revans2) Approvers: - Kuhu Shukla (https://github.com/kuhushukla) - Jim Brennan (https://github.com/jbrennan333) - Jason Lowe (https://github.com/jlowe) - Raza Jafri (https://github.com/razajafri) - Gera Shegalov (https://github.com/gerashegalov) URL: https://github.com/rapidsai/cudf/pull/12781 --- java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java | 4 ++-- java/src/test/java/ai/rapids/cudf/ScalarTest.java | 4 ++-- java/src/test/java/ai/rapids/cudf/TableTest.java | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index db64dcb08c7..937077c89c9 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -1045,8 +1045,8 @@ void decimal128Cv() { BigInteger bigInteger2 = new BigInteger("14"); BigInteger bigInteger3 = new BigInteger("152345742357340573405745"); final BigInteger[] bigInts = new BigInteger[] {bigInteger1, bigInteger2, bigInteger3}; - try (ColumnVector v = ColumnVector.decimalFromBigInt(-dec32Scale1, bigInts)) { - HostColumnVector hostColumnVector = v.copyToHost(); + try (ColumnVector v = ColumnVector.decimalFromBigInt(-dec32Scale1, bigInts); + HostColumnVector hostColumnVector = v.copyToHost()) { assertEquals(bigInteger1, hostColumnVector.getBigDecimal(0).unscaledValue()); assertEquals(bigInteger2, hostColumnVector.getBigDecimal(1).unscaledValue()); assertEquals(bigInteger3, hostColumnVector.getBigDecimal(2).unscaledValue()); diff --git a/java/src/test/java/ai/rapids/cudf/ScalarTest.java b/java/src/test/java/ai/rapids/cudf/ScalarTest.java index 86c340bb321..f4b652a7d03 100644 --- a/java/src/test/java/ai/rapids/cudf/ScalarTest.java +++ b/java/src/test/java/ai/rapids/cudf/ScalarTest.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,7 +41,7 @@ public void testDoubleClose() { } @Test - public void testIncRef() { + public void testIncRefAndDoubleFree() { Scalar s = Scalar.fromNull(DType.INT32); try (Scalar ignored1 = s) { try (Scalar ignored2 = s.incRefCount()) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 4f00bc7493d..c31bcf4f78d 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -244,7 +244,7 @@ void testOrderByWithNullsAndStrings() { } @Test - void testTableCreationIncreasesRefCount() { + void testTableCreationIncreasesRefCountWithDoubleFree() { //tests the Table increases the refcount on column vectors assertThrows(IllegalStateException.class, () -> { try (ColumnVector v1 = ColumnVector.build(DType.INT32, 5, Range.appendInts(5)); From 94bbc82a117a96979d1c5d7949ba213ade88c3be Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Sun, 19 Feb 2023 11:51:52 -0500 Subject: [PATCH 3/6] Add build metrics report as artifact to cpp-build workflow (#12750) Adds the Build Metrics Report into the gitHub actions workflow by uploading a link to the generated report to the CI downloads S3 and publishes a link to the report in the build output. A follow-on PR will provide a more direct mechanism for locating the report link. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) URL: https://github.com/rapidsai/cudf/pull/12750 --- build.sh | 8 +++++--- ci/build_cpp.sh | 28 +++++++++++++++++++++++++++- ci/test_cpp.sh | 16 ---------------- conda/recipes/libcudf/meta.yaml | 1 + 4 files changed, 33 insertions(+), 20 deletions(-) diff --git a/build.sh b/build.sh index d9e088c765e..22a62df7182 100755 --- a/build.sh +++ b/build.sh @@ -315,9 +315,11 @@ if buildAll || hasArg libcudf; then LIBCUDF_FS=$(ls -lh ${LIB_BUILD_DIR}/libcudf.so | awk '{print $5}') MSG="${MSG}
libcudf.so size: $LIBCUDF_FS" fi - echo "$MSG" - python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${LIB_BUILD_DIR}/ninja_log.html - cp ${LIB_BUILD_DIR}/.ninja_log ${LIB_BUILD_DIR}/ninja.log + BMR_DIR=${RAPIDS_ARTIFACTS_DIR:-"${LIB_BUILD_DIR}"} + echo "Metrics output dir: [$BMR_DIR]" + mkdir -p ${BMR_DIR} + python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${BMR_DIR}/ninja_log.html + cp ${LIB_BUILD_DIR}/.ninja_log ${BMR_DIR}/ninja.log fi if [[ ${INSTALL_TARGET} != "" ]]; then diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index 3b45b3ce2e7..b68c2bdbef6 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. set -euo pipefail @@ -14,3 +14,29 @@ rapids-logger "Begin cpp build" rapids-mamba-retry mambabuild conda/recipes/libcudf rapids-upload-conda-to-s3 cpp + +echo "++++++++++++++++++++++++++++++++++++++++++++" + +if [[ -d $RAPIDS_ARTIFACTS_DIR ]]; then + ls -l ${RAPIDS_ARTIFACTS_DIR} +fi + +echo "++++++++++++++++++++++++++++++++++++++++++++" + +FILE=${RAPIDS_ARTIFACTS_DIR}/ninja.log +if [[ -f $FILE ]]; then + echo -e "\x1B[33;1m\x1B[48;5;240m Ninja log for this build available at the following link \x1B[0m" + UPLOAD_NAME=cpp_cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch).ninja.log + rapids-upload-to-s3 "${UPLOAD_NAME}" "${FILE}" +fi + +echo "++++++++++++++++++++++++++++++++++++++++++++" + +FILE=${RAPIDS_ARTIFACTS_DIR}/ninja_log.html +if [[ -f $FILE ]]; then + echo -e "\x1B[33;1m\x1B[48;5;240m Build Metrics Report for this build available at the following link \x1B[0m" + UPLOAD_NAME=cpp_cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch).BuildMetricsReport.html + rapids-upload-to-s3 "${UPLOAD_NAME}" "${FILE}" +fi + +echo "++++++++++++++++++++++++++++++++++++++++++++" diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index 0be72486319..983a63d4ce9 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -66,21 +66,5 @@ for gt in "$CONDA_PREFIX"/bin/gtests/{libcudf,libcudf_kafka}/* ; do fi done -if [[ "${RAPIDS_BUILD_TYPE}" == "nightly" ]]; then - rapids-logger "Memcheck gtests with rmm_mode=cuda" - export GTEST_CUDF_RMM_MODE=cuda - COMPUTE_SANITIZER_CMD="compute-sanitizer --tool memcheck" - for gt in "$CONDA_PREFIX"/bin/gtests/{libcudf,libcudf_kafka}/* ; do - test_name=$(basename ${gt}) - if [[ "$test_name" == "ERROR_TEST" ]]; then - continue - fi - echo "Running gtest $test_name" - ${COMPUTE_SANITIZER_CMD} ${gt} | tee "${RAPIDS_TESTS_DIR}${test_name}.cs.log" - done - unset GTEST_CUDF_RMM_MODE - # TODO: test-results/*.cs.log are processed in CI -fi - rapids-logger "Test script exiting with value: $EXITCODE" exit ${EXITCODE} diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index b0b86b427b7..fbfcf6e71a2 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -27,6 +27,7 @@ build: - SCCACHE_IDLE_TIMEOUT - AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY + - RAPIDS_ARTIFACTS_DIR requirements: build: From c2f016122cf9fc769516dee785a9a68f44905693 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Mon, 20 Feb 2023 23:02:52 -0500 Subject: [PATCH 4/6] Changing `cudf::io::source_info` to use `cudf::host_span` in a non-breaking form (#12730) Closes #12576 This change converts `cudf::io::source_info` to take a `host_span`. This version deprecates the original API, but leaves it intact to avoid breaking changes. After being deprecated for a few releases, they will be removed. Authors: - Mike Wilson (https://github.com/hyperbolic2346) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Nghia Truong (https://github.com/ttnghia) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/12730 --- cpp/include/cudf/io/datasource.hpp | 10 ++++ cpp/include/cudf/io/types.hpp | 75 +++++++++++++++++++++++++++-- cpp/src/io/utilities/datasource.cpp | 8 ++- cpp/tests/io/parquet_test.cpp | 74 ++++++++++++++++++++++++++++ 4 files changed, 162 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index fd4c049e2fc..a0ef2155f7d 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -112,11 +112,21 @@ class datasource { /** * @brief Creates a source from a host memory buffer. * + # @deprecated Since 23.04 + * * @param[in] buffer Host buffer object * @return Constructed datasource object */ static std::unique_ptr create(host_buffer const& buffer); + /** + * @brief Creates a source from a host memory buffer. + * + * @param[in] buffer Host buffer object + * @return Constructed datasource object + */ + static std::unique_ptr create(cudf::host_span buffer); + /** * @brief Creates a source from a device memory buffer. * diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index 06b52563e19..6f97eb768d9 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -150,6 +150,8 @@ struct table_with_metadata { /** * @brief Non-owning view of a host memory buffer * + * @deprecated Since 23.04 + * * Used to describe buffer input in `source_info` objects. */ struct host_buffer { @@ -166,6 +168,22 @@ struct host_buffer { host_buffer(const char* data, size_t size) : data(data), size(size) {} }; +/** + * @brief Returns `true` if the type is byte-like, meaning it is reasonable to pass as a pointer to + * bytes. + * + * @tparam T The representation type + * @return `true` if the type is considered a byte-like type + */ +template +constexpr inline auto is_byte_like_type() +{ + using non_cv_T = std::remove_cv_t; + return std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v; +} + /** * @brief Source information for read interfaces */ @@ -191,21 +209,70 @@ struct source_info { /** * @brief Construct a new source info object for multiple buffers in host memory * + * @deprecated Since 23.04 + * * @param host_buffers Input buffers in host memory */ - explicit source_info(std::vector const& host_buffers) - : _type(io_type::HOST_BUFFER), _host_buffers(host_buffers) + explicit source_info(std::vector const& host_buffers) : _type(io_type::HOST_BUFFER) { + _host_buffers.reserve(host_buffers.size()); + std::transform(host_buffers.begin(), + host_buffers.end(), + std::back_inserter(_host_buffers), + [](auto const hb) { + return cudf::host_span{ + reinterpret_cast(hb.data), hb.size}; + }); } /** * @brief Construct a new source info object for a single buffer * + * @deprecated Since 23.04 + * * @param host_data Input buffer in host memory * @param size Size of the buffer */ explicit source_info(const char* host_data, size_t size) - : _type(io_type::HOST_BUFFER), _host_buffers({{host_data, size}}) + : _type(io_type::HOST_BUFFER), + _host_buffers( + {cudf::host_span(reinterpret_cast(host_data), size)}) + { + } + + /** + * @brief Construct a new source info object for multiple buffers in host memory + * + * @param host_buffers Input buffers in host memory + */ + template >())> + explicit source_info(cudf::host_span> const host_buffers) + : _type(io_type::HOST_BUFFER) + { + if constexpr (not std::is_same_v, std::byte>) { + _host_buffers.reserve(host_buffers.size()); + std::transform(host_buffers.begin(), + host_buffers.end(), + std::back_inserter(_host_buffers), + [](auto const s) { + return cudf::host_span{ + reinterpret_cast(s.data()), s.size()}; + }); + } else { + _host_buffers.assign(host_buffers.begin(), host_buffers.end()); + } + } + + /** + * @brief Construct a new source info object for a single buffer + * + * @param host_data Input buffer in host memory + */ + template >())> + explicit source_info(cudf::host_span host_data) + : _type(io_type::HOST_BUFFER), + _host_buffers{cudf::host_span( + reinterpret_cast(host_data.data()), host_data.size())} { } @@ -289,7 +356,7 @@ struct source_info { private: io_type _type = io_type::FILEPATH; std::vector _filepaths; - std::vector _host_buffers; + std::vector> _host_buffers; std::vector> _device_buffers; std::vector _user_sources; }; diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index c2f7b18d443..71d64900398 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -329,10 +329,16 @@ std::unique_ptr datasource::create(const std::string& filepath, } std::unique_ptr datasource::create(host_buffer const& buffer) +{ + return create( + cudf::host_span{reinterpret_cast(buffer.data), buffer.size}); +} + +std::unique_ptr datasource::create(cudf::host_span buffer) { // Use Arrow IO buffer class for zero-copy reads of host memory return std::make_unique(std::make_shared( - reinterpret_cast(buffer.data), buffer.size)); + reinterpret_cast(buffer.data()), buffer.size())); } std::unique_ptr datasource::create(cudf::device_span buffer) diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 21752196430..48f69e3ecd3 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -357,6 +357,10 @@ struct ParquetWriterSchemaTest : public ParquetWriterTest { auto type() { return cudf::data_type{cudf::type_to_id()}; } }; +template +struct ParquetReaderSourceTest : public ParquetReaderTest { +}; + // Declare typed test cases // TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352 using SupportedTypes = cudf::test::Types; @@ -369,6 +373,8 @@ using SupportedTimestampTypes = cudf::test::Types; TYPED_TEST_SUITE(ParquetWriterTimestampTypeTest, SupportedTimestampTypes); TYPED_TEST_SUITE(ParquetWriterSchemaTest, cudf::test::AllTypes); +using ByteLikeTypes = cudf::test::Types; +TYPED_TEST_SUITE(ParquetReaderSourceTest, ByteLikeTypes); // Base test fixture for chunked writer tests struct ParquetChunkedWriterTest : public cudf::test::BaseFixture { @@ -5113,4 +5119,72 @@ TEST_P(ParquetSizedTest, DictionaryTest) EXPECT_EQ(nbits, GetParam()); } +TYPED_TEST(ParquetReaderSourceTest, BufferSourceTypes) +{ + using T = TypeParam; + + srand(31337); + auto table = create_random_fixed_table(5, 5, true); + + std::vector out_buffer; + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), *table); + cudf::io::write_parquet(out_opts); + + { + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info( + cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size()))); + const auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*table, result.tbl->view()); + } + + { + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info(cudf::host_span( + reinterpret_cast(out_buffer.data()), out_buffer.size()))); + const auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*table, result.tbl->view()); + } +} + +TYPED_TEST(ParquetReaderSourceTest, BufferSourceArrayTypes) +{ + using T = TypeParam; + + srand(31337); + auto table = create_random_fixed_table(5, 5, true); + + std::vector out_buffer; + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), *table); + cudf::io::write_parquet(out_opts); + + auto full_table = cudf::concatenate(std::vector({*table, *table})); + + { + auto spans = std::vector>{ + cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size()), + cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size())}; + cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder( + cudf::io::source_info(cudf::host_span>(spans.data(), spans.size()))); + const auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*full_table, result.tbl->view()); + } + + { + auto spans = std::vector>{ + cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size()), + cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size())}; + cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder( + cudf::io::source_info(cudf::host_span>(spans.data(), spans.size()))); + const auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*full_table, result.tbl->view()); + } +} + CUDF_TEST_PROGRAM_MAIN() From 7da233b279bf84a501e9c2e3041cbc6fb335e610 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 21 Feb 2023 08:12:54 -0500 Subject: [PATCH 5/6] Rework logic in cudf::strings::split_record to improve performance (#12729) Updates the `cudf::strings::split_record` logic to match the more optimized code in `cudf::strings:split`. The optimized code performs much better for longer strings (>64 bytes) by parallelizing over the character bytes to find delimiters before determining split tokens. This led to refactoring the code so it both APIs can share the optimized code. Also fixes a bug found when using overlapped delimiters. Additional tests were added for multi-byte delimiters which can overlap and span multiple adjacent strings. Closes #12694 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Yunsong Wang (https://github.com/PointKernel) - https://github.com/nvdbaranec URL: https://github.com/rapidsai/cudf/pull/12729 --- cpp/benchmarks/string/split.cpp | 14 +- cpp/src/strings/split/split.cu | 448 ++------------------------ cpp/src/strings/split/split.cuh | 403 +++++++++++++++++++++++ cpp/src/strings/split/split_record.cu | 168 +++------- cpp/tests/strings/split_tests.cpp | 78 ++++- 5 files changed, 565 insertions(+), 546 deletions(-) create mode 100644 cpp/src/strings/split/split.cuh diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp index 0f005c462cc..1b3f4190680 100644 --- a/cpp/benchmarks/string/split.cpp +++ b/cpp/benchmarks/string/split.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,12 +57,12 @@ static void BM_split(benchmark::State& state, split_type rt) static void generate_bench_args(benchmark::internal::Benchmark* b) { - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 5; - int const max_rowlen = 1 << 13; - int const len_mult = 4; + int constexpr min_rows = 1 << 12; + int constexpr max_rows = 1 << 24; + int constexpr row_mult = 8; + int constexpr min_rowlen = 1 << 5; + int constexpr max_rowlen = 1 << 13; + int constexpr len_mult = 2; for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) { for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) { // avoid generating combinations that exceed the cudf column limit diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu index c11d7ad47f9..18599fb568a 100644 --- a/cpp/src/strings/split/split.cu +++ b/cpp/src/strings/split/split.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "split.cuh" + #include #include #include @@ -31,14 +33,10 @@ #include #include -#include -#include -#include #include #include #include #include -#include #include #include @@ -46,321 +44,8 @@ namespace cudf { namespace strings { namespace detail { -using string_index_pair = thrust::pair; - namespace { -/** - * @brief Base class for delimiter-based tokenizers. - * - * These are common methods used by both split and rsplit tokenizer functors. - */ -struct base_split_tokenizer { - __device__ const char* get_base_ptr() const - { - return d_strings.child(strings_column_view::chars_column_index).data(); - } - - __device__ string_view const get_string(size_type idx) const - { - return d_strings.element(idx); - } - - __device__ bool is_valid(size_type idx) const { return d_strings.is_valid(idx); } - - /** - * @brief Initialize token elements for all strings. - * - * The process_tokens() only handles creating tokens for strings that contain - * delimiters. This function will initialize the output tokens for all - * strings by assigning null entries for null and empty strings and the - * string itself for strings with no delimiters. - * - * The tokens are placed in output order so that all tokens for each output - * column are stored consecutively in `d_all_tokens`. - * - * @param idx Index of string in column - * @param column_count Number of columns in output - * @param d_all_tokens Tokens vector for all strings - */ - __device__ void init_tokens(size_type idx, - size_type column_count, - string_index_pair* d_all_tokens) const - { - auto d_tokens = d_all_tokens + idx; - if (is_valid(idx)) { - auto d_str = get_string(idx); - *d_tokens = string_index_pair{d_str.data(), d_str.size_bytes()}; - --column_count; - d_tokens += d_strings.size(); - } - // this is like fill() but output needs to be strided - for (size_type col = 0; col < column_count; ++col) - d_tokens[d_strings.size() * col] = string_index_pair{nullptr, 0}; - } - - base_split_tokenizer(column_device_view const& d_strings, - string_view const& d_delimiter, - size_type max_tokens) - : d_strings(d_strings), d_delimiter(d_delimiter), max_tokens(max_tokens) - { - } - - protected: - column_device_view const d_strings; // strings to split - string_view const d_delimiter; // delimiter for split - size_type max_tokens; -}; - -/** - * @brief The tokenizer functions for split(). - * - * The methods here count delimiters, tokens, and output token elements - * for each string in a strings column. - */ -struct split_tokenizer_fn : base_split_tokenizer { - /** - * @brief This will create tokens around each delimiter honoring the string boundaries - * in which the delimiter resides. - * - * Each token is placed in `d_all_tokens` so they align consecutively - * with other tokens for the same output column. - * That is, `d_tokens[col * strings_count + string_index]` is the token at column `col` - * for string at `string_index`. - * - * @param idx Index of the delimiter in the chars column - * @param d_token_counts Token counts for each string - * @param d_positions The beginning byte position of each delimiter - * @param positions_count Number of delimiters - * @param d_indexes Indices of the strings for each delimiter - * @param d_all_tokens All output tokens for the strings column - */ - __device__ void process_tokens(size_type idx, - size_type const* d_token_counts, - size_type const* d_positions, - size_type positions_count, - size_type const* d_indexes, - string_index_pair* d_all_tokens) const - { - size_type str_idx = d_indexes[idx]; - if ((idx > 0) && d_indexes[idx - 1] == str_idx) - return; // the first delimiter for the string rules them all - --str_idx; // all of these are off by 1 from the upper_bound call - size_type token_count = d_token_counts[str_idx]; // max_tokens already included - const char* const base_ptr = get_base_ptr(); // d_positions values are based on this ptr - // this string's tokens output - auto d_tokens = d_all_tokens + str_idx; - // this string - const string_view d_str = get_string(str_idx); - const char* str_ptr = d_str.data(); // beginning of the string - const char* const str_end_ptr = str_ptr + d_str.size_bytes(); // end of the string - // build the index-pair of each token for this string - for (size_type col = 0; col < token_count; ++col) { - auto next_delim = ((idx + col) < positions_count) // boundary check for delims in last string - ? (base_ptr + d_positions[idx + col]) // start of next delimiter - : str_end_ptr; // or end of this string - auto eptr = (next_delim < str_end_ptr) // make sure delimiter is inside this string - && (col + 1 < token_count) // and this is not the last token - ? next_delim - : str_end_ptr; - // store the token into the output vector - d_tokens[col * d_strings.size()] = - string_index_pair{str_ptr, static_cast(eptr - str_ptr)}; - // point past this delimiter - str_ptr = eptr + d_delimiter.size_bytes(); - } - } - - /** - * @brief Returns `true` if the byte at `idx` is the start of the delimiter. - * - * @param idx Index of a byte in the chars column. - * @param d_offsets Offsets values to locate the chars ranges. - * @param chars_bytes Total number of characters to process. - * @return true if delimiter is found starting at position `idx` - */ - __device__ bool is_delimiter(size_type idx, // chars index - int32_t const* d_offsets, - size_type chars_bytes) const - { - auto d_chars = get_base_ptr() + d_offsets[0]; - if (idx + d_delimiter.size_bytes() > chars_bytes) return false; - return d_delimiter.compare(d_chars + idx, d_delimiter.size_bytes()) == 0; - } - - /** - * @brief This counts the tokens for strings that contain delimiters. - * - * @param idx Index of a delimiter - * @param d_positions Start positions of all the delimiters - * @param positions_count The number of delimiters - * @param d_indexes Indices of the strings for each delimiter - * @param d_counts The token counts for all the strings - */ - __device__ void count_tokens(size_type idx, // delimiter index - size_type const* d_positions, - size_type positions_count, - size_type const* d_indexes, - size_type* d_counts) const - { - size_type str_idx = d_indexes[idx]; - if ((idx > 0) && d_indexes[idx - 1] == str_idx) - return; // first delimiter found handles all of them for this string - auto const delim_length = d_delimiter.size_bytes(); - string_view const d_str = get_string(str_idx - 1); - const char* const base_ptr = get_base_ptr(); - size_type delim_count = 0; // re-count delimiters to compute the token-count - size_type last_pos = d_positions[idx] - delim_length; - while ((idx < positions_count) && (d_indexes[idx] == str_idx)) { - // make sure the whole delimiter is inside the string before counting it - auto d_pos = d_positions[idx]; - if (((base_ptr + d_pos + delim_length - 1) < (d_str.data() + d_str.size_bytes())) && - ((d_pos - last_pos) >= delim_length)) { - ++delim_count; // only count if the delimiter fits - last_pos = d_pos; // overlapping delimiters are ignored too - } - ++idx; - } - // the number of tokens is delim_count+1 but capped to max_tokens - d_counts[str_idx - 1] = - ((max_tokens > 0) && (delim_count + 1 > max_tokens)) ? max_tokens : delim_count + 1; - } - - split_tokenizer_fn(column_device_view const& d_strings, - string_view const& d_delimiter, - size_type max_tokens) - : base_split_tokenizer(d_strings, d_delimiter, max_tokens) - { - } -}; - -/** - * @brief The tokenizer functions for split(). - * - * The methods here count delimiters, tokens, and output token elements - * for each string in a strings column. - * - * Same as split_tokenizer_fn except tokens are counted from the end of each string. - */ -struct rsplit_tokenizer_fn : base_split_tokenizer { - /** - * @brief This will create tokens around each delimiter honoring the string boundaries - * in which the delimiter resides. - * - * The tokens are processed from the end of each string so the `max_tokens` - * is honored correctly. - * - * Each token is placed in `d_all_tokens` so they align consecutively - * with other tokens for the same output column. - * That is, `d_tokens[col * strings_count + string_index]` is the token at column `col` - * for string at `string_index`. - * - * @param idx Index of the delimiter in the chars column - * @param d_token_counts Token counts for each string - * @param d_positions The ending byte position of each delimiter - * @param positions_count Number of delimiters - * @param d_indexes Indices of the strings for each delimiter - * @param d_all_tokens All output tokens for the strings column - */ - __device__ void process_tokens(size_type idx, // delimiter position index - size_type const* d_token_counts, // token counts for each string - size_type const* d_positions, // end of each delimiter - size_type positions_count, // total number of delimiters - size_type const* d_indexes, // string indices for each delimiter - string_index_pair* d_all_tokens) const - { - size_type str_idx = d_indexes[idx]; - if ((idx + 1 < positions_count) && d_indexes[idx + 1] == str_idx) - return; // the last delimiter for the string rules them all - --str_idx; // all of these are off by 1 from the upper_bound call - size_type token_count = d_token_counts[str_idx]; // max_tokens already included - const char* const base_ptr = get_base_ptr(); // d_positions values are based on this ptr - // this string's tokens output - auto d_tokens = d_all_tokens + str_idx; - // this string - const string_view d_str = get_string(str_idx); - const char* const str_begin_ptr = d_str.data(); // beginning of the string - const char* str_ptr = str_begin_ptr + d_str.size_bytes(); // end of the string - // build the index-pair of each token for this string - for (size_type col = 0; col < token_count; ++col) { - auto prev_delim = (idx >= col) // boundary check for delims in first string - ? (base_ptr + d_positions[idx - col] + 1) // end of prev delimiter - : str_begin_ptr; // or the start of this string - auto sptr = (prev_delim > str_begin_ptr) // make sure delimiter is inside the string - && (col + 1 < token_count) // and this is not the last token - ? prev_delim - : str_begin_ptr; - // store the token into the output -- building the array backwards - d_tokens[d_strings.size() * (token_count - 1 - col)] = - string_index_pair{sptr, static_cast(str_ptr - sptr)}; - str_ptr = sptr - d_delimiter.size_bytes(); // get ready for the next prev token - } - } - - /** - * @brief Returns `true` if the byte at `idx` is the end of the delimiter. - * - * @param idx Index of a byte in the chars column. - * @param d_offsets Offsets values to locate the chars ranges. - * @return true if delimiter is found ending at position `idx` - */ - __device__ bool is_delimiter(size_type idx, int32_t const* d_offsets, size_type) const - { - auto delim_length = d_delimiter.size_bytes(); - if (idx < delim_length - 1) return false; - auto d_chars = get_base_ptr() + d_offsets[0]; - return d_delimiter.compare(d_chars + idx - (delim_length - 1), delim_length) == 0; - } - - /** - * @brief This counts the tokens for strings that contain delimiters. - * - * Token counting starts at the end of the string to honor the `max_tokens` - * appropriately. - * - * @param idx Index of a delimiter - * @param d_positions End positions of all the delimiters - * @param positions_count The number of delimiters - * @param d_indexes Indices of the strings for each delimiter - * @param d_counts The token counts for all the strings - */ - __device__ void count_tokens(size_type idx, - size_type const* d_positions, - size_type positions_count, - size_type const* d_indexes, - size_type* d_counts) const - { - size_type str_idx = d_indexes[idx]; // 1-based string index created by upper_bound() - if ((idx > 0) && d_indexes[idx - 1] == str_idx) - return; // first delimiter found handles all of them for this string - auto const delim_length = d_delimiter.size_bytes(); - const string_view d_str = get_string(str_idx - 1); // -1 for 0-based index - const char* const base_ptr = get_base_ptr(); - size_type delim_count = 0; - size_type last_pos = d_positions[idx] - delim_length; - while ((idx < positions_count) && (d_indexes[idx] == str_idx)) { - // make sure the whole delimiter is inside the string before counting it - auto d_pos = d_positions[idx]; - if (((base_ptr + d_pos + 1 - delim_length) >= d_str.data()) && - ((d_pos - last_pos) >= delim_length)) { - ++delim_count; // only count if the delimiter fits - last_pos = d_pos; // overlapping delimiters are also ignored - } - ++idx; - } - // the number of tokens is delim_count+1 but capped to max_tokens - d_counts[str_idx - 1] = - ((max_tokens > 0) && (delim_count + 1 > max_tokens)) ? max_tokens : delim_count + 1; - } - - rsplit_tokenizer_fn(column_device_view const& d_strings, - string_view const& d_delimiter, - size_type max_tokens) - : base_split_tokenizer(d_strings, d_delimiter, max_tokens) - { - } -}; - /** * @brief Generic split function called by split() and rsplit(). * @@ -423,125 +108,42 @@ struct rsplit_tokenizer_fn : base_split_tokenizer { * @return table of columns for the output of the split */ template -std::unique_ptr split_fn(strings_column_view const& strings_column, +std::unique_ptr
split_fn(strings_column_view const& input, Tokenizer tokenizer, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { std::vector> results; - auto const strings_count = strings_column.size(); - if (strings_count == 0) { - results.push_back(make_empty_column(type_id::STRING)); + if (input.size() == input.null_count()) { + results.push_back(std::make_unique(input.parent(), stream, mr)); return std::make_unique
(std::move(results)); } - auto d_offsets = strings_column.offsets_begin(); - auto const chars_bytes = - cudf::detail::get_value( - strings_column.offsets(), strings_column.offset() + strings_count, stream) - - cudf::detail::get_value(strings_column.offsets(), strings_column.offset(), stream); + // builds the offsets and the vector of all tokens + auto [offsets, tokens] = split_helper(input, tokenizer, stream, mr); + auto const d_offsets = offsets->view().template data(); + auto const d_tokens = tokens.data(); - // count the number of delimiters in the entire column - auto const delimiter_count = - thrust::count_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(chars_bytes), - [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) { - return tokenizer.is_delimiter(idx, d_offsets, chars_bytes); - }); - - // create vector of every delimiter position in the chars column - rmm::device_uvector delimiter_positions(delimiter_count, stream); - auto d_positions = delimiter_positions.data(); - auto copy_end = thrust::copy_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(chars_bytes), - delimiter_positions.begin(), - [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) { - return tokenizer.is_delimiter(idx, d_offsets, chars_bytes); - }); - - // create vector of string indices for each delimiter - rmm::device_uvector string_indices(delimiter_count, stream); // these will - auto d_string_indices = string_indices.data(); // be strings that only contain delimiters - thrust::upper_bound(rmm::exec_policy(stream), - d_offsets, - d_offsets + strings_count, - delimiter_positions.begin(), - copy_end, - string_indices.begin()); - - // compute the number of tokens per string - rmm::device_uvector token_counts(strings_count, stream); - auto d_token_counts = token_counts.data(); - // first, initialize token counts for strings without delimiters in them - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - d_token_counts, - [tokenizer] __device__(size_type idx) { - // null are 0, all others 1 - return static_cast(tokenizer.is_valid(idx)); - }); - - // now compute the number of tokens in each string - thrust::for_each_n( + // compute the maximum number of tokens for any string + auto const columns_count = thrust::transform_reduce( rmm::exec_policy(stream), thrust::make_counting_iterator(0), - delimiter_count, - [tokenizer, d_positions, delimiter_count, d_string_indices, d_token_counts] __device__( - size_type idx) { - tokenizer.count_tokens(idx, d_positions, delimiter_count, d_string_indices, d_token_counts); - }); - - // the columns_count is the maximum number of tokens for any string - auto const columns_count = thrust::reduce( - rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{}); - // boundary case: if no columns, return one null column (custrings issue #119) - if (columns_count == 0) { - results.push_back(std::make_unique( - data_type{type_id::STRING}, - strings_count, - rmm::device_buffer{0, stream, mr}, // no data - cudf::detail::create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr), - strings_count)); - } + thrust::make_counting_iterator(input.size()), + [d_offsets] __device__(auto idx) -> size_type { return d_offsets[idx + 1] - d_offsets[idx]; }, + 0, + thrust::maximum{}); - // create working area to hold all token positions - rmm::device_uvector tokens(columns_count * strings_count, stream); - string_index_pair* d_tokens = tokens.data(); - // initialize the token positions - // -- accounts for nulls, empty, and strings with no delimiter in them - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - [tokenizer, columns_count, d_tokens] __device__(size_type idx) { - tokenizer.init_tokens(idx, columns_count, d_tokens); - }); - - // get the positions for every token using the delimiter positions - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - delimiter_count, - [tokenizer, - d_token_counts, - d_positions, - delimiter_count, - d_string_indices, - d_tokens] __device__(size_type idx) { - tokenizer.process_tokens( - idx, d_token_counts, d_positions, delimiter_count, d_string_indices, d_tokens); - }); - - // Create each column. - // - Each pair points to the strings for that column for each row. - // - Create the strings column from the vector using the strings factory. + // build strings columns for each token position for (size_type col = 0; col < columns_count; ++col) { - auto column_tokens = d_tokens + (col * strings_count); - results.emplace_back( - make_strings_column(column_tokens, column_tokens + strings_count, stream, mr)); + auto itr = cudf::detail::make_counting_transform_iterator( + 0, [d_tokens, d_offsets, col] __device__(size_type idx) { + auto const offset = d_offsets[idx]; + auto const token_count = d_offsets[idx + 1] - offset; + return (col < token_count) ? d_tokens[offset + col] : string_index_pair{nullptr, 0}; + }); + results.emplace_back(make_strings_column(itr, itr + input.size(), stream, mr)); } + return std::make_unique
(std::move(results)); } diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh new file mode 100644 index 00000000000..41213dac58b --- /dev/null +++ b/cpp/src/strings/split/split.cuh @@ -0,0 +1,403 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace cudf::strings::detail { + +/** + * @brief Base class for delimiter-based tokenizers + * + * These are common methods used by both split and rsplit tokenizer functors. + * + * The Derived class is required to implement the `process_tokens` function. + */ +template +struct base_split_tokenizer { + __device__ char const* get_base_ptr() const + { + return d_strings.child(strings_column_view::chars_column_index).data(); + } + + __device__ string_view const get_string(size_type idx) const + { + return d_strings.element(idx); + } + + __device__ bool is_valid(size_type idx) const { return d_strings.is_valid(idx); } + + /** + * @brief Returns `true` if the byte at `idx` is the start of the delimiter + * + * @param idx Index of a byte in the chars column + * @param d_offsets Offsets values to locate the chars ranges + * @param chars_bytes Total number of characters to process + * @return true if delimiter is found starting at position `idx` + */ + __device__ bool is_delimiter(size_type idx, + size_type const* d_offsets, + size_type chars_bytes) const + { + auto const d_chars = get_base_ptr() + d_offsets[0]; + if (idx + d_delimiter.size_bytes() > chars_bytes) { return false; } + return d_delimiter.compare(d_chars + idx, d_delimiter.size_bytes()) == 0; + } + + /** + * @brief This counts the tokens for strings that contain delimiters + * + * Counting tokens is the same regardless if counting from the left + * or from the right. This logic counts from the left which is simpler. + * The count will be truncated appropriately to the max_tokens value. + * + * @param idx Index of input string + * @param d_positions Start positions of all the delimiters + * @param d_delimiter_offsets Offsets per string to delimiters in d_positions + */ + __device__ size_type count_tokens(size_type idx, + size_type const* d_positions, + size_type const* d_delimiter_offsets) const + { + if (!is_valid(idx)) { return 0; } + + auto const delim_size = d_delimiter.size_bytes(); + auto const d_str = get_string(idx); + auto const d_str_end = d_str.data() + d_str.size_bytes(); + auto const base_ptr = get_base_ptr() + delim_size - 1; + auto const delimiters = + cudf::device_span(d_positions + d_delimiter_offsets[idx], + d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]); + + size_type token_count = 1; // all strings will have at least one token + size_type last_pos = delimiters[0] - delim_size; + for (auto d_pos : delimiters) { + // delimiter must fit in string && overlapping delimiters are ignored + if (((base_ptr + d_pos) < d_str_end) && ((d_pos - last_pos) >= delim_size)) { + ++token_count; + last_pos = d_pos; + } + } + // number of tokens is capped to max_tokens + return ((max_tokens > 0) && (token_count > max_tokens)) ? max_tokens : token_count; + } + + /** + * @brief This will create tokens around each delimiter honoring the string boundaries + * in which the delimiter resides + * + * Each token is placed in `d_all_tokens` so they align consecutively + * with other tokens for the same output column. + * + * The actual token extraction is performed in the subclass process_tokens() function. + * + * @param idx Index of the string to tokenize + * @param d_tokens_offsets Token offsets for each string + * @param d_positions The beginning byte position of each delimiter + * @param d_delimiter_offsets Offsets to d_positions to each delimiter set per string + * @param d_all_tokens All output tokens for the strings column + */ + __device__ void get_tokens(size_type idx, + size_type const* d_tokens_offsets, + size_type const* d_positions, + size_type const* d_delimiter_offsets, + string_index_pair* d_all_tokens) const + { + auto const d_tokens = // this string's tokens output + cudf::device_span(d_all_tokens + d_tokens_offsets[idx], + d_tokens_offsets[idx + 1] - d_tokens_offsets[idx]); + + if (!is_valid(idx)) { return; } + + auto const d_str = get_string(idx); + + // max_tokens already included in token counts + if (d_tokens.size() == 1) { + d_tokens[0] = string_index_pair{d_str.data(), d_str.size_bytes()}; + return; + } + + auto const delimiters = + cudf::device_span(d_positions + d_delimiter_offsets[idx], + d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]); + + auto& derived = static_cast(*this); + derived.process_tokens(d_str, delimiters, d_tokens); + } + + base_split_tokenizer(column_device_view const& d_strings, + string_view const& d_delimiter, + size_type max_tokens) + : d_strings(d_strings), d_delimiter(d_delimiter), max_tokens(max_tokens) + { + } + + protected: + column_device_view const d_strings; // strings to split + string_view const d_delimiter; // delimiter for split + size_type max_tokens; // maximum number of tokens to identify +}; + +/** + * @brief The tokenizer functions for forward splitting + */ +struct split_tokenizer_fn : base_split_tokenizer { + /** + * @brief This will create tokens around each delimiter honoring the string boundaries + * + * The tokens are processed from the beginning of each string ignoring overlapping + * delimiters and honoring the `max_tokens` value. + * + * @param d_str String to tokenize + * @param d_delimiters Positions of delimiters for this string + * @param d_tokens Output vector to store tokens for this string + */ + __device__ void process_tokens(string_view const d_str, + device_span d_delimiters, + device_span d_tokens) const + { + auto const base_ptr = get_base_ptr(); // d_positions values based on this + auto str_ptr = d_str.data(); + auto const str_end = str_ptr + d_str.size_bytes(); // end of the string + auto const token_count = static_cast(d_tokens.size()); + auto const delim_size = d_delimiter.size_bytes(); + + // build the index-pair of each token for this string + size_type token_idx = 0; + for (auto d_pos : d_delimiters) { + auto const next_delim = base_ptr + d_pos; + if (next_delim < str_ptr || ((next_delim + delim_size) > str_end)) { continue; } + auto const end_ptr = (token_idx + 1 < token_count) ? next_delim : str_end; + + // store the token into the output vector + d_tokens[token_idx++] = + string_index_pair{str_ptr, static_cast(thrust::distance(str_ptr, end_ptr))}; + + // setup for next token + str_ptr = end_ptr + delim_size; + } + // include anything leftover + if (token_idx < token_count) { + d_tokens[token_idx] = + string_index_pair{str_ptr, static_cast(thrust::distance(str_ptr, str_end))}; + } + } + + split_tokenizer_fn(column_device_view const& d_strings, + string_view const& d_delimiter, + size_type max_tokens) + : base_split_tokenizer(d_strings, d_delimiter, max_tokens) + { + } +}; + +/** + * @brief The tokenizer functions for backwards splitting + * + * Same as split_tokenizer_fn except delimiters are searched from the end of each string. + */ +struct rsplit_tokenizer_fn : base_split_tokenizer { + /** + * @brief This will create tokens around each delimiter honoring the string boundaries + * + * The tokens are processed from the end of each string ignoring overlapping + * delimiters and honoring the `max_tokens` value. + * + * @param d_str String to tokenize + * @param d_delimiters Positions of delimiters for this string + * @param d_tokens Output vector to store tokens for this string + */ + __device__ void process_tokens(string_view const d_str, + device_span d_delimiters, + device_span d_tokens) const + { + auto const base_ptr = get_base_ptr(); // d_positions values are based on this ptr + auto const str_begin = d_str.data(); // beginning of the string + auto const token_count = static_cast(d_tokens.size()); + auto const delim_count = static_cast(d_delimiters.size()); + auto const delim_size = d_delimiter.size_bytes(); + + // build the index-pair of each token for this string + auto str_ptr = str_begin + d_str.size_bytes(); + size_type token_idx = 0; + for (auto d = delim_count - 1; d >= 0; --d) { // read right-to-left + auto const prev_delim = base_ptr + d_delimiters[d] + delim_size; + if (prev_delim > str_ptr || ((prev_delim - delim_size) < str_begin)) { continue; } + auto const start_ptr = (token_idx + 1 < token_count) ? prev_delim : str_begin; + + // store the token into the output vector right-to-left + d_tokens[token_count - token_idx - 1] = + string_index_pair{start_ptr, static_cast(thrust::distance(start_ptr, str_ptr))}; + + // setup for next token + str_ptr = start_ptr - delim_size; + ++token_idx; + } + // include anything leftover (rightover?) + if (token_idx < token_count) { + d_tokens[0] = + string_index_pair{str_begin, static_cast(thrust::distance(str_begin, str_ptr))}; + } + } + + rsplit_tokenizer_fn(column_device_view const& d_strings, + string_view const& d_delimiter, + size_type max_tokens) + : base_split_tokenizer(d_strings, d_delimiter, max_tokens) + { + } +}; + +/** + * @brief Helper function used by split/rsplit and split_record/rsplit_record + * + * This function returns all the token/split positions within the input column as processed by + * the given tokenizer. It also returns the offsets for each set of tokens identified per string. + * + * @tparam Tokenizer Type of the tokenizer object + * + * @param input The input column of strings to split + * @param tokenizer Object used for counting and identifying delimiters and tokens + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned objects' device memory. + */ +template +std::pair, rmm::device_uvector> split_helper( + strings_column_view const& input, + Tokenizer tokenizer, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const strings_count = input.size(); + auto const chars_bytes = + cudf::detail::get_value(input.offsets(), input.offset() + strings_count, stream) - + cudf::detail::get_value(input.offsets(), input.offset(), stream); + + auto d_offsets = input.offsets_begin(); + + // count the number of delimiters in the entire column + auto const delimiter_count = + thrust::count_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(chars_bytes), + [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) { + return tokenizer.is_delimiter(idx, d_offsets, chars_bytes); + }); + // Create a vector of every delimiter position in the chars column. + // These may include overlapping or otherwise out-of-bounds delimiters which + // will be resolved during token processing. + auto delimiter_positions = rmm::device_uvector(delimiter_count, stream); + auto d_positions = delimiter_positions.data(); + auto const copy_end = + thrust::copy_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(chars_bytes), + delimiter_positions.begin(), + [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) { + return tokenizer.is_delimiter(idx, d_offsets, chars_bytes); + }); + + // create a vector of offsets to each string's delimiter set within delimiter_positions + auto const delimiter_offsets = [&] { + // first, create a vector of string indices for each delimiter + auto string_indices = rmm::device_uvector(delimiter_count, stream); + thrust::upper_bound(rmm::exec_policy(stream), + d_offsets, + d_offsets + strings_count, + delimiter_positions.begin(), + copy_end, + string_indices.begin()); + + // compute delimiter offsets per string + auto delimiter_offsets = rmm::device_uvector(strings_count + 1, stream); + auto d_delimiter_offsets = delimiter_offsets.data(); + + // memset to zero-out the delimiter counts for any null-entries or strings with no delimiters + CUDF_CUDA_TRY(cudaMemsetAsync( + d_delimiter_offsets, 0, delimiter_offsets.size() * sizeof(size_type), stream.value())); + + // next, count the number of delimiters per string + auto d_string_indices = string_indices.data(); // identifies strings with delimiters only + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + delimiter_count, + [d_string_indices, d_delimiter_offsets] __device__(size_type idx) { + auto const str_idx = d_string_indices[idx] - 1; + atomicAdd(d_delimiter_offsets + str_idx, 1); + }); + // finally, convert the delimiter counts into offsets + thrust::exclusive_scan(rmm::exec_policy(stream), + delimiter_offsets.begin(), + delimiter_offsets.end(), + delimiter_offsets.begin()); + return delimiter_offsets; + }(); + auto const d_delimiter_offsets = delimiter_offsets.data(); + + // compute the number of tokens per string + auto token_counts = rmm::device_uvector(strings_count, stream); + thrust::transform( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + token_counts.begin(), + [tokenizer, d_positions, d_delimiter_offsets] __device__(size_type idx) -> size_type { + return tokenizer.count_tokens(idx, d_positions, d_delimiter_offsets); + }); + + // create offsets from the counts for return to the caller + auto offsets = std::get<0>( + cudf::detail::make_offsets_child_column(token_counts.begin(), token_counts.end(), stream, mr)); + auto const total_tokens = + cudf::detail::get_value(offsets->view(), strings_count, stream); + auto const d_tokens_offsets = offsets->view().data(); + + // build a vector of all the token positions for all the strings + auto tokens = rmm::device_uvector(total_tokens, stream); + auto d_tokens = tokens.data(); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + strings_count, + [tokenizer, d_tokens_offsets, d_positions, d_delimiter_offsets, d_tokens] __device__( + size_type idx) { + tokenizer.get_tokens(idx, d_tokens_offsets, d_positions, d_delimiter_offsets, d_tokens); + }); + + return std::make_pair(std::move(offsets), std::move(tokens)); +} + +} // namespace cudf::strings::detail diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu index d935ad0b1da..5b79fdefb5a 100644 --- a/cpp/src/strings/split/split_record.cu +++ b/cpp/src/strings/split/split_record.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "split.cuh" + #include #include #include @@ -23,14 +25,12 @@ #include #include #include -#include #include #include #include #include -#include #include #include @@ -38,108 +38,43 @@ namespace cudf { namespace strings { namespace detail { -using string_index_pair = thrust::pair; - namespace { -enum class Dir { FORWARD, BACKWARD }; - -/** - * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`. - * - * The number of tokens is the same regardless if counting from the beginning - * or the end of the string. - */ -struct token_counter_fn { - column_device_view const d_strings; // strings to split - string_view const d_delimiter; // delimiter for split - size_type const max_tokens = std::numeric_limits::max(); - - __device__ size_type operator()(size_type idx) const - { - if (d_strings.is_null(idx)) { return 0; } - - auto const d_str = d_strings.element(idx); - size_type token_count = 0; - size_type start_pos = 0; - while (token_count < max_tokens - 1) { - auto const delimiter_pos = d_str.find(d_delimiter, start_pos); - if (delimiter_pos == string_view::npos) break; - token_count++; - start_pos = delimiter_pos + d_delimiter.length(); - } - return token_count + 1; // always at least one token - } -}; - -/** - * @brief Identify the tokens from the `idx'th` string element of `d_strings`. - */ -template -struct token_reader_fn { - column_device_view const d_strings; // strings to split - string_view const d_delimiter; // delimiter for split - int32_t* d_token_offsets{}; // for locating tokens in d_tokens - string_index_pair* d_tokens{}; - - __device__ string_index_pair resolve_token(string_view const& d_str, - size_type start_pos, - size_type end_pos, - size_type delimiter_pos) const - { - if (dir == Dir::FORWARD) { - auto const byte_offset = d_str.byte_offset(start_pos); - return string_index_pair{d_str.data() + byte_offset, - d_str.byte_offset(delimiter_pos) - byte_offset}; - } else { - auto const byte_offset = d_str.byte_offset(delimiter_pos + d_delimiter.length()); - return string_index_pair{d_str.data() + byte_offset, - d_str.byte_offset(end_pos) - byte_offset}; - } +template +std::unique_ptr split_record_fn(strings_column_view const& input, + Tokenizer tokenizer, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (input.is_empty()) { return make_empty_column(type_id::LIST); } + if (input.size() == input.null_count()) { + auto offsets = std::make_unique(input.offsets(), stream, mr); + auto results = make_empty_column(type_id::STRING); + return make_lists_column(input.size(), + std::move(offsets), + std::move(results), + input.null_count(), + copy_bitmask(input.parent(), stream, mr), + stream, + mr); } - __device__ void operator()(size_type idx) - { - if (d_strings.is_null(idx)) { return; } + // builds the offsets and the vector of all tokens + auto [offsets, tokens] = split_helper(input, tokenizer, stream, mr); - auto const token_offset = d_token_offsets[idx]; - auto const token_count = d_token_offsets[idx + 1] - token_offset; - auto d_result = d_tokens + token_offset; - auto const d_str = d_strings.element(idx); - if (d_str.empty()) { - // Pandas str.split("") for non-whitespace delimiter is an empty string - *d_result = string_index_pair{"", 0}; - return; - } + // build a strings column from the tokens + auto strings_child = make_strings_column(tokens.begin(), tokens.end(), stream, mr); - size_type token_idx = 0; - size_type start_pos = 0; // updates only if moving forward - size_type end_pos = d_str.length(); // updates only if moving backward - while (token_idx < token_count - 1) { - auto const delimiter_pos = dir == Dir::FORWARD ? d_str.find(d_delimiter, start_pos) - : d_str.rfind(d_delimiter, start_pos, end_pos); - if (delimiter_pos == string_view::npos) break; - auto const token = resolve_token(d_str, start_pos, end_pos, delimiter_pos); - if (dir == Dir::FORWARD) { - d_result[token_idx] = token; - start_pos = delimiter_pos + d_delimiter.length(); - } else { - d_result[token_count - 1 - token_idx] = token; - end_pos = delimiter_pos; - } - token_idx++; - } + return make_lists_column(input.size(), + std::move(offsets), + std::move(strings_child), + input.null_count(), + copy_bitmask(input.parent(), stream, mr), + stream, + mr); +} - // set last token to remainder of the string - if (dir == Dir::FORWARD) { - auto const offset_bytes = d_str.byte_offset(start_pos); - d_result[token_idx] = - string_index_pair{d_str.data() + offset_bytes, d_str.byte_offset(end_pos) - offset_bytes}; - } else { - d_result[0] = string_index_pair{d_str.data(), d_str.byte_offset(end_pos)}; - } - } -}; +enum class Dir { FORWARD, BACKWARD }; /** * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`. @@ -196,7 +131,7 @@ struct whitespace_token_reader_fn { whitespace_string_tokenizer tokenizer(d_str, dir != Dir::FORWARD); size_type token_idx = 0; position_pair token{0, 0}; - if (dir == Dir::FORWARD) { + if constexpr (dir == Dir::FORWARD) { while (tokenizer.next_token() && (token_idx < token_count)) { token = tokenizer.get_token(); d_result[token_idx++] = @@ -224,11 +159,11 @@ struct whitespace_token_reader_fn { // The output is one list item per string template -std::unique_ptr split_record_fn(strings_column_view const& strings, - TokenCounter counter, - TokenReader reader, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr whitespace_split_record_fn(strings_column_view const& strings, + TokenCounter counter, + TokenReader reader, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // create offsets column by counting the number of tokens per string auto strings_count = strings.size(); @@ -244,7 +179,7 @@ std::unique_ptr split_record_fn(strings_column_view const& strings, rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets); // last entry is the total number of tokens to be generated - auto total_tokens = cudf::detail::get_value(offsets->view(), strings_count, stream); + auto total_tokens = cudf::detail::get_value(offsets->view(), strings_count, stream); // split each string into an array of index-pair values rmm::device_uvector tokens(total_tokens, stream); reader.d_token_offsets = d_offsets; @@ -277,18 +212,21 @@ std::unique_ptr split_record(strings_column_view const& strings, auto d_strings_column_ptr = column_device_view::create(strings.parent(), stream); if (delimiter.size() == 0) { - return split_record_fn(strings, - whitespace_token_counter_fn{*d_strings_column_ptr, max_tokens}, - whitespace_token_reader_fn{*d_strings_column_ptr, max_tokens}, - stream, - mr); + return whitespace_split_record_fn( + strings, + whitespace_token_counter_fn{*d_strings_column_ptr, max_tokens}, + whitespace_token_reader_fn{*d_strings_column_ptr, max_tokens}, + stream, + mr); } else { string_view d_delimiter(delimiter.data(), delimiter.size()); - return split_record_fn(strings, - token_counter_fn{*d_strings_column_ptr, d_delimiter, max_tokens}, - token_reader_fn{*d_strings_column_ptr, d_delimiter}, - stream, - mr); + if (dir == Dir::FORWARD) { + return split_record_fn( + strings, split_tokenizer_fn{*d_strings_column_ptr, d_delimiter, max_tokens}, stream, mr); + } else { + return split_record_fn( + strings, rsplit_tokenizer_fn{*d_strings_column_ptr, d_delimiter, max_tokens}, stream, mr); + } } } diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index 73d5adab427..714c1ad416a 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -308,6 +308,82 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); } +TEST_F(StringsSplitTest, MultiByteDelimiters) +{ + // Overlapping delimiters + auto input = + cudf::test::strings_column_wrapper({"u::", "w:::x", "y::::z", "::a", ":::b", ":::c:::"}); + auto view = cudf::strings_column_view(input); + using LCW = cudf::test::lists_column_wrapper; + { + auto result = cudf::strings::split_record(view, cudf::string_scalar("::")); + auto expected_left = LCW({LCW{"u", ""}, + LCW{"w", ":x"}, + LCW{"y", "", "z"}, + LCW{"", "a"}, + LCW{"", ":b"}, + LCW{"", ":c", ":"}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_left); + result = cudf::strings::rsplit_record(view, cudf::string_scalar("::")); + auto expected_right = LCW({LCW{"u", ""}, + LCW{"w:", "x"}, + LCW{"y", "", "z"}, + LCW{"", "a"}, + LCW{":", "b"}, + LCW{":", "c:", ""}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_right); + } + { + auto result = cudf::strings::split(view, cudf::string_scalar("::")); + + auto c0 = cudf::test::strings_column_wrapper({"u", "w", "y", "", "", ""}); + auto c1 = cudf::test::strings_column_wrapper({"", ":x", "", "a", ":b", ":c"}); + auto c2 = cudf::test::strings_column_wrapper({"", "", "z", "", "", ":"}, {0, 0, 1, 0, 0, 1}); + std::vector> expected_columns; + expected_columns.push_back(c0.release()); + expected_columns.push_back(c1.release()); + expected_columns.push_back(c2.release()); + auto expected_left = std::make_unique(std::move(expected_columns)); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected_left); + + result = cudf::strings::rsplit(view, cudf::string_scalar("::")); + + c0 = cudf::test::strings_column_wrapper({"u", "w:", "y", "", ":", ":"}); + c1 = cudf::test::strings_column_wrapper({"", "x", "", "a", "b", "c:"}); + c2 = cudf::test::strings_column_wrapper({"", "", "z", "", "", ""}, {0, 0, 1, 0, 0, 1}); + expected_columns.push_back(c0.release()); + expected_columns.push_back(c1.release()); + expected_columns.push_back(c2.release()); + auto expected_right = std::make_unique(std::move(expected_columns)); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected_right); + } + + // Delimiters that span across adjacent strings + input = cudf::test::strings_column_wrapper({"{a=1}:{b=2}:", "{c=3}", ":{}:{}"}); + view = cudf::strings_column_view(input); + { + auto result = cudf::strings::split_record(view, cudf::string_scalar("}:{")); + auto expected = LCW({LCW{"{a=1", "b=2}:"}, LCW{"{c=3}"}, LCW{":{", "}"}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); + result = cudf::strings::rsplit_record(view, cudf::string_scalar("}:{")); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); + } + { + auto result = cudf::strings::split(view, cudf::string_scalar("}:{")); + + auto c0 = cudf::test::strings_column_wrapper({"{a=1", "{c=3}", ":{"}); + auto c1 = cudf::test::strings_column_wrapper({"b=2}:", "", "}"}, {1, 0, 1}); + std::vector> expected_columns; + expected_columns.push_back(c0.release()); + expected_columns.push_back(c1.release()); + auto expected = std::make_unique(std::move(expected_columns)); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected); + + result = cudf::strings::rsplit(view, cudf::string_scalar("}:{")); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected); + } +} + TEST_F(StringsSplitTest, SplitRegex) { std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""}; From a308b24a478f4888ccce5d64b10a6107b97b7da9 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 21 Feb 2023 09:42:54 -0800 Subject: [PATCH 6/6] Remove now redundant cuda initialization (#12758) Prior to #11452 cuDF Python did not require CUDA for compilation. When libcudf was is found by CMake, however, it triggers a compilation of the C++ library, which does require CUDA for compilation. In order to support this behavior, we included some extra logic in cuDF's CMake to ensure that the appropriate CUDA architectures are compiled for (respecting the extra options like `RAPIDS` and `NATIVE` that `rapids-cmake` offers). However, with the merge of #11452 this conditional is now redundant because cuDF requires CUDA compilation unconditionally, so we can remove the extra code. Authors: - Vyas Ramasubramani (https://github.com/vyasr) - AJ Schmidt (https://github.com/ajschmidt8) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/12758 --- python/cudf/CMakeLists.txt | 9 --------- 1 file changed, 9 deletions(-) diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index 7457b770b13..7210d398c6b 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -72,15 +72,6 @@ endif() include(rapids-cython) if(NOT cudf_FOUND) - # TODO: This will not be necessary once we upgrade to CMake 3.22, which will pull in the required - # languages for the C++ project even if this project does not require those languages. - include(rapids-cuda) - rapids_cuda_init_architectures(cudf-python) - enable_language(CUDA) - # Since cudf only enables CUDA optionally we need to manually include the file that - # rapids_cuda_init_architectures relies on `project` including. - include("${CMAKE_PROJECT_cudf-python_INCLUDE}") - set(BUILD_TESTS OFF) set(BUILD_BENCHMARKS OFF)