diff --git a/.github/workflows/add_to_project.yml b/.github/workflows/add_to_project.yml deleted file mode 100644 index b301c56a999..00000000000 --- a/.github/workflows/add_to_project.yml +++ /dev/null @@ -1,20 +0,0 @@ -name: Add new issue/PR to project - -on: - issues: - types: - - opened - - pull_request_target: - types: - - opened - -jobs: - add-to-project: - name: Add issue or PR to project - runs-on: ubuntu-latest - steps: - - uses: actions/add-to-project@v0.3.0 - with: - project-url: https://github.com/orgs/rapidsai/projects/51 - github-token: ${{ secrets.ADD_TO_PROJECT_GITHUB_TOKEN }} diff --git a/.github/workflows/new-issues-to-triage-projects.yml b/.github/workflows/new-issues-to-triage-projects.yml deleted file mode 100644 index cf9b0c379f1..00000000000 --- a/.github/workflows/new-issues-to-triage-projects.yml +++ /dev/null @@ -1,35 +0,0 @@ -name: Auto Assign New Issues to Triage Project - -on: - issues: - types: [opened] - -env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - -jobs: - assign_one_project: - runs-on: ubuntu-latest - name: Assign to New Issues to Triage Project - steps: - - name: Process bug issues - uses: docker://takanabe/github-actions-automate-projects:v0.0.1 - if: contains(github.event.issue.labels.*.name, 'bug') && contains(github.event.issue.labels.*.name, '? - Needs Triage') - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - GITHUB_PROJECT_URL: https://github.com/rapidsai/cudf/projects/1 - GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing' - - name: Process feature issues - uses: docker://takanabe/github-actions-automate-projects:v0.0.1 - if: contains(github.event.issue.labels.*.name, 'feature request') && contains(github.event.issue.labels.*.name, '? - Needs Triage') - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - GITHUB_PROJECT_URL: https://github.com/rapidsai/cudf/projects/9 - GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing' - - name: Process other issues - uses: docker://takanabe/github-actions-automate-projects:v0.0.1 - if: contains(github.event.issue.labels.*.name, '? - Needs Triage') && (!contains(github.event.issue.labels.*.name, 'bug') && !contains(github.event.issue.labels.*.name, 'feature request')) - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - GITHUB_PROJECT_URL: https://github.com/rapidsai/cudf/projects/10 - GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing' diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index d4abc28cf13..9fb991f9075 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -9,6 +9,7 @@ channels: - nvidia dependencies: - aiobotocore>=2.2.0 +- aws-sdk-cpp<1.11 - benchmark==1.8.0 - boto3>=1.21.21 - botocore>=1.24.21 diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index 9a98e400e6d..9ba0dd8dc38 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -9,6 +9,7 @@ channels: - nvidia dependencies: - aiobotocore>=2.2.0 +- aws-sdk-cpp<1.11 - benchmark==1.8.0 - boto3>=1.21.21 - botocore>=1.24.21 diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index 25b3f19de77..b1f5b083e06 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -22,6 +22,9 @@ gbench_version: gtest_version: - ">=1.13.0" +aws_sdk_cpp_version: + - "<1.11" + libarrow_version: - "=12" diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 627065817ba..28357f0d96d 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -74,6 +74,7 @@ requirements: - gtest {{ gtest_version }} - gmock {{ gtest_version }} - zlib {{ zlib_version }} + - aws-sdk-cpp {{ aws_sdk_cpp_version }} outputs: - name: libcudf @@ -107,6 +108,7 @@ outputs: - dlpack {{ dlpack_version }} - gtest {{ gtest_version }} - gmock {{ gtest_version }} + - aws-sdk-cpp {{ aws_sdk_cpp_version }} test: commands: - test -f $PREFIX/lib/libcudf.so diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp index 0319577f6b9..f3fd5cc5729 100644 --- a/cpp/benchmarks/text/ngrams.cpp +++ b/cpp/benchmarks/text/ngrams.cpp @@ -36,11 +36,12 @@ static void BM_ngrams(benchmark::State& state, ngrams_type nt) cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); cudf::strings_column_view input(column->view()); + auto const separator = cudf::string_scalar("_"); for (auto _ : state) { cuda_event_timer raii(state, true); switch (nt) { - case ngrams_type::tokens: nvtext::generate_ngrams(input); break; + case ngrams_type::tokens: nvtext::generate_ngrams(input, 2, separator); break; case ngrams_type::characters: nvtext::generate_character_ngrams(input); break; } } diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp index 423fe667b05..b556a84c541 100644 --- a/cpp/benchmarks/text/tokenize.cpp +++ b/cpp/benchmarks/text/tokenize.cpp @@ -67,8 +67,11 @@ static void bench_tokenize(nvbench::state& state) auto result = nvtext::count_tokens(input, cudf::strings_column_view(delimiters)); }); } else if (tokenize_type == "ngrams") { - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { auto result = nvtext::ngrams_tokenize(input); }); + auto const delimiter = cudf::string_scalar(""); + auto const separator = cudf::string_scalar("_"); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::ngrams_tokenize(input, 2, delimiter, separator); + }); } else if (tokenize_type == "characters") { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = nvtext::character_tokenize(input); }); diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh index 4731c4919e3..6532dae3695 100644 --- a/cpp/include/cudf/detail/indexalator.cuh +++ b/cpp/include/cudf/detail/indexalator.cuh @@ -16,14 +16,13 @@ #pragma once +#include + #include #include #include #include -#include -#include -#include #include #include #include @@ -32,193 +31,6 @@ namespace cudf { namespace detail { -/** - * @brief The base class for the input or output index normalizing iterator. - * - * This implementation uses CRTP to define the `input_indexalator` and the - * `output_indexalator` classes. This is so this class can manipulate the - * uniquely typed subclass member variable `p_` directly without requiring - * virtual functions since iterator instances will be copied to device memory. - * - * The base class mainly manages updating the `p_` member variable while the - * subclasses handle accessing individual elements in device memory. - * - * @tparam T The derived class type for the iterator. - */ -template -struct base_indexalator { - using difference_type = ptrdiff_t; - using value_type = size_type; - using pointer = size_type*; - using iterator_category = std::random_access_iterator_tag; - - base_indexalator() = default; - base_indexalator(base_indexalator const&) = default; - base_indexalator(base_indexalator&&) = default; - base_indexalator& operator=(base_indexalator const&) = default; - base_indexalator& operator=(base_indexalator&&) = default; - - /** - * @brief Prefix increment operator. - */ - CUDF_HOST_DEVICE inline T& operator++() - { - T& derived = static_cast(*this); - derived.p_ += width_; - return derived; - } - - /** - * @brief Postfix increment operator. - */ - CUDF_HOST_DEVICE inline T operator++(int) - { - T tmp{static_cast(*this)}; - operator++(); - return tmp; - } - - /** - * @brief Prefix decrement operator. - */ - CUDF_HOST_DEVICE inline T& operator--() - { - T& derived = static_cast(*this); - derived.p_ -= width_; - return derived; - } - - /** - * @brief Postfix decrement operator. - */ - CUDF_HOST_DEVICE inline T operator--(int) - { - T tmp{static_cast(*this)}; - operator--(); - return tmp; - } - - /** - * @brief Compound assignment by sum operator. - */ - CUDF_HOST_DEVICE inline T& operator+=(difference_type offset) - { - T& derived = static_cast(*this); - derived.p_ += offset * width_; - return derived; - } - - /** - * @brief Increment by offset operator. - */ - CUDF_HOST_DEVICE inline T operator+(difference_type offset) const - { - auto tmp = T{static_cast(*this)}; - tmp.p_ += (offset * width_); - return tmp; - } - - /** - * @brief Addition assignment operator. - */ - CUDF_HOST_DEVICE inline friend T operator+(difference_type offset, T const& rhs) - { - T tmp{rhs}; - tmp.p_ += (offset * rhs.width_); - return tmp; - } - - /** - * @brief Compound assignment by difference operator. - */ - CUDF_HOST_DEVICE inline T& operator-=(difference_type offset) - { - T& derived = static_cast(*this); - derived.p_ -= offset * width_; - return derived; - } - - /** - * @brief Decrement by offset operator. - */ - CUDF_HOST_DEVICE inline T operator-(difference_type offset) const - { - auto tmp = T{static_cast(*this)}; - tmp.p_ -= (offset * width_); - return tmp; - } - - /** - * @brief Subtraction assignment operator. - */ - CUDF_HOST_DEVICE inline friend T operator-(difference_type offset, T const& rhs) - { - T tmp{rhs}; - tmp.p_ -= (offset * rhs.width_); - return tmp; - } - - /** - * @brief Compute offset from iterator difference operator. - */ - CUDF_HOST_DEVICE inline difference_type operator-(T const& rhs) const - { - return (static_cast(*this).p_ - rhs.p_) / width_; - } - - /** - * @brief Equals to operator. - */ - CUDF_HOST_DEVICE inline bool operator==(T const& rhs) const - { - return rhs.p_ == static_cast(*this).p_; - } - /** - * @brief Not equals to operator. - */ - CUDF_HOST_DEVICE inline bool operator!=(T const& rhs) const - { - return rhs.p_ != static_cast(*this).p_; - } - /** - * @brief Less than operator. - */ - CUDF_HOST_DEVICE inline bool operator<(T const& rhs) const - { - return static_cast(*this).p_ < rhs.p_; - } - /** - * @brief Greater than operator. - */ - CUDF_HOST_DEVICE inline bool operator>(T const& rhs) const - { - return static_cast(*this).p_ > rhs.p_; - } - /** - * @brief Less than or equals to operator. - */ - CUDF_HOST_DEVICE inline bool operator<=(T const& rhs) const - { - return static_cast(*this).p_ <= rhs.p_; - } - /** - * @brief Greater than or equals to operator. - */ - CUDF_HOST_DEVICE inline bool operator>=(T const& rhs) const - { - return static_cast(*this).p_ >= rhs.p_; - } - - protected: - /** - * @brief Constructor assigns width and type member variables for base class. - */ - base_indexalator(int32_t width, data_type dtype) : width_(width), dtype_(dtype) {} - - int width_; /// integer type width = 1,2,4, or 8 - data_type dtype_; /// for type-dispatcher calls -}; - /** * @brief The index normalizing input iterator. * @@ -244,65 +56,7 @@ struct base_indexalator { * auto result = thrust::find(thrust::device, begin, end, size_type{12} ); * @endcode */ -struct input_indexalator : base_indexalator { - friend struct indexalator_factory; - friend struct base_indexalator; // for CRTP - - using reference = size_type const; // this keeps STL and thrust happy - - input_indexalator() = default; - input_indexalator(input_indexalator const&) = default; - input_indexalator(input_indexalator&&) = default; - input_indexalator& operator=(input_indexalator const&) = default; - input_indexalator& operator=(input_indexalator&&) = default; - - /** - * @brief Indirection operator returns the value at the current iterator position. - */ - __device__ inline size_type operator*() const { return operator[](0); } - - /** - * @brief Dispatch functor for resolving a size_type value from any index type. - */ - struct index_as_size_type { - template ()>* = nullptr> - __device__ size_type operator()(void const* tp) - { - return static_cast(*static_cast(tp)); - } - template ()>* = nullptr> - __device__ size_type operator()(void const* tp) - { - CUDF_UNREACHABLE("only index types are supported"); - } - }; - /** - * @brief Array subscript operator returns a value at the input - * `idx` position as a `size_type` value. - */ - __device__ inline size_type operator[](size_type idx) const - { - void const* tp = p_ + (idx * width_); - return type_dispatcher(dtype_, index_as_size_type{}, tp); - } - - protected: - /** - * @brief Create an input index normalizing iterator. - * - * Use the indexalator_factory to create an iterator instance. - * - * @param data Pointer to an integer array in device memory. - * @param width The width of the integer type (1, 2, 4, or 8) - * @param data_type Index integer type of width `width` - */ - input_indexalator(void const* data, int width, data_type dtype) - : base_indexalator(width, dtype), p_{static_cast(data)} - { - } - - char const* p_; /// pointer to the integer data in device memory -}; +using input_indexalator = input_normalator; /** * @brief The index normalizing output iterator. @@ -328,79 +82,7 @@ struct input_indexalator : base_indexalator { * thrust::less()); * @endcode */ -struct output_indexalator : base_indexalator { - friend struct indexalator_factory; - friend struct base_indexalator; // for CRTP - - using reference = output_indexalator const&; // required for output iterators - - output_indexalator() = default; - output_indexalator(output_indexalator const&) = default; - output_indexalator(output_indexalator&&) = default; - output_indexalator& operator=(output_indexalator const&) = default; - output_indexalator& operator=(output_indexalator&&) = default; - - /** - * @brief Indirection operator returns this iterator instance in order - * to capture the `operator=(size_type)` calls. - */ - __device__ inline output_indexalator const& operator*() const { return *this; } - - /** - * @brief Array subscript operator returns an iterator instance at the specified `idx` position. - * - * This allows capturing the subsequent `operator=(size_type)` call in this class. - */ - __device__ inline output_indexalator const operator[](size_type idx) const - { - output_indexalator tmp{*this}; - tmp.p_ += (idx * width_); - return tmp; - } - - /** - * @brief Dispatch functor for setting the index value from a size_type value. - */ - struct size_type_to_index { - template ()>* = nullptr> - __device__ void operator()(void* tp, size_type const value) - { - (*static_cast(tp)) = static_cast(value); - } - template ()>* = nullptr> - __device__ void operator()(void* tp, size_type const value) - { - CUDF_UNREACHABLE("only index types are supported"); - } - }; - - /** - * @brief Assign a size_type value to the current iterator position. - */ - __device__ inline output_indexalator const& operator=(size_type const value) const - { - void* tp = p_; - type_dispatcher(dtype_, size_type_to_index{}, tp, value); - return *this; - } - - protected: - /** - * @brief Create an output index normalizing iterator. - * - * Use the indexalator_factory to create an iterator instance. - * - * @param data Pointer to an integer array in device memory. - * @param width The width of the integer type (1, 2, 4, or 8) - * @param data_type Index integer type of width `width` - */ - output_indexalator(void* data, int width, data_type dtype) - : base_indexalator(width, dtype), p_{static_cast(data)} - { - } - - char* p_; /// pointer to the integer data in device memory -}; +using output_indexalator = output_normalator; /** * @brief Use this class to create an indexalator instance. @@ -413,7 +95,7 @@ struct indexalator_factory { template ()>* = nullptr> input_indexalator operator()(column_view const& indices) { - return input_indexalator(indices.data(), sizeof(IndexType), indices.type()); + return input_indexalator(indices.data(), indices.type()); } template const&>(index) creates a copy auto const scalar_impl = static_cast const*>(&index); - return input_indexalator(scalar_impl->data(), sizeof(IndexType), index.type()); + return input_indexalator(scalar_impl->data(), index.type()); } template ()>* = nullptr> output_indexalator operator()(mutable_column_view const& indices) { - return output_indexalator(indices.data(), sizeof(IndexType), indices.type()); + return output_indexalator(indices.data(), indices.type()); } template to_arrow_array(cudf::type_id id, Ts&&... args) } } +/** + * @brief Invokes an `operator()` template with the type instantiation based on + * the specified `arrow::DataType`'s `id()`. + * + * This function is analogous to libcudf's type_dispatcher, but instead applies + * to Arrow functions. Its primary use case is to leverage Arrow's + * metaprogramming facilities like arrow::TypeTraits that require translating + * the runtime dtype information into compile-time types. + */ +template +constexpr decltype(auto) arrow_type_dispatcher(arrow::DataType const& dtype, + Functor f, + Ts&&... args) +{ + switch (dtype.id()) { + case arrow::Type::INT8: + return f.template operator()(std::forward(args)...); + case arrow::Type::INT16: + return f.template operator()(std::forward(args)...); + case arrow::Type::INT32: + return f.template operator()(std::forward(args)...); + case arrow::Type::INT64: + return f.template operator()(std::forward(args)...); + case arrow::Type::UINT8: + return f.template operator()(std::forward(args)...); + case arrow::Type::UINT16: + return f.template operator()(std::forward(args)...); + case arrow::Type::UINT32: + return f.template operator()(std::forward(args)...); + case arrow::Type::UINT64: + return f.template operator()(std::forward(args)...); + case arrow::Type::FLOAT: + return f.template operator()(std::forward(args)...); + case arrow::Type::DOUBLE: + return f.template operator()(std::forward(args)...); + case arrow::Type::BOOL: + return f.template operator()(std::forward(args)...); + case arrow::Type::TIMESTAMP: + return f.template operator()(std::forward(args)...); + case arrow::Type::DURATION: + return f.template operator()(std::forward(args)...); + case arrow::Type::STRING: + return f.template operator()(std::forward(args)...); + case arrow::Type::LIST: + return f.template operator()(std::forward(args)...); + case arrow::Type::DECIMAL128: + return f.template operator()(std::forward(args)...); + case arrow::Type::STRUCT: + return f.template operator()(std::forward(args)...); + default: { + CUDF_FAIL("Invalid type."); + } + } +} + // Converting arrow type to cudf type data_type arrow_to_cudf_type(arrow::DataType const& arrow_type); /** - * @copydoc cudf::to_arrow - * - * @param stream CUDA stream used for device memory operations and kernel launches. + * @copydoc cudf::to_arrow(table_view input, std::vector const& metadata, + * rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr) */ std::shared_ptr to_arrow(table_view input, std::vector const& metadata, @@ -118,13 +172,27 @@ std::shared_ptr to_arrow(table_view input, arrow::MemoryPool* ar_mr); /** - * @copydoc cudf::arrow_to_cudf - * - * @param stream CUDA stream used for device memory operations and kernel launches. + * @copydoc cudf::to_arrow(cudf::scalar const& input, column_metadata const& metadata, + * rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr) + */ +std::shared_ptr to_arrow(cudf::scalar const& input, + column_metadata const& metadata, + rmm::cuda_stream_view stream, + arrow::MemoryPool* ar_mr); +/** + * @copydoc cudf::from_arrow(arrow::Table const& input_table, rmm::cuda_stream_view stream, + * rmm::mr::device_memory_resource* mr) */ std::unique_ptr from_arrow(arrow::Table const& input_table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @copydoc cudf::from_arrow(arrow::Scalar const& input, rmm::cuda_stream_view stream, + * rmm::mr::device_memory_resource* mr) + */ +std::unique_ptr from_arrow(arrow::Scalar const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/normalizing_iterator.cuh b/cpp/include/cudf/detail/normalizing_iterator.cuh new file mode 100644 index 00000000000..51b3133f84f --- /dev/null +++ b/cpp/include/cudf/detail/normalizing_iterator.cuh @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf { +namespace detail { + +/** + * @brief The base class for the input or output normalizing iterator + * + * The base class mainly manages updating the `p_` member variable while the + * subclasses handle accessing individual elements in device memory. + * + * @tparam Derived The derived class type for the iterator + * @tparam Integer The type the iterator normalizes to + */ +template +struct base_normalator { + static_assert(std::is_integral_v); + using difference_type = std::ptrdiff_t; + using value_type = Integer; + using pointer = Integer*; + using iterator_category = std::random_access_iterator_tag; + + base_normalator() = default; + base_normalator(base_normalator const&) = default; + base_normalator(base_normalator&&) = default; + base_normalator& operator=(base_normalator const&) = default; + base_normalator& operator=(base_normalator&&) = default; + + /** + * @brief Prefix increment operator. + */ + CUDF_HOST_DEVICE inline Derived& operator++() + { + Derived& derived = static_cast(*this); + derived.p_ += width_; + return derived; + } + + /** + * @brief Postfix increment operator. + */ + CUDF_HOST_DEVICE inline Derived operator++(int) + { + Derived tmp{static_cast(*this)}; + operator++(); + return tmp; + } + + /** + * @brief Prefix decrement operator. + */ + CUDF_HOST_DEVICE inline Derived& operator--() + { + Derived& derived = static_cast(*this); + derived.p_ -= width_; + return derived; + } + + /** + * @brief Postfix decrement operator. + */ + CUDF_HOST_DEVICE inline Derived operator--(int) + { + Derived tmp{static_cast(*this)}; + operator--(); + return tmp; + } + + /** + * @brief Compound assignment by sum operator. + */ + CUDF_HOST_DEVICE inline Derived& operator+=(difference_type offset) + { + Derived& derived = static_cast(*this); + derived.p_ += offset * width_; + return derived; + } + + /** + * @brief Increment by offset operator. + */ + CUDF_HOST_DEVICE inline Derived operator+(difference_type offset) const + { + auto tmp = Derived{static_cast(*this)}; + tmp.p_ += (offset * width_); + return tmp; + } + + /** + * @brief Addition assignment operator. + */ + CUDF_HOST_DEVICE inline friend Derived operator+(difference_type offset, Derived const& rhs) + { + Derived tmp{rhs}; + tmp.p_ += (offset * rhs.width_); + return tmp; + } + + /** + * @brief Compound assignment by difference operator. + */ + CUDF_HOST_DEVICE inline Derived& operator-=(difference_type offset) + { + Derived& derived = static_cast(*this); + derived.p_ -= offset * width_; + return derived; + } + + /** + * @brief Decrement by offset operator. + */ + CUDF_HOST_DEVICE inline Derived operator-(difference_type offset) const + { + auto tmp = Derived{static_cast(*this)}; + tmp.p_ -= (offset * width_); + return tmp; + } + + /** + * @brief Subtraction assignment operator. + */ + CUDF_HOST_DEVICE inline friend Derived operator-(difference_type offset, Derived const& rhs) + { + Derived tmp{rhs}; + tmp.p_ -= (offset * rhs.width_); + return tmp; + } + + /** + * @brief Compute offset from iterator difference operator. + */ + CUDF_HOST_DEVICE inline difference_type operator-(Derived const& rhs) const + { + return (static_cast(*this).p_ - rhs.p_) / width_; + } + + /** + * @brief Equals to operator. + */ + CUDF_HOST_DEVICE inline bool operator==(Derived const& rhs) const + { + return rhs.p_ == static_cast(*this).p_; + } + + /** + * @brief Not equals to operator. + */ + CUDF_HOST_DEVICE inline bool operator!=(Derived const& rhs) const + { + return rhs.p_ != static_cast(*this).p_; + } + + /** + * @brief Less than operator. + */ + CUDF_HOST_DEVICE inline bool operator<(Derived const& rhs) const + { + return static_cast(*this).p_ < rhs.p_; + } + + /** + * @brief Greater than operator. + */ + CUDF_HOST_DEVICE inline bool operator>(Derived const& rhs) const + { + return static_cast(*this).p_ > rhs.p_; + } + + /** + * @brief Less than or equals to operator. + */ + CUDF_HOST_DEVICE inline bool operator<=(Derived const& rhs) const + { + return static_cast(*this).p_ <= rhs.p_; + } + + /** + * @brief Greater than or equals to operator. + */ + CUDF_HOST_DEVICE inline bool operator>=(Derived const& rhs) const + { + return static_cast(*this).p_ >= rhs.p_; + } + + protected: + /** + * @brief Constructor assigns width and type member variables for base class. + */ + explicit base_normalator(data_type dtype) : width_(size_of(dtype)), dtype_(dtype) {} + + int width_; /// integer type width = 1,2,4, or 8 + data_type dtype_; /// for type-dispatcher calls +}; + +/** + * @brief The integer normalizing input iterator + * + * This is an iterator that can be used for index types (integers) without + * requiring a type-specific instance. It can be used for any iterator + * interface for reading an array of integer values of type + * int8, int16, int32, int64, uint8, uint16, uint32, or uint64. + * Reading specific elements always return a type of `Integer` + * + * @tparam Integer Type returned by all read functions + */ +template +struct input_normalator : base_normalator, Integer> { + friend struct base_normalator, Integer>; // for CRTP + + using reference = Integer const; // this keeps STL and thrust happy + + input_normalator() = default; + input_normalator(input_normalator const&) = default; + input_normalator(input_normalator&&) = default; + input_normalator& operator=(input_normalator const&) = default; + input_normalator& operator=(input_normalator&&) = default; + + /** + * @brief Indirection operator returns the value at the current iterator position + */ + __device__ inline Integer operator*() const { return operator[](0); } + + /** + * @brief Dispatch functor for resolving a Integer value from any integer type + */ + struct normalize_type { + template >* = nullptr> + __device__ Integer operator()(void const* tp) + { + return static_cast(*static_cast(tp)); + } + template >* = nullptr> + __device__ Integer operator()(void const*) + { + CUDF_UNREACHABLE("only integral types are supported"); + } + }; + + /** + * @brief Array subscript operator returns a value at the input + * `idx` position as a `Integer` value. + */ + __device__ inline Integer operator[](size_type idx) const + { + void const* tp = p_ + (idx * this->width_); + return type_dispatcher(this->dtype_, normalize_type{}, tp); + } + + /** + * @brief Create an input index normalizing iterator. + * + * Use the indexalator_factory to create an iterator instance. + * + * @param data Pointer to an integer array in device memory. + * @param data_type Type of data in data + */ + input_normalator(void const* data, data_type dtype) + : base_normalator, Integer>(dtype), p_{static_cast(data)} + { + } + + char const* p_; /// pointer to the integer data in device memory +}; + +/** + * @brief The integer normalizing output iterator + * + * This is an iterator that can be used for index types (integers) without + * requiring a type-specific instance. It can be used for any iterator + * interface for writing an array of integer values of type + * int8, int16, int32, int64, uint8, uint16, uint32, or uint64. + * Setting specific elements always accept the `Integer` type values. + * + * @tparam Integer The type used for all write functions + */ +template +struct output_normalator : base_normalator, Integer> { + friend struct base_normalator, Integer>; // for CRTP + + using reference = output_normalator const&; // required for output iterators + + output_normalator() = default; + output_normalator(output_normalator const&) = default; + output_normalator(output_normalator&&) = default; + output_normalator& operator=(output_normalator const&) = default; + output_normalator& operator=(output_normalator&&) = default; + + /** + * @brief Indirection operator returns this iterator instance in order + * to capture the `operator=(Integer)` calls. + */ + __device__ inline output_normalator const& operator*() const { return *this; } + + /** + * @brief Array subscript operator returns an iterator instance at the specified `idx` position. + * + * This allows capturing the subsequent `operator=(Integer)` call in this class. + */ + __device__ inline output_normalator const operator[](size_type idx) const + { + output_normalator tmp{*this}; + tmp.p_ += (idx * this->width_); + return tmp; + } + + /** + * @brief Dispatch functor for setting the index value from a size_type value. + */ + struct normalize_type { + template >* = nullptr> + __device__ void operator()(void* tp, Integer const value) + { + (*static_cast(tp)) = static_cast(value); + } + template >* = nullptr> + __device__ void operator()(void*, Integer const) + { + CUDF_UNREACHABLE("only index types are supported"); + } + }; + + /** + * @brief Assign an Integer value to the current iterator position + */ + __device__ inline output_normalator const& operator=(Integer const value) const + { + void* tp = p_; + type_dispatcher(this->dtype_, normalize_type{}, tp, value); + return *this; + } + + /** + * @brief Create an output normalizing iterator + * + * @param data Pointer to an integer array in device memory. + * @param data_type Type of data in data + */ + output_normalator(void* data, data_type dtype) + : base_normalator, Integer>(dtype), p_{static_cast(data)} + { + } + + char* p_; /// pointer to the integer data in device memory +}; + +} // namespace detail +} // namespace cudf diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index e210179b147..865cc004107 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -126,23 +126,56 @@ struct column_metadata { * * @param input table_view that needs to be converted to arrow Table * @param metadata Contains hierarchy of names of columns and children + * @param stream CUDA stream used for device memory operations and kernel launches * @param ar_mr arrow memory pool to allocate memory for arrow Table * @return arrow Table generated from `input` */ std::shared_ptr to_arrow(table_view input, std::vector const& metadata = {}, - arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); +/** + * @brief Create `arrow::Scalar` from cudf scalar `input` + * + * Converts the `cudf::scalar` to `arrow::Scalar`. + * + * @param input scalar that needs to be converted to arrow Scalar + * @param metadata Contains hierarchy of names of columns and children + * @param stream CUDA stream used for device memory operations and kernel launches + * @param ar_mr arrow memory pool to allocate memory for arrow Scalar + * @return arrow Scalar generated from `input` + */ +std::shared_ptr to_arrow(cudf::scalar const& input, + column_metadata const& metadata = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); /** * @brief Create `cudf::table` from given arrow Table input * * @param input arrow:Table that needs to be converted to `cudf::table` + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate `cudf::table` * @return cudf table generated from given arrow Table */ std::unique_ptr
from_arrow( arrow::Table const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Create `cudf::scalar` from given arrow Scalar input + * + * @param input `arrow::Scalar` that needs to be converted to `cudf::scalar` + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate `cudf::scalar` + * @return cudf scalar generated from given arrow Scalar + */ + +std::unique_ptr from_arrow( + arrow::Scalar const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp index 6924e77ae9b..e4e803b2d3c 100644 --- a/cpp/include/cudf/sorting.hpp +++ b/cpp/include/cudf/sorting.hpp @@ -18,6 +18,7 @@ #include #include +#include #include @@ -43,6 +44,7 @@ namespace cudf { * @param null_precedence The desired order of null compared to other elements * for each column. Size must be equal to `input.num_columns()` or empty. * If empty, all columns will be sorted in `null_order::BEFORE`. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return A non-nullable column of elements containing the permuted row indices of * `input` if it were sorted @@ -51,6 +53,7 @@ std::unique_ptr sorted_order( table_view const& input, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -65,27 +68,30 @@ std::unique_ptr stable_sorted_order( table_view const& input, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Checks whether the rows of a `table` are sorted in a lexicographical * order. * - * @param[in] table Table whose rows need to be compared for ordering - * @param[in] column_order The expected sort order for each column. Size - * must be equal to `in.num_columns()` or empty. If - * empty, it is expected all columns are in - * ascending order. - * @param[in] null_precedence The desired order of null compared to other - * elements for each column. Size must be equal to - * `input.num_columns()` or empty. If empty, - * `null_order::BEFORE` is assumed for all columns. - * - * @returns bool true if sorted as expected, false if not + * @param table Table whose rows need to be compared for ordering + * @param column_order The expected sort order for each column. Size + * must be equal to `in.num_columns()` or empty. If + * empty, it is expected all columns are in + * ascending order. + * @param null_precedence The desired order of null compared to other + * elements for each column. Size must be equal to + * `input.num_columns()` or empty. If empty, + * `null_order::BEFORE` is assumed for all columns. + * + * @param stream CUDA stream used for device memory operations and kernel launches + * @returns true if sorted as expected, false if not */ bool is_sorted(cudf::table_view const& table, std::vector const& column_order, - std::vector const& null_precedence); + std::vector const& null_precedence, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Performs a lexicographic sort of the rows of a table @@ -98,6 +104,7 @@ bool is_sorted(cudf::table_view const& table, * elements for each column in `input`. Size must be equal to * `input.num_columns()` or empty. If empty, all columns will be sorted with * `null_order::BEFORE`. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return New table containing the desired sorted order of `input` */ @@ -105,6 +112,7 @@ std::unique_ptr
sort( table_view const& input, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -124,6 +132,7 @@ std::unique_ptr
sort( * elements for each column in `keys`. Size must be equal to * `keys.num_columns()` or empty. If empty, all columns will be sorted with * `null_order::BEFORE`. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return The reordering of `values` determined by the lexicographic order of * the rows of `keys`. @@ -133,6 +142,7 @@ std::unique_ptr
sort_by_key( table_view const& keys, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -154,6 +164,7 @@ std::unique_ptr
sort_by_key( * elements for each column in `keys`. Size must be equal to * `keys.num_columns()` or empty. If empty, all columns will be sorted with * `null_order::BEFORE`. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return The reordering of `values` determined by the lexicographic order of * the rows of `keys`. @@ -163,6 +174,7 @@ std::unique_ptr
stable_sort_by_key( table_view const& keys, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -189,6 +201,7 @@ std::unique_ptr
stable_sort_by_key( * @param null_precedence The desired order of null compared to other elements * for column * @param percentage flag to convert ranks to percentage in range (0,1] + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return A column of containing the rank of the each element of the column of `input`. The output * column type will be `size_type`column by default or else `double` when @@ -201,6 +214,7 @@ std::unique_ptr rank( null_policy null_handling, null_order null_precedence, bool percentage, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -241,6 +255,7 @@ std::unique_ptr rank( * elements for each column in `keys`. Size must be equal to * `keys.num_columns()` or empty. If empty, all columns will be sorted with * `null_order::BEFORE`. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to allocate any returned objects * @return sorted order of the segment sorted table * @@ -250,6 +265,7 @@ std::unique_ptr segmented_sorted_order( column_view const& segment_offsets, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -262,6 +278,7 @@ std::unique_ptr stable_segmented_sorted_order( column_view const& segment_offsets, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -306,6 +323,7 @@ std::unique_ptr stable_segmented_sorted_order( * elements for each column in `keys`. Size must be equal to * `keys.num_columns()` or empty. If empty, all columns will be sorted with * `null_order::BEFORE`. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to allocate any returned objects * @return table with elements in each segment sorted * @@ -316,6 +334,7 @@ std::unique_ptr
segmented_sort_by_key( column_view const& segment_offsets, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -329,6 +348,7 @@ std::unique_ptr
stable_segmented_sort_by_key( column_view const& segment_offsets, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp index 5d66401df9d..46f2c0e7bc9 100644 --- a/cpp/include/nvtext/generate_ngrams.hpp +++ b/cpp/include/nvtext/generate_ngrams.hpp @@ -47,19 +47,19 @@ namespace nvtext { * @throw cudf::logic_error if `separator` is invalid * @throw cudf::logic_error if there are not enough strings to generate any ngrams * - * @param strings Strings column to tokenize and produce ngrams from. - * @param ngrams The ngram number to generate. - * Default is 2 = bigram. - * @param separator The string to use for separating ngram tokens. - * Default is "_" character. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of tokens. + * @param input Strings column to tokenize and produce ngrams from + * @param ngrams The ngram number to generate + * @param separator The string to use for separating ngram tokens + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of tokens */ std::unique_ptr generate_ngrams( - cudf::strings_column_view const& strings, - cudf::size_type ngrams = 2, - cudf::string_scalar const& separator = cudf::string_scalar{"_"}, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + cudf::strings_column_view const& input, + cudf::size_type ngrams, + cudf::string_scalar const& separator, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Generates ngrams of characters within each string. @@ -79,15 +79,17 @@ std::unique_ptr generate_ngrams( * @throw cudf::logic_error if `ngrams < 2` * @throw cudf::logic_error if there are not enough characters to generate any ngrams * - * @param strings Strings column to produce ngrams from. + * @param input Strings column to produce ngrams from * @param ngrams The ngram number to generate. * Default is 2 = bigram. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of tokens. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of tokens */ std::unique_ptr generate_character_ngrams( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::size_type ngrams = 2, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -113,14 +115,16 @@ std::unique_ptr generate_character_ngrams( * @throw cudf::logic_error if `ngrams < 2` * @throw cudf::logic_error if there are not enough characters to generate any ngrams * - * @param strings Strings column to produce ngrams from. + * @param input Strings column to produce ngrams from * @param ngrams The ngram number to generate. Default is 5. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory. * @return A lists column of hash values */ std::unique_ptr hash_character_ngrams( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::size_type ngrams = 5, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/nvtext/ngrams_tokenize.hpp b/cpp/include/nvtext/ngrams_tokenize.hpp index 17f20f7ea4c..9d76ef8689f 100644 --- a/cpp/include/nvtext/ngrams_tokenize.hpp +++ b/cpp/include/nvtext/ngrams_tokenize.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -66,22 +66,22 @@ namespace nvtext { * * All null row entries are ignored and the output contains all valid rows. * - * @param strings Strings column to tokenize and produce ngrams from. - * @param ngrams The ngram number to generate. - * Default is 2 = bigram. + * @param input Strings column to tokenize and produce ngrams from + * @param ngrams The ngram number to generate * @param delimiter UTF-8 characters used to separate each string into tokens. - * The default of empty string will separate tokens using whitespace. - * @param separator The string to use for separating ngram tokens. - * Default is "_" character. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of tokens. + * An empty string will separate tokens using whitespace. + * @param separator The string to use for separating ngram tokens + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of tokens */ std::unique_ptr ngrams_tokenize( - cudf::strings_column_view const& strings, - cudf::size_type ngrams = 2, - cudf::string_scalar const& delimiter = cudf::string_scalar{""}, - cudf::string_scalar const& separator = cudf::string_scalar{"_"}, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + cudf::strings_column_view const& input, + cudf::size_type ngrams, + cudf::string_scalar const& delimiter, + cudf::string_scalar const& separator, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group } // namespace nvtext diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 30cfee97fd8..e39625c92e7 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -419,6 +419,52 @@ std::unique_ptr get_column(arrow::Array const& array, : get_empty_type_column(array.length()); } +struct BuilderGenerator { + template && + !std::is_same_v)> + std::shared_ptr operator()(std::shared_ptr const& type) + { + return std::make_shared::BuilderType>( + type, arrow::default_memory_pool()); + } + + template || + std::is_same_v)> + std::shared_ptr operator()(std::shared_ptr const& type) + { + CUDF_FAIL("Type not supported by BuilderGenerator"); + } +}; + +std::shared_ptr make_builder(std::shared_ptr const& type) +{ + switch (type->id()) { + case arrow::Type::STRUCT: { + std::vector> field_builders; + + for (auto field : type->fields()) { + auto const vt = field->type(); + if (vt->id() == arrow::Type::STRUCT || vt->id() == arrow::Type::LIST) { + field_builders.push_back(make_builder(vt)); + } else { + field_builders.push_back(arrow_type_dispatcher(*vt, BuilderGenerator{}, vt)); + } + } + return std::make_shared( + type, arrow::default_memory_pool(), field_builders); + } + case arrow::Type::LIST: { + return std::make_shared(arrow::default_memory_pool(), + make_builder(type->field(0)->type())); + } + default: { + return arrow_type_dispatcher(*type, BuilderGenerator{}, type); + } + } +} + } // namespace std::unique_ptr
from_arrow(arrow::Table const& input_table, @@ -462,14 +508,54 @@ std::unique_ptr
from_arrow(arrow::Table const& input_table, return std::make_unique
(std::move(columns)); } +std::unique_ptr from_arrow(arrow::Scalar const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Get a builder for the scalar type + auto builder = detail::make_builder(input.type); + + auto status = builder->AppendScalar(input); + if (status != arrow::Status::OK()) { + if (status.IsNotImplemented()) { + // The only known failure case here is for nulls + CUDF_FAIL("Cannot create untyped null scalars or nested types with untyped null leaf nodes", + std::invalid_argument); + } + CUDF_FAIL("Arrow ArrayBuilder::AppendScalar failed"); + } + + auto maybe_array = builder->Finish(); + if (!maybe_array.ok()) { CUDF_FAIL("Arrow ArrayBuilder::Finish failed"); } + auto array = *maybe_array; + + auto field = arrow::field("", input.type); + + auto table = arrow::Table::Make(arrow::schema({field}), {array}); + + auto cudf_table = detail::from_arrow(*table, stream, mr); + + auto cv = cudf_table->view().column(0); + return get_element(cv, 0, stream); +} + } // namespace detail std::unique_ptr
from_arrow(arrow::Table const& input_table, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::from_arrow(input_table, cudf::get_default_stream(), mr); + return detail::from_arrow(input_table, stream, mr); } +std::unique_ptr from_arrow(arrow::Scalar const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + return detail::from_arrow(input, stream, mr); +} } // namespace cudf diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 958a2fcb95f..0cd750bc947 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -15,14 +15,16 @@ */ #include +#include #include +#include #include #include +#include #include #include #include #include -#include #include #include #include @@ -77,7 +79,10 @@ std::shared_ptr fetch_mask_buffer(column_view input_view, auto mask_buffer = allocate_arrow_bitmap(static_cast(input_view.size()), ar_mr); CUDF_CUDA_TRY(cudaMemcpyAsync( mask_buffer->mutable_data(), - (input_view.offset() > 0) ? cudf::copy_bitmask(input_view).data() : input_view.null_mask(), + (input_view.offset() > 0) + ? cudf::detail::copy_bitmask(input_view, stream, rmm::mr::get_current_device_resource()) + .data() + : input_view.null_mask(), mask_size_in_bytes, cudaMemcpyDefault, stream.value())); @@ -139,29 +144,36 @@ struct dispatch_to_arrow { } }; -template <> -std::shared_ptr dispatch_to_arrow::operator()( - column_view input, - cudf::type_id, - column_metadata const&, - arrow::MemoryPool* ar_mr, - rmm::cuda_stream_view stream) +// Convert decimal types from libcudf to arrow where those types are not +// directly supported by Arrow. These types must be fit into 128 bits, the +// smallest decimal resolution supported by Arrow. +template +std::shared_ptr unsupported_decimals_to_arrow(column_view input, + int32_t precision, + arrow::MemoryPool* ar_mr, + rmm::cuda_stream_view stream) { - using DeviceType = int64_t; - size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType); rmm::device_uvector buf(input.size() * BIT_WIDTH_RATIO, stream); auto count = thrust::make_counting_iterator(0); - thrust::for_each(rmm::exec_policy(cudf::get_default_stream()), - count, - count + input.size(), - [in = input.begin(), out = buf.data()] __device__(auto in_idx) { - auto const out_idx = in_idx * 2; - out[out_idx] = in[in_idx]; - out[out_idx + 1] = in[in_idx] < 0 ? -1 : 0; - }); + thrust::for_each( + rmm::exec_policy(cudf::get_default_stream()), + count, + count + input.size(), + [in = input.begin(), out = buf.data(), BIT_WIDTH_RATIO] __device__(auto in_idx) { + auto const out_idx = in_idx * BIT_WIDTH_RATIO; + // The lowest order bits are the value, the remainder + // simply matches the sign bit to satisfy the two's + // complement integer representation of negative numbers. + out[out_idx] = in[in_idx]; +#pragma unroll BIT_WIDTH_RATIO - 1 + for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) { + out[out_idx + i] = in[in_idx] < 0 ? -1 : 0; + } + }); auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType); auto data_buffer = allocate_arrow_buffer(buf_size_in_bytes, ar_mr); @@ -169,7 +181,7 @@ std::shared_ptr dispatch_to_arrow::operator()( CUDF_CUDA_TRY(cudaMemcpyAsync( data_buffer->mutable_data(), buf.data(), buf_size_in_bytes, cudaMemcpyDefault, stream.value())); - auto type = arrow::decimal(18, -input.type().scale()); + auto type = arrow::decimal(precision, -input.type().scale()); auto mask = fetch_mask_buffer(input, ar_mr, stream); auto buffers = std::vector>{mask, std::move(data_buffer)}; auto data = std::make_shared(type, input.size(), buffers); @@ -177,6 +189,28 @@ std::shared_ptr dispatch_to_arrow::operator()( return std::make_shared(data); } +template <> +std::shared_ptr dispatch_to_arrow::operator()( + column_view input, + cudf::type_id, + column_metadata const&, + arrow::MemoryPool* ar_mr, + rmm::cuda_stream_view stream) +{ + return unsupported_decimals_to_arrow(input, 9, ar_mr, stream); +} + +template <> +std::shared_ptr dispatch_to_arrow::operator()( + column_view input, + cudf::type_id, + column_metadata const&, + arrow::MemoryPool* ar_mr, + rmm::cuda_stream_view stream) +{ + return unsupported_decimals_to_arrow(input, 18, ar_mr, stream); +} + template <> std::shared_ptr dispatch_to_arrow::operator()( column_view input, @@ -403,14 +437,37 @@ std::shared_ptr to_arrow(table_view input, return result; } + +std::shared_ptr to_arrow(cudf::scalar const& input, + column_metadata const& metadata, + rmm::cuda_stream_view stream, + arrow::MemoryPool* ar_mr) +{ + auto const column = cudf::make_column_from_scalar(input, 1, stream); + cudf::table_view const tv{{column->view()}}; + auto const arrow_table = cudf::to_arrow(tv, {metadata}, stream); + auto const ac = arrow_table->column(0); + auto const maybe_scalar = ac->GetScalar(0); + if (!maybe_scalar.ok()) { CUDF_FAIL("Failed to produce a scalar"); } + return maybe_scalar.ValueOrDie(); +} } // namespace detail std::shared_ptr to_arrow(table_view input, std::vector const& metadata, + rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr) { CUDF_FUNC_RANGE(); - return detail::to_arrow(input, metadata, cudf::get_default_stream(), ar_mr); + return detail::to_arrow(input, metadata, stream, ar_mr); } +std::shared_ptr to_arrow(cudf::scalar const& input, + column_metadata const& metadata, + rmm::cuda_stream_view stream, + arrow::MemoryPool* ar_mr) +{ + CUDF_FUNC_RANGE(); + return detail::to_arrow(input, metadata, stream, ar_mr); +} } // namespace cudf diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu index f8e7b4c6126..40a14d805e1 100644 --- a/cpp/src/lists/count_elements.cu +++ b/cpp/src/lists/count_elements.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,12 +36,12 @@ namespace cudf { namespace lists { namespace detail { /** - * @brief Returns a numeric column containing lengths of each element. + * @brief Returns a numeric column containing lengths of each element * - * @param input Input lists column. - * @param stream CUDA stream used for device memory operations and kernel launches. + * @param input Input lists column + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return New INT32 column with lengths. + * @return New size_type column with lengths */ std::unique_ptr count_elements(lists_column_view const& input, rmm::cuda_stream_view stream, @@ -52,7 +52,7 @@ std::unique_ptr count_elements(lists_column_view const& input, // create output column auto output = make_fixed_width_column(data_type{type_to_id()}, input.size(), - copy_bitmask(input.parent()), + cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count(), stream, mr); diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu index 260636a61cf..49054ebb046 100644 --- a/cpp/src/lists/segmented_sort.cu +++ b/cpp/src/lists/segmented_sort.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -70,13 +70,13 @@ std::unique_ptr sort_lists(lists_column_view const& input, auto output_offset = build_output_offsets(input, stream, mr); auto const child = input.get_sliced_child(stream); - auto const sorted_child_table = segmented_sort_by_key(table_view{{child}}, - table_view{{child}}, - output_offset->view(), - {column_order}, - {null_precedence}, - stream, - mr); + auto const sorted_child_table = cudf::detail::segmented_sort_by_key(table_view{{child}}, + table_view{{child}}, + output_offset->view(), + {column_order}, + {null_precedence}, + stream, + mr); return make_lists_column(input.size(), std::move(output_offset), @@ -98,13 +98,13 @@ std::unique_ptr stable_sort_lists(lists_column_view const& input, auto output_offset = build_output_offsets(input, stream, mr); auto const child = input.get_sliced_child(stream); - auto const sorted_child_table = stable_segmented_sort_by_key(table_view{{child}}, - table_view{{child}}, - output_offset->view(), - {column_order}, - {null_precedence}, - stream, - mr); + auto const sorted_child_table = cudf::detail::stable_segmented_sort_by_key(table_view{{child}}, + table_view{{child}}, + output_offset->view(), + {column_order}, + {null_precedence}, + stream, + mr); return make_lists_column(input.size(), std::move(output_offset), diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu index 2b48aed2d29..950cb484ddf 100644 --- a/cpp/src/replace/clamp.cu +++ b/cpp/src/replace/clamp.cu @@ -163,7 +163,9 @@ std::enable_if_t(), std::unique_ptr> clamp auto output = detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr); // mask will not change - if (input.nullable()) { output->set_null_mask(copy_bitmask(input), input.null_count()); } + if (input.nullable()) { + output->set_null_mask(cudf::detail::copy_bitmask(input, stream, mr), input.null_count()); + } auto output_device_view = cudf::mutable_column_device_view::create(output->mutable_view(), stream); diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu index 6e69b5157c2..7ac784bef43 100644 --- a/cpp/src/rolling/grouped_rolling.cu +++ b/cpp/src/rolling/grouped_rolling.cu @@ -357,6 +357,16 @@ template struct device_value_accessor { column_device_view const col; ///< column view of column in device + /// Checks that the type used to access device values matches the rep-type + /// of the order-by column. + struct is_correct_range_rep { + template /// Order-by type. + constexpr bool operator()() const + { + return std::is_same_v>; + } + }; + /** * @brief constructor * @@ -364,8 +374,11 @@ struct device_value_accessor { */ explicit __device__ device_value_accessor(column_device_view const& col_) : col{col_} { - cudf_assert(type_id_matches_device_storage_type(col.type().id()) && - "the data type mismatch"); + // For non-timestamp types, T must match the order-by column's type. + // For timestamp types, T must match the range rep type for the order-by column. + cudf_assert((type_id_matches_device_storage_type(col.type().id()) or + cudf::type_dispatcher(col.type(), is_correct_range_rep{})) && + "data type mismatch when accessing the order-by column"); } /** diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu index 25c594e9e74..39476a2f534 100644 --- a/cpp/src/sort/is_sorted.cu +++ b/cpp/src/sort/is_sorted.cu @@ -73,7 +73,8 @@ bool is_sorted(cudf::table_view const& in, bool is_sorted(cudf::table_view const& in, std::vector const& column_order, - std::vector const& null_precedence) + std::vector const& null_precedence, + rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); if (in.num_columns() == 0 || in.num_rows() == 0) { return true; } @@ -89,7 +90,7 @@ bool is_sorted(cudf::table_view const& in, "Number of columns in the table doesn't match the vector null_precedence's size .\n"); } - return detail::is_sorted(in, column_order, null_precedence, cudf::get_default_stream()); + return detail::is_sorted(in, column_order, null_precedence, stream); } } // namespace cudf diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu index fd65e38d467..3ead8cfcbaa 100644 --- a/cpp/src/sort/rank.cu +++ b/cpp/src/sort/rank.cu @@ -366,16 +366,11 @@ std::unique_ptr rank(column_view const& input, null_policy null_handling, null_order null_precedence, bool percentage, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::rank(input, - method, - column_order, - null_handling, - null_precedence, - percentage, - cudf::get_default_stream(), - mr); + return detail::rank( + input, method, column_order, null_handling, null_precedence, percentage, stream, mr); } } // namespace cudf diff --git a/cpp/src/sort/segmented_sort.cu b/cpp/src/sort/segmented_sort.cu index 38d008c120c..d9457341bd2 100644 --- a/cpp/src/sort/segmented_sort.cu +++ b/cpp/src/sort/segmented_sort.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -81,11 +81,12 @@ std::unique_ptr segmented_sorted_order(table_view const& keys, column_view const& segment_offsets, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::segmented_sorted_order( - keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr); + keys, segment_offsets, column_order, null_precedence, stream, mr); } std::unique_ptr
segmented_sort_by_key(table_view const& values, @@ -93,11 +94,12 @@ std::unique_ptr
segmented_sort_by_key(table_view const& values, column_view const& segment_offsets, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::segmented_sort_by_key( - values, keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr); + values, keys, segment_offsets, column_order, null_precedence, stream, mr); } } // namespace cudf diff --git a/cpp/src/sort/segmented_sort_impl.cuh b/cpp/src/sort/segmented_sort_impl.cuh index 37664f33762..5d11bf055f1 100644 --- a/cpp/src/sort/segmented_sort_impl.cuh +++ b/cpp/src/sort/segmented_sort_impl.cuh @@ -166,7 +166,7 @@ std::unique_ptr fast_segmented_sorted_order(column_view const& input, // Unfortunately, CUB's segmented sort functions cannot accept iterators. // We have to build a pre-filled sequence of indices as input. auto sorted_indices = - cudf::detail::sequence(input.size(), numeric_scalar{0}, stream, mr); + cudf::detail::sequence(input.size(), numeric_scalar{0, true, stream}, stream, mr); auto indices_view = sorted_indices->mutable_view(); cudf::type_dispatcher(input.type(), diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu index 25b95af4f83..46edae798d4 100644 --- a/cpp/src/sort/sort.cu +++ b/cpp/src/sort/sort.cu @@ -109,30 +109,32 @@ std::unique_ptr
sort(table_view const& input, std::unique_ptr sorted_order(table_view const& input, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::sorted_order(input, column_order, null_precedence, cudf::get_default_stream(), mr); + return detail::sorted_order(input, column_order, null_precedence, stream, mr); } std::unique_ptr
sort(table_view const& input, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::sort(input, column_order, null_precedence, cudf::get_default_stream(), mr); + return detail::sort(input, column_order, null_precedence, stream, mr); } std::unique_ptr
sort_by_key(table_view const& values, table_view const& keys, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::sort_by_key( - values, keys, column_order, null_precedence, cudf::get_default_stream(), mr); + return detail::sort_by_key(values, keys, column_order, null_precedence, stream, mr); } } // namespace cudf diff --git a/cpp/src/sort/stable_segmented_sort.cu b/cpp/src/sort/stable_segmented_sort.cu index 40df1b50279..4725d65e05d 100644 --- a/cpp/src/sort/stable_segmented_sort.cu +++ b/cpp/src/sort/stable_segmented_sort.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -55,11 +55,12 @@ std::unique_ptr stable_segmented_sorted_order( column_view const& segment_offsets, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::stable_segmented_sorted_order( - keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr); + keys, segment_offsets, column_order, null_precedence, stream, mr); } std::unique_ptr
stable_segmented_sort_by_key(table_view const& values, @@ -67,11 +68,12 @@ std::unique_ptr
stable_segmented_sort_by_key(table_view const& values, column_view const& segment_offsets, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::stable_segmented_sort_by_key( - values, keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr); + values, keys, segment_offsets, column_order, null_precedence, stream, mr); } } // namespace cudf diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu index 6f5678c4168..cf602dcf1a9 100644 --- a/cpp/src/sort/stable_sort.cu +++ b/cpp/src/sort/stable_sort.cu @@ -62,22 +62,22 @@ std::unique_ptr
stable_sort_by_key(table_view const& values, std::unique_ptr stable_sorted_order(table_view const& input, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::stable_sorted_order( - input, column_order, null_precedence, cudf::get_default_stream(), mr); + return detail::stable_sorted_order(input, column_order, null_precedence, stream, mr); } std::unique_ptr
stable_sort_by_key(table_view const& values, table_view const& keys, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::stable_sort_by_key( - values, keys, column_order, null_precedence, cudf::get_default_stream(), mr); + return detail::stable_sort_by_key(values, keys, column_order, null_precedence, stream, mr); } } // namespace cudf diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index 938fd45246d..5f2f4d021a4 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -150,10 +150,11 @@ std::unique_ptr generate_ngrams(cudf::strings_column_view const& s std::unique_ptr generate_ngrams(cudf::strings_column_view const& strings, cudf::size_type ngrams, cudf::string_scalar const& separator, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::generate_ngrams(strings, ngrams, separator, cudf::get_default_stream(), mr); + return detail::generate_ngrams(strings, ngrams, separator, stream, mr); } namespace detail { @@ -317,18 +318,20 @@ std::unique_ptr hash_character_ngrams(cudf::strings_column_view co std::unique_ptr generate_character_ngrams(cudf::strings_column_view const& strings, cudf::size_type ngrams, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::generate_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr); + return detail::generate_character_ngrams(strings, ngrams, stream, mr); } std::unique_ptr hash_character_ngrams(cudf::strings_column_view const& strings, cudf::size_type ngrams, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::hash_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr); + return detail::hash_character_ngrams(strings, ngrams, stream, mr); } } // namespace nvtext diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu index 5b55745c2c7..95324847ea0 100644 --- a/cpp/src/text/jaccard.cu +++ b/cpp/src/text/jaccard.cu @@ -107,7 +107,7 @@ rmm::device_uvector compute_unique_counts(cudf::column_view con * * This is called with a warp per row */ -struct sorted_interset_fn { +struct sorted_intersect_fn { cudf::column_device_view const d_input1; cudf::column_device_view const d_input2; cudf::size_type* d_results; @@ -151,7 +151,7 @@ rmm::device_uvector compute_intersect_counts(cudf::column_view auto const d_input1 = cudf::column_device_view::create(input1, stream); auto const d_input2 = cudf::column_device_view::create(input2, stream); auto d_results = rmm::device_uvector(input1.size(), stream); - sorted_interset_fn fn{*d_input1, *d_input2, d_results.data()}; + sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()}; thrust::for_each_n(rmm::exec_policy(stream), thrust::counting_iterator(0), input1.size() * cudf::detail::warp_size, diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu index fd1cbf99221..73d85513e95 100644 --- a/cpp/src/text/ngrams_tokenize.cu +++ b/cpp/src/text/ngrams_tokenize.cu @@ -265,11 +265,11 @@ std::unique_ptr ngrams_tokenize(cudf::strings_column_view const& s cudf::size_type ngrams, cudf::string_scalar const& delimiter, cudf::string_scalar const& separator, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ngrams_tokenize( - strings, ngrams, delimiter, separator, cudf::get_default_stream(), mr); + return detail::ngrams_tokenize(strings, ngrams, delimiter, separator, stream, mr); } } // namespace nvtext diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index d1e50442058..956bfc7c27d 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -621,17 +621,20 @@ ConfigureTest( STREAM_IDENTIFICATION_TEST identify_stream_usage/test_default_stream_identification.cu ) -ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing) -ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing) -ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing) ConfigureTest( STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE testing ) +ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing) # ################################################################################################## # Install tests #################################################################################### diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 9a5cc3733af..a898106a5b2 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -456,3 +456,98 @@ INSTANTIATE_TEST_CASE_P(FromArrowTest, std::make_tuple(0, 0), std::make_tuple(0, 3000), std::make_tuple(10000, 10000))); + +template +struct FromArrowNumericScalarTest : public cudf::test::BaseFixture {}; + +using NumericTypesNotBool = + cudf::test::Concat; +TYPED_TEST_SUITE(FromArrowNumericScalarTest, NumericTypesNotBool); + +TYPED_TEST(FromArrowNumericScalarTest, Basic) +{ + TypeParam const value{42}; + auto const arrow_scalar = arrow::MakeScalar(value); + auto const cudf_scalar = cudf::from_arrow(*arrow_scalar); + auto const cudf_numeric_scalar = + dynamic_cast*>(cudf_scalar.get()); + if (cudf_numeric_scalar == nullptr) { CUDF_FAIL("Attempted to test with a non-numeric type."); } + EXPECT_EQ(cudf_numeric_scalar->type(), cudf::data_type(cudf::type_to_id())); + EXPECT_EQ(cudf_numeric_scalar->value(), value); +} + +struct FromArrowDecimalScalarTest : public cudf::test::BaseFixture {}; + +// Only testing Decimal128 because that's the only size cudf and arrow have in common. +TEST_F(FromArrowDecimalScalarTest, Basic) +{ + auto const value{42}; + auto const precision{8}; + auto const scale{4}; + auto arrow_scalar = arrow::Decimal128Scalar(value, arrow::decimal128(precision, -scale)); + auto cudf_scalar = cudf::from_arrow(arrow_scalar); + + // Arrow offers a minimum of 128 bits for the Decimal type. + auto const cudf_decimal_scalar = + dynamic_cast*>(cudf_scalar.get()); + EXPECT_EQ(cudf_decimal_scalar->type(), + cudf::data_type(cudf::type_to_id(), scale)); + EXPECT_EQ(cudf_decimal_scalar->value(), value); +} + +struct FromArrowStringScalarTest : public cudf::test::BaseFixture {}; + +TEST_F(FromArrowStringScalarTest, Basic) +{ + auto const value = std::string("hello world"); + auto const arrow_scalar = arrow::StringScalar(value); + auto const cudf_scalar = cudf::from_arrow(arrow_scalar); + + auto const cudf_string_scalar = dynamic_cast(cudf_scalar.get()); + EXPECT_EQ(cudf_string_scalar->type(), cudf::data_type(cudf::type_id::STRING)); + EXPECT_EQ(cudf_string_scalar->to_string(), value); +} + +struct FromArrowListScalarTest : public cudf::test::BaseFixture {}; + +TEST_F(FromArrowListScalarTest, Basic) +{ + std::vector host_values = {1, 2, 3, 5, 6, 7, 8}; + std::vector host_validity = {true, true, true, false, true, true, true}; + + arrow::Int64Builder builder; + auto const status = builder.AppendValues(host_values, host_validity); + auto const maybe_array = builder.Finish(); + auto const array = *maybe_array; + + auto const arrow_scalar = arrow::ListScalar(array); + auto const cudf_scalar = cudf::from_arrow(arrow_scalar); + + auto const cudf_list_scalar = dynamic_cast(cudf_scalar.get()); + EXPECT_EQ(cudf_list_scalar->type(), cudf::data_type(cudf::type_id::LIST)); + + cudf::test::fixed_width_column_wrapper const lhs( + host_values.begin(), host_values.end(), host_validity.begin()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(lhs, cudf_list_scalar->view()); +} + +struct FromArrowStructScalarTest : public cudf::test::BaseFixture {}; + +TEST_F(FromArrowStructScalarTest, Basic) +{ + int64_t const value{42}; + auto const underlying_arrow_scalar = arrow::MakeScalar(value); + + auto const field = arrow::field("", underlying_arrow_scalar->type); + auto const arrow_type = arrow::struct_({field}); + auto const arrow_scalar = arrow::StructScalar({underlying_arrow_scalar}, arrow_type); + auto const cudf_scalar = cudf::from_arrow(arrow_scalar); + + auto const cudf_struct_scalar = dynamic_cast(cudf_scalar.get()); + EXPECT_EQ(cudf_struct_scalar->type(), cudf::data_type(cudf::type_id::STRUCT)); + + cudf::test::fixed_width_column_wrapper const col({value}); + cudf::table_view const lhs({col}); + + CUDF_TEST_EXPECT_TABLES_EQUAL(lhs, cudf_struct_scalar->view()); +} diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index 97d80984272..6bb4cdfd747 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -578,4 +579,106 @@ INSTANTIATE_TEST_CASE_P(ToArrowTest, std::make_tuple(0, 0), std::make_tuple(0, 3000))); +template +struct ToArrowNumericScalarTest : public cudf::test::BaseFixture {}; + +using NumericTypesNotBool = + cudf::test::Concat; +TYPED_TEST_SUITE(ToArrowNumericScalarTest, NumericTypesNotBool); + +TYPED_TEST(ToArrowNumericScalarTest, Basic) +{ + TypeParam const value{42}; + auto const cudf_scalar = cudf::make_fixed_width_scalar(value); + + cudf::column_metadata const metadata{""}; + auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); + + auto const ref_arrow_scalar = arrow::MakeScalar(value); + EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar)); +} + +struct ToArrowDecimalScalarTest : public cudf::test::BaseFixture {}; + +// Only testing Decimal128 because that's the only size cudf and arrow have in common. +TEST_F(ToArrowDecimalScalarTest, Basic) +{ + auto const value{42}; + auto const precision{18}; // cudf will convert to the widest-precision Arrow scalar of the type + int32_t const scale{4}; + + auto const cudf_scalar = + cudf::make_fixed_point_scalar(value, numeric::scale_type{scale}); + + cudf::column_metadata const metadata{""}; + auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); + + auto const maybe_ref_arrow_scalar = + arrow::MakeScalar(arrow::decimal128(precision, -scale), value); + if (!maybe_ref_arrow_scalar.ok()) { CUDF_FAIL("Failed to construct reference scalar"); } + auto const ref_arrow_scalar = *maybe_ref_arrow_scalar; + EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar)); +} + +struct ToArrowStringScalarTest : public cudf::test::BaseFixture {}; + +TEST_F(ToArrowStringScalarTest, Basic) +{ + std::string const value{"hello world"}; + auto const cudf_scalar = cudf::make_string_scalar(value); + cudf::column_metadata const metadata{""}; + auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); + + auto const ref_arrow_scalar = arrow::MakeScalar(value); + EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar)); +} + +struct ToArrowListScalarTest : public cudf::test::BaseFixture {}; + +TEST_F(ToArrowListScalarTest, Basic) +{ + std::vector const host_values = {1, 2, 3, 5, 6, 7, 8}; + std::vector const host_validity = {true, true, true, false, true, true, true}; + + cudf::test::fixed_width_column_wrapper const col( + host_values.begin(), host_values.end(), host_validity.begin()); + + auto const cudf_scalar = cudf::make_list_scalar(col); + + cudf::column_metadata const metadata{""}; + auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); + + arrow::Int64Builder builder; + auto const status = builder.AppendValues(host_values, host_validity); + auto const maybe_array = builder.Finish(); + auto const array = *maybe_array; + + auto const ref_arrow_scalar = arrow::ListScalar(array); + + EXPECT_TRUE(arrow_scalar->Equals(ref_arrow_scalar)); +} + +struct ToArrowStructScalarTest : public cudf::test::BaseFixture {}; + +TEST_F(ToArrowStructScalarTest, Basic) +{ + int64_t const value{42}; + auto const field_name{"a"}; + + cudf::test::fixed_width_column_wrapper const col{value}; + cudf::table_view const tbl({col}); + auto const cudf_scalar = cudf::make_struct_scalar(tbl); + + cudf::column_metadata metadata{""}; + metadata.children_meta.emplace_back(field_name); + auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); + + auto const underlying_arrow_scalar = arrow::MakeScalar(value); + auto const field = arrow::field(field_name, underlying_arrow_scalar->type, false); + auto const arrow_type = arrow::struct_({field}); + auto const ref_arrow_scalar = arrow::StructScalar({underlying_arrow_scalar}, arrow_type); + + EXPECT_TRUE(arrow_scalar->Equals(ref_arrow_scalar)); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/streams/interop_test.cpp b/cpp/tests/streams/interop_test.cpp new file mode 100644 index 00000000000..7eac9e016eb --- /dev/null +++ b/cpp/tests/streams/interop_test.cpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include + +struct ArrowTest : public cudf::test::BaseFixture {}; + +TEST_F(ArrowTest, ToArrow) +{ + int32_t const value{42}; + auto col = cudf::test::fixed_width_column_wrapper{{value}}; + cudf::table_view tbl{{col}}; + + std::vector metadata{{""}}; + cudf::to_arrow(tbl, metadata, cudf::test::get_default_stream()); +} + +TEST_F(ArrowTest, FromArrow) +{ + std::vector host_values = {1, 2, 3, 5, 6, 7, 8}; + std::vector host_validity = {true, true, true, false, true, true, true}; + + arrow::Int64Builder builder; + auto status = builder.AppendValues(host_values, host_validity); + auto maybe_array = builder.Finish(); + auto array = *maybe_array; + + auto field = arrow::field("", arrow::int32()); + auto schema = arrow::schema({field}); + auto table = arrow::Table::Make(schema, {array}); + cudf::from_arrow(*table, cudf::test::get_default_stream()); +} + +TEST_F(ArrowTest, ToArrowScalar) +{ + int32_t const value{42}; + auto cudf_scalar = + cudf::make_fixed_width_scalar(value, cudf::test::get_default_stream()); + + cudf::column_metadata metadata{""}; + cudf::to_arrow(*cudf_scalar, metadata, cudf::test::get_default_stream()); +} + +TEST_F(ArrowTest, FromArrowScalar) +{ + int32_t const value{42}; + auto arrow_scalar = arrow::MakeScalar(value); + cudf::from_arrow(*arrow_scalar, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/sorting_test.cpp b/cpp/tests/streams/sorting_test.cpp new file mode 100644 index 00000000000..e481f95bded --- /dev/null +++ b/cpp/tests/streams/sorting_test.cpp @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +class SortingTest : public cudf::test::BaseFixture {}; + +TEST_F(SortingTest, SortedOrder) +{ + cudf::test::fixed_width_column_wrapper const column{10, 20, 30, 40, 50}; + cudf::table_view const tbl{{column}}; + + cudf::sorted_order(tbl, {}, {}, cudf::test::get_default_stream()); +} + +TEST_F(SortingTest, StableSortedOrder) +{ + cudf::test::fixed_width_column_wrapper const column{10, 20, 30, 40, 50}; + cudf::table_view const tbl{{column}}; + + cudf::stable_sorted_order(tbl, {}, {}, cudf::test::get_default_stream()); +} + +TEST_F(SortingTest, IsSorted) +{ + cudf::test::fixed_width_column_wrapper const column{10, 20, 30, 40, 50}; + cudf::table_view const tbl{{column}}; + + cudf::is_sorted(tbl, {}, {}, cudf::test::get_default_stream()); +} + +TEST_F(SortingTest, Sort) +{ + cudf::test::fixed_width_column_wrapper const column{10, 20, 30, 40, 50}; + cudf::table_view const tbl{{column}}; + + cudf::sort(tbl, {}, {}, cudf::test::get_default_stream()); +} + +TEST_F(SortingTest, SortByKey) +{ + cudf::test::fixed_width_column_wrapper const values_col{10, 20, 30, 40, 50}; + cudf::table_view const values{{values_col}}; + cudf::test::fixed_width_column_wrapper const keys_col{10, 20, 30, 40, 50}; + cudf::table_view const keys{{keys_col}}; + + cudf::sort_by_key(values, keys, {}, {}, cudf::test::get_default_stream()); +} + +TEST_F(SortingTest, StableSortByKey) +{ + cudf::test::fixed_width_column_wrapper const values_col{10, 20, 30, 40, 50}; + cudf::table_view const values{{values_col}}; + cudf::test::fixed_width_column_wrapper const keys_col{10, 20, 30, 40, 50}; + cudf::table_view const keys{{keys_col}}; + + cudf::stable_sort_by_key(values, keys, {}, {}, cudf::test::get_default_stream()); +} + +TEST_F(SortingTest, Rank) +{ + cudf::test::fixed_width_column_wrapper const column{10, 20, 30, 40, 50}; + + cudf::rank(column, + cudf::rank_method::AVERAGE, + cudf::order::ASCENDING, + cudf::null_policy::EXCLUDE, + cudf::null_order::AFTER, + false, + cudf::test::get_default_stream()); +} + +TEST_F(SortingTest, SegmentedSortedOrder) +{ + cudf::test::fixed_width_column_wrapper const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + cudf::table_view const keys{{keys_col}}; + cudf::test::fixed_width_column_wrapper const segment_offsets{3, 7}; + + cudf::segmented_sorted_order(keys, segment_offsets, {}, {}, cudf::test::get_default_stream()); +} + +TEST_F(SortingTest, StableSegmentedSortedOrder) +{ + cudf::test::fixed_width_column_wrapper const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + cudf::table_view const keys{{keys_col}}; + cudf::test::fixed_width_column_wrapper const segment_offsets{3, 7}; + + cudf::stable_segmented_sorted_order( + keys, segment_offsets, {}, {}, cudf::test::get_default_stream()); +} + +TEST_F(SortingTest, SegmentedSortByKey) +{ + cudf::test::fixed_width_column_wrapper const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + cudf::table_view const keys{{keys_col}}; + cudf::test::fixed_width_column_wrapper const values_col{7, 6, 9, 3, 4, 5, 1, 2, 0, 4}; + cudf::table_view const values{{values_col}}; + cudf::test::fixed_width_column_wrapper const segment_offsets{0, 3, 7, 10}; + + cudf::segmented_sort_by_key( + values, keys, segment_offsets, {}, {}, cudf::test::get_default_stream()); +} + +TEST_F(SortingTest, StableSegmentedSortByKey) +{ + cudf::test::fixed_width_column_wrapper const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + cudf::table_view const keys{{keys_col}}; + cudf::test::fixed_width_column_wrapper const values_col{7, 6, 9, 3, 4, 5, 1, 2, 0, 4}; + cudf::table_view const values{{values_col}}; + cudf::test::fixed_width_column_wrapper const segment_offsets{0, 3, 7, 10}; + + cudf::stable_segmented_sort_by_key( + values, keys, segment_offsets, {}, {}, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/text/ngrams_test.cpp b/cpp/tests/streams/text/ngrams_test.cpp new file mode 100644 index 00000000000..bce0d2b680b --- /dev/null +++ b/cpp/tests/streams/text/ngrams_test.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +class TextNGramsTest : public cudf::test::BaseFixture {}; + +TEST_F(TextNGramsTest, GenerateNgrams) +{ + auto const input = + cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"}); + auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()}; + nvtext::generate_ngrams( + cudf::strings_column_view(input), 3, separator, cudf::test::get_default_stream()); +} + +TEST_F(TextNGramsTest, GenerateCharacterNgrams) +{ + auto const input = + cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"}); + nvtext::generate_character_ngrams( + cudf::strings_column_view(input), 3, cudf::test::get_default_stream()); +} + +TEST_F(TextNGramsTest, HashCharacterNgrams) +{ + auto input = + cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."}); + nvtext::hash_character_ngrams( + cudf::strings_column_view(input), 5, cudf::test::get_default_stream()); +} + +TEST_F(TextNGramsTest, NgramsTokenize) +{ + auto input = + cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."}); + auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()}; + auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()}; + nvtext::ngrams_tokenize( + cudf::strings_column_view(input), 2, delimiter, separator, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp index 323b3eed3e2..7b179588385 100644 --- a/cpp/tests/text/ngrams_tests.cpp +++ b/cpp/tests/text/ngrams_tests.cpp @@ -34,18 +34,19 @@ TEST_F(TextGenerateNgramsTest, Ngrams) { cudf::test::strings_column_wrapper strings{"the", "fox", "jumped", "over", "thé", "dog"}; cudf::strings_column_view strings_view(strings); + auto const separator = cudf::string_scalar("_"); { cudf::test::strings_column_wrapper expected{ "the_fox", "fox_jumped", "jumped_over", "over_thé", "thé_dog"}; - auto const results = nvtext::generate_ngrams(strings_view); + auto const results = nvtext::generate_ngrams(strings_view, 2, separator); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { cudf::test::strings_column_wrapper expected{ "the_fox_jumped", "fox_jumped_over", "jumped_over_thé", "over_thé_dog"}; - auto const results = nvtext::generate_ngrams(strings_view, 3); + auto const results = nvtext::generate_ngrams(strings_view, 3, separator); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { @@ -83,10 +84,11 @@ TEST_F(TextGenerateNgramsTest, NgramsWithNulls) h_strings.begin(), h_strings.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + auto const separator = cudf::string_scalar("_"); cudf::strings_column_view strings_view(strings); { - auto const results = nvtext::generate_ngrams(strings_view, 3); + auto const results = nvtext::generate_ngrams(strings_view, 3, separator); cudf::test::strings_column_wrapper expected{ "the_fox_jumped", "fox_jumped_over", "jumped_over_the", "over_the_dog"}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); @@ -103,7 +105,10 @@ TEST_F(TextGenerateNgramsTest, Empty) { auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view(); - auto results = nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column)); + auto const separator = cudf::string_scalar("_"); + + auto results = + nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column), 2, separator); cudf::test::expect_column_empty(results->view()); results = nvtext::generate_character_ngrams(cudf::strings_column_view(zero_size_strings_column)); cudf::test::expect_column_empty(results->view()); @@ -112,21 +117,20 @@ TEST_F(TextGenerateNgramsTest, Empty) TEST_F(TextGenerateNgramsTest, Errors) { cudf::test::strings_column_wrapper strings{""}; + auto const separator = cudf::string_scalar("_"); // invalid parameter value - EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1), cudf::logic_error); + EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1, separator), + cudf::logic_error); EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 1), cudf::logic_error); // not enough strings to generate ngrams - EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3), cudf::logic_error); + EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3, separator), + cudf::logic_error); EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 3), cudf::logic_error); - std::vector h_strings{"", nullptr, "", nullptr}; - cudf::test::strings_column_wrapper strings_no_tokens( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings_no_tokens)), + cudf::test::strings_column_wrapper strings_no_tokens({"", "", "", ""}, {1, 0, 1, 0}); + EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings_no_tokens), 2, separator), cudf::logic_error); EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings_no_tokens)), cudf::logic_error); diff --git a/cpp/tests/text/ngrams_tokenize_tests.cpp b/cpp/tests/text/ngrams_tokenize_tests.cpp index 5879bec3e64..c6fb886f7e5 100644 --- a/cpp/tests/text/ngrams_tokenize_tests.cpp +++ b/cpp/tests/text/ngrams_tokenize_tests.cpp @@ -62,7 +62,7 @@ TEST_F(TextNgramsTokenizeTest, Tokenize) "mousé_ate", "ate_the", "the_cheese"}; - auto results = nvtext::ngrams_tokenize(strings_view); + auto results = nvtext::ngrams_tokenize(strings_view, 2, std::string(), std::string("_")); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { @@ -101,9 +101,10 @@ TEST_F(TextNgramsTokenizeTest, TokenizeOneGram) { cudf::test::strings_column_wrapper strings{"aaa bbb", " ccc ddd ", "eee"}; cudf::strings_column_view strings_view(strings); + auto const empty = cudf::string_scalar(""); cudf::test::strings_column_wrapper expected{"aaa", "bbb", "ccc", "ddd", "eee"}; - auto results = nvtext::ngrams_tokenize(strings_view, 1); + auto results = nvtext::ngrams_tokenize(strings_view, 1, empty, empty); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } @@ -111,7 +112,8 @@ TEST_F(TextNgramsTokenizeTest, TokenizeEmptyTest) { auto strings = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); cudf::strings_column_view strings_view(strings->view()); - auto results = nvtext::ngrams_tokenize(strings_view); + auto const empty = cudf::string_scalar(""); + auto results = nvtext::ngrams_tokenize(strings_view, 2, empty, empty); EXPECT_EQ(results->size(), 0); EXPECT_EQ(results->has_nulls(), false); } @@ -120,5 +122,6 @@ TEST_F(TextNgramsTokenizeTest, TokenizeErrorTest) { cudf::test::strings_column_wrapper strings{"this column intentionally left blank"}; cudf::strings_column_view strings_view(strings); - EXPECT_THROW(nvtext::ngrams_tokenize(strings_view, 0), cudf::logic_error); + auto const empty = cudf::string_scalar(""); + EXPECT_THROW(nvtext::ngrams_tokenize(strings_view, 0, empty, empty), cudf::logic_error); } diff --git a/dependencies.yaml b/dependencies.yaml index 376e43094a7..5586f54348c 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -218,6 +218,7 @@ dependencies: - libkvikio==23.10.* - output_types: conda packages: + - aws-sdk-cpp<1.11 - fmt>=9.1.0,<10 - &gbench benchmark==1.8.0 - >est gtest>=1.13.0 diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1a780cc9e9f..8a3dbe77787 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1390,10 +1390,21 @@ def _get_numeric_data(self): return self[columns] @_cudf_nvtx_annotate - def assign(self, **kwargs): + def assign(self, **kwargs: Union[Callable[[Self], Any], Any]): """ Assign columns to DataFrame from keyword arguments. + Parameters + ---------- + **kwargs: dict mapping string column names to values + The value for each key can either be a literal column (or + something that can be converted to a column), or + a callable of one argument that will be given the + dataframe as an argument and should return the new column + (without modifying the input argument). + Columns are added in-order, so callables can refer to + column names constructed in the assignment. + Examples -------- >>> import cudf @@ -1405,15 +1416,9 @@ def assign(self, **kwargs): 1 1 4 2 2 5 """ - new_df = cudf.DataFrame(index=self.index.copy()) - for name, col in self._data.items(): - if name in kwargs: - new_df[name] = kwargs.pop(name) - else: - new_df._data[name] = col.copy() - + new_df = self.copy(deep=False) for k, v in kwargs.items(): - new_df[k] = v + new_df[k] = v(new_df) if callable(v) else v return new_df @classmethod diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 6180162ecdd..2f531afdeb7 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1327,6 +1327,25 @@ def test_assign(): np.testing.assert_equal(gdf2.y.to_numpy(), [2, 3, 4]) +@pytest.mark.parametrize( + "mapping", + [ + {"y": 1, "z": lambda df: df["x"] + df["y"]}, + { + "x": lambda df: df["x"] * 2, + "y": lambda df: 2, + "z": lambda df: df["x"] / df["y"], + }, + ], +) +def test_assign_callable(mapping): + df = pd.DataFrame({"x": [1, 2, 3]}) + cdf = cudf.from_pandas(df) + expect = df.assign(**mapping) + actual = cdf.assign(**mapping) + assert_eq(expect, actual) + + @pytest.mark.parametrize("nrows", [1, 8, 100, 1000]) @pytest.mark.parametrize("method", ["murmur3", "md5"]) @pytest.mark.parametrize("seed", [None, 42])