diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json index 2a195c6c81d..a0e193ff0bf 100644 --- a/.devcontainer/cuda12.5-conda/devcontainer.json +++ b/.devcontainer/cuda12.5-conda/devcontainer.json @@ -15,9 +15,31 @@ ], "hostRequirements": {"gpu": "optional"}, "features": { + "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": { + "version": "12.5", + "installCompilers": false, + "installProfilers": true, + "installDevPackages": false, + "installcuDNN": false, + "installcuTensor": false, + "installNCCL": false, + "installCUDARuntime": false, + "installNVRTC": false, + "installOpenCL": false, + "installcuBLAS": false, + "installcuSPARSE": false, + "installcuFFT": false, + "installcuFile": false, + "installcuRAND": false, + "installcuSOLVER": false, + "installNPP": false, + "installnvJPEG": false, + "pruneStaticLibs": true + }, "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {} }, "overrideFeatureInstallOrder": [ + "ghcr.io/rapidsai/devcontainers/features/cuda", "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"], diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a22d3c5b9cc..1275aad757c 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -47,11 +47,23 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: - build_type: pull-request + build_type: nightly + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} # Use the wheel container so we can skip conda solves and since our # primary static consumers (Spark) are not in conda anyway. container_image: "rapidsai/ci-wheel:latest" run_script: "ci/configure_cpp_static.sh" + clang-tidy: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 + with: + build_type: nightly + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + run_script: "ci/clang_tidy.sh" conda-python-cudf-tests: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 diff --git a/ci/clang_tidy.sh b/ci/clang_tidy.sh new file mode 100755 index 00000000000..4d5d3fc3136 --- /dev/null +++ b/ci/clang_tidy.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +rapids-logger "Create clang-tidy conda environment" +. /opt/conda/etc/profile.d/conda.sh + +ENV_YAML_DIR="$(mktemp -d)" + +rapids-dependency-file-generator \ + --output conda \ + --file-key clang_tidy \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml" + +rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n clang_tidy + +# Temporarily allow unbound variables for conda activation. +set +u +conda activate clang_tidy +set -u + +RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)" + +source rapids-configure-sccache + +# Run the build via CMake, which will run clang-tidy when CUDF_CLANG_TIDY is enabled. +cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_CLANG_TIDY=ON -GNinja +cmake --build cpp/build diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 870901d223b..95f36653c2c 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -93,6 +93,7 @@ sed_runner "s/cudf-.*-SNAPSHOT/cudf-${NEXT_FULL_JAVA_TAG}/g" java/ci/README.md # .devcontainer files find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}" + sed_runner "s@rapidsai/devcontainers/features/cuda:[0-9.]*@rapidsai/devcontainers/features/cuda:${NEXT_SHORT_TAG_PEP440}@" "${filename}" sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}" sed_runner "s@rapids-\${localWorkspaceFolderBasename}-[0-9.]*@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}" done diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh index 0819eacf636..2439af5b644 100755 --- a/ci/run_cudf_examples.sh +++ b/ci/run_cudf_examples.sh @@ -23,7 +23,10 @@ compute-sanitizer --tool memcheck custom_optimized names.csv compute-sanitizer --tool memcheck custom_prealloc names.csv compute-sanitizer --tool memcheck custom_with_malloc names.csv -compute-sanitizer --tool memcheck parquet_io +compute-sanitizer --tool memcheck parquet_io example.parquet compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE +compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet +compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet 4 DEVICE_BUFFER 2 2 + exit ${EXITCODE} diff --git a/cpp/.clang-tidy b/cpp/.clang-tidy index 2d4f8c0d80e..12120a5c6d1 100644 --- a/cpp/.clang-tidy +++ b/cpp/.clang-tidy @@ -39,7 +39,7 @@ Checks: -clang-analyzer-optin.core.EnumCastOutOfRange, -clang-analyzer-optin.cplusplus.UninitializedObject' -WarningsAsErrors: '' +WarningsAsErrors: '*' HeaderFilterRegex: '.*cudf/cpp/(src|include|tests).*' ExcludeHeaderFilterRegex: '.*(Message_generated.h|Schema_generated.h|brotli_dict.hpp|unbz2.hpp|cxxopts.hpp).*' FormatStyle: none diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f7a5dd2f2fb..32a753c9f40 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -88,6 +88,7 @@ option( ${DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL} ) mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL) +option(CUDF_CLANG_TIDY "Enable clang-tidy checking" OFF) message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}") message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}") @@ -144,6 +145,58 @@ if(NOT CUDF_GENERATED_INCLUDE_DIR) set(CUDF_GENERATED_INCLUDE_DIR ${CUDF_BINARY_DIR}) endif() +# ################################################################################################## +# * clang-tidy configuration ---------------------------------------------------------------------- +if(CUDF_CLANG_TIDY) + find_program( + CLANG_TIDY_EXE + NAMES "clang-tidy" + DOC "Path to clang-tidy executable" REQUIRED + ) + + execute_process( + COMMAND ${CLANG_TIDY_EXE} --version + OUTPUT_VARIABLE CLANG_TIDY_OUTPUT + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + string(REGEX MATCH "LLVM version ([0-9]+\\.[0-9]+)\\.[0-9]+" LLVM_VERSION_MATCH + "${CLANG_TIDY_OUTPUT}" + ) + # Discard the patch version and allow it to float. Empirically, results between patch versions are + # mostly stable, and looking at available packages on some package managers sometimes patch + # versions are skipped so we don't want to constrain to a patch version that the user can't + # install. + set(LLVM_VERSION "${CMAKE_MATCH_1}") + set(expected_clang_tidy_version 19.1) + if(NOT expected_clang_tidy_version VERSION_EQUAL LLVM_VERSION) + message( + FATAL_ERROR + "clang-tidy version ${expected_clang_tidy_version} is required, but found ${LLVM_VERSION}" + ) + endif() +endif() + +# Turn on the clang-tidy property for a target excluding the files specified in SKIPPED_FILES. +function(enable_clang_tidy target) + set(_tidy_options) + set(_tidy_one_value) + set(_tidy_multi_value SKIPPED_FILES) + cmake_parse_arguments( + _TIDY "${_tidy_options}" "${_tidy_one_value}" "${_tidy_multi_value}" ${ARGN} + ) + + if(CUDF_CLANG_TIDY) + # clang will complain about unused link libraries on the compile line unless we specify + # -Qunused-arguments. + set_target_properties( + ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments" + ) + foreach(file IN LISTS _TIDY_SKIPPED_FILES) + set_source_files_properties(${file} PROPERTIES SKIP_LINTING ON) + endforeach() + endif() +endfunction() + # ################################################################################################## # * conda environment ----------------------------------------------------------------------------- rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH) @@ -315,6 +368,7 @@ add_library( src/filling/repeat.cu src/filling/sequence.cu src/groupby/groupby.cu + src/groupby/hash/flatten_single_pass_aggs.cpp src/groupby/hash/groupby.cu src/groupby/sort/aggregate.cpp src/groupby/sort/group_argmax.cu @@ -713,6 +767,7 @@ target_compile_options( cudf PRIVATE "$<$:${CUDF_CXX_FLAGS}>" "$<$:${CUDF_CUDA_FLAGS}>" ) +enable_clang_tidy(cudf SKIPPED_FILES src/io/comp/cpu_unbz2.cpp src/io/comp/brotli_dict.cpp) if(CUDF_BUILD_STACKTRACE_DEBUG) # Remove any optimization level to avoid nvcc warning "incompatible redefinition for option @@ -862,15 +917,7 @@ if(CUDF_BUILD_TESTUTIL) add_library(cudf::cudftest_default_stream ALIAS cudftest_default_stream) - add_library( - cudftestutil SHARED - tests/io/metadata_utilities.cpp - tests/utilities/column_utilities.cu - tests/utilities/debug_utilities.cu - tests/utilities/random_seed.cpp - tests/utilities/table_utilities.cu - tests/utilities/tdigest_utilities.cu - ) + add_library(cudftestutil INTERFACE) set_target_properties( cudftestutil @@ -879,32 +926,56 @@ if(CUDF_BUILD_TESTUTIL) # set target compile options CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON - CXX_VISIBILITY_PRESET hidden CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON - CUDA_VISIBILITY_PRESET hidden - POSITION_INDEPENDENT_CODE ON - INTERFACE_POSITION_INDEPENDENT_CODE ON ) target_compile_options( - cudftestutil PUBLIC "$:${CUDF_CXX_FLAGS}>>" - "$:${CUDF_CUDA_FLAGS}>>" + cudftestutil INTERFACE "$:${CUDF_CXX_FLAGS}>>" + "$:${CUDF_CUDA_FLAGS}>>" ) target_link_libraries( - cudftestutil - PUBLIC Threads::Threads cudf cudftest_default_stream - PRIVATE GTest::gmock GTest::gtest $ + cudftestutil INTERFACE Threads::Threads cudf cudftest_default_stream + $ ) target_include_directories( - cudftestutil PUBLIC "$" - "$" + cudftestutil INTERFACE "$" + "$" ) rapids_cuda_set_runtime(cudftestutil USE_STATIC ${CUDA_STATIC_RUNTIME}) add_library(cudf::cudftestutil ALIAS cudftestutil) + add_library(cudftestutil_impl INTERFACE) + add_library(cudf::cudftestutil_impl ALIAS cudftestutil_impl) + target_sources( + cudftestutil_impl + INTERFACE $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + ) + target_link_libraries(cudftestutil_impl INTERFACE cudf::cudftestutil) + + install(FILES tests/io/metadata_utilities.cpp DESTINATION src/cudftestutil/io) + install( + FILES tests/utilities/column_utilities.cu + tests/utilities/debug_utilities.cu + tests/utilities/random_seed.cpp + tests/utilities/table_utilities.cu + tests/utilities/tdigest_utilities.cu + DESTINATION src/cudftestutil/utilities + ) + endif() # * build cudf_identify_stream_usage -------------------------------------------------------------- @@ -1005,7 +1076,7 @@ install( set(_components_export_string) if(TARGET cudftestutil) install( - TARGETS cudftest_default_stream cudftestutil + TARGETS cudftest_default_stream cudftestutil cudftestutil_impl DESTINATION ${lib_dir} EXPORT cudf-testing-exports ) @@ -1045,14 +1116,15 @@ targets: This module offers an optional testing component which defines the following IMPORTED GLOBAL targets: - cudf::cudftestutil - The main cudf testing library + cudf::cudftestutil - The main cudf testing library + cudf::cudftestutil_impl - C++ and CUDA sources to compile for definitions in cudf::cudftestutil ]=] ) rapids_export( INSTALL cudf EXPORT_SET cudf-exports ${_components_export_string} - GLOBAL_TARGETS cudf cudftestutil + GLOBAL_TARGETS cudf cudftestutil cudftestutil_impl NAMESPACE cudf:: DOCUMENTATION doc_string ) @@ -1073,7 +1145,7 @@ endif() rapids_export( BUILD cudf EXPORT_SET cudf-exports ${_components_export_string} - GLOBAL_TARGETS cudf cudftestutil + GLOBAL_TARGETS cudf cudftestutil cudftestutil_impl NAMESPACE cudf:: DOCUMENTATION doc_string FINAL_CODE_BLOCK build_code_string diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index b0f75b25975..d6fc5dc6039 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -25,7 +25,7 @@ target_compile_options( target_link_libraries( cudf_datagen PUBLIC GTest::gmock GTest::gtest benchmark::benchmark nvbench::nvbench Threads::Threads cudf - cudftestutil nvtx3::nvtx3-cpp + cudf::cudftestutil nvtx3::nvtx3-cpp PRIVATE $ ) @@ -49,7 +49,7 @@ target_compile_options( target_link_libraries( ndsh_data_generator - PUBLIC cudf cudftestutil nvtx3::nvtx3-cpp + PUBLIC cudf GTest::gmock GTest::gtest cudf::cudftestutil nvtx3::nvtx3-cpp PRIVATE $ ) @@ -65,14 +65,14 @@ target_include_directories( # Use an OBJECT library so we only compile these helper source files only once add_library( cudf_benchmark_common OBJECT - "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp" - synchronization/synchronization.cpp - io/cuio_common.cpp - common/table_utilities.cpp - common/benchmark_utilities.cpp - common/nvbench_utilities.cpp + synchronization/synchronization.cpp io/cuio_common.cpp common/table_utilities.cpp + common/benchmark_utilities.cpp common/nvbench_utilities.cpp ) -target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $) +target_link_libraries( + cudf_benchmark_common PRIVATE cudf_datagen $ GTest::gmock + GTest::gtest +) + add_custom_command( OUTPUT CUDF_BENCHMARKS COMMAND echo Running benchmarks @@ -99,7 +99,7 @@ function(ConfigureBench CMAKE_BENCH_NAME) ) target_link_libraries( ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main - $ + cudf::cudftestutil_impl $ ) add_custom_command( OUTPUT CUDF_BENCHMARKS @@ -127,8 +127,9 @@ function(ConfigureNVBench CMAKE_BENCH_NAME) INSTALL_RPATH "\$ORIGIN/../../../lib" ) target_link_libraries( - ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common ndsh_data_generator cudf_datagen - nvbench::nvbench $ + ${CMAKE_BENCH_NAME} + PRIVATE cudf_benchmark_common ndsh_data_generator cudf_datagen nvbench::nvbench + $ cudf::cudftestutil_impl ) install( TARGETS ${CMAKE_BENCH_NAME} diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt index d8e9205ffd4..a7d0146b170 100644 --- a/cpp/examples/parquet_io/CMakeLists.txt +++ b/cpp/examples/parquet_io/CMakeLists.txt @@ -16,10 +16,23 @@ project( include(../fetch_dependencies.cmake) -# Configure your project here +add_library(parquet_io_utils OBJECT common_utils.cpp io_source.cpp) +target_compile_features(parquet_io_utils PRIVATE cxx_std_17) +target_link_libraries(parquet_io_utils PRIVATE cudf::cudf) + +# Build and install parquet_io add_executable(parquet_io parquet_io.cpp) -target_link_libraries(parquet_io PRIVATE cudf::cudf) +target_link_libraries(parquet_io PRIVATE cudf::cudf nvToolsExt $) target_compile_features(parquet_io PRIVATE cxx_std_17) - install(TARGETS parquet_io DESTINATION bin/examples/libcudf) + +# Build and install parquet_io_multithreaded +add_executable(parquet_io_multithreaded parquet_io_multithreaded.cpp) +target_link_libraries( + parquet_io_multithreaded PRIVATE cudf::cudf nvToolsExt $ +) +target_compile_features(parquet_io_multithreaded PRIVATE cxx_std_17) +install(TARGETS parquet_io_multithreaded DESTINATION bin/examples/libcudf) + +# Install the example.parquet file install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf) diff --git a/cpp/examples/parquet_io/parquet_io.hpp b/cpp/examples/parquet_io/common_utils.cpp similarity index 50% rename from cpp/examples/parquet_io/parquet_io.hpp rename to cpp/examples/parquet_io/common_utils.cpp index e27cbec4fce..a79ca48af86 100644 --- a/cpp/examples/parquet_io/parquet_io.hpp +++ b/cpp/examples/parquet_io/common_utils.cpp @@ -14,30 +14,27 @@ * limitations under the License. */ -#pragma once +#include "common_utils.hpp" -#include +#include #include #include #include -#include -#include #include #include #include #include -#include -#include +#include #include /** - * @brief Create memory resource for libcudf functions + * @file common_utils.cpp + * @brief Definitions for common utilities for `parquet_io` examples * - * @param pool Whether to use a pool memory resource. - * @return Memory resource instance */ + std::shared_ptr create_memory_resource(bool is_pool_used) { auto cuda_mr = std::make_shared(); @@ -48,17 +45,11 @@ std::shared_ptr create_memory_resource(bool is_ return cuda_mr; } -/** - * @brief Get encoding type from the keyword - * - * @param name encoding keyword name - * @return corresponding column encoding type - */ -[[nodiscard]] cudf::io::column_encoding get_encoding_type(std::string name) +cudf::io::column_encoding get_encoding_type(std::string name) { using encoding_type = cudf::io::column_encoding; - static const std::unordered_map map = { + static std::unordered_map const map = { {"DEFAULT", encoding_type::USE_DEFAULT}, {"DICTIONARY", encoding_type::DICTIONARY}, {"PLAIN", encoding_type::PLAIN}, @@ -69,26 +60,18 @@ std::shared_ptr create_memory_resource(bool is_ std::transform(name.begin(), name.end(), name.begin(), ::toupper); if (map.find(name) != map.end()) { return map.at(name); } - throw std::invalid_argument("FATAL: " + std::string(name) + + throw std::invalid_argument(name + " is not a valid encoding type.\n\n" "Available encoding types: DEFAULT, DICTIONARY, PLAIN,\n" "DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,\n" - "DELTA_BYTE_ARRAY\n" - "\n" - "Exiting...\n"); + "DELTA_BYTE_ARRAY\n\n"); } -/** - * @brief Get compression type from the keyword - * - * @param name compression keyword name - * @return corresponding compression type - */ -[[nodiscard]] cudf::io::compression_type get_compression_type(std::string name) +cudf::io::compression_type get_compression_type(std::string name) { using compression_type = cudf::io::compression_type; - static const std::unordered_map map = { + static std::unordered_map const map = { {"NONE", compression_type::NONE}, {"AUTO", compression_type::AUTO}, {"SNAPPY", compression_type::SNAPPY}, @@ -97,30 +80,58 @@ std::shared_ptr create_memory_resource(bool is_ std::transform(name.begin(), name.end(), name.begin(), ::toupper); if (map.find(name) != map.end()) { return map.at(name); } - throw std::invalid_argument("FATAL: " + std::string(name) + + throw std::invalid_argument(name + " is not a valid compression type.\n\n" - "Available compression_type types: NONE, AUTO, SNAPPY,\n" - "LZ4, ZSTD\n" - "\n" - "Exiting...\n"); + "Available compression types: NONE, AUTO, SNAPPY,\n" + "LZ4, ZSTD\n\n"); } -/** - * @brief Get the optional page size stat frequency from they keyword - * - * @param use_stats keyword affirmation string such as: Y, T, YES, TRUE, ON - * @return optional page statistics frequency set to full (STATISTICS_COLUMN) - */ -[[nodiscard]] std::optional get_page_size_stats(std::string use_stats) +bool get_boolean(std::string input) { - std::transform(use_stats.begin(), use_stats.end(), use_stats.begin(), ::toupper); + std::transform(input.begin(), input.end(), input.begin(), ::toupper); // Check if the input string matches to any of the following - if (not use_stats.compare("ON") or not use_stats.compare("TRUE") or - not use_stats.compare("YES") or not use_stats.compare("Y") or not use_stats.compare("T")) { - // Full column and offset indices - STATISTICS_COLUMN - return std::make_optional(cudf::io::statistics_freq::STATISTICS_COLUMN); + return input == "ON" or input == "TRUE" or input == "YES" or input == "Y" or input == "T"; +} + +void check_tables_equal(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table) +{ + try { + // Left anti-join the original and transcoded tables + // identical tables should not throw an exception and + // return an empty indices vector + auto const indices = cudf::left_anti_join(lhs_table, rhs_table, cudf::null_equality::EQUAL); + + // No exception thrown, check indices + auto const valid = indices->size() == 0; + std::cout << "Tables identical: " << valid << "\n\n"; + } catch (std::exception& e) { + std::cerr << e.what() << std::endl << std::endl; + throw std::runtime_error("Tables identical: false\n\n"); } +} - return std::nullopt; +std::unique_ptr concatenate_tables(std::vector> tables, + rmm::cuda_stream_view stream) +{ + if (tables.size() == 1) { return std::move(tables[0]); } + + std::vector table_views; + table_views.reserve(tables.size()); + std::transform( + tables.begin(), tables.end(), std::back_inserter(table_views), [&](auto const& tbl) { + return tbl->view(); + }); + // Construct the final table + return cudf::concatenate(table_views, stream); +} + +std::string current_date_and_time() +{ + auto const time = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + auto const local_time = *std::localtime(&time); + // Stringstream to format the date and time + std::stringstream ss; + ss << std::put_time(&local_time, "%Y-%m-%d-%H-%M-%S"); + return ss.str(); } diff --git a/cpp/examples/parquet_io/common_utils.hpp b/cpp/examples/parquet_io/common_utils.hpp new file mode 100644 index 00000000000..12896e61a0d --- /dev/null +++ b/cpp/examples/parquet_io/common_utils.hpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +#include +#include + +/** + * @file common_utils.hpp + * @brief Common utilities for `parquet_io` examples + * + */ + +/** + * @brief Create memory resource for libcudf functions + * + * @param pool Whether to use a pool memory resource. + * @return Memory resource instance + */ +std::shared_ptr create_memory_resource(bool is_pool_used); + +/** + * @brief Get encoding type from the keyword + * + * @param name encoding keyword name + * @return corresponding column encoding type + */ +[[nodiscard]] cudf::io::column_encoding get_encoding_type(std::string name); + +/** + * @brief Get compression type from the keyword + * + * @param name compression keyword name + * @return corresponding compression type + */ +[[nodiscard]] cudf::io::compression_type get_compression_type(std::string name); + +/** + * @brief Get boolean from they keyword + * + * @param input keyword affirmation string such as: Y, T, YES, TRUE, ON + * @return true or false + */ +[[nodiscard]] bool get_boolean(std::string input); + +/** + * @brief Check if two tables are identical, throw an error otherwise + * + * @param lhs_table View to lhs table + * @param rhs_table View to rhs table + */ +void check_tables_equal(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table); + +/** + * @brief Concatenate a vector of tables and return the resultant table + * + * @param tables Vector of tables to concatenate + * @param stream CUDA stream to use + * + * @return Unique pointer to the resultant concatenated table. + */ +std::unique_ptr concatenate_tables(std::vector> tables, + rmm::cuda_stream_view stream); + +/** + * @brief Returns a string containing current date and time + * + */ +std::string current_date_and_time(); diff --git a/cpp/examples/parquet_io/io_source.cpp b/cpp/examples/parquet_io/io_source.cpp new file mode 100644 index 00000000000..019b3f96474 --- /dev/null +++ b/cpp/examples/parquet_io/io_source.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "io_source.hpp" + +#include +#include + +#include +#include + +#include + +#include +#include +#include + +rmm::host_async_resource_ref pinned_memory_resource() +{ + static auto mr = rmm::mr::pinned_host_memory_resource{}; + return mr; +} + +io_source_type get_io_source_type(std::string name) +{ + static std::unordered_map const map = { + {"FILEPATH", io_source_type::FILEPATH}, + {"HOST_BUFFER", io_source_type::HOST_BUFFER}, + {"PINNED_BUFFER", io_source_type::PINNED_BUFFER}, + {"DEVICE_BUFFER", io_source_type::DEVICE_BUFFER}}; + + std::transform(name.begin(), name.end(), name.begin(), ::toupper); + if (map.find(name) != map.end()) { + return map.at(name); + } else { + throw std::invalid_argument(name + + " is not a valid io source type. Available: FILEPATH,\n" + "HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER.\n\n"); + } +} + +io_source::io_source(std::string_view file_path, io_source_type type, rmm::cuda_stream_view stream) + : pinned_buffer({pinned_memory_resource(), stream}), d_buffer{0, stream} +{ + std::string const file_name{file_path}; + auto const file_size = std::filesystem::file_size(file_name); + + // For filepath make a quick source_info and return early + if (type == io_source_type::FILEPATH) { + source_info = cudf::io::source_info(file_name); + return; + } + + std::ifstream file{file_name, std::ifstream::binary}; + + // Copy file contents to the specified io source buffer + switch (type) { + case io_source_type::HOST_BUFFER: { + h_buffer.resize(file_size); + file.read(h_buffer.data(), file_size); + source_info = cudf::io::source_info(h_buffer.data(), file_size); + break; + } + case io_source_type::PINNED_BUFFER: { + pinned_buffer.resize(file_size); + file.read(pinned_buffer.data(), file_size); + source_info = cudf::io::source_info(pinned_buffer.data(), file_size); + break; + } + case io_source_type::DEVICE_BUFFER: { + h_buffer.resize(file_size); + file.read(h_buffer.data(), file_size); + d_buffer.resize(file_size, stream); + CUDF_CUDA_TRY(cudaMemcpyAsync( + d_buffer.data(), h_buffer.data(), file_size, cudaMemcpyDefault, stream.value())); + + source_info = cudf::io::source_info(d_buffer); + break; + } + default: { + throw std::runtime_error("Encountered unexpected source type\n\n"); + } + } +} diff --git a/cpp/examples/parquet_io/io_source.hpp b/cpp/examples/parquet_io/io_source.hpp new file mode 100644 index 00000000000..a614d348fae --- /dev/null +++ b/cpp/examples/parquet_io/io_source.hpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include + +#include + +#include + +/** + * @file io_source.hpp + * @brief Utilities for constructing the specified IO sources from the input parquet files. + * + */ + +/** + * @brief Available IO source types + */ +enum class io_source_type { FILEPATH, HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER }; + +/** + * @brief Get io source type from the string keyword argument + * + * @param name io source type keyword name + * @return io source type + */ +[[nodiscard]] io_source_type get_io_source_type(std::string name); + +/** + * @brief Create and return a reference to a static pinned memory pool + * + * @return Reference to a static pinned memory pool + */ +rmm::host_async_resource_ref pinned_memory_resource(); + +/** + * @brief Custom allocator for pinned_buffer via RMM. + */ +template +struct pinned_allocator : public std::allocator { + pinned_allocator(rmm::host_async_resource_ref _mr, rmm::cuda_stream_view _stream) + : mr{_mr}, stream{_stream} + { + } + + T* allocate(std::size_t n) + { + auto ptr = mr.allocate_async(n * sizeof(T), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); + stream.synchronize(); + return static_cast(ptr); + } + + void deallocate(T* ptr, std::size_t n) + { + mr.deallocate_async(ptr, n * sizeof(T), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); + } + + private: + rmm::host_async_resource_ref mr; + rmm::cuda_stream_view stream; +}; + +/** + * @brief Class to create a cudf::io::source_info of given type from the input parquet file + * + */ +class io_source { + public: + io_source(std::string_view file_path, io_source_type io_type, rmm::cuda_stream_view stream); + + // Get the internal source info + [[nodiscard]] cudf::io::source_info get_source_info() const { return source_info; } + + private: + // alias for pinned vector + template + using pinned_vector = thrust::host_vector>; + cudf::io::source_info source_info; + std::vector h_buffer; + pinned_vector pinned_buffer; + rmm::device_uvector d_buffer; +}; diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp index 9cda22d0695..c11b8de82b5 100644 --- a/cpp/examples/parquet_io/parquet_io.cpp +++ b/cpp/examples/parquet_io/parquet_io.cpp @@ -14,11 +14,15 @@ * limitations under the License. */ -#include "parquet_io.hpp" - #include "../utilities/timer.hpp" +#include "common_utils.hpp" +#include "io_source.hpp" + +#include +#include +#include -#include +#include /** * @file parquet_io.cpp @@ -81,6 +85,18 @@ void write_parquet(cudf::table_view input, cudf::io::write_parquet(options); } +/** + * @brief Function to print example usage and argument information. + */ +void print_usage() +{ + std::cout << "\nUsage: parquet_io \n" + " \n\n" + "Available encoding types: DEFAULT, DICTIONARY, PLAIN, DELTA_BINARY_PACKED,\n" + " DELTA_LENGTH_BYTE_ARRAY, DELTA_BYTE_ARRAY\n\n" + "Available compression types: NONE, AUTO, SNAPPY, LZ4, ZSTD\n\n"; +} + /** * @brief Main for nested_types examples * @@ -97,29 +113,28 @@ void write_parquet(cudf::table_view input, */ int main(int argc, char const** argv) { - std::string input_filepath; - std::string output_filepath; - cudf::io::column_encoding encoding; - cudf::io::compression_type compression; - std::optional page_stats; + std::string input_filepath = "example.parquet"; + std::string output_filepath = "output.parquet"; + cudf::io::column_encoding encoding = get_encoding_type("DELTA_BINARY_PACKED"); + cudf::io::compression_type compression = get_compression_type("ZSTD"); + std::optional page_stats = std::nullopt; switch (argc) { - case 1: - input_filepath = "example.parquet"; - output_filepath = "output.parquet"; - encoding = get_encoding_type("DELTA_BINARY_PACKED"); - compression = get_compression_type("ZSTD"); - break; - case 6: page_stats = get_page_size_stats(argv[5]); [[fallthrough]]; - case 5: - input_filepath = argv[1]; - output_filepath = argv[2]; - encoding = get_encoding_type(argv[3]); - compression = get_compression_type(argv[4]); - break; - default: - throw std::runtime_error( - "Either provide all command-line arguments, or none to use defaults\n"); + case 6: + page_stats = get_boolean(argv[5]) + ? std::make_optional(cudf::io::statistics_freq::STATISTICS_COLUMN) + : std::nullopt; + [[fallthrough]]; + case 5: compression = get_compression_type(argv[4]); [[fallthrough]]; + case 4: encoding = get_encoding_type(argv[3]); [[fallthrough]]; + case 3: output_filepath = argv[2]; [[fallthrough]]; + case 2: // Check if instead of input_paths, the first argument is `-h` or `--help` + if (auto arg = std::string{argv[1]}; arg != "-h" and arg != "--help") { + input_filepath = std::move(arg); + break; + } + [[fallthrough]]; + default: print_usage(); throw std::runtime_error(""); } // Create and use a memory pool @@ -130,18 +145,16 @@ int main(int argc, char const** argv) // Read input parquet file // We do not want to time the initial read time as it may include // time for nvcomp, cufile loading and RMM growth - std::cout << std::endl << "Reading " << input_filepath << "..." << std::endl; + std::cout << "\nReading " << input_filepath << "...\n"; std::cout << "Note: Not timing the initial parquet read as it may include\n" - "times for nvcomp, cufile loading and RMM growth." - << std::endl - << std::endl; + "times for nvcomp, cufile loading and RMM growth.\n\n"; auto [input, metadata] = read_parquet(input_filepath); // Status string to indicate if page stats are set to be written or not auto page_stat_string = (page_stats.has_value()) ? "page stats" : "no page stats"; // Write parquet file with the specified encoding and compression std::cout << "Writing " << output_filepath << " with encoding, compression and " - << page_stat_string << ".." << std::endl; + << page_stat_string << "..\n"; // `timer` is automatically started here cudf::examples::timer timer; @@ -149,7 +162,7 @@ int main(int argc, char const** argv) timer.print_elapsed_millis(); // Read the parquet file written with encoding and compression - std::cout << "Reading " << output_filepath << "..." << std::endl; + std::cout << "Reading " << output_filepath << "...\n"; // Reset the timer timer.reset(); @@ -157,23 +170,7 @@ int main(int argc, char const** argv) timer.print_elapsed_millis(); // Check for validity - try { - // Left anti-join the original and transcoded tables - // identical tables should not throw an exception and - // return an empty indices vector - auto const indices = cudf::left_anti_join(input->view(), - transcoded_input->view(), - cudf::null_equality::EQUAL, - cudf::get_default_stream(), - resource.get()); - - // No exception thrown, check indices - auto const valid = indices->size() == 0; - std::cout << "Transcoding valid: " << std::boolalpha << valid << std::endl; - } catch (std::exception& e) { - std::cerr << e.what() << std::endl << std::endl; - std::cout << "Transcoding valid: false" << std::endl; - } + check_tables_equal(input->view(), transcoded_input->view()); return 0; } diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp new file mode 100644 index 00000000000..6ad4b862240 --- /dev/null +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -0,0 +1,466 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../utilities/timer.hpp" +#include "common_utils.hpp" +#include "io_source.hpp" + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +/** + * @file parquet_io_multithreaded.cpp + * @brief Demonstrates reading parquet data from the specified io source using multiple threads. + * + * The input parquet data is provided via files which are converted to the specified io source type + * to be read using multiple threads. Optionally, the parquet data read by each thread can be + * written to corresponding files and checked for validatity of the output files against the input + * data. + * + * Run: ``parquet_io_multithreaded -h`` to see help with input args and more information. + * + * The following io source types are supported: + * IO source types: FILEPATH, HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER + * + */ + +// Type alias for unique ptr to cudf table +using table_t = std::unique_ptr; + +/** + * @brief Behavior when handling the read tables by multiple threads + */ +enum class read_mode { + NO_CONCATENATE, ///< Only read and discard tables + CONCATENATE_THREAD, ///< Read and concatenate tables from each thread + CONCATENATE_ALL, ///< Read and concatenate everything to a single table +}; + +/** + * @brief Functor for multithreaded parquet reading based on the provided read_mode + */ +template +struct read_fn { + std::vector const& input_sources; + std::vector& tables; + int const thread_id; + int const thread_count; + rmm::cuda_stream_view stream; + + void operator()() + { + // Tables read by this thread + std::vector tables_this_thread; + + // Sweep the available input files + for (auto curr_file_idx = thread_id; curr_file_idx < input_sources.size(); + curr_file_idx += thread_count) { + auto builder = + cudf::io::parquet_reader_options::builder(input_sources[curr_file_idx].get_source_info()); + auto const options = builder.build(); + if constexpr (read_mode != read_mode::NO_CONCATENATE) { + tables_this_thread.push_back(cudf::io::read_parquet(options, stream).tbl); + } else { + cudf::io::read_parquet(options, stream); + } + } + + // Concatenate the tables read by this thread if not NO_CONCATENATE read_mode. + if constexpr (read_mode != read_mode::NO_CONCATENATE) { + auto table = concatenate_tables(std::move(tables_this_thread), stream); + stream.synchronize_no_throw(); + tables[thread_id] = std::move(table); + } else { + // Just synchronize this stream and exit + stream.synchronize_no_throw(); + } + } +}; + +/** + * @brief Function to setup and launch multithreaded parquet reading. + * + * @tparam read_mode Specifies if to concatenate and return the actual + * tables or discard them and return an empty vector + * + * @param input_sources List of input sources to read + * @param thread_count Number of threads + * @param stream_pool CUDA stream pool to use for threads + * + * @return Vector of read tables. + */ +template +std::vector read_parquet_multithreaded(std::vector const& input_sources, + int32_t thread_count, + rmm::cuda_stream_pool& stream_pool) +{ + // Tables read by each thread + std::vector tables(thread_count); + + // Table reading tasks + std::vector> read_tasks; + read_tasks.reserve(thread_count); + + // Create the read tasks + std::for_each( + thrust::make_counting_iterator(0), thrust::make_counting_iterator(thread_count), [&](auto tid) { + read_tasks.emplace_back( + read_fn{input_sources, tables, tid, thread_count, stream_pool.get_stream()}); + }); + + // Create threads with tasks + std::vector threads; + threads.reserve(thread_count); + for (auto& c : read_tasks) { + threads.emplace_back(c); + } + for (auto& t : threads) { + t.join(); + } + + // If CONCATENATE_ALL mode, then concatenate to a vector of one final table. + if (read_mode == read_mode::CONCATENATE_ALL) { + auto stream = stream_pool.get_stream(); + auto final_tbl = concatenate_tables(std::move(tables), stream); + stream.synchronize(); + tables.clear(); + tables.emplace_back(std::move(final_tbl)); + } + + return tables; +} + +/** + * @brief Functor for multithreaded parquet writing + */ +struct write_fn { + std::string const& output_path; + std::vector const& table_views; + int const thread_id; + rmm::cuda_stream_view stream; + + void operator()() + { + // Create a sink + cudf::io::sink_info const sink_info{output_path + "/table_" + std::to_string(thread_id) + + ".parquet"}; + // Writer options builder + auto builder = cudf::io::parquet_writer_options::builder(sink_info, table_views[thread_id]); + // Create a new metadata for the table + auto table_metadata = cudf::io::table_input_metadata{table_views[thread_id]}; + + builder.metadata(table_metadata); + auto options = builder.build(); + + // Write parquet data + cudf::io::write_parquet(options, stream); + + // Done with this stream + stream.synchronize_no_throw(); + } +}; + +/** + * @brief Function to setup and launch multithreaded writing parquet files. + * + * @param output_path Path to output directory + * @param tables List of at least table views to be written + * @param thread_count Number of threads to use for writing tables. + * @param stream_pool CUDA stream pool to use for threads + * + */ +void write_parquet_multithreaded(std::string const& output_path, + std::vector const& tables, + int32_t thread_count, + rmm::cuda_stream_pool& stream_pool) +{ + // Table writing tasks + std::vector write_tasks; + write_tasks.reserve(thread_count); + std::for_each( + thrust::make_counting_iterator(0), thrust::make_counting_iterator(thread_count), [&](auto tid) { + write_tasks.emplace_back(write_fn{output_path, tables, tid, stream_pool.get_stream()}); + }); + + // Writer threads + std::vector threads; + threads.reserve(thread_count); + for (auto& c : write_tasks) { + threads.emplace_back(c); + } + for (auto& t : threads) { + t.join(); + } +} + +/** + * @brief Function to print example usage and argument information. + */ +void print_usage() +{ + std::cout + << "\nUsage: parquet_io_multithreaded \n" + " \n" + " \n\n" + "Available IO source types: FILEPATH, HOST_BUFFER, PINNED_BUFFER (Default), " + "DEVICE_BUFFER\n\n" + "Note: Provide as many arguments as you like in the above order. Default values\n" + " for the unprovided arguments will be used. All input parquet files will\n" + " be converted to the specified IO source type before reading\n\n"; +} + +/** + * @brief Function to process comma delimited input paths string to parquet files and/or dirs + * and convert them to specified io sources. + * + * Process the input path string containing directories (of parquet files) and/or individual + * parquet files into a list of input parquet files, multiple the list by `input_multiplier`, + * make sure to have at least `thread_count` files to satisfy at least file per parallel thread, + * and convert the final list of files to a list of `io_source` and return. + * + * @param paths Comma delimited input paths string + * @param input_multiplier Multiplier for the input files list + * @param thread_count Number of threads being used in the example + * @param io_source_type Specified IO source type to convert input files to + * @param stream CUDA stream to use + * + * @return Vector of input sources for the given paths + */ +std::vector extract_input_sources(std::string const& paths, + int32_t input_multiplier, + int32_t thread_count, + io_source_type io_source_type, + rmm::cuda_stream_view stream) +{ + // Get the delimited paths to directory and/or files. + std::vector const delimited_paths = [&]() { + std::vector paths_list; + std::stringstream strstream{paths}; + std::string path; + // Extract the delimited paths. + while (std::getline(strstream, path, char{','})) { + paths_list.push_back(path); + } + return paths_list; + }(); + + // List of parquet files + std::vector parquet_files; + std::for_each(delimited_paths.cbegin(), delimited_paths.cend(), [&](auto const& path_string) { + std::filesystem::path path{path_string}; + // If this is a parquet file, add it. + if (std::filesystem::is_regular_file(path)) { + parquet_files.push_back(path_string); + } + // If this is a directory, add all files in the directory. + else if (std::filesystem::is_directory(path)) { + for (auto const& file : std::filesystem::directory_iterator(path)) { + if (std::filesystem::is_regular_file(file.path())) { + parquet_files.push_back(file.path().string()); + } else { + std::cout << "Skipping sub-directory: " << file.path().string() << "\n"; + } + } + } else { + print_usage(); + throw std::runtime_error("Encountered an invalid input path\n"); + } + }); + + // Current size of list of parquet files + auto const initial_size = parquet_files.size(); + if (initial_size == 0) { return {}; } + + // Reserve space + parquet_files.reserve(std::max(thread_count, input_multiplier * parquet_files.size())); + + // Append the input files by input_multiplier times + std::for_each(thrust::make_counting_iterator(1), + thrust::make_counting_iterator(input_multiplier), + [&](auto i) { + parquet_files.insert(parquet_files.end(), + parquet_files.begin(), + parquet_files.begin() + initial_size); + }); + + // Cycle append parquet files from the existing ones if less than the thread_count + std::cout << "Warning: Number of input sources < thread count. Cycling from\n" + "and appending to current input sources such that the number of\n" + "input source == thread count\n"; + for (size_t idx = 0; thread_count > static_cast(parquet_files.size()); idx++) { + parquet_files.emplace_back(parquet_files[idx % initial_size]); + } + + // Vector of io sources + std::vector input_sources; + input_sources.reserve(parquet_files.size()); + // Transform input files to the specified io sources + std::transform(parquet_files.begin(), + parquet_files.end(), + std::back_inserter(input_sources), + [&](auto const& file_name) { + return io_source{file_name, io_source_type, stream}; + }); + stream.synchronize(); + return input_sources; +} + +/** + * @brief The main function + */ +int32_t main(int argc, char const** argv) +{ + // Set arguments to defaults + std::string input_paths = "example.parquet"; + int32_t input_multiplier = 1; + int32_t num_reads = 1; + int32_t thread_count = 1; + io_source_type io_source_type = io_source_type::PINNED_BUFFER; + bool write_and_validate = false; + + // Set to the provided args + switch (argc) { + case 7: write_and_validate = get_boolean(argv[6]); [[fallthrough]]; + case 6: thread_count = std::max(thread_count, std::stoi(std::string{argv[5]})); [[fallthrough]]; + case 5: num_reads = std::max(1, std::stoi(argv[4])); [[fallthrough]]; + case 4: io_source_type = get_io_source_type(argv[3]); [[fallthrough]]; + case 3: + input_multiplier = std::max(input_multiplier, std::stoi(std::string{argv[2]})); + [[fallthrough]]; + case 2: + // Check if instead of input_paths, the first argument is `-h` or `--help` + if (auto arg = std::string{argv[1]}; arg != "-h" and arg != "--help") { + input_paths = std::move(arg); + break; + } + [[fallthrough]]; + default: print_usage(); throw std::runtime_error(""); + } + + // Initialize mr, default stream and stream pool + auto const is_pool_used = true; + auto resource = create_memory_resource(is_pool_used); + auto default_stream = cudf::get_default_stream(); + auto stream_pool = rmm::cuda_stream_pool(thread_count); + auto stats_mr = + rmm::mr::statistics_resource_adaptor(resource.get()); + rmm::mr::set_current_device_resource(&stats_mr); + + // List of input sources from the input_paths string. + auto const input_sources = extract_input_sources( + input_paths, input_multiplier, thread_count, io_source_type, default_stream); + + // Check if there is nothing to do + if (input_sources.empty()) { + print_usage(); + throw std::runtime_error("No input files to read. Exiting early.\n"); + } + + // Read the same parquet files specified times with multiple threads and discard the read tables + { + // Print status + std::cout << "\nReading " << input_sources.size() << " input sources " << num_reads + << " time(s) using " << thread_count + << " threads and discarding output " + "tables..\n"; + + if (io_source_type == io_source_type::FILEPATH) { + std::cout << "Note that the first read may include times for nvcomp, cufile loading and RMM " + "growth.\n\n"; + } + + cudf::examples::timer timer; + std::for_each(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_reads), + [&](auto i) { // Read parquet files and discard the tables + std::ignore = read_parquet_multithreaded( + input_sources, thread_count, stream_pool); + }); + default_stream.synchronize(); + timer.print_elapsed_millis(); + } + + // Write parquet files and validate if needed + if (write_and_validate) { + // read_mode::CONCATENATE_THREADS returns a vector of `thread_count` tables + auto const tables = read_parquet_multithreaded( + input_sources, thread_count, stream_pool); + default_stream.synchronize(); + + // Construct a vector of table views for write_parquet_multithreaded + auto const table_views = [&tables]() { + std::vector table_views; + table_views.reserve(tables.size()); + std::transform( + tables.cbegin(), tables.cend(), std::back_inserter(table_views), [](auto const& tbl) { + return tbl->view(); + }); + return table_views; + }(); + + // Write tables to parquet + std::cout << "Writing parquet output files..\n"; + + // Create a directory at the tmpdir path. + std::string output_path = + std::filesystem::temp_directory_path().string() + "/output_" + current_date_and_time(); + std::filesystem::create_directory({output_path}); + cudf::examples::timer timer; + write_parquet_multithreaded(output_path, table_views, thread_count, stream_pool); + default_stream.synchronize(); + timer.print_elapsed_millis(); + + // Verify the output + std::cout << "Verifying output..\n"; + + // Simply concatenate the previously read tables from input sources + auto const input_table = cudf::concatenate(table_views, default_stream); + + // Sources from written parquet files + auto const written_pq_sources = extract_input_sources( + output_path, input_multiplier, thread_count, io_source_type, default_stream); + + // read_mode::CONCATENATE_ALL returns a concatenated vector of 1 table only + auto const transcoded_table = std::move(read_parquet_multithreaded( + written_pq_sources, thread_count, stream_pool) + .back()); + default_stream.synchronize(); + + // Check if the tables are identical + check_tables_equal(input_table->view(), transcoded_table->view()); + + // Remove the created temp directory and parquet data + std::filesystem::remove_all(output_path); + } + + // Print peak memory + std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n\n"; + + return 0; +} diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp index 272c91133f8..2bd08f410e0 100644 --- a/cpp/include/cudf_test/testing_main.hpp +++ b/cpp/include/cudf_test/testing_main.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -36,6 +37,12 @@ namespace CUDF_EXPORT cudf { namespace test { +struct config { + std::string rmm_mode; + std::string stream_mode; + std::string stream_error_mode; +}; + /// MR factory functions inline auto make_cuda() { return std::make_shared(); } @@ -157,10 +164,9 @@ inline auto parse_cudf_test_opts(int argc, char** argv) * @param cmd_opts Command line options returned by parse_cudf_test_opts * @return Memory resource adaptor */ -inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts) +inline auto make_memory_resource_adaptor(cudf::test::config const& config) { - auto const rmm_mode = cmd_opts["rmm_mode"].as(); - auto resource = cudf::test::create_memory_resource(rmm_mode); + auto resource = cudf::test::create_memory_resource(config.rmm_mode); cudf::set_current_device_resource(resource.get()); return resource; } @@ -176,37 +182,54 @@ inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts) * @param cmd_opts Command line options returned by parse_cudf_test_opts * @return Memory resource adaptor */ -inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts) +inline auto make_stream_mode_adaptor(cudf::test::config const& config) { auto resource = cudf::get_current_device_resource_ref(); - auto const stream_mode = cmd_opts["stream_mode"].as(); - auto const stream_error_mode = cmd_opts["stream_error_mode"].as(); - auto const error_on_invalid_stream = (stream_error_mode == "error"); - auto const check_default_stream = (stream_mode == "new_cudf_default"); + auto const error_on_invalid_stream = (config.stream_error_mode == "error"); + auto const check_default_stream = (config.stream_mode == "new_cudf_default"); auto adaptor = cudf::test::stream_checking_resource_adaptor( resource, error_on_invalid_stream, check_default_stream); - if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) { + if ((config.stream_mode == "new_cudf_default") || (config.stream_mode == "new_testing_default")) { cudf::set_current_device_resource(&adaptor); } return adaptor; } +/** + * @brief Should be called in every test program that uses rmm allocators since it maintains the + * lifespan of the rmm default memory resource. this function parses the command line to customize + * test behavior, like the allocation mode used for creating the default memory resource. + * + */ +inline void init_cudf_test(int argc, char** argv, cudf::test::config const& config_override = {}) +{ + // static lifetime to keep rmm resource alive till tests end + auto const cmd_opts = parse_cudf_test_opts(argc, argv); + cudf::test::config config = config_override; + if (config.rmm_mode.empty()) { config.rmm_mode = cmd_opts["rmm_mode"].as(); } + + if (config.stream_mode.empty()) { + config.stream_mode = cmd_opts["stream_mode"].as(); + } + + if (config.stream_error_mode.empty()) { + config.stream_error_mode = cmd_opts["stream_error_mode"].as(); + } + + [[maybe_unused]] static auto mr = make_memory_resource_adaptor(config); + [[maybe_unused]] static auto adaptor = make_stream_mode_adaptor(config); +} + /** * @brief Macro that defines main function for gtest programs that use rmm * - * Should be included in every test program that uses rmm allocators since - * it maintains the lifespan of the rmm default memory resource. * This `main` function is a wrapper around the google test generated `main`, - * maintaining the original functionality. In addition, this custom `main` - * function parses the command line to customize test behavior, like the - * allocation mode used for creating the default memory resource. + * maintaining the original functionality. */ -#define CUDF_TEST_PROGRAM_MAIN() \ - int main(int argc, char** argv) \ - { \ - ::testing::InitGoogleTest(&argc, argv); \ - auto const cmd_opts = parse_cudf_test_opts(argc, argv); \ - [[maybe_unused]] auto mr = make_memory_resource_adaptor(cmd_opts); \ - [[maybe_unused]] auto adaptor = make_stream_mode_adaptor(cmd_opts); \ - return RUN_ALL_TESTS(); \ +#define CUDF_TEST_PROGRAM_MAIN() \ + int main(int argc, char** argv) \ + { \ + ::testing::InitGoogleTest(&argc, argv); \ + init_cudf_test(argc, argv); \ + return RUN_ALL_TESTS(); \ } diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py deleted file mode 100644 index e5e57dbf562..00000000000 --- a/cpp/scripts/run-clang-tidy.py +++ /dev/null @@ -1,253 +0,0 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import re -import os -import subprocess -import argparse -import json -import multiprocessing as mp -import shutil - - -EXPECTED_VERSION = "16.0.6" -VERSION_REGEX = re.compile(r" LLVM version ([0-9.]+)") -GPU_ARCH_REGEX = re.compile(r"sm_(\d+)") -SPACES = re.compile(r"\s+") -SEPARATOR = "-" * 16 - - -def parse_args(): - argparser = argparse.ArgumentParser("Runs clang-tidy on a project") - argparser.add_argument("-cdb", type=str, - # TODO This is a hack, needs to be fixed - default="cpp/build/cuda-11.5.0/clang-tidy/release/compile_commands.clangd.json", - help="Path to cmake-generated compilation database" - " file. It is always found inside the root of the " - "cmake build folder. So make sure that `cmake` has " - "been run once before running this script!") - argparser.add_argument("-exe", type=str, default="clang-tidy", - help="Path to clang-tidy exe") - argparser.add_argument("-ignore", type=str, default="[.]cu$|examples/kmeans/", - help="Regex used to ignore files from checking") - argparser.add_argument("-select", type=str, default=None, - help="Regex used to select files for checking") - argparser.add_argument("-j", type=int, default=-1, - help="Number of parallel jobs to launch.") - args = argparser.parse_args() - if args.j <= 0: - args.j = mp.cpu_count() - args.ignore_compiled = re.compile(args.ignore) if args.ignore else None - args.select_compiled = re.compile(args.select) if args.select else None - ret = subprocess.check_output("%s --version" % args.exe, shell=True) - ret = ret.decode("utf-8") - version = VERSION_REGEX.search(ret) - if version is None: - raise Exception("Failed to figure out clang-tidy version!") - version = version.group(1) - if version != EXPECTED_VERSION: - raise Exception("clang-tidy exe must be v%s found '%s'" % \ - (EXPECTED_VERSION, version)) - if not os.path.exists(args.cdb): - raise Exception("Compilation database '%s' missing" % args.cdb) - return args - - -def get_all_commands(cdb): - with open(cdb) as fp: - return json.load(fp) - - -def get_gpu_archs(command): - archs = [] - for loc in range(len(command)): - if command[loc] != "-gencode": - continue - arch_flag = command[loc + 1] - match = GPU_ARCH_REGEX.search(arch_flag) - if match is not None: - archs.append("--cuda-gpu-arch=sm_%s" % match.group(1)) - return archs - - -def get_index(arr, item): - try: - return arr.index(item) - except: - return -1 - - -def remove_item(arr, item): - loc = get_index(arr, item) - if loc >= 0: - del arr[loc] - return loc - - -def remove_item_plus_one(arr, item): - loc = get_index(arr, item) - if loc >= 0: - del arr[loc + 1] - del arr[loc] - return loc - - -def get_clang_includes(exe): - dir = os.getenv("CONDA_PREFIX") - if dir is None: - ret = subprocess.check_output("which %s 2>&1" % exe, shell=True) - ret = ret.decode("utf-8") - dir = os.path.dirname(os.path.dirname(ret)) - header = os.path.join(dir, "include", "ClangHeaders") - return ["-I", header] - - -def get_tidy_args(cmd, exe): - command, file = cmd["command"], cmd["file"] - is_cuda = file.endswith(".cu") - command = re.split(SPACES, command) - # compiler is always clang++! - command[0] = "clang++" - # remove compilation and output targets from the original command - remove_item_plus_one(command, "-c") - remove_item_plus_one(command, "-o") - if is_cuda: - # replace nvcc's "-gencode ..." with clang's "--cuda-gpu-arch ..." - archs = get_gpu_archs(command) - command.extend(archs) - while True: - loc = remove_item_plus_one(command, "-gencode") - if loc < 0: - break - # "-x cuda" is the right usage in clang - loc = get_index(command, "-x") - if loc >= 0: - command[loc + 1] = "cuda" - remove_item_plus_one(command, "-ccbin") - remove_item(command, "--expt-extended-lambda") - remove_item(command, "--diag_suppress=unrecognized_gcc_pragma") - command.extend(get_clang_includes(exe)) - return command, is_cuda - - -def run_clang_tidy_command(tidy_cmd): - cmd = " ".join(tidy_cmd) - result = subprocess.run(cmd, check=False, shell=True, - stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - status = result.returncode == 0 - if status: - out = "" - else: - out = "CMD: " + cmd - out += result.stdout.decode("utf-8").rstrip() - return status, out - - -def run_clang_tidy(cmd, args): - command, is_cuda = get_tidy_args(cmd, args.exe) - tidy_cmd = [args.exe, - "-header-filter='.*cudf/cpp/(src|include|bench|comms).*'", - cmd["file"], "--", ] - tidy_cmd.extend(command) - status = True - out = "" - if is_cuda: - tidy_cmd.append("--cuda-device-only") - tidy_cmd.append(cmd["file"]) - ret, out1 = run_clang_tidy_command(tidy_cmd) - out += out1 - out += "%s" % SEPARATOR - if not ret: - status = ret - tidy_cmd[-2] = "--cuda-host-only" - ret, out1 = run_clang_tidy_command(tidy_cmd) - if not ret: - status = ret - out += out1 - else: - tidy_cmd.append(cmd["file"]) - ret, out1 = run_clang_tidy_command(tidy_cmd) - if not ret: - status = ret - out += out1 - return status, out, cmd["file"] - - -# yikes! global var :( -results = [] -def collect_result(result): - global results - results.append(result) - - -def print_result(passed, stdout, file): - status_str = "PASSED" if passed else "FAILED" - print(f"{SEPARATOR} File:{file} {status_str} {SEPARATOR}") - if stdout: - print(stdout) - print(f"{SEPARATOR} File:{file} ENDS {SEPARATOR}") - - -def print_results(): - global results - status = True - for passed, stdout, file in results: - print_result(passed, stdout, file) - if not passed: - status = False - return status - - -def run_tidy_for_all_files(args, all_files): - pool = None if args.j == 1 else mp.Pool(args.j) - # actual tidy checker - for cmd in all_files: - # skip files that we don't want to look at - if args.ignore_compiled is not None and \ - re.search(args.ignore_compiled, cmd["file"]) is not None: - continue - if args.select_compiled is not None and \ - re.search(args.select_compiled, cmd["file"]) is None: - continue - if pool is not None: - pool.apply_async(run_clang_tidy, args=(cmd, args), - callback=collect_result) - else: - passed, stdout, file = run_clang_tidy(cmd, args) - collect_result((passed, stdout, file)) - if pool is not None: - pool.close() - pool.join() - return print_results() - - -def main(): - args = parse_args() - # Attempt to making sure that we run this script from root of repo always - if not os.path.exists(".git"): - raise Exception("This needs to always be run from the root of repo") - # Check whether clang-tidy exists - # print(args) - if "exe" not in args and shutil.which("clang-tidy") is not None: - print("clang-tidy not found. Exiting...") - return - all_files = get_all_commands(args.cdb) - status = run_tidy_for_all_files(args, all_files) - if not status: - raise Exception("clang-tidy failed! Refer to the errors above.") - - -if __name__ == "__main__": - main() diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp index a6b6cbbf0b5..2196ee97fee 100644 --- a/cpp/src/datetime/timezone.cpp +++ b/cpp/src/datetime/timezone.cpp @@ -138,7 +138,7 @@ struct timezone_file { std::filesystem::path{tzif_dir.value_or(tzif_system_directory)} / timezone_name; std::ifstream fin; fin.open(tz_filename, ios_base::in | ios_base::binary | ios_base::ate); - CUDF_EXPECTS(fin, "Failed to open the timezone file."); + CUDF_EXPECTS(fin, "Failed to open the timezone file '" + tz_filename.string() + "'"); auto const file_size = fin.tellg(); fin.seekg(0); diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp new file mode 100644 index 00000000000..b2048a9fbb8 --- /dev/null +++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flatten_single_pass_aggs.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace cudf::groupby::detail::hash { + +class groupby_simple_aggregations_collector final + : public cudf::detail::simple_aggregations_collector { + public: + using cudf::detail::simple_aggregations_collector::visit; + + std::vector> visit(data_type col_type, + cudf::detail::min_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation() + : make_min_aggregation()); + return aggs; + } + + std::vector> visit(data_type col_type, + cudf::detail::max_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation() + : make_max_aggregation()); + return aggs; + } + + std::vector> visit(data_type col_type, + cudf::detail::mean_aggregation const&) override + { + (void)col_type; + CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type"); + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } + + std::vector> visit(data_type, + cudf::detail::var_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } + + std::vector> visit(data_type, + cudf::detail::std_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } + + std::vector> visit( + data_type, cudf::detail::correlation_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } +}; + +// flatten aggs to filter in single pass aggs +std::tuple, std::vector>> +flatten_single_pass_aggs(host_span requests) +{ + std::vector columns; + std::vector> aggs; + std::vector agg_kinds; + + for (auto const& request : requests) { + auto const& agg_v = request.aggregations; + + std::unordered_set agg_kinds_set; + auto insert_agg = [&](column_view const& request_values, std::unique_ptr&& agg) { + if (agg_kinds_set.insert(agg->kind).second) { + agg_kinds.push_back(agg->kind); + aggs.push_back(std::move(agg)); + columns.push_back(request_values); + } + }; + + auto values_type = cudf::is_dictionary(request.values.type()) + ? cudf::dictionary_column_view(request.values).keys().type() + : request.values.type(); + for (auto&& agg : agg_v) { + groupby_simple_aggregations_collector collector; + + for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) { + insert_agg(request.values, std::move(agg_s)); + } + } + } + + return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs)); +} + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp new file mode 100644 index 00000000000..2bf983e5e90 --- /dev/null +++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +namespace cudf::groupby::detail::hash { + +// flatten aggs to filter in single pass aggs +std::tuple, std::vector>> +flatten_single_pass_aggs(host_span requests); + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index f9a80a048b5..0432b9d120a 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -14,8 +14,10 @@ * limitations under the License. */ +#include "flatten_single_pass_aggs.hpp" #include "groupby/common/utils.hpp" -#include "groupby/hash/groupby_kernels.cuh" +#include "groupby_kernels.cuh" +#include "var_hash_functor.cuh" #include #include @@ -110,76 +112,6 @@ bool constexpr is_hash_aggregation(aggregation::Kind t) return array_contains(hash_aggregations, t); } -class groupby_simple_aggregations_collector final - : public cudf::detail::simple_aggregations_collector { - public: - using cudf::detail::simple_aggregations_collector::visit; - - std::vector> visit(data_type col_type, - cudf::detail::min_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation() - : make_min_aggregation()); - return aggs; - } - - std::vector> visit(data_type col_type, - cudf::detail::max_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation() - : make_max_aggregation()); - return aggs; - } - - std::vector> visit(data_type col_type, - cudf::detail::mean_aggregation const&) override - { - (void)col_type; - CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type"); - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } - - std::vector> visit(data_type, - cudf::detail::var_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } - - std::vector> visit(data_type, - cudf::detail::std_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } - - std::vector> visit( - data_type, cudf::detail::correlation_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } -}; - template class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer { column_view col; @@ -330,7 +262,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final rmm::exec_policy(stream), thrust::make_counting_iterator(0), col.size(), - ::cudf::detail::var_hash_functor{ + var_hash_functor{ set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); sparse_results->add_result(col, agg, std::move(var_result)); dense_results->add_result(col, agg, to_dense_agg_result(agg)); @@ -347,40 +279,6 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final dense_results->add_result(col, agg, std::move(result)); } }; -// flatten aggs to filter in single pass aggs -std::tuple, std::vector>> -flatten_single_pass_aggs(host_span requests) -{ - std::vector columns; - std::vector> aggs; - std::vector agg_kinds; - - for (auto const& request : requests) { - auto const& agg_v = request.aggregations; - - std::unordered_set agg_kinds_set; - auto insert_agg = [&](column_view const& request_values, std::unique_ptr&& agg) { - if (agg_kinds_set.insert(agg->kind).second) { - agg_kinds.push_back(agg->kind); - aggs.push_back(std::move(agg)); - columns.push_back(request_values); - } - }; - - auto values_type = cudf::is_dictionary(request.values.type()) - ? cudf::dictionary_column_view(request.values).keys().type() - : request.values.type(); - for (auto&& agg : agg_v) { - groupby_simple_aggregations_collector collector; - - for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) { - insert_agg(request.values, std::move(agg_s)); - } - } - } - - return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs)); -} /** * @brief Gather sparse results into dense using `gather_map` and add to diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh index 188d0cff3f1..86f4d76487f 100644 --- a/cpp/src/groupby/hash/groupby_kernels.cuh +++ b/cpp/src/groupby/hash/groupby_kernels.cuh @@ -16,8 +16,6 @@ #pragma once -#include "multi_pass_kernels.cuh" - #include #include #include diff --git a/cpp/src/groupby/hash/multi_pass_kernels.cuh b/cpp/src/groupby/hash/var_hash_functor.cuh similarity index 69% rename from cpp/src/groupby/hash/multi_pass_kernels.cuh rename to cpp/src/groupby/hash/var_hash_functor.cuh index 7043eafdc10..bb55cc9188c 100644 --- a/cpp/src/groupby/hash/multi_pass_kernels.cuh +++ b/cpp/src/groupby/hash/var_hash_functor.cuh @@ -13,7 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once #include @@ -21,17 +20,14 @@ #include #include #include -#include #include +#include #include +#include -#include - -namespace cudf { -namespace detail { - -template +namespace cudf::groupby::detail::hash { +template struct var_hash_functor { SetType set; bitmask_type const* __restrict__ row_bitmask; @@ -47,13 +43,13 @@ struct var_hash_functor { column_device_view sum, column_device_view count, size_type ddof) - : set(set), - row_bitmask(row_bitmask), - target(target), - source(source), - sum(sum), - count(count), - ddof(ddof) + : set{set}, + row_bitmask{row_bitmask}, + target{target}, + source{source}, + sum{sum}, + count{count}, + ddof{ddof} { } @@ -64,23 +60,21 @@ struct var_hash_functor { } template - __device__ std::enable_if_t()> operator()(column_device_view const& source, - size_type source_index, - size_type target_index) noexcept + __device__ cuda::std::enable_if_t()> operator()( + column_device_view const& source, size_type source_index, size_type target_index) noexcept { CUDF_UNREACHABLE("Invalid source type for std, var aggregation combination."); } template - __device__ std::enable_if_t()> operator()(column_device_view const& source, - size_type source_index, - size_type target_index) noexcept + __device__ cuda::std::enable_if_t()> operator()( + column_device_view const& source, size_type source_index, size_type target_index) noexcept { - using Target = target_type_t; - using SumType = target_type_t; - using CountType = target_type_t; + using Target = cudf::detail::target_type_t; + using SumType = cudf::detail::target_type_t; + using CountType = cudf::detail::target_type_t; - if (source_has_nulls and source.is_null(source_index)) return; + if (source.is_null(source_index)) return; CountType group_size = count.element(target_index); if (group_size == 0 or group_size - ddof <= 0) return; @@ -91,8 +85,9 @@ struct var_hash_functor { ref.fetch_add(result, cuda::std::memory_order_relaxed); // STD sqrt is applied in finalize() - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + if (target.is_null(target_index)) { target.set_valid(target_index); } } + __device__ inline void operator()(size_type source_index) { if (row_bitmask == nullptr or cudf::bit_is_set(row_bitmask, source_index)) { @@ -110,6 +105,4 @@ struct var_hash_functor { } } }; - -} // namespace detail -} // namespace cudf +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu index 79fb7550044..8ec0904f1ba 100644 --- a/cpp/src/interop/to_arrow_host.cu +++ b/cpp/src/interop/to_arrow_host.cu @@ -44,6 +44,7 @@ #include #include #include +#include #include @@ -52,6 +53,30 @@ namespace detail { namespace { +/* + Enable Transparent Huge Pages (THP) for large (>4MB) allocations. + `buf` is returned untouched. + Enabling THP can improve performance of device-host memory transfers + significantly, see . +*/ +void enable_hugepage(ArrowBuffer* buffer) +{ + if (buffer->size_bytes < (1u << 22u)) { // Smaller than 4 MB + return; + } + +#ifdef MADV_HUGEPAGE + auto const pagesize = sysconf(_SC_PAGESIZE); + void* addr = const_cast(buffer->data); + auto length{static_cast(buffer->size_bytes)}; + if (std::align(pagesize, pagesize, addr, length)) { + // Intentionally not checking for errors that may be returned by older kernel versions; + // optimistically tries enabling huge pages. + madvise(addr, length, MADV_HUGEPAGE); + } +#endif +} + struct dispatch_to_arrow_host { cudf::column_view column; rmm::cuda_stream_view stream; @@ -62,6 +87,7 @@ struct dispatch_to_arrow_host { if (!column.has_nulls()) { return NANOARROW_OK; } NANOARROW_RETURN_NOT_OK(ArrowBitmapResize(bitmap, static_cast(column.size()), 0)); + enable_hugepage(&bitmap->buffer); CUDF_CUDA_TRY(cudaMemcpyAsync(bitmap->buffer.data, (column.offset() > 0) ? cudf::detail::copy_bitmask(column, stream, mr).data() @@ -76,6 +102,7 @@ struct dispatch_to_arrow_host { int populate_data_buffer(device_span input, ArrowBuffer* buffer) const { NANOARROW_RETURN_NOT_OK(ArrowBufferResize(buffer, input.size_bytes(), 1)); + enable_hugepage(buffer); CUDF_CUDA_TRY(cudaMemcpyAsync( buffer->data, input.data(), input.size_bytes(), cudaMemcpyDefault, stream.value())); return NANOARROW_OK; diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 76816071d8c..69a51fab5dc 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -83,8 +83,7 @@ struct tree_node { void check_input_size(std::size_t input_size) { // Transduce() writes symbol offsets that may be as large input_size-1 - CUDF_EXPECTS(input_size == 0 || - (input_size - 1) <= std::numeric_limits::max(), + CUDF_EXPECTS(input_size == 0 || (input_size - 1) <= std::numeric_limits::max(), "Given JSON input is too large"); } } // namespace diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 99a5b17bce8..c424d2b3b62 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -351,10 +351,16 @@ table_with_metadata read_json(host_span> sources, * JSON inputs. */ std::size_t const total_source_size = sources_size(sources, 0, 0); - std::size_t chunk_offset = reader_opts.get_byte_range_offset(); - std::size_t chunk_size = reader_opts.get_byte_range_size(); - chunk_size = !chunk_size ? total_source_size - chunk_offset - : std::min(chunk_size, total_source_size - chunk_offset); + + // Batching is enabled only for JSONL inputs, not regular JSON files + CUDF_EXPECTS( + reader_opts.is_enabled_lines() || total_source_size < std::numeric_limits::max(), + "Parsing Regular JSON inputs of size greater than INT_MAX bytes is not supported"); + + std::size_t chunk_offset = reader_opts.get_byte_range_offset(); + std::size_t chunk_size = reader_opts.get_byte_range_size(); + chunk_size = !chunk_size ? total_source_size - chunk_offset + : std::min(chunk_size, total_source_size - chunk_offset); std::size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size); std::size_t const batch_size_upper_bound = get_batch_size_upper_bound(); diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 5be75350951..0cb5c382631 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -77,20 +77,6 @@ void rowgroup_char_counts(device_2dspan counts, counts, orc_columns, rowgroup_bounds, str_col_indexes); } -template -CUDF_KERNEL void __launch_bounds__(block_size) - initialize_dictionary_hash_maps_kernel(device_span dictionaries) -{ - auto const dict_map = dictionaries[blockIdx.x].map_slots; - auto const t = threadIdx.x; - for (size_type i = 0; i < dict_map.size(); i += block_size) { - if (t + i < dict_map.size()) { - new (&dict_map[t + i].first) map_type::atomic_key_type{KEY_SENTINEL}; - new (&dict_map[t + i].second) map_type::atomic_mapped_type{VALUE_SENTINEL}; - } - } -} - struct equality_functor { column_device_view const& col; __device__ bool operator()(size_type lhs_idx, size_type rhs_idx) const @@ -109,6 +95,9 @@ struct hash_functor { } }; +// Probing scheme to use for the hash map +using probing_scheme_type = cuco::linear_probing; + template CUDF_KERNEL void __launch_bounds__(block_size) populate_dictionary_hash_maps_kernel(device_2dspan dictionaries, @@ -121,26 +110,34 @@ CUDF_KERNEL void __launch_bounds__(block_size) auto const& col = columns[dict.column_idx]; // Make a view of the hash map - auto hash_map_mutable = map_type::device_mutable_view(dict.map_slots.data(), - dict.map_slots.size(), - cuco::empty_key{KEY_SENTINEL}, - cuco::empty_value{VALUE_SENTINEL}); auto const hash_fn = hash_functor{col}; auto const equality_fn = equality_functor{col}; + storage_ref_type const storage_ref{dict.map_slots.size(), dict.map_slots.data()}; + // Make a view of the hash map. + auto hash_map_ref = cuco::static_map_ref{cuco::empty_key{KEY_SENTINEL}, + cuco::empty_value{VALUE_SENTINEL}, + equality_fn, + probing_scheme_type{hash_fn}, + cuco::thread_scope_block, + storage_ref}; + + // Create a map ref with `cuco::insert` operator + auto has_map_insert_ref = hash_map_ref.rebind_operators(cuco::insert); + auto const start_row = dict.start_row; auto const end_row = dict.start_row + dict.num_rows; size_type entry_count{0}; size_type char_count{0}; + // all threads should loop the same number of times for (thread_index_type cur_row = start_row + t; cur_row - t < end_row; cur_row += block_size) { auto const is_valid = cur_row < end_row and col.is_valid(cur_row); if (is_valid) { // insert element at cur_row to hash map and count successful insertions - auto const is_unique = - hash_map_mutable.insert(std::pair(cur_row, cur_row), hash_fn, equality_fn); + auto const is_unique = has_map_insert_ref.insert(cuco::pair{cur_row, cur_row}); if (is_unique) { ++entry_count; @@ -175,24 +172,23 @@ CUDF_KERNEL void __launch_bounds__(block_size) if (not dict.is_enabled) { return; } auto const t = threadIdx.x; - auto map = map_type::device_view(dict.map_slots.data(), - dict.map_slots.size(), - cuco::empty_key{KEY_SENTINEL}, - cuco::empty_value{VALUE_SENTINEL}); - __shared__ cuda::atomic counter; using cuda::std::memory_order_relaxed; if (t == 0) { new (&counter) cuda::atomic{0}; } __syncthreads(); + for (size_type i = 0; i < dict.map_slots.size(); i += block_size) { if (t + i < dict.map_slots.size()) { - auto* slot = reinterpret_cast(map.begin_slot() + t + i); - auto key = slot->first; - if (key != KEY_SENTINEL) { - auto loc = counter.fetch_add(1, memory_order_relaxed); - dict.data[loc] = key; - slot->second = loc; + auto window = dict.map_slots.begin() + t + i; + // Collect all slots from each window. + for (auto& slot : *window) { + auto const key = slot.first; + if (key != KEY_SENTINEL) { + auto loc = counter.fetch_add(1, memory_order_relaxed); + dict.data[loc] = key; + slot.second = loc; + } } } } @@ -205,47 +201,42 @@ CUDF_KERNEL void __launch_bounds__(block_size) { auto const col_idx = blockIdx.x; auto const stripe_idx = blockIdx.y; + auto const t = threadIdx.x; auto const& dict = dictionaries[col_idx][stripe_idx]; auto const& col = columns[dict.column_idx]; if (not dict.is_enabled) { return; } - auto const t = threadIdx.x; + // Make a view of the hash map + auto const hash_fn = hash_functor{col}; + auto const equality_fn = equality_functor{col}; + + storage_ref_type const storage_ref{dict.map_slots.size(), dict.map_slots.data()}; + // Make a view of the hash map. + auto hash_map_ref = cuco::static_map_ref{cuco::empty_key{KEY_SENTINEL}, + cuco::empty_value{VALUE_SENTINEL}, + equality_fn, + probing_scheme_type{hash_fn}, + cuco::thread_scope_block, + storage_ref}; + + // Create a map ref with `cuco::insert` operator + auto has_map_find_ref = hash_map_ref.rebind_operators(cuco::find); + auto const start_row = dict.start_row; auto const end_row = dict.start_row + dict.num_rows; - auto const map = map_type::device_view(dict.map_slots.data(), - dict.map_slots.size(), - cuco::empty_key{KEY_SENTINEL}, - cuco::empty_value{VALUE_SENTINEL}); - - thread_index_type cur_row = start_row + t; - while (cur_row < end_row) { + for (thread_index_type cur_row = start_row + t; cur_row < end_row; cur_row += block_size) { if (col.is_valid(cur_row)) { - auto const hash_fn = hash_functor{col}; - auto const equality_fn = equality_functor{col}; - auto const found_slot = map.find(cur_row, hash_fn, equality_fn); - cudf_assert(found_slot != map.end() && + auto const found_slot = has_map_find_ref.find(cur_row); + // Fail if we didn't find the previously inserted key. + cudf_assert(found_slot != has_map_find_ref.end() && "Unable to find value in map in dictionary index construction"); - if (found_slot != map.end()) { - // No need for atomic as this is not going to be modified by any other thread - auto const val_ptr = reinterpret_cast(&found_slot->second); - dict.index[cur_row] = *val_ptr; - } + dict.index[cur_row] = found_slot->second; } - cur_row += block_size; } } -void initialize_dictionary_hash_maps(device_2dspan dictionaries, - rmm::cuda_stream_view stream) -{ - if (dictionaries.count() == 0) { return; } - constexpr int block_size = 1024; - initialize_dictionary_hash_maps_kernel - <<>>(dictionaries.flat_view()); -} - void populate_dictionary_hash_maps(device_2dspan dictionaries, device_span columns, rmm::cuda_stream_view stream) diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp index 8c7ccf0527f..0949fafe9a4 100644 --- a/cpp/src/io/orc/orc_gpu.hpp +++ b/cpp/src/io/orc/orc_gpu.hpp @@ -21,6 +21,7 @@ #include "io/utilities/column_buffer.hpp" #include "orc.hpp" +#include #include #include #include @@ -40,19 +41,27 @@ namespace gpu { using cudf::detail::device_2dspan; using cudf::detail::host_2dspan; +using key_type = size_type; +using mapped_type = size_type; +using slot_type = cuco::pair; +auto constexpr map_cg_size = + 1; ///< A CUDA Cooperative Group of 1 thread (set for best performance) to handle each subset. + ///< Note: Adjust insert and find loops to use `cg::tile` if increasing this. +auto constexpr window_size = + 1; ///< Number of concurrent slots (set for best performance) handled by each thread. +auto constexpr occupancy_factor = 1.43f; ///< cuCollections suggests using a hash map of size + ///< N * (1/0.7) = 1.43 to target a 70% occupancy factor. +using storage_type = cuco::aow_storage, + cudf::detail::cuco_allocator>; +using storage_ref_type = typename storage_type::ref_type; +using window_type = typename storage_type::window_type; +using slot_type = cuco::pair; + auto constexpr KEY_SENTINEL = size_type{-1}; auto constexpr VALUE_SENTINEL = size_type{-1}; -using map_type = cuco::legacy::static_map; - -/** - * @brief The alias of `map_type::pair_atomic_type` class. - * - * Declare this struct by trivial subclassing instead of type aliasing so we can have forward - * declaration of this struct somewhere else. - */ -struct slot_type : public map_type::slot_type {}; - struct CompressedStreamInfo { CompressedStreamInfo() = default; explicit constexpr CompressedStreamInfo(uint8_t const* compressed_data_, size_t compressed_size_) @@ -184,11 +193,11 @@ struct StripeStream { */ struct stripe_dictionary { // input - device_span map_slots; // hash map storage - uint32_t column_idx = 0; // column index - size_type start_row = 0; // first row in the stripe - size_type start_rowgroup = 0; // first rowgroup in the stripe - size_type num_rows = 0; // number of rows in the stripe + device_span map_slots; // hash map (windows) storage + uint32_t column_idx = 0; // column index + size_type start_row = 0; // first row in the stripe + size_type start_rowgroup = 0; // first rowgroup in the stripe + size_type num_rows = 0; // number of rows in the stripe // output device_span data; // index of elements in the column to include in the dictionary diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu index 01ee5ad177d..ecf319e75ab 100644 --- a/cpp/src/io/orc/reader_impl_chunking.cu +++ b/cpp/src/io/orc/reader_impl_chunking.cu @@ -500,6 +500,8 @@ void reader_impl::load_next_stripe_data(read_mode mode) auto const [read_begin, read_end] = merge_selected_ranges(_file_itm_data.stripe_data_read_ranges, load_stripe_range); + bool stream_synchronized{false}; + for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) { auto const& read_info = _file_itm_data.data_read_info[read_idx]; auto const source_ptr = _metadata.per_file_metadata[read_info.source_idx].source; @@ -507,6 +509,13 @@ void reader_impl::load_next_stripe_data(read_mode mode) lvl_stripe_data[read_info.level][read_info.stripe_idx - stripe_start].data()); if (source_ptr->is_device_read_preferred(read_info.length)) { + // `device_read_async` may not use _stream at all. + // Instead, it may use some other stream(s) to sync the H->D memcpy. + // As such, we need to make sure the device buffers in `lvl_stripe_data` are ready first. + if (!stream_synchronized) { + _stream.synchronize(); + stream_synchronized = true; + } device_read_tasks.push_back( std::pair(source_ptr->device_read_async( read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream), diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 60a64fb0ee6..b09062f700e 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -20,6 +20,7 @@ */ #include "io/comp/nvcomp_adapter.hpp" +#include "io/orc/orc_gpu.hpp" #include "io/statistics/column_statistics.cuh" #include "io/utilities/column_utils.cuh" #include "writer_impl.hpp" @@ -2110,7 +2111,9 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, bool sort_dictionaries, rmm::cuda_stream_view stream) { - std::vector>> hash_maps_storage( + // Variable to keep track of the current total map storage size + size_t total_map_storage_size = 0; + std::vector> hash_maps_storage_offsets( orc_table.string_column_indices.size()); for (auto col_idx : orc_table.string_column_indices) { auto& str_column = orc_table.column(col_idx); @@ -2119,14 +2122,21 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, stripe.size == 0 ? 0 : segmentation.rowgroups[stripe.first + stripe.size - 1][col_idx].end - segmentation.rowgroups[stripe.first][col_idx].begin; - hash_maps_storage[str_column.str_index()].emplace_back(stripe_num_rows * 1.43, stream); + hash_maps_storage_offsets[str_column.str_index()].emplace_back(total_map_storage_size); + total_map_storage_size += stripe_num_rows * gpu::occupancy_factor; } + hash_maps_storage_offsets[str_column.str_index()].emplace_back(total_map_storage_size); } hostdevice_2dvector stripe_dicts( orc_table.num_string_columns(), segmentation.num_stripes(), stream); if (stripe_dicts.count() == 0) return {std::move(stripe_dicts), {}, {}}; + // Create a single bulk storage to use for all sub-dictionaries + auto map_storage = std::make_unique( + total_map_storage_size, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}); + // Initialize stripe dictionaries for (auto col_idx : orc_table.string_column_indices) { auto& str_column = orc_table.column(col_idx); @@ -2137,7 +2147,9 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, auto const stripe_idx = stripe.id; auto& sd = stripe_dicts[str_col_idx][stripe_idx]; - sd.map_slots = hash_maps_storage[str_col_idx][stripe_idx]; + sd.map_slots = {map_storage->data() + hash_maps_storage_offsets[str_col_idx][stripe_idx], + hash_maps_storage_offsets[str_col_idx][stripe_idx + 1] - + hash_maps_storage_offsets[str_col_idx][stripe_idx]}; sd.column_idx = col_idx; sd.start_row = segmentation.rowgroups[stripe.first][col_idx].begin; sd.start_rowgroup = stripe.first; @@ -2150,7 +2162,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, } stripe_dicts.host_to_device_async(stream); - gpu::initialize_dictionary_hash_maps(stripe_dicts, stream); + map_storage->initialize_async({gpu::KEY_SENTINEL, gpu::VALUE_SENTINEL}, {stream.value()}); gpu::populate_dictionary_hash_maps(stripe_dicts, orc_table.d_columns, stream); // Copy the entry counts and char counts from the device to the host stripe_dicts.device_to_host_sync(stream); @@ -2184,8 +2196,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, col_use_dictionary = true; } else { // Clear hash map storage as dictionary encoding is not used for this stripe - hash_maps_storage[str_col_idx][stripe_idx] = rmm::device_uvector(0, stream); - sd.map_slots = {}; + sd.map_slots = {}; } } // If any stripe uses dictionary encoding, allocate index storage for the whole column @@ -2203,7 +2214,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, gpu::get_dictionary_indices(stripe_dicts, orc_table.d_columns, stream); // deallocate hash map storage, unused after this point - hash_maps_storage.clear(); + map_storage.reset(); // Clear map slots and attach order buffers auto dictionaries_flat = stripe_dicts.host_view().flat_view(); diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 1a2a9eac17d..b85ebf2fa1a 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -194,17 +194,12 @@ struct map_find_fn { val_idx += block_size) { // Find the key using a single thread for best performance for now. if (data_col.is_valid(val_idx)) { + auto const found_slot = map_find_ref.find(val_idx); + // Fail if we didn't find the previously inserted key. + cudf_assert(found_slot != map_find_ref.end() && + "Unable to find value in map in dictionary index construction"); // No need for atomic as this is not going to be modified by any other thread. - chunk->dict_index[val_idx - s_ck_start_val_idx] = [&]() { - auto const found_slot = map_find_ref.find(val_idx); - - // Fail if we didn't find the previously inserted key. - cudf_assert(found_slot != map_find_ref.end() && - "Unable to find value in map in dictionary index construction"); - - // Return the found value. - return found_slot->second; - }(); + chunk->dict_index[val_idx - s_ck_start_val_idx] = found_slot->second; } } } else { diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 775a2580f60..b923a301f84 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -710,9 +710,7 @@ class regex_parser { std::stack lbra_stack; auto repeat_start_index = -1; - for (std::size_t index = 0; index < in.size(); index++) { - auto const item = in[index]; - + for (auto const item : in) { if (item.type != COUNTED && item.type != COUNTED_LAZY) { out.push_back(item); if (item.type == LBRA || item.type == LBRA_NC) { @@ -739,7 +737,7 @@ class regex_parser { auto const m = item.d.count.m; // maximum count assert(n >= 0 && "invalid repeat count value n"); // zero-repeat edge-case: need to erase the previous items - if (n == 0 && m == 0) { out.erase(begin, end); } + if (n == 0) { out.erase(begin, end); } std::vector repeat_copy(begin, end); // special handling for quantified capture groups diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 4596ec65ce7..799a84cbc37 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -56,8 +56,15 @@ function(ConfigureTest CMAKE_TEST_NAME) target_link_libraries( ${CMAKE_TEST_NAME} - PRIVATE cudftestutil GTest::gmock GTest::gmock_main GTest::gtest GTest::gtest_main - nvtx3::nvtx3-cpp $ "${_CUDF_TEST_EXTRA_LIBS}" + PRIVATE cudf::cudftestutil + cudf::cudftestutil_impl + GTest::gmock + GTest::gmock_main + GTest::gtest + GTest::gtest_main + nvtx3::nvtx3-cpp + $ + "${_CUDF_TEST_EXTRA_LIBS}" ) rapids_cuda_set_runtime(${CMAKE_TEST_NAME} USE_STATIC ${CUDA_STATIC_RUNTIME}) rapids_test_add( @@ -76,6 +83,7 @@ function(ConfigureTest CMAKE_TEST_NAME) "GTEST_CUDF_STREAM_MODE=new_${_CUDF_TEST_STREAM_MODE}_default;LD_PRELOAD=$" ) endif() + enable_clang_tidy(${CMAKE_TEST_NAME}) endfunction() # ################################################################################################## diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp index a961f73d955..8be7e087b6d 100644 --- a/cpp/tests/interop/nanoarrow_utils.hpp +++ b/cpp/tests/interop/nanoarrow_utils.hpp @@ -256,7 +256,8 @@ std::enable_if_t, nanoarrow::UniqueArray> get_nanoarrow_ ArrowBitmap out; ArrowBitmapInit(&out); NANOARROW_THROW_NOT_OK(ArrowBitmapResize(&out, b.size(), 1)); - std::memset(out.buffer.data, 0, out.buffer.size_bytes); + // TODO: Investigate clang-tidy issue further after nanoarrow is made compliant + std::memset(out.buffer.data, 0, out.buffer.size_bytes); // NOLINT for (size_t i = 0; i < b.size(); ++i) { ArrowBitSetTo(out.buffer.data, i, static_cast(b[i])); diff --git a/cpp/tests/io/metadata_utilities.cpp b/cpp/tests/io/metadata_utilities.cpp index 84f04f67038..380d66c53f9 100644 --- a/cpp/tests/io/metadata_utilities.cpp +++ b/cpp/tests/io/metadata_utilities.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,10 +14,9 @@ * limitations under the License. */ +#include #include -#include - namespace cudf::test { void expect_metadata_equal(cudf::io::table_input_metadata in_meta, diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp index 249319da7f7..7b61be113f9 100644 --- a/cpp/tests/large_strings/large_strings_fixture.cpp +++ b/cpp/tests/large_strings/large_strings_fixture.cpp @@ -123,12 +123,9 @@ LargeStringsData* StringsLargeTest::g_ls_data = nullptr; int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - auto const cmd_opts = parse_cudf_test_opts(argc, argv); - // hardcoding the CUDA memory resource to keep from exceeding the pool - auto mr = cudf::test::make_cuda(); - cudf::set_current_device_resource(mr.get()); - auto adaptor = make_stream_mode_adaptor(cmd_opts); - + cudf::test::config config; + config.rmm_mode = "cuda"; + init_cudf_test(argc, argv, config); // create object to automatically be destroyed at the end of main() auto lsd = cudf::test::StringsLargeTest::get_ls_data(); diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index 216ddfce5f1..cceec1d3537 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -474,6 +474,40 @@ TEST_F(StringsContainsTests, FixedQuantifier) } } +TEST_F(StringsContainsTests, ZeroRangeQuantifier) +{ + auto input = cudf::test::strings_column_wrapper({"a", "", "abc", "XYAZ", "ABC", "ZYXA"}); + auto sv = cudf::strings_column_view(input); + + auto pattern = std::string("A{0,}"); // should match everyting + auto prog = cudf::strings::regex_program::create(pattern); + + { + auto expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 1, 1, 1}); + auto results = cudf::strings::contains_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } + { + auto expected = cudf::test::fixed_width_column_wrapper({2, 1, 4, 5, 4, 5}); + auto results = cudf::strings::count_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } + + pattern = std::string("(?:ab){0,3}"); + prog = cudf::strings::regex_program::create(pattern); + + { + auto expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 1, 1, 1}); + auto results = cudf::strings::contains_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } + { + auto expected = cudf::test::fixed_width_column_wrapper({2, 1, 3, 5, 4, 5}); + auto results = cudf::strings::count_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } +} + TEST_F(StringsContainsTests, NestedQuantifier) { auto input = cudf::test::strings_column_wrapper({"TEST12 1111 2222 3333 4444 5555", diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index 9847d8d6bb5..abc12b00a81 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -200,6 +200,34 @@ TEST_F(StringsReplaceRegexTest, ZeroLengthMatch) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } +TEST_F(StringsReplaceRegexTest, ZeroRangeQuantifier) +{ + auto input = cudf::test::strings_column_wrapper({"a", "", "123", "XYAZ", "abc", "zéyab"}); + auto sv = cudf::strings_column_view(input); + + auto pattern = std::string("A{0,5}"); + auto prog = cudf::strings::regex_program::create(pattern); + auto repl = cudf::string_scalar("_"); + auto expected = cudf::test::strings_column_wrapper( + {"_a_", "_", "_1_2_3_", "_X_Y__Z_", "_a_b_c_", "_z_é_y_a_b_"}); + auto results = cudf::strings::replace_re(sv, *prog, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + pattern = std::string("[a0-9]{0,2}"); + prog = cudf::strings::regex_program::create(pattern); + expected = + cudf::test::strings_column_wrapper({"__", "_", "___", "_X_Y_A_Z_", "__b_c_", "_z_é_y__b_"}); + results = cudf::strings::replace_re(sv, *prog, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + pattern = std::string("(?:ab){0,3}"); + prog = cudf::strings::regex_program::create(pattern); + expected = + cudf::test::strings_column_wrapper({"_a_", "_", "_1_2_3_", "_X_Y_A_Z_", "__c_", "_z_é_y__"}); + results = cudf::strings::replace_re(sv, *prog, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsReplaceRegexTest, Multiline) { auto const multiline = cudf::strings::regex_flags::MULTILINE; diff --git a/cpp/tests/utilities/table_utilities.cu b/cpp/tests/utilities/table_utilities.cu index 354c0b1b57e..8e4906408de 100644 --- a/cpp/tests/utilities/table_utilities.cu +++ b/cpp/tests/utilities/table_utilities.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,10 +15,9 @@ */ #include +#include #include -#include - namespace cudf::test::detail { void expect_table_properties_equal(cudf::table_view lhs, cudf::table_view rhs) { diff --git a/dependencies.yaml b/dependencies.yaml index ca17917c905..ff97b67f0ce 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -6,10 +6,18 @@ files: cuda: ["11.8", "12.5"] arch: [x86_64] includes: + # Note that clang-tidy is not included here because cudf's preferred + # version conflicts with the rest of RAPIDS as well as its own + # clang-format version. Until we update our clang-format version we will + # not be able to install both into the same environment. Moreover, using + # this version will break compatibility with other RAPIDS libraries that + # are still using 16.0.6, and as such will and that would break any + # unified environment like that used in unified devcontainers. - build_base - build_all - build_cpp - build_python_common + - clang_format - cuda - cuda_version - depends_on_cupy @@ -86,6 +94,16 @@ files: includes: - develop - py_version + clang_tidy: + output: none + includes: + - build_all + - build_base + - clang_tidy + - cuda + - cuda_version + - develop + - py_version docs: output: none includes: @@ -553,11 +571,21 @@ dependencies: # pre-commit requires identify minimum version 1.0, but clang-format requires textproto support and that was # added in 2.5.20, so we need to call out the minimum version needed for our plugins - identify>=2.5.20 + - output_types: conda + packages: + - &doxygen doxygen=1.9.1 # pre-commit hook needs a specific version. + clang_format: + common: - output_types: conda packages: - clang==16.0.6 - clang-tools=16.0.6 - - &doxygen doxygen=1.9.1 # pre-commit hook needs a specific version. + clang_tidy: + common: + - output_types: conda + packages: + - clang==19.1.0 + - clang-tools==19.1.0 docs: common: - output_types: [conda] diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst index 6300f77d686..58303356336 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst @@ -7,3 +7,5 @@ nvtext edit_distance generate_ngrams jaccard + minhash + ngrams_tokenize diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst new file mode 100644 index 00000000000..b8ec02fca35 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst @@ -0,0 +1,6 @@ +======= +minhash +======= + +.. automodule:: pylibcudf.nvtext.minhash + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/ngrams_tokenize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/ngrams_tokenize.rst new file mode 100644 index 00000000000..ce6db76f889 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/ngrams_tokenize.rst @@ -0,0 +1,6 @@ +=============== +ngrams_tokenize +=============== + +.. automodule:: pylibcudf.nvtext.ngrams_tokenize + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst new file mode 100644 index 00000000000..71d146c0379 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst @@ -0,0 +1,6 @@ +================ +convert_integers +================ + +.. automodule:: pylibcudf.strings.convert.convert_integers + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst index fa05cb7d786..3d07c1271b4 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst @@ -9,6 +9,7 @@ convert convert_durations convert_fixed_point convert_floats + convert_integers convert_ipv4 convert_lists convert_urls diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 59cb8d51440..5e39cafa47b 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -2,93 +2,44 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.minhash cimport ( - minhash as cpp_minhash, - minhash64 as cpp_minhash64, - word_minhash as cpp_word_minhash, - word_minhash64 as cpp_word_minhash64, -) -from pylibcudf.libcudf.types cimport size_type - from cudf._lib.column cimport Column - -@acquire_spill_lock() -def minhash(Column strings, Column seeds, int width): - - cdef column_view c_strings = strings.view() - cdef size_type c_width = width - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_minhash( - c_strings, - c_seeds, - c_width - ) - ) - - return Column.from_unique_ptr(move(c_result)) +from pylibcudf import nvtext @acquire_spill_lock() -def minhash64(Column strings, Column seeds, int width): - - cdef column_view c_strings = strings.view() - cdef size_type c_width = width - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result +def minhash(Column input, Column seeds, int width=4): + result = nvtext.minhash.minhash( + input.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + width, + ) + return Column.from_pylibcudf(result) - with nogil: - c_result = move( - cpp_minhash64( - c_strings, - c_seeds, - c_width - ) - ) - return Column.from_unique_ptr(move(c_result)) +@acquire_spill_lock() +def minhash64(Column input, Column seeds, int width=4): + result = nvtext.minhash.minhash64( + input.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + width, + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def word_minhash(Column input, Column seeds): - - cdef column_view c_input = input.view() - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_word_minhash( - c_input, - c_seeds - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.minhash.word_minhash( + input.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def word_minhash64(Column input, Column seeds): - - cdef column_view c_input = input.view() - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_word_minhash64( - c_input, - c_seeds - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.minhash.word_minhash64( + input.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx index dec4f037d98..6521116eafe 100644 --- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx +++ b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx @@ -2,48 +2,22 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.ngrams_tokenize cimport ( - ngrams_tokenize as cpp_ngrams_tokenize, -) -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.types cimport size_type - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar + +from pylibcudf import nvtext @acquire_spill_lock() def ngrams_tokenize( - Column strings, + Column input, int ngrams, object py_delimiter, object py_separator ): - - cdef DeviceScalar delimiter = py_delimiter.device_value - cdef DeviceScalar separator = py_separator.device_value - - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef const string_scalar* c_separator = separator\ - .get_raw_ptr() - cdef const string_scalar* c_delimiter = delimiter\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_ngrams_tokenize( - c_strings, - c_ngrams, - c_delimiter[0], - c_separator[0] - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.ngrams_tokenize.ngrams_tokenize( + input.to_pylibcudf(mode="read"), + ngrams, + py_delimiter.device_value.c_value, + py_separator.device_value.c_value + ) + return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 93b67bd4c9d..06ee07d8e2b 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -2,28 +2,10 @@ from cudf._lib.column cimport Column -from cudf._lib.scalar import as_device_scalar -from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_integers cimport ( - from_integers as cpp_from_integers, - hex_to_integers as cpp_hex_to_integers, - integers_to_hex as cpp_integers_to_hex, - is_hex as cpp_is_hex, - to_integers as cpp_to_integers, -) -from pylibcudf.libcudf.types cimport data_type, type_id - -from cudf._lib.types cimport underlying_type_t_type_id - import pylibcudf as plc +from pylibcudf.types cimport DataType -import cudf +from cudf._lib.scalar import as_device_scalar from cudf._lib.types cimport dtype_to_pylibcudf_type @@ -35,10 +17,10 @@ def floating_to_string(Column input_col): return Column.from_pylibcudf(plc_column) -def string_to_floating(Column input_col, object out_type): +def string_to_floating(Column input_col, DataType out_type): plc_column = plc.strings.convert.convert_floats.to_floats( input_col.to_pylibcudf(mode="read"), - dtype_to_pylibcudf_type(out_type) + out_type ) return Column.from_pylibcudf(plc_column) @@ -72,7 +54,7 @@ def stod(Column input_col): A Column with strings cast to double """ - return string_to_floating(input_col, cudf.dtype("float64")) + return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT64)) def ftos(Column input_col): @@ -104,36 +86,22 @@ def stof(Column input_col): A Column with strings cast to float """ - return string_to_floating(input_col, cudf.dtype("float32")) + return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT32)) def integer_to_string(Column input_col): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_integers( - input_column_view)) - - return Column.from_unique_ptr(move(c_result)) - - -def string_to_integer(Column input_col, object out_type): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type] - ) + plc_column = plc.strings.convert.convert_integers.from_integers( + input_col.to_pylibcudf(mode="read"), ) - cdef data_type c_out_type = data_type(tid) - with nogil: - c_result = move( - cpp_to_integers( - input_column_view, - c_out_type)) + return Column.from_pylibcudf(plc_column) - return Column.from_unique_ptr(move(c_result)) + +def string_to_integer(Column input_col, DataType out_type): + plc_column = plc.strings.convert.convert_integers.to_integers( + input_col.to_pylibcudf(mode="read"), + out_type + ) + return Column.from_pylibcudf(plc_column) def i8tos(Column input_col): @@ -165,7 +133,7 @@ def stoi8(Column input_col): A Column with strings cast to int8 """ - return string_to_integer(input_col, cudf.dtype("int8")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.INT8)) def i16tos(Column input_col): @@ -197,7 +165,7 @@ def stoi16(Column input_col): A Column with strings cast to int16 """ - return string_to_integer(input_col, cudf.dtype("int16")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.INT16)) def itos(Column input_col): @@ -229,7 +197,7 @@ def stoi(Column input_col): A Column with strings cast to int32 """ - return string_to_integer(input_col, cudf.dtype("int32")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.INT32)) def ltos(Column input_col): @@ -261,7 +229,7 @@ def stol(Column input_col): A Column with strings cast to int64 """ - return string_to_integer(input_col, cudf.dtype("int64")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.INT64)) def ui8tos(Column input_col): @@ -293,7 +261,7 @@ def stoui8(Column input_col): A Column with strings cast to uint8 """ - return string_to_integer(input_col, cudf.dtype("uint8")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT8)) def ui16tos(Column input_col): @@ -325,7 +293,7 @@ def stoui16(Column input_col): A Column with strings cast to uint16 """ - return string_to_integer(input_col, cudf.dtype("uint16")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT16)) def uitos(Column input_col): @@ -357,7 +325,7 @@ def stoui(Column input_col): A Column with strings cast to uint32 """ - return string_to_integer(input_col, cudf.dtype("uint32")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT32)) def ultos(Column input_col): @@ -389,7 +357,7 @@ def stoul(Column input_col): A Column with strings cast to uint64 """ - return string_to_integer(input_col, cudf.dtype("uint64")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT64)) def to_booleans(Column input_col): @@ -477,8 +445,6 @@ def istimestamp(Column input_col, str format): A Column of boolean values identifying strings that matched the format. """ - if input_col.size == 0: - return cudf.core.column.column_empty(0, dtype=cudf.dtype("bool")) plc_column = plc.strings.convert.convert_datetime.is_timestamp( input_col.to_pylibcudf(mode="read"), format @@ -582,7 +548,7 @@ def is_ipv4(Column source_strings): return Column.from_pylibcudf(plc_column) -def htoi(Column input_col, **kwargs): +def htoi(Column input_col): """ Converting input column of type string having hex values to integer of out_type @@ -595,22 +561,11 @@ def htoi(Column input_col, **kwargs): ------- A Column of integers parsed from hexadecimal string values. """ - - cdef column_view input_column_view = input_col.view() - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[cudf.dtype("int64")] - ) + plc_column = plc.strings.convert.convert_integers.hex_to_integers( + input_col.to_pylibcudf(mode="read"), + plc.DataType(plc.TypeId.INT64) ) - cdef data_type c_out_type = data_type(tid) - - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_hex_to_integers(input_column_view, - c_out_type)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) def is_hex(Column source_strings): @@ -618,15 +573,10 @@ def is_hex(Column source_strings): Returns a Column of boolean values with True for `source_strings` that have hex characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_hex( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_integers.is_hex( + source_strings.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(plc_column) def itoh(Column input_col): @@ -642,11 +592,7 @@ def itoh(Column input_col): ------- A Column of strings with hexadecimal characters. """ - - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_integers_to_hex(input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_integers.integers_to_hex( + input_col.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/custreamz/custreamz/kafka.py b/python/custreamz/custreamz/kafka.py index 0def0ba746e..4cbd7244751 100644 --- a/python/custreamz/custreamz/kafka.py +++ b/python/custreamz/custreamz/kafka.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import confluent_kafka as ck from cudf_kafka._lib.kafka import KafkaDatasource @@ -288,4 +288,4 @@ def poll(self, timeout=None): (default: infinite (None translated into -1 in the library)). (Seconds) """ - return self.ck.poll(timeout) + return self.ck_consumer.poll(timeout) diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx index 5f9d145139a..51b2b4cfaa3 100644 --- a/python/pylibcudf/pylibcudf/binaryop.pyx +++ b/python/pylibcudf/pylibcudf/binaryop.pyx @@ -52,33 +52,27 @@ cpdef Column binary_operation( if LeftBinaryOperand is Column and RightBinaryOperand is Column: with nogil: - result = move( - cpp_binaryop.binary_operation( - lhs.view(), - rhs.view(), - op, - output_type.c_obj - ) + result = cpp_binaryop.binary_operation( + lhs.view(), + rhs.view(), + op, + output_type.c_obj ) elif LeftBinaryOperand is Column and RightBinaryOperand is Scalar: with nogil: - result = move( - cpp_binaryop.binary_operation( - lhs.view(), - dereference(rhs.c_obj), - op, - output_type.c_obj - ) + result = cpp_binaryop.binary_operation( + lhs.view(), + dereference(rhs.c_obj), + op, + output_type.c_obj ) elif LeftBinaryOperand is Scalar and RightBinaryOperand is Column: with nogil: - result = move( - cpp_binaryop.binary_operation( - dereference(lhs.c_obj), - rhs.view(), - op, - output_type.c_obj - ) + result = cpp_binaryop.binary_operation( + dereference(lhs.c_obj), + rhs.view(), + op, + output_type.c_obj ) else: raise ValueError(f"Invalid arguments {lhs} and {rhs}") diff --git a/python/pylibcudf/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx index 03808f0b664..4e5698566d0 100644 --- a/python/pylibcudf/pylibcudf/column.pyx +++ b/python/pylibcudf/pylibcudf/column.pyx @@ -138,7 +138,7 @@ cdef class Column: cdef size_type null_count = libcudf_col.get().null_count() - cdef column_contents contents = move(libcudf_col.get().release()) + cdef column_contents contents = libcudf_col.get().release() # Note that when converting to cudf Column objects we'll need to pull # out the base object. @@ -247,7 +247,7 @@ cdef class Column: cdef const scalar* c_scalar = slr.get() cdef unique_ptr[column] c_result with nogil: - c_result = move(make_column_from_scalar(dereference(c_scalar), size)) + c_result = make_column_from_scalar(dereference(c_scalar), size) return Column.from_libcudf(move(c_result)) @staticmethod @@ -269,7 +269,7 @@ cdef class Column: cdef Scalar slr = Scalar.empty_like(like) cdef unique_ptr[column] c_result with nogil: - c_result = move(make_column_from_scalar(dereference(slr.get()), size)) + c_result = make_column_from_scalar(dereference(slr.get()), size) return Column.from_libcudf(move(c_result)) @staticmethod @@ -373,7 +373,7 @@ cdef class Column: """Create a copy of the column.""" cdef unique_ptr[column] c_result with nogil: - c_result = move(make_unique[column](self.view())) + c_result = make_unique[column](self.view()) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/column_factories.pxd b/python/pylibcudf/pylibcudf/column_factories.pxd index fef02359240..d556085ab64 100644 --- a/python/pylibcudf/pylibcudf/column_factories.pxd +++ b/python/pylibcudf/pylibcudf/column_factories.pxd @@ -1,7 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move -from pylibcudf.libcudf.types cimport mask_state, size_type +from pylibcudf.libcudf.types cimport mask_state from .column cimport Column from .types cimport DataType, size_type, type_id diff --git a/python/pylibcudf/pylibcudf/column_factories.pyx b/python/pylibcudf/pylibcudf/column_factories.pyx index e9085e3ea02..ac942a620b5 100644 --- a/python/pylibcudf/pylibcudf/column_factories.pyx +++ b/python/pylibcudf/pylibcudf/column_factories.pyx @@ -39,29 +39,17 @@ cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id): if isinstance(type_or_id, TypeId): id = type_or_id with nogil: - result = move( - cpp_make_empty_column( - id - ) - ) + result = cpp_make_empty_column(id) else: raise TypeError( "Must pass a TypeId or DataType" ) elif MakeEmptyColumnOperand is DataType: with nogil: - result = move( - cpp_make_empty_column( - type_or_id.c_obj - ) - ) + result = cpp_make_empty_column(type_or_id.c_obj) elif MakeEmptyColumnOperand is type_id: with nogil: - result = move( - cpp_make_empty_column( - type_or_id - ) - ) + result = cpp_make_empty_column(type_or_id) else: raise TypeError( "Must pass a TypeId or DataType" @@ -92,12 +80,10 @@ cpdef Column make_numeric_column( else: raise TypeError("Invalid mask argument") with nogil: - result = move( - cpp_make_numeric_column( - type_.c_obj, - size, - state - ) + result = cpp_make_numeric_column( + type_.c_obj, + size, + state ) return Column.from_libcudf(move(result)) @@ -121,12 +107,10 @@ cpdef Column make_fixed_point_column( else: raise TypeError("Invalid mask argument") with nogil: - result = move( - cpp_make_fixed_point_column( - type_.c_obj, - size, - state - ) + result = cpp_make_fixed_point_column( + type_.c_obj, + size, + state ) return Column.from_libcudf(move(result)) @@ -151,12 +135,10 @@ cpdef Column make_timestamp_column( else: raise TypeError("Invalid mask argument") with nogil: - result = move( - cpp_make_timestamp_column( - type_.c_obj, - size, - state - ) + result = cpp_make_timestamp_column( + type_.c_obj, + size, + state ) return Column.from_libcudf(move(result)) @@ -181,12 +163,10 @@ cpdef Column make_duration_column( else: raise TypeError("Invalid mask argument") with nogil: - result = move( - cpp_make_duration_column( - type_.c_obj, - size, - state - ) + result = cpp_make_duration_column( + type_.c_obj, + size, + state ) return Column.from_libcudf(move(result)) @@ -211,12 +191,10 @@ cpdef Column make_fixed_width_column( else: raise TypeError("Invalid mask argument") with nogil: - result = move( - cpp_make_fixed_width_column( - type_.c_obj, - size, - state - ) + result = cpp_make_fixed_width_column( + type_.c_obj, + size, + state ) return Column.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/concatenate.pyx b/python/pylibcudf/pylibcudf/concatenate.pyx index 8bdcc086e0f..10c860d97bb 100644 --- a/python/pylibcudf/pylibcudf/concatenate.pyx +++ b/python/pylibcudf/pylibcudf/concatenate.pyx @@ -40,14 +40,14 @@ cpdef concatenate(list objects): c_tables.push_back((tbl).view()) with nogil: - c_tbl_result = move(cpp_concatenate.concatenate(c_tables)) + c_tbl_result = cpp_concatenate.concatenate(c_tables) return Table.from_libcudf(move(c_tbl_result)) elif isinstance(objects[0], Column): for column in objects: c_columns.push_back((column).view()) with nogil: - c_col_result = move(cpp_concatenate.concatenate(c_columns)) + c_col_result = cpp_concatenate.concatenate(c_columns) return Column.from_libcudf(move(c_col_result)) else: raise ValueError("input must be a list of Columns or Tables") diff --git a/python/pylibcudf/pylibcudf/copying.pyx b/python/pylibcudf/pylibcudf/copying.pyx index 9743119d92a..4938f1a3dda 100644 --- a/python/pylibcudf/pylibcudf/copying.pyx +++ b/python/pylibcudf/pylibcudf/copying.pyx @@ -67,13 +67,12 @@ cpdef Table gather( """ cdef unique_ptr[table] c_result with nogil: - c_result = move( - cpp_copying.gather( - source_table.view(), - gather_map.view(), - bounds_policy - ) + c_result = cpp_copying.gather( + source_table.view(), + gather_map.view(), + bounds_policy ) + return Table.from_libcudf(move(c_result)) @@ -121,22 +120,18 @@ cpdef Table scatter( cdef vector[reference_wrapper[const scalar]] source_scalars if TableOrListOfScalars is Table: with nogil: - c_result = move( - cpp_copying.scatter( - source.view(), - scatter_map.view(), - target_table.view(), - ) + c_result = cpp_copying.scatter( + source.view(), + scatter_map.view(), + target_table.view(), ) else: source_scalars = _as_vector(source) with nogil: - c_result = move( - cpp_copying.scatter( - source_scalars, - scatter_map.view(), - target_table.view(), - ) + c_result = cpp_copying.scatter( + source_scalars, + scatter_map.view(), + target_table.view(), ) return Table.from_libcudf(move(c_result)) @@ -160,11 +155,11 @@ cpdef ColumnOrTable empty_like(ColumnOrTable input): cdef unique_ptr[column] c_col_result if ColumnOrTable is Column: with nogil: - c_col_result = move(cpp_copying.empty_like(input.view())) + c_col_result = cpp_copying.empty_like(input.view()) return Column.from_libcudf(move(c_col_result)) else: with nogil: - c_tbl_result = move(cpp_copying.empty_like(input.view())) + c_tbl_result = cpp_copying.empty_like(input.view()) return Table.from_libcudf(move(c_tbl_result)) @@ -195,13 +190,11 @@ cpdef Column allocate_like( cdef size_type c_size = size if size is not None else input_column.size() with nogil: - c_result = move( - cpp_copying.allocate_like( + c_result = cpp_copying.allocate_like( input_column.view(), c_size, policy, ) - ) return Column.from_libcudf(move(c_result)) @@ -298,12 +291,12 @@ cpdef Column copy_range( cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_copying.copy_range( + c_result = cpp_copying.copy_range( input_column.view(), target_column.view(), input_begin, input_end, - target_begin) + target_begin ) return Column.from_libcudf(move(c_result)) @@ -337,13 +330,11 @@ cpdef Column shift(Column input, size_type offset, Scalar fill_value): """ cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_copying.shift( + c_result = cpp_copying.shift( input.view(), offset, dereference(fill_value.c_obj) ) - ) return Column.from_libcudf(move(c_result)) @@ -378,7 +369,7 @@ cpdef list slice(ColumnOrTable input, list indices): cdef int i if ColumnOrTable is Column: with nogil: - c_col_result = move(cpp_copying.slice(input.view(), c_indices)) + c_col_result = cpp_copying.slice(input.view(), c_indices) return [ Column.from_column_view(c_col_result[i], input) @@ -386,7 +377,7 @@ cpdef list slice(ColumnOrTable input, list indices): ] else: with nogil: - c_tbl_result = move(cpp_copying.slice(input.view(), c_indices)) + c_tbl_result = cpp_copying.slice(input.view(), c_indices) return [ Table.from_table_view(c_tbl_result[i], input) @@ -418,7 +409,7 @@ cpdef list split(ColumnOrTable input, list splits): if ColumnOrTable is Column: with nogil: - c_col_result = move(cpp_copying.split(input.view(), c_splits)) + c_col_result = cpp_copying.split(input.view(), c_splits) return [ Column.from_column_view(c_col_result[i], input) @@ -426,7 +417,7 @@ cpdef list split(ColumnOrTable input, list splits): ] else: with nogil: - c_tbl_result = move(cpp_copying.split(input.view(), c_splits)) + c_tbl_result = cpp_copying.split(input.view(), c_splits) return [ Table.from_table_view(c_tbl_result[i], input) @@ -472,29 +463,25 @@ cpdef Column copy_if_else( if LeftCopyIfElseOperand is Column and RightCopyIfElseOperand is Column: with nogil: - result = move( - cpp_copying.copy_if_else(lhs.view(), rhs.view(), boolean_mask.view()) + result = cpp_copying.copy_if_else( + lhs.view(), + rhs.view(), + boolean_mask.view() ) elif LeftCopyIfElseOperand is Column and RightCopyIfElseOperand is Scalar: with nogil: - result = move( - cpp_copying.copy_if_else( - lhs.view(), dereference(rhs.c_obj), boolean_mask.view() - ) + result = cpp_copying.copy_if_else( + lhs.view(), dereference(rhs.c_obj), boolean_mask.view() ) elif LeftCopyIfElseOperand is Scalar and RightCopyIfElseOperand is Column: with nogil: - result = move( - cpp_copying.copy_if_else( - dereference(lhs.c_obj), rhs.view(), boolean_mask.view() - ) + result = cpp_copying.copy_if_else( + dereference(lhs.c_obj), rhs.view(), boolean_mask.view() ) else: with nogil: - result = move( - cpp_copying.copy_if_else( - dereference(lhs.c_obj), dereference(rhs.c_obj), boolean_mask.view() - ) + result = cpp_copying.copy_if_else( + dereference(lhs.c_obj), dereference(rhs.c_obj), boolean_mask.view() ) return Column.from_libcudf(move(result)) @@ -541,22 +528,18 @@ cpdef Table boolean_mask_scatter( if TableOrListOfScalars is Table: with nogil: - result = move( - cpp_copying.boolean_mask_scatter( - input.view(), - target.view(), - boolean_mask.view() - ) + result = cpp_copying.boolean_mask_scatter( + input.view(), + target.view(), + boolean_mask.view() ) else: source_scalars = _as_vector(input) with nogil: - result = move( - cpp_copying.boolean_mask_scatter( - source_scalars, - target.view(), - boolean_mask.view(), - ) + result = cpp_copying.boolean_mask_scatter( + source_scalars, + target.view(), + boolean_mask.view(), ) return Table.from_libcudf(move(result)) @@ -586,8 +569,6 @@ cpdef Scalar get_element(Column input_column, size_type index): """ cdef unique_ptr[scalar] c_output with nogil: - c_output = move( - cpp_copying.get_element(input_column.view(), index) - ) + c_output = cpp_copying.get_element(input_column.view(), index) return Scalar.from_libcudf(move(c_output)) diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx index 784d29128bf..ac4335cca56 100644 --- a/python/pylibcudf/pylibcudf/datetime.pyx +++ b/python/pylibcudf/pylibcudf/datetime.pyx @@ -33,7 +33,7 @@ cpdef Column extract_year( cdef unique_ptr[column] result with nogil: - result = move(cpp_extract_year(values.view())) + result = cpp_extract_year(values.view()) return Column.from_libcudf(move(result)) cpdef Column extract_datetime_component( @@ -60,5 +60,5 @@ cpdef Column extract_datetime_component( cdef unique_ptr[column] result with nogil: - result = move(cpp_extract_datetime_component(values.view(), component)) + result = cpp_extract_datetime_component(values.view(), component) return Column.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/filling.pyx b/python/pylibcudf/pylibcudf/filling.pyx index 61b430e64aa..0372e1132cc 100644 --- a/python/pylibcudf/pylibcudf/filling.pyx +++ b/python/pylibcudf/pylibcudf/filling.pyx @@ -48,13 +48,11 @@ cpdef Column fill( cdef unique_ptr[column] result with nogil: - result = move( - cpp_fill( - destination.view(), - begin, - end, - dereference(( value).c_obj) - ) + result = cpp_fill( + destination.view(), + begin, + end, + dereference(( value).c_obj) ) return Column.from_libcudf(move(result)) @@ -112,12 +110,10 @@ cpdef Column sequence(size_type size, Scalar init, Scalar step): cdef unique_ptr[column] result cdef size_type c_size = size with nogil: - result = move( - cpp_sequence( - c_size, - dereference(init.c_obj), - dereference(step.c_obj), - ) + result = cpp_sequence( + c_size, + dereference(init.c_obj), + dereference(step.c_obj), ) return Column.from_libcudf(move(result)) @@ -152,18 +148,14 @@ cpdef Table repeat( if ColumnOrSize is Column: with nogil: - result = move( - cpp_repeat( - input_table.view(), - count.view() - ) + result = cpp_repeat( + input_table.view(), + count.view() ) if ColumnOrSize is size_type: with nogil: - result = move( - cpp_repeat( - input_table.view(), - count - ) + result = cpp_repeat( + input_table.view(), + count ) return Table.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/groupby.pyx b/python/pylibcudf/pylibcudf/groupby.pyx index afb95dba5b3..71f9ecb0453 100644 --- a/python/pylibcudf/pylibcudf/groupby.pyx +++ b/python/pylibcudf/pylibcudf/groupby.pyx @@ -176,7 +176,7 @@ cdef class GroupBy: # We rely on libcudf to tell us this rather than checking the types beforehand # ourselves. with nogil: - c_res = move(dereference(self.c_obj).aggregate(c_requests)) + c_res = dereference(self.c_obj).aggregate(c_requests) return GroupBy._parse_outputs(move(c_res)) cpdef tuple scan(self, list requests): @@ -205,7 +205,7 @@ cdef class GroupBy: cdef pair[unique_ptr[table], vector[aggregation_result]] c_res with nogil: - c_res = move(dereference(self.c_obj).scan(c_requests)) + c_res = dereference(self.c_obj).scan(c_requests) return GroupBy._parse_outputs(move(c_res)) cpdef tuple shift(self, Table values, list offset, list fill_values): @@ -234,10 +234,11 @@ cdef class GroupBy: cdef vector[size_type] c_offset = offset cdef pair[unique_ptr[table], unique_ptr[table]] c_res with nogil: - c_res = move( - dereference(self.c_obj).shift(values.view(), c_offset, c_fill_values) + c_res = dereference(self.c_obj).shift( + values.view(), + c_offset, + c_fill_values ) - return ( Table.from_libcudf(move(c_res.first)), Table.from_libcudf(move(c_res.second)), @@ -264,10 +265,10 @@ cdef class GroupBy: cdef pair[unique_ptr[table], unique_ptr[table]] c_res cdef vector[replace_policy] c_replace_policies = replace_policies with nogil: - c_res = move( - dereference(self.c_obj).replace_nulls(value.view(), c_replace_policies) + c_res = dereference(self.c_obj).replace_nulls( + value.view(), + c_replace_policies ) - return ( Table.from_libcudf(move(c_res.first)), Table.from_libcudf(move(c_res.second)), diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx index 1a03fa5b45b..642516a1b90 100644 --- a/python/pylibcudf/pylibcudf/interop.pyx +++ b/python/pylibcudf/pylibcudf/interop.pyx @@ -131,7 +131,7 @@ def _from_arrow_table(pyarrow_object, *, DataType data_type=None): cdef unique_ptr[table] c_result with nogil: # The libcudf function here will release the stream. - c_result = move(cpp_from_arrow_stream(c_stream)) + c_result = cpp_from_arrow_stream(c_stream) return Table.from_libcudf(move(c_result)) @@ -166,7 +166,7 @@ def _from_arrow_column(pyarrow_object, *, DataType data_type=None): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_from_arrow_column(c_schema, c_array)) + c_result = cpp_from_arrow_column(c_schema, c_array) # The capsule destructors should release automatically for us, but we # choose to do it explicitly here for clarity. diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx index 438b0ff1634..fe765b34f82 100644 --- a/python/pylibcudf/pylibcudf/io/avro.pyx +++ b/python/pylibcudf/pylibcudf/io/avro.pyx @@ -45,7 +45,7 @@ cpdef TableWithMetadata read_avro( for col in columns: c_columns.push_back(str(col).encode()) - cdef avro_reader_options avro_opts = move( + cdef avro_reader_options avro_opts = ( avro_reader_options.builder(source_info.c_obj) .columns(c_columns) .skip_rows(skip_rows) diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx index b53d6771cd6..2c61cc42d82 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyx +++ b/python/pylibcudf/pylibcudf/io/csv.pyx @@ -168,7 +168,7 @@ def read_csv( cdef vector[data_type] c_dtypes_list cdef map[string, data_type] c_dtypes_map - cdef csv_reader_options options = move( + cdef csv_reader_options options = ( csv_reader_options.builder(source_info.c_obj) .compression(compression) .mangle_dupe_cols(mangle_dupe_cols) diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx index 29e49083bc6..65f78f830f1 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyx +++ b/python/pylibcudf/pylibcudf/io/json.pyx @@ -59,7 +59,7 @@ cdef json_reader_options _setup_json_reader_options( json_recovery_mode_t recovery_mode): cdef vector[data_type] types_vec - cdef json_reader_options opts = move( + cdef json_reader_options opts = ( json_reader_options.builder(source_info.c_obj) .compression(compression) .lines(lines) diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx index 01a5e4b04a1..70e0a7995a2 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyx +++ b/python/pylibcudf/pylibcudf/io/orc.pyx @@ -252,7 +252,7 @@ cpdef TableWithMetadata read_orc( """ cdef orc_reader_options opts cdef vector[vector[size_type]] c_stripes - opts = move( + opts = ( orc_reader_options.builder(source_info.c_obj) .use_index(use_index) .build() diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyx b/python/pylibcudf/pylibcudf/io/timezone.pyx index e02239d7252..f120b65fb2c 100644 --- a/python/pylibcudf/pylibcudf/io/timezone.pyx +++ b/python/pylibcudf/pylibcudf/io/timezone.pyx @@ -33,11 +33,9 @@ cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name): cdef string c_tzname = timezone_name.encode() with nogil: - c_result = move( - cpp_make_timezone_transition_table( - make_optional[string](c_tzdir), - c_tzname - ) + c_result = cpp_make_timezone_transition_table( + make_optional[string](c_tzdir), + c_tzname ) return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx index b019ed8f099..bc72647ea8e 100644 --- a/python/pylibcudf/pylibcudf/join.pyx +++ b/python/pylibcudf/pylibcudf/join.pyx @@ -212,5 +212,5 @@ cpdef Table cross_join(Table left, Table right): """ cdef unique_ptr[table] result with nogil: - result = move(cpp_join.cross_join(left.view(), right.view())) + result = cpp_join.cross_join(left.view(), right.view()) return Table.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/json.pyx b/python/pylibcudf/pylibcudf/json.pyx index 4a8d11068f9..ebb82f80408 100644 --- a/python/pylibcudf/pylibcudf/json.pyx +++ b/python/pylibcudf/pylibcudf/json.pyx @@ -143,12 +143,10 @@ cpdef Column get_json_object( cdef cpp_json.get_json_object_options c_options = options.options with nogil: - c_result = move( - cpp_json.get_json_object( - col.view(), - dereference(c_json_path), - c_options - ) + c_result = cpp_json.get_json_object( + col.view(), + dereference(c_json_path), + c_options ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx index b3f6a92d85c..226a9e14172 100644 --- a/python/pylibcudf/pylibcudf/labeling.pyx +++ b/python/pylibcudf/pylibcudf/labeling.pyx @@ -54,14 +54,12 @@ cpdef Column label_bins( ) with nogil: - c_result = move( - cpp_labeling.label_bins( - input.view(), - left_edges.view(), - c_left_inclusive, - right_edges.view(), - c_right_inclusive, - ) + c_result = cpp_labeling.label_bins( + input.view(), + left_edges.view(), + c_left_inclusive, + right_edges.view(), + c_right_inclusive, ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd index a09b6c01392..def292148c5 100644 --- a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd @@ -4,7 +4,7 @@ from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector from pylibcudf.libcudf.column.column cimport column, column_view from pylibcudf.libcudf.table.table cimport table, table_view -from pylibcudf.libcudf.utilities.host_span cimport host_span +from pylibcudf.libcudf.utilities.span cimport host_span from rmm.librmm.device_buffer cimport device_buffer diff --git a/python/pylibcudf/pylibcudf/libcudf/groupby.pxd b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd index 848462131fe..17ea33a2066 100644 --- a/python/pylibcudf/pylibcudf/libcudf/groupby.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd @@ -22,7 +22,6 @@ from pylibcudf.libcudf.types cimport ( size_type, sorted, ) -from pylibcudf.libcudf.utilities.host_span cimport host_span # workaround for https://github.com/cython/cython/issues/3885 ctypedef const scalar constscalar diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index f2dd22f43aa..41250037dcf 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -1,13 +1,21 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar from pylibcudf.libcudf.types cimport size_type cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: + cdef unique_ptr[column] minhash( + const column_view &strings, + const numeric_scalar[uint32_t] seed, + const size_type width, + ) except + + cdef unique_ptr[column] minhash( const column_view &strings, const column_view &seeds, @@ -20,6 +28,12 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: const size_type width, ) except + + cdef unique_ptr[column] minhash64( + const column_view &strings, + const numeric_scalar[uint64_t] seed, + const size_type width, + ) except + + cdef unique_ptr[column] word_minhash( const column_view &input, const column_view &seeds diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd index f12aab0a2e4..69d566b8c49 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport data_type @@ -9,23 +10,28 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_integers( - column_view input_col, - data_type output_type) except + + column_view input, + data_type output_type) except +libcudf_exception_handler cdef unique_ptr[column] from_integers( - column_view input_col) except + + column_view integers) except +libcudf_exception_handler + + cdef unique_ptr[column] is_integer( + column_view input + ) except +libcudf_exception_handler cdef unique_ptr[column] is_integer( - column_view source_strings - ) except + + column_view input, + data_type int_type + ) except +libcudf_exception_handler cdef unique_ptr[column] hex_to_integers( - column_view input_col, + column_view input, data_type output_type) except + cdef unique_ptr[column] is_hex( - column_view source_strings - ) except + + column_view input + ) except +libcudf_exception_handler cdef unique_ptr[column] integers_to_hex( - column_view input_col) except + + column_view input) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd similarity index 100% rename from python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd rename to python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx index 6f82124d06e..ecaf62d6895 100644 --- a/python/pylibcudf/pylibcudf/lists.pyx +++ b/python/pylibcudf/pylibcudf/lists.pyx @@ -69,7 +69,7 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx): cdef unique_ptr[table] c_result with nogil: - c_result = move(cpp_explode.explode_outer(input.view(), explode_column_idx)) + c_result = cpp_explode.explode_outer(input.view(), explode_column_idx) return Table.from_libcudf(move(c_result)) @@ -92,7 +92,7 @@ cpdef Column concatenate_rows(Table input): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_concatenate_rows(input.view())) + c_result = cpp_concatenate_rows(input.view()) return Column.from_libcudf(move(c_result)) @@ -123,10 +123,7 @@ cpdef Column concatenate_list_elements(Column input, bool dropna): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_concatenate_list_elements( - input.view(), - null_policy, - )) + c_result = cpp_concatenate_list_elements(input.view(), null_policy) return Column.from_libcudf(move(c_result)) @@ -161,12 +158,12 @@ cpdef Column contains(Column input, ColumnOrScalar search_key): raise TypeError("Must pass a Column or Scalar") with nogil: - c_result = move(cpp_contains.contains( + c_result = cpp_contains.contains( list_view.view(), search_key.view() if ColumnOrScalar is Column else dereference( search_key.get() ), - )) + ) return Column.from_libcudf(move(c_result)) @@ -190,7 +187,7 @@ cpdef Column contains_nulls(Column input): cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() with nogil: - c_result = move(cpp_contains.contains_nulls(list_view.view())) + c_result = cpp_contains.contains_nulls(list_view.view()) return Column.from_libcudf(move(c_result)) @@ -229,13 +226,13 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o ) with nogil: - c_result = move(cpp_contains.index_of( + c_result = cpp_contains.index_of( list_view.view(), search_key.view() if ColumnOrScalar is Column else dereference( search_key.get() ), find_option, - )) + ) return Column.from_libcudf(move(c_result)) @@ -258,9 +255,7 @@ cpdef Column reverse(Column input): cdef ListColumnView list_view = input.list_view() with nogil: - c_result = move(cpp_reverse.reverse( - list_view.view(), - )) + c_result = cpp_reverse.reverse(list_view.view()) return Column.from_libcudf(move(c_result)) @@ -288,10 +283,10 @@ cpdef Column segmented_gather(Column input, Column gather_map_list): cdef ListColumnView list_view2 = gather_map_list.list_view() with nogil: - c_result = move(cpp_gather.segmented_gather( + c_result = cpp_gather.segmented_gather( list_view1.view(), list_view2.view(), - )) + ) return Column.from_libcudf(move(c_result)) @@ -316,10 +311,10 @@ cpdef Column extract_list_element(Column input, ColumnOrSizeType index): cdef ListColumnView list_view = input.list_view() with nogil: - c_result = move(cpp_extract_list_element( + c_result = cpp_extract_list_element( list_view.view(), index.view() if ColumnOrSizeType is Column else index, - )) + ) return Column.from_libcudf(move(c_result)) @@ -344,7 +339,7 @@ cpdef Column count_elements(Column input): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_count_elements(list_view.view())) + c_result = cpp_count_elements(list_view.view()) return Column.from_libcudf(move(c_result)) @@ -373,17 +368,14 @@ cpdef Column sequences(Column starts, Column sizes, Column steps = None): if steps is not None: with nogil: - c_result = move(cpp_filling.sequences( + c_result = cpp_filling.sequences( starts.view(), steps.view(), sizes.view(), - )) + ) else: with nogil: - c_result = move(cpp_filling.sequences( - starts.view(), - sizes.view(), - )) + c_result = cpp_filling.sequences(starts.view(), sizes.view()) return Column.from_libcudf(move(c_result)) cpdef Column sort_lists( @@ -423,17 +415,17 @@ cpdef Column sort_lists( with nogil: if stable: - c_result = move(cpp_stable_sort_lists( + c_result = cpp_stable_sort_lists( list_view.view(), c_sort_order, na_position, - )) + ) else: - c_result = move(cpp_sort_lists( + c_result = cpp_sort_lists( list_view.view(), c_sort_order, na_position, - )) + ) return Column.from_libcudf(move(c_result)) @@ -477,12 +469,12 @@ cpdef Column difference_distinct( ) with nogil: - c_result = move(cpp_set_operations.difference_distinct( + c_result = cpp_set_operations.difference_distinct( lhs_view.view(), rhs_view.view(), c_nulls_equal, c_nans_equal, - )) + ) return Column.from_libcudf(move(c_result)) @@ -525,12 +517,12 @@ cpdef Column have_overlap( ) with nogil: - c_result = move(cpp_set_operations.have_overlap( + c_result = cpp_set_operations.have_overlap( lhs_view.view(), rhs_view.view(), c_nulls_equal, c_nans_equal, - )) + ) return Column.from_libcudf(move(c_result)) @@ -573,12 +565,12 @@ cpdef Column intersect_distinct( ) with nogil: - c_result = move(cpp_set_operations.intersect_distinct( + c_result = cpp_set_operations.intersect_distinct( lhs_view.view(), rhs_view.view(), c_nulls_equal, c_nans_equal, - )) + ) return Column.from_libcudf(move(c_result)) @@ -622,12 +614,12 @@ cpdef Column union_distinct( ) with nogil: - c_result = move(cpp_set_operations.union_distinct( + c_result = cpp_set_operations.union_distinct( lhs_view.view(), rhs_view.view(), c_nulls_equal, c_nans_equal, - )) + ) return Column.from_libcudf(move(c_result)) @@ -652,10 +644,10 @@ cpdef Column apply_boolean_mask(Column input, Column boolean_mask): cdef ListColumnView list_view = input.list_view() cdef ListColumnView mask_view = boolean_mask.list_view() with nogil: - c_result = move(cpp_apply_boolean_mask( + c_result = cpp_apply_boolean_mask( list_view.view(), mask_view.view(), - )) + ) return Column.from_libcudf(move(c_result)) @@ -690,9 +682,9 @@ cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal): ) with nogil: - c_result = move(cpp_distinct( + c_result = cpp_distinct( list_view.view(), c_nulls_equal, c_nans_equal, - )) + ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/merge.pyx b/python/pylibcudf/pylibcudf/merge.pyx index 6d707b67449..61a21aafdb2 100644 --- a/python/pylibcudf/pylibcudf/merge.pyx +++ b/python/pylibcudf/pylibcudf/merge.pyx @@ -47,12 +47,10 @@ cpdef Table merge ( cdef unique_ptr[table] c_result with nogil: - c_result = move( - cpp_merge.merge( - c_tables_to_merge, - c_key_cols, - c_column_order, - c_null_precedence, - ) + c_result = cpp_merge.merge( + c_tables_to_merge, + c_key_cols, + c_column_order, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/null_mask.pyx b/python/pylibcudf/pylibcudf/null_mask.pyx index aae39987dac..74180951562 100644 --- a/python/pylibcudf/pylibcudf/null_mask.pyx +++ b/python/pylibcudf/pylibcudf/null_mask.pyx @@ -38,7 +38,7 @@ cpdef DeviceBuffer copy_bitmask(Column col): cdef device_buffer db with nogil: - db = move(cpp_null_mask.copy_bitmask(col.view())) + db = cpp_null_mask.copy_bitmask(col.view()) return buffer_to_python(move(db)) @@ -90,7 +90,7 @@ cpdef DeviceBuffer create_null_mask( cdef device_buffer db with nogil: - db = move(cpp_null_mask.create_null_mask(size, state)) + db = cpp_null_mask.create_null_mask(size, state) return buffer_to_python(move(db)) @@ -114,7 +114,7 @@ cpdef tuple bitmask_and(list columns): cdef pair[device_buffer, size_type] c_result with nogil: - c_result = move(cpp_null_mask.bitmask_and(c_table.view())) + c_result = cpp_null_mask.bitmask_and(c_table.view()) return buffer_to_python(move(c_result.first)), c_result.second @@ -138,6 +138,6 @@ cpdef tuple bitmask_or(list columns): cdef pair[device_buffer, size_type] c_result with nogil: - c_result = move(cpp_null_mask.bitmask_or(c_table.view())) + c_result = cpp_null_mask.bitmask_or(c_table.view()) return buffer_to_python(move(c_result.first)), c_result.second diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt index 9913e1fbadb..94df9bbbebb 100644 --- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -12,7 +12,9 @@ # the License. # ============================================================================= -set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx) +set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx + ngrams_tokenize.pyx +) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd index 5f1762b1e3d..b6659827688 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -1,9 +1,17 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport edit_distance, generate_ngrams, jaccard +from . cimport ( + edit_distance, + generate_ngrams, + jaccard, + minhash, + ngrams_tokenize, +) __all__ = [ "edit_distance", "generate_ngrams", "jaccard", + "minhash", + "ngrams_tokenize" ] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py index 1c0ddb1e5a4..f74633a3521 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.py +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -1,9 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import edit_distance, generate_ngrams, jaccard +from . import edit_distance, generate_ngrams, jaccard, minhash, ngrams_tokenize __all__ = [ "edit_distance", "generate_ngrams", "jaccard", + "minhash", + "ngrams_tokenize", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx index fc98ccbc50c..dcacb2e1267 100644 --- a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx @@ -33,7 +33,7 @@ cpdef Column edit_distance(Column input, Column targets): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_edit_distance(c_strings, c_targets)) + c_result = cpp_edit_distance(c_strings, c_targets) return Column.from_libcudf(move(c_result)) @@ -58,6 +58,6 @@ cpdef Column edit_distance_matrix(Column input): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_edit_distance_matrix(c_strings)) + c_result = cpp_edit_distance_matrix(c_strings) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx index 8c7a8edc01d..09859d09e9e 100644 --- a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx @@ -40,12 +40,10 @@ cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_generate_ngrams( - c_strings, - ngrams, - c_separator[0] - ) + c_result = cpp_generate_ngrams( + c_strings, + ngrams, + c_separator[0] ) return Column.from_libcudf(move(c_result)) @@ -72,11 +70,9 @@ cpdef Column generate_character_ngrams(Column input, size_type ngrams = 2): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_generate_character_ngrams( - c_strings, - ngrams, - ) + c_result = cpp_generate_character_ngrams( + c_strings, + ngrams, ) return Column.from_libcudf(move(c_result)) @@ -102,10 +98,8 @@ cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_hash_character_ngrams( - c_strings, - ngrams, - ) + c_result = cpp_hash_character_ngrams( + c_strings, + ngrams, ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx index 9334d7ce751..3d8669865d9 100644 --- a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx @@ -36,12 +36,10 @@ cpdef Column jaccard_index(Column input1, Column input2, size_type width): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_jaccard_index( - c_input1, - c_input2, - width - ) + c_result = cpp_jaccard_index( + c_input1, + c_input2, + width ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd new file mode 100644 index 00000000000..97e8c9dc83c --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libc.stdint cimport uint32_t, uint64_t +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + +ctypedef fused ColumnOrScalar: + Column + Scalar + +cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*) + +cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*) + +cpdef Column word_minhash(Column input, Column seeds) + +cpdef Column word_minhash64(Column input, Column seeds) diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx new file mode 100644 index 00000000000..f1e012e60e5 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -0,0 +1,152 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libc.stdint cimport uint32_t, uint64_t +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.nvtext.minhash cimport ( + minhash as cpp_minhash, + minhash64 as cpp_minhash64, + word_minhash as cpp_word_minhash, + word_minhash64 as cpp_word_minhash64, +) +from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference + + +cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): + """ + Returns the minhash values for each string per seed. + This function uses MurmurHash3_x86_32 for the hash algorithm. + + For details, see :cpp:func:`minhash`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seeds : Column or Scalar + Seed value(s) used for the hash algorithm. + width : size_type + Character width used for apply substrings; + Default is 4 characters. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + if not isinstance(seeds, (Column, Scalar)): + raise TypeError("Must pass a Column or Scalar") + + with nogil: + c_result = cpp_minhash( + input.view(), + seeds.view() if ColumnOrScalar is Column else + dereference(seeds.c_obj.get()), + width + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): + """ + Returns the minhash values for each string per seed. + This function uses MurmurHash3_x64_128 for the hash algorithm. + + For details, see :cpp:func:`minhash64`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seeds : Column or Scalar + Seed value(s) used for the hash algorithm. + width : size_type + Character width used for apply substrings; + Default is 4 characters. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + if not isinstance(seeds, (Column, Scalar)): + raise TypeError("Must pass a Column or Scalar") + + with nogil: + c_result = cpp_minhash64( + input.view(), + seeds.view() if ColumnOrScalar is Column else + dereference(seeds.c_obj.get()), + width + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column word_minhash(Column input, Column seeds): + """ + Returns the minhash values for each row of strings per seed. + This function uses MurmurHash3_x86_32 for the hash algorithm. + + For details, see :cpp:func:`word_minhash`. + + Parameters + ---------- + input : Column + Lists column of strings to compute minhash + seeds : Column or Scalar + Seed values used for the hash algorithm. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_word_minhash( + input.view(), + seeds.view() + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column word_minhash64(Column input, Column seeds): + """ + Returns the minhash values for each row of strings per seed. + This function uses MurmurHash3_x64_128 for the hash algorithm though + only the first 64-bits of the hash are used in computing the output. + + For details, see :cpp:func:`word_minhash64`. + + Parameters + ---------- + input : Column + Lists column of strings to compute minhash + seeds : Column or Scalar + Seed values used for the hash algorithm. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_word_minhash64( + input.view(), + seeds.view() + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd new file mode 100644 index 00000000000..4f791ba1ee9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column ngrams_tokenize( + Column input, + size_type ngrams, + Scalar delimiter, + Scalar separator +) diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx new file mode 100644 index 00000000000..8a1854c5f0d --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.nvtext.ngrams_tokenize cimport ( + ngrams_tokenize as cpp_ngrams_tokenize, +) +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column ngrams_tokenize( + Column input, + size_type ngrams, + Scalar delimiter, + Scalar separator +): + """ + Returns a single column of strings by tokenizing the input strings column + and then producing ngrams of each string. + + For details, see :cpp:func:`ngrams_tokenize` + + Parameters + ---------- + input : Column + Input strings + ngrams : size_type + The ngram number to generate + delimiter : Scalar + UTF-8 characters used to separate each string into tokens. + An empty string will separate tokens using whitespace. + separator : Scalar + The string to use for separating ngram tokens + + Returns + ------- + Column + New strings columns of tokens + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_ngrams_tokenize( + input.view(), + ngrams, + dereference(delimiter.get()), + dereference(separator.get()), + ) + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx index 8fa70daab5a..3cff4843735 100644 --- a/python/pylibcudf/pylibcudf/partitioning.pyx +++ b/python/pylibcudf/pylibcudf/partitioning.pyx @@ -41,10 +41,10 @@ cpdef tuple[Table, list] hash_partition( cdef int c_num_partitions = num_partitions with nogil: - c_result = move( - cpp_partitioning.hash_partition( - input.view(), c_columns_to_hash, c_num_partitions - ) + c_result = cpp_partitioning.hash_partition( + input.view(), + c_columns_to_hash, + c_num_partitions ) return Table.from_libcudf(move(c_result.first)), list(c_result.second) @@ -74,8 +74,10 @@ cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partit cdef int c_num_partitions = num_partitions with nogil: - c_result = move( - cpp_partitioning.partition(t.view(), partition_map.view(), c_num_partitions) + c_result = cpp_partitioning.partition( + t.view(), + partition_map.view(), + c_num_partitions ) return Table.from_libcudf(move(c_result.first)), list(c_result.second) @@ -111,10 +113,8 @@ cpdef tuple[Table, list] round_robin_partition( cdef int c_start_partition = start_partition with nogil: - c_result = move( - cpp_partitioning.round_robin_partition( - input.view(), c_num_partitions, c_start_partition - ) + c_result = cpp_partitioning.round_robin_partition( + input.view(), c_num_partitions, c_start_partition ) return Table.from_libcudf(move(c_result.first)), list(c_result.second) diff --git a/python/pylibcudf/pylibcudf/quantiles.pyx b/python/pylibcudf/pylibcudf/quantiles.pyx index 3a771fbe7ef..7d92b598bd0 100644 --- a/python/pylibcudf/pylibcudf/quantiles.pyx +++ b/python/pylibcudf/pylibcudf/quantiles.pyx @@ -66,14 +66,12 @@ cpdef Column quantile( ordered_indices_view = ordered_indices.view() with nogil: - c_result = move( - cpp_quantile( - input.view(), - q, - interp, - ordered_indices_view, - exact, - ) + c_result = cpp_quantile( + input.view(), + q, + interp, + ordered_indices_view, + exact, ) return Column.from_libcudf(move(c_result)) @@ -141,15 +139,13 @@ cpdef Table quantiles( null_precedence_vec = null_precedence with nogil: - c_result = move( - cpp_quantiles( - input.view(), - q, - interp, - is_input_sorted, - column_order_vec, - null_precedence_vec, - ) + c_result = cpp_quantiles( + input.view(), + q, + interp, + is_input_sorted, + column_order_vec, + null_precedence_vec, ) return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/reduce.pyx b/python/pylibcudf/pylibcudf/reduce.pyx index b0212a5b9c1..d9ec3a9bdc4 100644 --- a/python/pylibcudf/pylibcudf/reduce.pyx +++ b/python/pylibcudf/pylibcudf/reduce.pyx @@ -39,12 +39,10 @@ cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type): cdef unique_ptr[scalar] result cdef const reduce_aggregation *c_agg = agg.view_underlying_as_reduce() with nogil: - result = move( - cpp_reduce.cpp_reduce( - col.view(), - dereference(c_agg), - data_type.c_obj - ) + result = cpp_reduce.cpp_reduce( + col.view(), + dereference(c_agg), + data_type.c_obj ) return Scalar.from_libcudf(move(result)) @@ -71,12 +69,10 @@ cpdef Column scan(Column col, Aggregation agg, scan_type inclusive): cdef unique_ptr[column] result cdef const scan_aggregation *c_agg = agg.view_underlying_as_scan() with nogil: - result = move( - cpp_reduce.cpp_scan( - col.view(), - dereference(c_agg), - inclusive, - ) + result = cpp_reduce.cpp_scan( + col.view(), + dereference(c_agg), + inclusive, ) return Column.from_libcudf(move(result)) @@ -99,7 +95,7 @@ cpdef tuple minmax(Column col): """ cdef pair[unique_ptr[scalar], unique_ptr[scalar]] result with nogil: - result = move(cpp_reduce.cpp_minmax(col.view())) + result = cpp_reduce.cpp_minmax(col.view()) return ( Scalar.from_libcudf(move(result.first)), diff --git a/python/pylibcudf/pylibcudf/replace.pyx b/python/pylibcudf/pylibcudf/replace.pyx index 115dee132fd..f77eba7ace5 100644 --- a/python/pylibcudf/pylibcudf/replace.pyx +++ b/python/pylibcudf/pylibcudf/replace.pyx @@ -56,28 +56,23 @@ cpdef Column replace_nulls(Column source_column, ReplacementType replacement): if isinstance(replacement, ReplacePolicy): policy = replacement with nogil: - c_result = move( - cpp_replace.replace_nulls(source_column.view(), policy) - ) + c_result = cpp_replace.replace_nulls(source_column.view(), policy) return Column.from_libcudf(move(c_result)) else: raise TypeError("replacement must be a Column, Scalar, or replace_policy") with nogil: if ReplacementType is Column: - c_result = move( - cpp_replace.replace_nulls(source_column.view(), replacement.view()) + c_result = cpp_replace.replace_nulls( + source_column.view(), + replacement.view() ) elif ReplacementType is Scalar: - c_result = move( - cpp_replace.replace_nulls( - source_column.view(), dereference(replacement.c_obj) - ) + c_result = cpp_replace.replace_nulls( + source_column.view(), dereference(replacement.c_obj) ) elif ReplacementType is replace_policy: - c_result = move( - cpp_replace.replace_nulls(source_column.view(), replacement) - ) + c_result = cpp_replace.replace_nulls(source_column.view(), replacement) else: assert False, "Internal error. Please contact pylibcudf developers" return Column.from_libcudf(move(c_result)) @@ -109,12 +104,10 @@ cpdef Column find_and_replace_all( """ cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_replace.find_and_replace_all( - source_column.view(), - values_to_replace.view(), - replacement_values.view(), - ) + c_result = cpp_replace.find_and_replace_all( + source_column.view(), + values_to_replace.view(), + replacement_values.view(), ) return Column.from_libcudf(move(c_result)) @@ -156,22 +149,18 @@ cpdef Column clamp( cdef unique_ptr[column] c_result with nogil: if lo_replace is None: - c_result = move( - cpp_replace.clamp( - source_column.view(), - dereference(lo.c_obj), - dereference(hi.c_obj), - ) + c_result = cpp_replace.clamp( + source_column.view(), + dereference(lo.c_obj), + dereference(hi.c_obj), ) else: - c_result = move( - cpp_replace.clamp( - source_column.view(), - dereference(lo.c_obj), - dereference(hi.c_obj), - dereference(lo_replace.c_obj), - dereference(hi_replace.c_obj), - ) + c_result = cpp_replace.clamp( + source_column.view(), + dereference(lo.c_obj), + dereference(hi.c_obj), + dereference(lo_replace.c_obj), + dereference(hi_replace.c_obj), ) return Column.from_libcudf(move(c_result)) @@ -199,9 +188,7 @@ cpdef Column normalize_nans_and_zeros(Column source_column, bool inplace=False): if inplace: cpp_replace.normalize_nans_and_zeros(source_column.mutable_view()) else: - c_result = move( - cpp_replace.normalize_nans_and_zeros(source_column.view()) - ) + c_result = cpp_replace.normalize_nans_and_zeros(source_column.view()) if not inplace: return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx index eb1499ebbea..6540b5198ab 100644 --- a/python/pylibcudf/pylibcudf/reshape.pyx +++ b/python/pylibcudf/pylibcudf/reshape.pyx @@ -38,7 +38,7 @@ cpdef Column interleave_columns(Table source_table): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_interleave_columns(source_table.view())) + c_result = cpp_interleave_columns(source_table.view()) return Column.from_libcudf(move(c_result)) @@ -63,6 +63,6 @@ cpdef Table tile(Table source_table, size_type count): cdef unique_ptr[table] c_result with nogil: - c_result = move(cpp_tile(source_table.view(), count)) + c_result = cpp_tile(source_table.view(), count) return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/rolling.pyx b/python/pylibcudf/pylibcudf/rolling.pyx index a46540d7ffa..4fd0b005431 100644 --- a/python/pylibcudf/pylibcudf/rolling.pyx +++ b/python/pylibcudf/pylibcudf/rolling.pyx @@ -49,24 +49,21 @@ cpdef Column rolling_window( cdef const rolling_aggregation *c_agg = agg.view_underlying_as_rolling() if WindowType is Column: with nogil: - result = move( - cpp_rolling.rolling_window( - source.view(), - preceding_window.view(), - following_window.view(), - min_periods, - dereference(c_agg), - ) + result = cpp_rolling.rolling_window( + source.view(), + preceding_window.view(), + following_window.view(), + min_periods, + dereference(c_agg), ) else: with nogil: - result = move( - cpp_rolling.rolling_window( - source.view(), - preceding_window, - following_window, - min_periods, - dereference(c_agg), - ) + result = cpp_rolling.rolling_window( + source.view(), + preceding_window, + following_window, + min_periods, + dereference(c_agg), ) + return Column.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/round.pyx b/python/pylibcudf/pylibcudf/round.pyx index dc60d53b07e..689363e652d 100644 --- a/python/pylibcudf/pylibcudf/round.pyx +++ b/python/pylibcudf/pylibcudf/round.pyx @@ -39,12 +39,10 @@ cpdef Column round( """ cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_round( - source.view(), - decimal_places, - round_method - ) + c_result = cpp_round( + source.view(), + decimal_places, + round_method ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/search.pyx b/python/pylibcudf/pylibcudf/search.pyx index 814bc6553d8..1a870248046 100644 --- a/python/pylibcudf/pylibcudf/search.pyx +++ b/python/pylibcudf/pylibcudf/search.pyx @@ -41,13 +41,11 @@ cpdef Column lower_bound( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_search.lower_bound( - haystack.view(), - needles.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_search.lower_bound( + haystack.view(), + needles.view(), + c_orders, + c_null_precedence, ) return Column.from_libcudf(move(c_result)) @@ -82,13 +80,11 @@ cpdef Column upper_bound( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_search.upper_bound( - haystack.view(), - needles.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_search.upper_bound( + haystack.view(), + needles.view(), + c_orders, + c_null_precedence, ) return Column.from_libcudf(move(c_result)) @@ -112,10 +108,8 @@ cpdef Column contains(Column haystack, Column needles): """ cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_search.contains( - haystack.view(), - needles.view(), - ) + c_result = cpp_search.contains( + haystack.view(), + needles.view(), ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx index 42289d54bca..fc40f03e1fd 100644 --- a/python/pylibcudf/pylibcudf/sorting.pyx +++ b/python/pylibcudf/pylibcudf/sorting.pyx @@ -36,12 +36,10 @@ cpdef Column sorted_order(Table source_table, list column_order, list null_prece cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.sorted_order( - source_table.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.sorted_order( + source_table.view(), + c_orders, + c_null_precedence, ) return Column.from_libcudf(move(c_result)) @@ -74,12 +72,10 @@ cpdef Column stable_sorted_order( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.stable_sorted_order( - source_table.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.stable_sorted_order( + source_table.view(), + c_orders, + c_null_precedence, ) return Column.from_libcudf(move(c_result)) @@ -118,15 +114,13 @@ cpdef Column rank( """ cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_sorting.rank( - input_view.view(), - method, - column_order, - null_handling, - null_precedence, - percentage, - ) + c_result = cpp_sorting.rank( + input_view.view(), + method, + column_order, + null_handling, + null_precedence, + percentage, ) return Column.from_libcudf(move(c_result)) @@ -154,12 +148,10 @@ cpdef bool is_sorted(Table tbl, list column_order, list null_precedence): cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.is_sorted( - tbl.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.is_sorted( + tbl.view(), + c_orders, + c_null_precedence, ) return c_result @@ -197,14 +189,12 @@ cpdef Table segmented_sort_by_key( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.segmented_sort_by_key( - values.view(), - keys.view(), - segment_offsets.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.segmented_sort_by_key( + values.view(), + keys.view(), + segment_offsets.view(), + c_orders, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) @@ -243,14 +233,12 @@ cpdef Table stable_segmented_sort_by_key( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.stable_segmented_sort_by_key( - values.view(), - keys.view(), - segment_offsets.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.stable_segmented_sort_by_key( + values.view(), + keys.view(), + segment_offsets.view(), + c_orders, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) @@ -285,13 +273,11 @@ cpdef Table sort_by_key( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.sort_by_key( - values.view(), - keys.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.sort_by_key( + values.view(), + keys.view(), + c_orders, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) @@ -326,13 +312,11 @@ cpdef Table stable_sort_by_key( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.stable_sort_by_key( - values.view(), - keys.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.stable_sort_by_key( + values.view(), + keys.view(), + c_orders, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) @@ -360,12 +344,10 @@ cpdef Table sort(Table source_table, list column_order, list null_precedence): cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.sort( - source_table.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.sort( + source_table.view(), + c_orders, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) @@ -393,11 +375,9 @@ cpdef Table stable_sort(Table source_table, list column_order, list null_precede cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.stable_sort( - source_table.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.stable_sort( + source_table.view(), + c_orders, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx index d5475ea79d5..2145398a191 100644 --- a/python/pylibcudf/pylibcudf/stream_compaction.pyx +++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx @@ -44,10 +44,8 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold): cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys with nogil: - c_result = move( - cpp_stream_compaction.drop_nulls( - source_table.view(), c_keys, keep_threshold - ) + c_result = cpp_stream_compaction.drop_nulls( + source_table.view(), c_keys, keep_threshold ) return Table.from_libcudf(move(c_result)) @@ -74,10 +72,8 @@ cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold): cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys with nogil: - c_result = move( - cpp_stream_compaction.drop_nulls( - source_table.view(), c_keys, keep_threshold - ) + c_result = cpp_stream_compaction.drop_nulls( + source_table.view(), c_keys, keep_threshold ) return Table.from_libcudf(move(c_result)) @@ -101,10 +97,8 @@ cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask): """ cdef unique_ptr[table] c_result with nogil: - c_result = move( - cpp_stream_compaction.apply_boolean_mask( - source_table.view(), boolean_mask.view() - ) + c_result = cpp_stream_compaction.apply_boolean_mask( + source_table.view(), boolean_mask.view() ) return Table.from_libcudf(move(c_result)) @@ -144,10 +138,8 @@ cpdef Table unique( cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys with nogil: - c_result = move( - cpp_stream_compaction.unique( - input.view(), c_keys, keep, nulls_equal - ) + c_result = cpp_stream_compaction.unique( + input.view(), c_keys, keep, nulls_equal ) return Table.from_libcudf(move(c_result)) @@ -185,10 +177,8 @@ cpdef Table distinct( cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys with nogil: - c_result = move( - cpp_stream_compaction.distinct( - input.view(), c_keys, keep, nulls_equal, nans_equal - ) + c_result = cpp_stream_compaction.distinct( + input.view(), c_keys, keep, nulls_equal, nans_equal ) return Table.from_libcudf(move(c_result)) @@ -221,10 +211,8 @@ cpdef Column distinct_indices( """ cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_stream_compaction.distinct_indices( - input.view(), keep, nulls_equal, nans_equal - ) + c_result = cpp_stream_compaction.distinct_indices( + input.view(), keep, nulls_equal, nans_equal ) return Column.from_libcudf(move(c_result)) @@ -262,10 +250,8 @@ cpdef Table stable_distinct( cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys with nogil: - c_result = move( - cpp_stream_compaction.stable_distinct( - input.view(), c_keys, keep, nulls_equal, nans_equal - ) + c_result = cpp_stream_compaction.stable_distinct( + input.view(), c_keys, keep, nulls_equal, nans_equal ) return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyx b/python/pylibcudf/pylibcudf/strings/attributes.pyx index 36bee7bd1d9..8e46a32835d 100644 --- a/python/pylibcudf/pylibcudf/strings/attributes.pyx +++ b/python/pylibcudf/pylibcudf/strings/attributes.pyx @@ -25,7 +25,7 @@ cpdef Column count_characters(Column source_strings): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_attributes.count_characters(source_strings.view())) + c_result = cpp_attributes.count_characters(source_strings.view()) return Column.from_libcudf(move(c_result)) @@ -48,7 +48,7 @@ cpdef Column count_bytes(Column source_strings): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_attributes.count_bytes(source_strings.view())) + c_result = cpp_attributes.count_bytes(source_strings.view()) return Column.from_libcudf(move(c_result)) @@ -71,6 +71,6 @@ cpdef Column code_points(Column source_strings): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_attributes.code_points(source_strings.view())) + c_result = cpp_attributes.code_points(source_strings.view()) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/strings/char_types.pyx index 6a24d79bc4b..cb04efe5e8f 100644 --- a/python/pylibcudf/pylibcudf/strings/char_types.pyx +++ b/python/pylibcudf/pylibcudf/strings/char_types.pyx @@ -38,12 +38,10 @@ cpdef Column all_characters_of_type( cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_char_types.all_characters_of_type( - source_strings.view(), - types, - verify_types, - ) + c_result = cpp_char_types.all_characters_of_type( + source_strings.view(), + types, + verify_types, ) return Column.from_libcudf(move(c_result)) @@ -81,13 +79,11 @@ cpdef Column filter_characters_of_type( cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_char_types.filter_characters_of_type( - source_strings.view(), - types_to_remove, - dereference(c_replacement), - types_to_keep, - ) + c_result = cpp_char_types.filter_characters_of_type( + source_strings.view(), + types_to_remove, + dereference(c_replacement), + types_to_keep, ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx index 82bd1fbea32..d4b1130241d 100644 --- a/python/pylibcudf/pylibcudf/strings/contains.pyx +++ b/python/pylibcudf/pylibcudf/strings/contains.pyx @@ -38,10 +38,10 @@ cpdef Column contains_re( cdef unique_ptr[column] result with nogil: - result = move(cpp_contains.contains_re( + result = cpp_contains.contains_re( input.view(), prog.c_obj.get()[0] - )) + ) return Column.from_libcudf(move(result)) @@ -71,10 +71,10 @@ cpdef Column count_re( cdef unique_ptr[column] result with nogil: - result = move(cpp_contains.count_re( + result = cpp_contains.count_re( input.view(), prog.c_obj.get()[0] - )) + ) return Column.from_libcudf(move(result)) @@ -105,10 +105,10 @@ cpdef Column matches_re( cdef unique_ptr[column] result with nogil: - result = move(cpp_contains.matches_re( + result = cpp_contains.matches_re( input.view(), prog.c_obj.get()[0] - )) + ) return Column.from_libcudf(move(result)) @@ -149,19 +149,19 @@ cpdef Column like(Column input, ColumnOrScalar pattern, Scalar escape_character= if ColumnOrScalar is Column: with nogil: - result = move(cpp_contains.like( + result = cpp_contains.like( input.view(), pattern.view(), dereference(c_escape_character) - )) + ) elif ColumnOrScalar is Scalar: c_pattern = (pattern.c_obj.get()) with nogil: - result = move(cpp_contains.like( + result = cpp_contains.like( input.view(), dereference(c_pattern), dereference(c_escape_character) - )) + ) else: raise ValueError("pattern must be a Column or a Scalar") diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt index 846070870b1..8ba84ba7d50 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -14,7 +14,7 @@ set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx convert_fixed_point.pyx - convert_floats.pyx convert_ipv4.pyx convert_lists.pyx convert_urls.pyx + convert_floats.pyx convert_integers.pyx convert_ipv4.pyx convert_lists.pyx convert_urls.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd index 799532d72c6..85300936e4d 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -5,6 +5,7 @@ from . cimport ( convert_durations, convert_fixed_point, convert_floats, + convert_integers, convert_ipv4, convert_lists, convert_urls, diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py index deb2d8ab74b..aa27a7c8929 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -5,6 +5,7 @@ convert_durations, convert_fixed_point, convert_floats, + convert_integers, convert_ipv4, convert_lists, convert_urls, diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx index 0c10f821ab6..dc12b291b11 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx @@ -39,11 +39,9 @@ cpdef Column to_booleans(Column input, Scalar true_string): ) with nogil: - c_result = move( - cpp_convert_booleans.to_booleans( - input.view(), - dereference(c_true_string) - ) + c_result = cpp_convert_booleans.to_booleans( + input.view(), + dereference(c_true_string) ) return Column.from_libcudf(move(c_result)) @@ -80,12 +78,10 @@ cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_str ) with nogil: - c_result = move( - cpp_convert_booleans.from_booleans( - booleans.view(), - dereference(c_true_string), - dereference(c_false_string), - ) + c_result = cpp_convert_booleans.from_booleans( + booleans.view(), + dereference(c_true_string), + dereference(c_false_string), ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx index 76c5809c3d5..31980ace418 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx @@ -43,12 +43,10 @@ cpdef Column to_durations( cdef string c_format = format.encode() with nogil: - c_result = move( - cpp_convert_durations.to_durations( - input.view(), - duration_type.c_obj, - c_format - ) + c_result = cpp_convert_durations.to_durations( + input.view(), + duration_type.c_obj, + c_format ) return Column.from_libcudf(move(c_result)) @@ -84,11 +82,9 @@ cpdef Column from_durations( cdef string c_format = format.encode() with nogil: - c_result = move( - cpp_convert_durations.from_durations( - durations.view(), - c_format - ) + c_result = cpp_convert_durations.from_durations( + durations.view(), + c_format ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx index 60a8fca8baf..962a47dfadf 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx @@ -33,11 +33,9 @@ cpdef Column to_fixed_point(Column input, DataType output_type): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_fixed_point.to_fixed_point( - input.view(), - output_type.c_obj, - ) + c_result = cpp_fixed_point.to_fixed_point( + input.view(), + output_type.c_obj, ) return Column.from_libcudf(move(c_result)) @@ -62,11 +60,7 @@ cpdef Column from_fixed_point(Column input): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_fixed_point.from_fixed_point( - input.view(), - ) - ) + c_result = cpp_fixed_point.from_fixed_point(input.view()) return Column.from_libcudf(move(c_result)) @@ -97,11 +91,9 @@ cpdef Column is_fixed_point(Column input, DataType decimal_type=None): decimal_type = DataType(type_id.DECIMAL64) with nogil: - c_result = move( - cpp_fixed_point.is_fixed_point( - input.view(), - decimal_type.c_obj, - ) + c_result = cpp_fixed_point.is_fixed_point( + input.view(), + decimal_type.c_obj, ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx index 8081aadb085..1296f4f9db5 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx @@ -33,11 +33,9 @@ cpdef Column to_floats(Column strings, DataType output_type): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_convert_floats.to_floats( - strings.view(), - output_type.c_obj, - ) + c_result = cpp_convert_floats.to_floats( + strings.view(), + output_type.c_obj, ) return Column.from_libcudf(move(c_result)) @@ -63,11 +61,7 @@ cpdef Column from_floats(Column floats): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_convert_floats.from_floats( - floats.view(), - ) - ) + c_result = cpp_convert_floats.from_floats(floats.view()) return Column.from_libcudf(move(c_result)) @@ -92,10 +86,6 @@ cpdef Column is_float(Column input): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_convert_floats.is_float( - input.view(), - ) - ) + c_result = cpp_convert_floats.is_float(input.view()) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd new file mode 100644 index 00000000000..eff2e080c27 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_integers(Column input, DataType output_type) + +cpdef Column from_integers(Column integers) + +cpdef Column is_integer(Column input, DataType int_type=*) + +cpdef Column hex_to_integers(Column input, DataType output_type) + +cpdef Column is_hex(Column input) + +cpdef Column integers_to_hex(Column input) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx new file mode 100644 index 00000000000..5558683a502 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx @@ -0,0 +1,206 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_integers as cpp_convert_integers, +) +from pylibcudf.types cimport DataType + + +cpdef Column to_integers(Column input, DataType output_type): + """ + Returns a new integer numeric column parsing integer values from the + provided strings column. + + For details, cpp:func:`cudf::strings::to_integers`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + output_type : DataType + Type of integer numeric column to return. + + Returns + ------- + Column + New column with integers converted from strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.to_integers( + input.view(), + output_type.c_obj + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column from_integers(Column integers): + """ + Returns a new strings column converting the integer values from the + provided column into strings. + + For details, cpp:func:`cudf::strings::from_integers`. + + Parameters + ---------- + integers : Column + Strings instance for this operation. + + Returns + ------- + Column + New strings column with integers as strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.from_integers( + integers.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_integer(Column input, DataType int_type=None): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to integers. + + For details, cpp:func:`cudf::strings::is_integer`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + int_type : DataType + Integer type used for checking underflow and overflow. + By default, does not check an integer type for underflow + or overflow. + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + if int_type is None: + with nogil: + c_result = move( + cpp_convert_integers.is_integer( + input.view(), + ) + ) + else: + with nogil: + c_result = move( + cpp_convert_integers.is_integer( + input.view(), + int_type.c_obj + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column hex_to_integers(Column input, DataType output_type): + """ + Returns a new integer numeric column parsing hexadecimal values + from the provided strings column. + + For details, cpp:func:`cudf::strings::hex_to_integers`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + output_type : DataType + Type of integer numeric column to return. + + Returns + ------- + Column + New column with integers converted from strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.hex_to_integers( + input.view(), + output_type.c_obj + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_hex(Column input): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to integers from hex. + + For details, cpp:func:`cudf::strings::is_hex`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.is_hex( + input.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column integers_to_hex(Column input): + """ + Returns a new strings column converting integer columns to hexadecimal + characters. + + For details, cpp:func:`cudf::strings::integers_to_hex`. + + Parameters + ---------- + input : Column + Integer column to convert to hex. + + Returns + ------- + Column + New strings column with hexadecimal characters. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.integers_to_hex( + input.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx index f2a980d4269..834781f95f3 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx @@ -26,11 +26,7 @@ cpdef Column ipv4_to_integers(Column input): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_convert_ipv4.ipv4_to_integers( - input.view() - ) - ) + c_result = cpp_convert_ipv4.ipv4_to_integers(input.view()) return Column.from_libcudf(move(c_result)) @@ -54,11 +50,7 @@ cpdef Column integers_to_ipv4(Column integers): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_convert_ipv4.integers_to_ipv4( - integers.view() - ) - ) + c_result = cpp_convert_ipv4.integers_to_ipv4(integers.view()) return Column.from_libcudf(move(c_result)) @@ -83,10 +75,6 @@ cpdef Column is_ipv4(Column input): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_convert_ipv4.is_ipv4( - input.view() - ) - ) + c_result = cpp_convert_ipv4.is_ipv4(input.view()) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx index 3fbc08a9ab5..cbfe5f5aa8b 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx @@ -61,12 +61,10 @@ cpdef Column format_list_column( separators = make_empty_column(type_id.STRING) with nogil: - c_result = move( - cpp_convert_lists.format_list_column( - input.view(), - dereference(c_na_rep), - separators.view() - ) + c_result = cpp_convert_lists.format_list_column( + input.view(), + dereference(c_na_rep), + separators.view() ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx index a5e080e53b7..82f8a75f1d9 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx @@ -26,11 +26,7 @@ cpdef Column url_encode(Column input): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_convert_urls.url_encode( - input.view() - ) - ) + c_result = cpp_convert_urls.url_encode(input.view()) return Column.from_libcudf(move(c_result)) @@ -54,10 +50,6 @@ cpdef Column url_decode(Column input): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_convert_urls.url_decode( - input.view() - ) - ) + c_result = cpp_convert_urls.url_decode(input.view()) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyx b/python/pylibcudf/pylibcudf/strings/extract.pyx index dcb11ca10ce..b56eccc8287 100644 --- a/python/pylibcudf/pylibcudf/strings/extract.pyx +++ b/python/pylibcudf/pylibcudf/strings/extract.pyx @@ -33,11 +33,9 @@ cpdef Table extract(Column input, RegexProgram prog): cdef unique_ptr[table] c_result with nogil: - c_result = move( - cpp_extract.extract( - input.view(), - prog.c_obj.get()[0] - ) + c_result = cpp_extract.extract( + input.view(), + prog.c_obj.get()[0] ) return Table.from_libcudf(move(c_result)) @@ -66,11 +64,9 @@ cpdef Column extract_all_record(Column input, RegexProgram prog): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_extract.extract_all_record( - input.view(), - prog.c_obj.get()[0] - ) + c_result = cpp_extract.extract_all_record( + input.view(), + prog.c_obj.get()[0] ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/find.pyx b/python/pylibcudf/pylibcudf/strings/find.pyx index 22d370bf7e8..6fc6dca24fd 100644 --- a/python/pylibcudf/pylibcudf/strings/find.pyx +++ b/python/pylibcudf/pylibcudf/strings/find.pyx @@ -50,22 +50,18 @@ cpdef Column find( cdef unique_ptr[column] result if ColumnOrScalar is Column: with nogil: - result = move( - cpp_find.find( - input.view(), - target.view(), - start - ) + result = cpp_find.find( + input.view(), + target.view(), + start ) elif ColumnOrScalar is Scalar: with nogil: - result = move( - cpp_find.find( - input.view(), - dereference((target.c_obj.get())), - start, - stop - ) + result = cpp_find.find( + input.view(), + dereference((target.c_obj.get())), + start, + stop ) else: raise ValueError(f"Invalid target {target}") @@ -104,13 +100,11 @@ cpdef Column rfind( """ cdef unique_ptr[column] result with nogil: - result = move( - cpp_find.rfind( - input.view(), - dereference((target.c_obj.get())), - start, - stop - ) + result = cpp_find.rfind( + input.view(), + dereference((target.c_obj.get())), + start, + stop ) return Column.from_libcudf(move(result)) @@ -149,19 +143,15 @@ cpdef Column contains( cdef unique_ptr[column] result if ColumnOrScalar is Column: with nogil: - result = move( - cpp_find.contains( - input.view(), - target.view() - ) + result = cpp_find.contains( + input.view(), + target.view() ) elif ColumnOrScalar is Scalar: with nogil: - result = move( - cpp_find.contains( - input.view(), - dereference((target.c_obj.get())) - ) + result = cpp_find.contains( + input.view(), + dereference((target.c_obj.get())) ) else: raise ValueError(f"Invalid target {target}") @@ -204,19 +194,15 @@ cpdef Column starts_with( if ColumnOrScalar is Column: with nogil: - result = move( - cpp_find.starts_with( - input.view(), - target.view() - ) + result = cpp_find.starts_with( + input.view(), + target.view() ) elif ColumnOrScalar is Scalar: with nogil: - result = move( - cpp_find.starts_with( - input.view(), - dereference((target.c_obj.get())) - ) + result = cpp_find.starts_with( + input.view(), + dereference((target.c_obj.get())) ) else: raise ValueError(f"Invalid target {target}") @@ -256,19 +242,15 @@ cpdef Column ends_with( cdef unique_ptr[column] result if ColumnOrScalar is Column: with nogil: - result = move( - cpp_find.ends_with( - input.view(), - target.view() - ) + result = cpp_find.ends_with( + input.view(), + target.view() ) elif ColumnOrScalar is Scalar: with nogil: - result = move( - cpp_find.ends_with( - input.view(), - dereference((target.c_obj.get())) - ) + result = cpp_find.ends_with( + input.view(), + dereference((target.c_obj.get())) ) else: raise ValueError(f"Invalid target {target}") diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx index 413fc1cb79d..672aa606bd0 100644 --- a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx +++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx @@ -29,11 +29,9 @@ cpdef Column find_multiple(Column input, Column targets): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_find_multiple.find_multiple( - input.view(), - targets.view() - ) + c_result = cpp_find_multiple.find_multiple( + input.view(), + targets.view() ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx index 5212dc4594d..89fa4302824 100644 --- a/python/pylibcudf/pylibcudf/strings/findall.pyx +++ b/python/pylibcudf/pylibcudf/strings/findall.pyx @@ -30,11 +30,9 @@ cpdef Column findall(Column input, RegexProgram pattern): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_findall.findall( - input.view(), - pattern.c_obj.get()[0] - ) + c_result = cpp_findall.findall( + input.view(), + pattern.c_obj.get()[0] ) return Column.from_libcudf(move(c_result)) @@ -62,11 +60,9 @@ cpdef Column find_re(Column input, RegexProgram pattern): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_findall.find_re( - input.view(), - pattern.c_obj.get()[0] - ) + c_result = cpp_findall.find_re( + input.view(), + pattern.c_obj.get()[0] ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyx b/python/pylibcudf/pylibcudf/strings/padding.pyx index 24daaaa3838..f6950eecf60 100644 --- a/python/pylibcudf/pylibcudf/strings/padding.pyx +++ b/python/pylibcudf/pylibcudf/strings/padding.pyx @@ -33,13 +33,11 @@ cpdef Column pad(Column input, size_type width, side_type side, str fill_char): cdef string c_fill_char = fill_char.encode("utf-8") with nogil: - c_result = move( - cpp_padding.pad( - input.view(), - width, - side, - c_fill_char, - ) + c_result = cpp_padding.pad( + input.view(), + width, + side, + c_fill_char, ) return Column.from_libcudf(move(c_result)) @@ -65,11 +63,9 @@ cpdef Column zfill(Column input, size_type width): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_padding.zfill( - input.view(), - width, - ) + c_result = cpp_padding.zfill( + input.view(), + width, ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyx b/python/pylibcudf/pylibcudf/strings/repeat.pyx index 5f627218f6e..fb2bb13c666 100644 --- a/python/pylibcudf/pylibcudf/strings/repeat.pyx +++ b/python/pylibcudf/pylibcudf/strings/repeat.pyx @@ -31,19 +31,15 @@ cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times): if ColumnorSizeType is Column: with nogil: - c_result = move( - cpp_repeat.repeat_strings( - input.view(), - repeat_times.view() - ) + c_result = cpp_repeat.repeat_strings( + input.view(), + repeat_times.view() ) elif ColumnorSizeType is size_type: with nogil: - c_result = move( - cpp_repeat.repeat_strings( - input.view(), - repeat_times - ) + c_result = cpp_repeat.repeat_strings( + input.view(), + repeat_times ) else: raise ValueError("repeat_times must be size_type or integer") diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyx b/python/pylibcudf/pylibcudf/strings/replace.pyx index 9d0ebf4a814..6db7f04fcbb 100644 --- a/python/pylibcudf/pylibcudf/strings/replace.pyx +++ b/python/pylibcudf/pylibcudf/strings/replace.pyx @@ -55,12 +55,12 @@ cpdef Column replace( repl_str = (repl.c_obj.get()) with nogil: - c_result = move(cpp_replace( + c_result = cpp_replace( input.view(), target_str[0], repl_str[0], maxrepl, - )) + ) return Column.from_libcudf(move(c_result)) @@ -98,11 +98,11 @@ cpdef Column replace_multiple( cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_replace_multiple( + c_result = cpp_replace_multiple( input.view(), target.view(), repl.view(), - )) + ) return Column.from_libcudf(move(c_result)) @@ -151,11 +151,11 @@ cpdef Column replace_slice( cdef const string_scalar* scalar_str = (repl.c_obj.get()) with nogil: - c_result = move(cpp_replace_slice( + c_result = cpp_replace_slice( input.view(), scalar_str[0], start, stop - )) + ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyx b/python/pylibcudf/pylibcudf/strings/split/partition.pyx index ecc959e65b0..0fb4f186c41 100644 --- a/python/pylibcudf/pylibcudf/strings/split/partition.pyx +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyx @@ -45,11 +45,9 @@ cpdef Table partition(Column input, Scalar delimiter=None): ) with nogil: - c_result = move( - cpp_partition.partition( - input.view(), - dereference(c_delimiter) - ) + c_result = cpp_partition.partition( + input.view(), + dereference(c_delimiter) ) return Table.from_libcudf(move(c_result)) @@ -85,11 +83,9 @@ cpdef Table rpartition(Column input, Scalar delimiter=None): ) with nogil: - c_result = move( - cpp_partition.rpartition( - input.view(), - dereference(c_delimiter) - ) + c_result = cpp_partition.rpartition( + input.view(), + dereference(c_delimiter) ) return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyx b/python/pylibcudf/pylibcudf/strings/split/split.pyx index a7d7f39fc47..e3827f6645e 100644 --- a/python/pylibcudf/pylibcudf/strings/split/split.pyx +++ b/python/pylibcudf/pylibcudf/strings/split/split.pyx @@ -44,12 +44,10 @@ cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit): ) with nogil: - c_result = move( - cpp_split.split( - strings_column.view(), - dereference(c_delimiter), - maxsplit, - ) + c_result = cpp_split.split( + strings_column.view(), + dereference(c_delimiter), + maxsplit, ) return Table.from_libcudf(move(c_result)) @@ -85,12 +83,10 @@ cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit): ) with nogil: - c_result = move( - cpp_split.rsplit( - strings_column.view(), - dereference(c_delimiter), - maxsplit, - ) + c_result = cpp_split.rsplit( + strings_column.view(), + dereference(c_delimiter), + maxsplit, ) return Table.from_libcudf(move(c_result)) @@ -124,12 +120,10 @@ cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit): ) with nogil: - c_result = move( - cpp_split.split_record( - strings.view(), - dereference(c_delimiter), - maxsplit, - ) + c_result = cpp_split.split_record( + strings.view(), + dereference(c_delimiter), + maxsplit, ) return Column.from_libcudf(move(c_result)) @@ -165,12 +159,10 @@ cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit) ) with nogil: - c_result = move( - cpp_split.rsplit_record( - strings.view(), - dereference(c_delimiter), - maxsplit, - ) + c_result = cpp_split.rsplit_record( + strings.view(), + dereference(c_delimiter), + maxsplit, ) return Column.from_libcudf(move(c_result)) @@ -203,12 +195,10 @@ cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit): cdef unique_ptr[table] c_result with nogil: - c_result = move( - cpp_split.split_re( - input.view(), - prog.c_obj.get()[0], - maxsplit, - ) + c_result = cpp_split.split_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, ) return Table.from_libcudf(move(c_result)) @@ -241,12 +231,10 @@ cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit): cdef unique_ptr[table] c_result with nogil: - c_result = move( - cpp_split.rsplit_re( - input.view(), - prog.c_obj.get()[0], - maxsplit, - ) + c_result = cpp_split.rsplit_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, ) return Table.from_libcudf(move(c_result)) @@ -278,12 +266,10 @@ cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_split.split_record_re( - input.view(), - prog.c_obj.get()[0], - maxsplit, - ) + c_result = cpp_split.split_record_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, ) return Column.from_libcudf(move(c_result)) @@ -315,12 +301,10 @@ cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxspli cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_split.rsplit_record_re( - input.view(), - prog.c_obj.get()[0], - maxsplit, - ) + c_result = cpp_split.rsplit_record_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/strings/translate.pyx index a62c7ec4528..d85da8e6cdd 100644 --- a/python/pylibcudf/pylibcudf/strings/translate.pyx +++ b/python/pylibcudf/pylibcudf/strings/translate.pyx @@ -62,11 +62,9 @@ cpdef Column translate(Column input, dict chars_table): ) with nogil: - c_result = move( - cpp_translate.translate( - input.view(), - c_chars_table - ) + c_result = cpp_translate.translate( + input.view(), + c_chars_table ) return Column.from_libcudf(move(c_result)) @@ -111,12 +109,10 @@ cpdef Column filter_characters( ) with nogil: - c_result = move( - cpp_translate.filter_characters( - input.view(), - c_characters_to_filter, - keep_characters, - dereference(c_replacement), - ) + c_result = cpp_translate.filter_characters( + input.view(), + c_characters_to_filter, + keep_characters, + dereference(c_replacement), ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyx b/python/pylibcudf/pylibcudf/strings/wrap.pyx index 11e31f54eee..2ced250f837 100644 --- a/python/pylibcudf/pylibcudf/strings/wrap.pyx +++ b/python/pylibcudf/pylibcudf/strings/wrap.pyx @@ -32,11 +32,9 @@ cpdef Column wrap(Column input, size_type width): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_wrap.wrap( - input.view(), - width, - ) + c_result = cpp_wrap.wrap( + input.view(), + width, ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/table.pyx b/python/pylibcudf/pylibcudf/table.pyx index 5f77b89a605..d0d6f2343d0 100644 --- a/python/pylibcudf/pylibcudf/table.pyx +++ b/python/pylibcudf/pylibcudf/table.pyx @@ -49,9 +49,7 @@ cdef class Table: calling libcudf algorithms, and should generally not be needed by users (even direct pylibcudf Cython users). """ - cdef vector[unique_ptr[column]] c_columns = move( - dereference(libcudf_tbl).release() - ) + cdef vector[unique_ptr[column]] c_columns = dereference(libcudf_tbl).release() cdef vector[unique_ptr[column]].size_type i return Table([ diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py new file mode 100644 index 00000000000..4e389a63f90 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest + + +@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()]) +def minhash_input_data(request): + input_arr = pa.array(["foo", "bar", "foo foo", "bar bar"]) + seeds = pa.array([2, 3, 4, 5], request.param) + return input_arr, seeds, request.param + + +@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()]) +def word_minhash_input_data(request): + input_arr = pa.array([["foo", "bar"], ["foo foo", "bar bar"]]) + seeds = pa.array([2, 3, 4, 5], request.param) + return input_arr, seeds, request.param + + +@pytest.mark.parametrize("width", [5, 12]) +def test_minhash(minhash_input_data, width): + input_arr, seeds, seed_type = minhash_input_data + minhash_func = ( + plc.nvtext.minhash.minhash + if seed_type == pa.uint32() + else plc.nvtext.minhash.minhash64 + ) + result = minhash_func( + plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width + ) + pa_result = plc.interop.to_arrow(result) + assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) + assert pa_result.type == pa.list_( + pa.field("element", seed_type, nullable=False) + ) + + +def test_word_minhash(word_minhash_input_data): + input_arr, seeds, seed_type = word_minhash_input_data + word_minhash_func = ( + plc.nvtext.minhash.word_minhash + if seed_type == pa.uint32() + else plc.nvtext.minhash.word_minhash64 + ) + result = word_minhash_func( + plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds) + ) + pa_result = plc.interop.to_arrow(result) + assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) + assert pa_result.type == pa.list_( + pa.field("element", seed_type, nullable=False) + ) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py new file mode 100644 index 00000000000..283a009288d --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py @@ -0,0 +1,37 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture(scope="module") +def input_col(): + arr = ["a*b*c*d", "a b c d", "a-b-c-d", "a*b c-d"] + return pa.array(arr) + + +@pytest.mark.parametrize("ngrams", [2, 3]) +@pytest.mark.parametrize("delim", ["*", " ", "-"]) +@pytest.mark.parametrize("sep", ["_", "&", ","]) +def test_ngrams_tokenize(input_col, ngrams, delim, sep): + def ngrams_tokenize(strings, ngrams, delim, sep): + tokens = [] + for s in strings: + ss = s.split(delim) + for i in range(len(ss) - ngrams + 1): + token = sep.join(ss[i : i + ngrams]) + tokens.append(token) + return tokens + + result = plc.nvtext.ngrams_tokenize.ngrams_tokenize( + plc.interop.from_arrow(input_col), + ngrams, + plc.interop.from_arrow(pa.scalar(delim)), + plc.interop.from_arrow(pa.scalar(sep)), + ) + expected = pa.array( + ngrams_tokenize(input_col.to_pylist(), ngrams, delim, sep) + ) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py new file mode 100644 index 00000000000..6d1d565af30 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py @@ -0,0 +1,69 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_to_integers(): + typ = pa.int8() + arr = pa.array(["1", "-1", None]) + result = plc.strings.convert.convert_integers.to_integers( + plc.interop.from_arrow(arr), plc.interop.from_arrow(typ) + ) + expected = arr.cast(typ) + assert_column_eq(result, expected) + + +def test_from_integers(): + arr = pa.array([1, -1, None]) + result = plc.strings.convert.convert_integers.from_integers( + plc.interop.from_arrow(arr) + ) + expected = pa.array(["1", "-1", None]) + assert_column_eq(result, expected) + + +def test_is_integer(): + arr = pa.array(["1", "-1", "1.2", "A", None]) + plc_column = plc.interop.from_arrow(arr) + result = plc.strings.convert.convert_integers.is_integer(plc_column) + expected = pa.array([True, True, False, False, None]) + assert_column_eq(result, expected) + + result = plc.strings.convert.convert_integers.is_integer( + plc_column, plc.interop.from_arrow(pa.uint8()) + ) + expected = pa.array([True, False, False, False, None]) + assert_column_eq(result, expected) + + +def test_hex_to_integers(): + typ = pa.int32() + data = ["0xff", "0x2a", None] + result = plc.strings.convert.convert_integers.hex_to_integers( + plc.interop.from_arrow(pa.array(data)), plc.interop.from_arrow(typ) + ) + expected = pa.array( + [int(val, 16) if isinstance(val, str) else val for val in data], + type=typ, + ) + assert_column_eq(result, expected) + + +def test_is_hex(): + arr = pa.array(["0xff", "123", "!", None]) + result = plc.strings.convert.convert_integers.is_hex( + plc.interop.from_arrow(arr) + ) + expected = pa.array([True, True, False, None]) + assert_column_eq(result, expected) + + +def test_integers_to_hex(): + data = [255, -42, None] + arr = pa.array(data) + result = plc.strings.convert.convert_integers.integers_to_hex( + plc.interop.from_arrow(arr) + ) + expected = pa.array(["FF", "FFFFFFFFFFFFFFD6", None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx index 74134caeb78..bce9702752a 100644 --- a/python/pylibcudf/pylibcudf/transform.pyx +++ b/python/pylibcudf/pylibcudf/transform.pyx @@ -35,7 +35,7 @@ cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): cdef pair[unique_ptr[device_buffer], size_type] c_result with nogil: - c_result = move(cpp_transform.nans_to_nulls(input.view())) + c_result = cpp_transform.nans_to_nulls(input.view()) return ( gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))), @@ -59,7 +59,7 @@ cpdef tuple[gpumemoryview, int] bools_to_mask(Column input): cdef pair[unique_ptr[device_buffer], size_type] c_result with nogil: - c_result = move(cpp_transform.bools_to_mask(input.view())) + c_result = cpp_transform.bools_to_mask(input.view()) return ( gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))), @@ -88,7 +88,7 @@ cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit): cdef bitmask_type * bitmask_ptr = int_to_bitmask_ptr(bitmask) with nogil: - c_result = move(cpp_transform.mask_to_bools(bitmask_ptr, begin_bit, end_bit)) + c_result = cpp_transform.mask_to_bools(bitmask_ptr, begin_bit, end_bit) return Column.from_libcudf(move(c_result)) @@ -119,10 +119,8 @@ cpdef Column transform(Column input, str unary_udf, DataType output_type, bool i cdef bool c_is_ptx = is_ptx with nogil: - c_result = move( - cpp_transform.transform( - input.view(), c_unary_udf, output_type.c_obj, c_is_ptx - ) + c_result = cpp_transform.transform( + input.view(), c_unary_udf, output_type.c_obj, c_is_ptx ) return Column.from_libcudf(move(c_result)) @@ -144,7 +142,7 @@ cpdef tuple[Table, Column] encode(Table input): cdef pair[unique_ptr[table], unique_ptr[column]] c_result with nogil: - c_result = move(cpp_transform.encode(input.view())) + c_result = cpp_transform.encode(input.view()) return ( Table.from_libcudf(move(c_result.first)), @@ -172,7 +170,7 @@ cpdef Table one_hot_encode(Column input, Column categories): cdef Table owner_table with nogil: - c_result = move(cpp_transform.one_hot_encode(input.view(), categories.view())) + c_result = cpp_transform.one_hot_encode(input.view(), categories.view()) owner_table = Table( [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns() diff --git a/python/pylibcudf/pylibcudf/transpose.pyx b/python/pylibcudf/pylibcudf/transpose.pyx index a708f6cc37f..a24f937ced3 100644 --- a/python/pylibcudf/pylibcudf/transpose.pyx +++ b/python/pylibcudf/pylibcudf/transpose.pyx @@ -29,7 +29,7 @@ cpdef Table transpose(Table input_table): cdef Table owner_table with nogil: - c_result = move(cpp_transpose.transpose(input_table.view())) + c_result = cpp_transpose.transpose(input_table.view()) owner_table = Table( [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns() diff --git a/python/pylibcudf/pylibcudf/unary.pyx b/python/pylibcudf/pylibcudf/unary.pyx index 839360ef406..53e8c382b5e 100644 --- a/python/pylibcudf/pylibcudf/unary.pyx +++ b/python/pylibcudf/pylibcudf/unary.pyx @@ -34,7 +34,7 @@ cpdef Column unary_operation(Column input, unary_operator op): cdef unique_ptr[column] result with nogil: - result = move(cpp_unary.unary_operation(input.view(), op)) + result = cpp_unary.unary_operation(input.view(), op) return Column.from_libcudf(move(result)) @@ -57,7 +57,7 @@ cpdef Column is_null(Column input): cdef unique_ptr[column] result with nogil: - result = move(cpp_unary.is_null(input.view())) + result = cpp_unary.is_null(input.view()) return Column.from_libcudf(move(result)) @@ -80,7 +80,7 @@ cpdef Column is_valid(Column input): cdef unique_ptr[column] result with nogil: - result = move(cpp_unary.is_valid(input.view())) + result = cpp_unary.is_valid(input.view()) return Column.from_libcudf(move(result)) @@ -105,7 +105,7 @@ cpdef Column cast(Column input, DataType data_type): cdef unique_ptr[column] result with nogil: - result = move(cpp_unary.cast(input.view(), data_type.c_obj)) + result = cpp_unary.cast(input.view(), data_type.c_obj) return Column.from_libcudf(move(result)) @@ -128,7 +128,7 @@ cpdef Column is_nan(Column input): cdef unique_ptr[column] result with nogil: - result = move(cpp_unary.is_nan(input.view())) + result = cpp_unary.is_nan(input.view()) return Column.from_libcudf(move(result)) @@ -151,7 +151,7 @@ cpdef Column is_not_nan(Column input): cdef unique_ptr[column] result with nogil: - result = move(cpp_unary.is_not_nan(input.view())) + result = cpp_unary.is_not_nan(input.view()) return Column.from_libcudf(move(result))