diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 92e31ea9c4b..c87033238c7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -59,6 +59,7 @@ repos: hooks: - id: mypy args: ["--config-file=python/cudf/setup.cfg", "python/cudf/cudf"] + pass_filenames: false - repo: https://github.com/pycqa/pydocstyle rev: 6.0.0 hooks: diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh index ed2484814fb..a9bc1f4c605 100755 --- a/ci/cpu/prebuild.sh +++ b/ci/cpu/prebuild.sh @@ -3,6 +3,16 @@ # Copyright (c) 2020, NVIDIA CORPORATION. set -e +ARCH=$(arch) +if [ "${ARCH}" = "x86_64" ]; then + DEFAULT_CUDA_VER="11.0" +elif [ "${ARCH}" = "aarch64" ]; then + DEFAULT_CUDA_VER="11.2" +else + echo "Unsupported arch ${ARCH}" + exit 1 +fi + #Always upload cudf Python package export UPLOAD_CUDF=1 @@ -14,14 +24,14 @@ else fi # upload cudf_kafka for all versions of Python -if [[ "$CUDA" == "11.0" ]]; then +if [[ "$CUDA" == "${DEFAULT_CUDA_VER}" ]]; then export UPLOAD_CUDF_KAFKA=1 else export UPLOAD_CUDF_KAFKA=0 fi #We only want to upload libcudf_kafka once per python/CUDA combo -if [[ "$PYTHON" == "3.7" ]] && [[ "$CUDA" == "11.0" ]]; then +if [[ "$PYTHON" == "3.7" ]] && [[ "$CUDA" == "${DEFAULT_CUDA_VER}" ]]; then export UPLOAD_LIBCUDF_KAFKA=1 else export UPLOAD_LIBCUDF_KAFKA=0 @@ -31,4 +41,4 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then #If project flash is not activate, always build both export BUILD_LIBCUDF=1 export BUILD_CUDF=1 -fi \ No newline at end of file +fi diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index b8ae1120b30..5e839589811 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -56,7 +56,7 @@ dependencies: - protobuf - nvtx>=0.2.1 - cachetools - - transformers + - transformers<=4.10.3 - pydata-sphinx-theme - pip: - git+https://github.com/dask/dask.git@main diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml index 048d5181aa8..28266b6af87 100644 --- a/conda/environments/cudf_dev_cuda11.2.yml +++ b/conda/environments/cudf_dev_cuda11.2.yml @@ -56,7 +56,7 @@ dependencies: - protobuf - nvtx>=0.2.1 - cachetools - - transformers + - transformers<=4.10.3 - pydata-sphinx-theme - pip: - git+https://github.com/dask/dask.git@main diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 982fee640d9..82bc5bfba93 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -473,8 +473,7 @@ target_include_directories(cudf "$" PRIVATE "$" INTERFACE "$" - "$" - "$") + "$") target_compile_definitions(cudf PUBLIC "$<$:${CUDF_CXX_DEFINITIONS}>" @@ -511,7 +510,7 @@ target_link_libraries(cudf cudf::Thrust rmm::rmm PRIVATE cuco::cuco - ZLIB::ZLIB + ZLIB::ZLIB nvcomp::nvcomp) # Add Conda library, and include paths if specified @@ -692,6 +691,40 @@ following IMPORTED GLOBAL targets: ]=]) +set(common_code_string + [=[ +if(NOT TARGET cudf::Thrust) + thrust_create_target(cudf::Thrust FROM_OPTIONS) +endif() + +# nvcc automatically adds the CUDA Toolkit system include paths before any +# system include paths that CMake adds. +# +# CMake implicitly treats all includes on import targets as 'SYSTEM' includes. +# +# To get the cudacxx shipped with cudf to be picked up by consumers instead of the +# version shipped with the CUDA Toolkit we need to make sure it is a non-SYSTEM +# include on the CMake side. +# +# To do this currently, we move the includes from the cudf::cudf target to a +# non-import target to ensure they are `-I` instead of `-isystem` + +add_library(cudf_non_system_includes INTERFACE) +target_link_libraries(cudf::cudf INTERFACE cudf_non_system_includes) + +get_target_property(all_includes cudf::cudf INTERFACE_INCLUDE_DIRECTORIES) +set(system_includes ) +set(normal_includes ) +foreach(include IN LISTS all_includes) + if(include MATCHES "/include/libcudf/") + list(APPEND normal_includes "${include}") + else() + list(APPEND system_includes "${include}") + endif() +endforeach() +set_target_properties(cudf::cudf PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${system_includes}") +set_target_properties(cudf_non_system_includes PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${normal_includes}") +]=]) set(install_code_string [=[ set(ArrowCUDA_DIR "${Arrow_DIR}") @@ -705,11 +738,8 @@ if(testing IN_LIST cudf_FIND_COMPONENTS) include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake") endif() endif() - -if(NOT TARGET cudf::Thrust) - thrust_create_target(cudf::Thrust FROM_OPTIONS) -endif() ]=]) +string(APPEND install_code_string "${common_code_string}") rapids_export(INSTALL cudf EXPORT_SET cudf-exports @@ -728,11 +758,8 @@ endif() if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake") include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake") endif() - -if(NOT TARGET cudf::Thrust) - thrust_create_target(cudf::Thrust FROM_OPTIONS) -endif() ]=]) +string(APPEND build_code_string "${common_code_string}") rapids_export(BUILD cudf EXPORT_SET cudf-exports diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 3309be62dc3..41751c7dd50 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -157,17 +157,40 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endif() if(Arrow_ADDED) + set(arrow_code_string + [=[ + if (TARGET cudf::arrow_shared AND (NOT TARGET arrow_shared)) + add_library(arrow_shared ALIAS cudf::arrow_shared) + endif() + if (TARGET cudf::arrow_static AND (NOT TARGET arrow_static)) + add_library(arrow_static ALIAS cudf::arrow_static) + endif() + ]=] + ) + set(arrow_cuda_code_string + [=[ + if (TARGET cudf::arrow_cuda_shared AND (NOT TARGET arrow_cuda_shared)) + add_library(arrow_cuda_shared ALIAS cudf::arrow_cuda_shared) + endif() + if (TARGET cudf::arrow_cuda_static AND (NOT TARGET arrow_cuda_static)) + add_library(arrow_cuda_static ALIAS cudf::arrow_cuda_static) + endif() + ]=] + ) + rapids_export(BUILD Arrow VERSION ${VERSION} EXPORT_SET arrow_targets - GLOBAL_TARGETS arrow_shared arrow_static - NAMESPACE cudf::) + GLOBAL_TARGETS arrow_shared cud + NAMESPACE cudf:: + FINAL_CODE_BLOCK arrow_code_string) rapids_export(BUILD ArrowCUDA VERSION ${VERSION} EXPORT_SET arrow_cuda_targets GLOBAL_TARGETS arrow_cuda_shared arrow_cuda_static - NAMESPACE cudf::) + NAMESPACE cudf:: + FINAL_CODE_BLOCK arrow_cuda_code_string) endif() # We generate the arrow-config and arrowcuda-config files # when we built arrow locally, so always do `find_dependency` diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake index 2512597b28f..47dbc037334 100644 --- a/cpp/cmake/thirdparty/get_cucollections.cmake +++ b/cpp/cmake/thirdparty/get_cucollections.cmake @@ -21,7 +21,7 @@ function(find_and_configure_cucollections) GLOBAL_TARGETS cuco::cuco CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections - GIT_TAG 0d602ae21ea4f38d23ed816aa948453d97b2ee4e + GIT_TAG 729857a5698a0e8d8f812e0464f65f37854ae17b OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF" diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh index 501071cafb7..bb5cfa5c6e0 100644 --- a/cpp/include/cudf/detail/copy_if.cuh +++ b/cpp/include/cudf/detail/copy_if.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include @@ -36,12 +36,15 @@ #include #include #include +#include #include #include -namespace { +namespace cudf { +namespace detail { + // Compute the count of elements that pass the mask within each block template __global__ void compute_block_counts(cudf::size_type* __restrict__ block_counts, @@ -293,9 +296,9 @@ struct scatter_gather_functor { filter); auto output_table = cudf::detail::gather(cudf::table_view{{input}}, - indices.begin(), - indices.end(), + indices, cudf::out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, stream, mr); @@ -304,10 +307,6 @@ struct scatter_gather_functor { } }; -} // namespace - -namespace cudf { -namespace detail { /** * @brief Filters `input` using a Filter function object * diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index 74a94f34ad8..d794adceec2 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -152,8 +152,8 @@ __launch_bounds__(block_size) __global__ * @param filter Function of type `FilterFn` which determines for index `i` where to get the * corresponding output value from * @param out_type `cudf::data_type` of the returned column - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory * @return A new column that contains the values from either `lhs` or `rhs` as determined * by `filter[i]` */ diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index 06de9ff2716..5d649e55389 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -640,8 +640,8 @@ void gather_bitmask(table_view const& source, * use `DONT_CHECK` when they are certain that the gather_map contains only valid indices for * better performance. In case there are out-of-bound indices in the gather map, the behavior * is undefined. Defaults to `DONT_CHECK`. - * @param[in] mr Device memory resource used to allocate the returned table's device memory * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @param[in] mr Device memory resource used to allocate the returned table's device memory * @return cudf::table Result of the gather */ template diff --git a/cpp/include/cudf/detail/gather.hpp b/cpp/include/cudf/detail/gather.hpp index 268c4878444..01d9c64ba30 100644 --- a/cpp/include/cudf/detail/gather.hpp +++ b/cpp/include/cudf/detail/gather.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,10 +16,10 @@ #pragma once #include -#include - #include #include +#include +#include #include @@ -55,10 +55,10 @@ enum class negative_index_policy : bool { ALLOWED, NOT_ALLOWED }; * indices. If `policy` is set to `DONT_CHECK` and there are out-of-bounds indices in `gather_map`, * the behavior is undefined. * @param[in] negative_index_policy Interpret each negative index `i` in the - * gathermap as the positive index `i+num_source_rows`. - * @param[in] mr Device memory resource used to allocate the returned table's device memory + * `gather_map` as the positive index `i+num_source_rows`. * @param[in] stream CUDA stream used for device memory operations and kernel launches. - * @return cudf::table Result of the gather + * @param[in] mr Device memory resource used to allocate the returned table's device memory + * @return Result of the gather */ std::unique_ptr gather( table_view const& source_table, @@ -67,5 +67,21 @@ std::unique_ptr
gather( negative_index_policy neg_indices, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @copydoc cudf::detail::gather(table_view const&,column_view const&,table_view + * const&,cudf::out_of_bounds_policy,cudf::detail::negative_index_policy,rmm::cuda_stream_view, + * rmm::mr::device_memory_resource*) + * + * @throws cudf::logic_error if `gather_map` span size is larger than max of `size_type`. + */ +std::unique_ptr
gather( + table_view const& source_table, + device_span const gather_map, + out_of_bounds_policy bounds_policy, + negative_index_policy neg_indices, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh index d546162fc7a..d0fa4e02440 100644 --- a/cpp/include/cudf/detail/indexalator.cuh +++ b/cpp/include/cudf/detail/indexalator.cuh @@ -502,17 +502,32 @@ struct indexalator_factory { iter = make_input_iterator(col); } + __device__ thrust::pair operator()(size_type i) const + { + return {iter[i], (has_nulls ? bit_is_set(null_mask, i + offset) : true)}; + } + }; + + /** + * @brief An index accessor that returns a validity flag along with the index value. + * + * This is suitable as a `pair_iterator`. + */ + struct scalar_nullable_index_accessor { + input_indexalator iter; + bool const is_null; + /** * @brief Create an accessor from a scalar. */ - nullable_index_accessor(scalar const& input) : has_nulls{!input.is_valid()} + scalar_nullable_index_accessor(scalar const& input) : is_null{!input.is_valid()} { iter = indexalator_factory::make_input_iterator(input); } - __device__ thrust::pair operator()(size_type i) const + __device__ thrust::pair operator()(size_type) const { - return {iter[i], (has_nulls ? bit_is_set(null_mask, i + offset) : true)}; + return {*iter, is_null}; } }; @@ -530,7 +545,75 @@ struct indexalator_factory { static auto make_input_pair_iterator(scalar const& input) { return thrust::make_transform_iterator(thrust::make_constant_iterator(0), - nullable_index_accessor{input}); + scalar_nullable_index_accessor{input}); + } + + /** + * @brief An index accessor that returns an index value if corresponding validity flag is true. + * + * This is suitable as an `optional_iterator`. + */ + struct optional_index_accessor { + input_indexalator iter; + bitmask_type const* null_mask{}; + size_type const offset{}; + bool const has_nulls{}; + + /** + * @brief Create an accessor from a column_view. + */ + optional_index_accessor(column_view const& col, bool has_nulls = false) + : null_mask{col.null_mask()}, offset{col.offset()}, has_nulls{has_nulls} + { + if (has_nulls) { CUDF_EXPECTS(col.nullable(), "Unexpected non-nullable column."); } + iter = make_input_iterator(col); + } + + __device__ thrust::optional operator()(size_type i) const + { + return has_nulls && !bit_is_set(null_mask, i + offset) ? thrust::nullopt + : thrust::make_optional(iter[i]); + } + }; + + /** + * @brief An index accessor that returns an index value if corresponding validity flag is true. + * + * This is suitable as an `optional_iterator`. + */ + struct scalar_optional_index_accessor { + input_indexalator iter; + bool const is_null; + + /** + * @brief Create an accessor from a scalar. + */ + scalar_optional_index_accessor(scalar const& input) : is_null{!input.is_valid()} + { + iter = indexalator_factory::make_input_iterator(input); + } + + __device__ thrust::optional operator()(size_type) const + { + return is_null ? thrust::nullopt : thrust::make_optional(*iter); + } + }; + + /** + * @brief Create an index iterator with a nullable index accessor. + */ + static auto make_input_optional_iterator(column_view const& col) + { + return make_counting_transform_iterator(0, optional_index_accessor{col, col.has_nulls()}); + } + + /** + * @brief Create an index iterator with a nullable index accessor for a scalar. + */ + static auto make_input_optional_iterator(scalar const& input) + { + return thrust::make_transform_iterator(thrust::make_constant_iterator(0), + scalar_optional_index_accessor{input}); } }; diff --git a/cpp/include/cudf/detail/reduction_functions.hpp b/cpp/include/cudf/detail/reduction_functions.hpp index 01df55dea05..0565f332b48 100644 --- a/cpp/include/cudf/detail/reduction_functions.hpp +++ b/cpp/include/cudf/detail/reduction_functions.hpp @@ -33,8 +33,8 @@ namespace reduction { * * @param col input column to compute sum * @param output_dtype data type of return type and typecast elements of input column - * @param mr Device memory resource used to allocate the returned scalar's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned scalar's device memory * @return Sum as scalar of type `output_dtype`. */ std::unique_ptr sum( @@ -52,8 +52,8 @@ std::unique_ptr sum( * * @param col input column to compute minimum. * @param output_dtype data type of return type and typecast elements of input column - * @param mr Device memory resource used to allocate the returned scalar's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned scalar's device memory * @return Minimum element as scalar of type `output_dtype`. */ std::unique_ptr min( @@ -71,8 +71,8 @@ std::unique_ptr min( * * @param col input column to compute maximum. * @param output_dtype data type of return type and typecast elements of input column - * @param mr Device memory resource used to allocate the returned scalar's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned scalar's device memory * @return Maximum element as scalar of type `output_dtype`. */ std::unique_ptr max( @@ -91,8 +91,8 @@ std::unique_ptr max( * * @param col input column to compute any_of. * @param output_dtype data type of return type and typecast elements of input column - * @param mr Device memory resource used to allocate the returned scalar's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned scalar's device memory * @return bool scalar if any of elements is true when typecasted to bool */ std::unique_ptr any( @@ -111,8 +111,8 @@ std::unique_ptr any( * * @param col input column to compute all_of. * @param output_dtype data type of return type and typecast elements of input column - * @param mr Device memory resource used to allocate the returned scalar's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned scalar's device memory * @return bool scalar if all of elements is true when typecasted to bool */ std::unique_ptr all( @@ -131,8 +131,8 @@ std::unique_ptr all( * * @param col input column to compute product. * @param output_dtype data type of return type and typecast elements of input column - * @param mr Device memory resource used to allocate the returned scalar's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned scalar's device memory * @return Product as scalar of type `output_dtype`. */ std::unique_ptr product( @@ -151,8 +151,8 @@ std::unique_ptr product( * * @param col input column to compute sum of squares. * @param output_dtype data type of return type and typecast elements of input column - * @param mr Device memory resource used to allocate the returned scalar's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned scalar's device memory * @return Sum of squares as scalar of type `output_dtype`. */ std::unique_ptr sum_of_squares( @@ -171,8 +171,8 @@ std::unique_ptr sum_of_squares( * * @param col input column to compute mean. * @param output_dtype data type of return type and typecast elements of input column. - * @param mr Device memory resource used to allocate the returned scalar's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned scalar's device memory. * @return Mean as scalar of type `output_dtype`. */ std::unique_ptr mean( @@ -191,8 +191,8 @@ std::unique_ptr mean( * * @param col input column to compute variance. * @param output_dtype data type of return type and typecast elements of input column. - * @param mr Device memory resource used to allocate the returned scalar's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned scalar's device memory. * @return Variance as scalar of type `output_dtype`. */ std::unique_ptr variance( @@ -212,8 +212,8 @@ std::unique_ptr variance( * * @param col input column to compute standard deviation. * @param output_dtype data type of return type and typecast elements of input column. - * @param mr Device memory resource used to allocate the returned scalar's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned scalar's device memory. * @return Standard deviation as scalar of type `output_dtype`. */ std::unique_ptr standard_deviation( @@ -243,8 +243,8 @@ std::unique_ptr standard_deviation( * @param col input column to get nth element from. * @param n index of element to get * @param null_handling Indicates if null values will be counted while indexing. - * @param mr Device memory resource used to allocate the returned scalar's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned scalar's device memory * @return nth element as scalar */ std::unique_ptr nth_element( diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp index a3b1f95ca0a..e43f8495d07 100644 --- a/cpp/include/cudf/detail/scatter.hpp +++ b/cpp/include/cudf/detail/scatter.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include @@ -70,6 +70,20 @@ std::unique_ptr
scatter( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc cudf::detail::scatter(table_view const&,column_view const&,table_view + * const&,bool,rmm::cuda_stream_view,rmm::mr::device_memory_resource*) + * + * @throws cudf::logic_error if `scatter_map` span size is larger than max of `size_type`. + */ +std::unique_ptr
scatter( + table_view const& source, + device_span const scatter_map, + table_view const& target, + bool check_bounds = false, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Scatters a row of scalar values into a copy of the target table * according to a scatter map. diff --git a/cpp/include/cudf/dictionary/detail/concatenate.hpp b/cpp/include/cudf/dictionary/detail/concatenate.hpp index c2fe2dce1fe..00d2c820b61 100644 --- a/cpp/include/cudf/dictionary/detail/concatenate.hpp +++ b/cpp/include/cudf/dictionary/detail/concatenate.hpp @@ -32,8 +32,8 @@ namespace detail { * @throw cudf::logic_error if dictionary column keys are not all the same type. * * @param columns Vector of dictionary columns to concatenate. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column with concatenated results. */ std::unique_ptr concatenate( diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp index 8c037406e45..e1d5f0367e0 100644 --- a/cpp/include/cudf/dictionary/detail/update_keys.hpp +++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp @@ -98,8 +98,8 @@ std::vector> match_dictionaries( * Any null rows are left unchanged. * * @param input Vector of cudf::table_views that include dictionary columns to be matched. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New dictionary columns and updated cudf::table_views. */ std::pair>, std::vector> match_dictionaries( diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh index d62b54208d5..7c2979c56cd 100644 --- a/cpp/include/cudf/lists/detail/gather.cuh +++ b/cpp/include/cudf/lists/detail/gather.cuh @@ -283,6 +283,7 @@ std::unique_ptr gather_list_leaf( /** * @copydoc cudf::lists::segmented_gather(lists_column_view const& source_column, * lists_column_view const& gather_map_list, + * out_of_bounds_policy bounds_policy, * rmm::mr::device_memory_resource* mr) * * @param stream CUDA stream on which to execute kernels @@ -290,6 +291,7 @@ std::unique_ptr gather_list_leaf( std::unique_ptr segmented_gather( lists_column_view const& source_column, lists_column_view const& gather_map_list, + out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/cudf/lists/gather.hpp b/cpp/include/cudf/lists/gather.hpp index 66d7fb137b2..23054b91592 100644 --- a/cpp/include/cudf/lists/gather.hpp +++ b/cpp/include/cudf/lists/gather.hpp @@ -17,6 +17,7 @@ #include #include +#include #include namespace cudf { @@ -32,7 +33,7 @@ namespace lists { * * `source_column` with any depth and `gather_map_list` with depth 1 are only supported. * - * * @code{.pseudo} + * @code{.pseudo} * source_column : [{"a", "b", "c", "d"}, {"1", "2", "3", "4"}, {"x", "y", "z"}] * gather_map_list : [{0, 1, 3, 2}, {1, 3, 2}, {}] * @@ -44,11 +45,24 @@ namespace lists { * @throws cudf::logic_error if gather_map is not list column of an index type. * * If indices in `gather_map_list` are outside the range `[-n, n)`, where `n` is the number of - * elements in corresponding row of the source column, the behavior is undefined. + * elements in corresponding row of the source column, the behaviour is as follows: + * 1. If `bounds_policy` is set to `DONT_CHECK`, the behaviour is undefined. + * 2. If `bounds_policy` is set to `NULLIFY`, the corresponding element in the list row + * is set to null in the output column. + * + * @code{.pseudo} + * source_column : [{"a", "b", "c", "d"}, {"1", "2", "3", "4"}, {"x", "y", "z"}] + * gather_map_list : [{0, -1, 4, -5}, {1, 3, 5}, {}] + * + * result_with_nullify : [{"a", "d", null, null}, {"2", "4", null}, {}] + * @endcode * * @param source_column View into the list column to gather from * @param gather_map_list View into a non-nullable list column of integral indices that maps the * element in list of each row in the source columns to rows of lists in the destination columns. + * @param bounds_policy Can be `DONT_CHECK` or `NULLIFY`. Selects whether or not to nullify the + * output list row's element, when the gather index falls outside the range `[-n, n)`, + * where `n` is the number of elements in list row corresponding to the gather-map row. * @param mr Device memory resource to allocate any returned objects * @return column with elements in list of rows gathered based on `gather_map_list` * @@ -56,6 +70,7 @@ namespace lists { std::unique_ptr segmented_gather( lists_column_view const& source_column, lists_column_view const& gather_map_list, + out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/strings/detail/concatenate.hpp b/cpp/include/cudf/strings/detail/concatenate.hpp index 0740039e896..3512c05a586 100644 --- a/cpp/include/cudf/strings/detail/concatenate.hpp +++ b/cpp/include/cudf/strings/detail/concatenate.hpp @@ -37,8 +37,8 @@ namespace detail { * ``` * * @param columns List of string columns to concatenate. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column with concatenated results. */ std::unique_ptr concatenate( diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh index 72f9ac11ceb..b6d34f8d89a 100644 --- a/cpp/include/cudf/strings/detail/copy_if_else.cuh +++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh @@ -47,8 +47,8 @@ namespace detail { * @param rhs_begin Strings of second set of data. Used when filter_fn returns false. * @param filter_fn Called to determine which iterator (lhs or rhs) to retrieve an entry for a * specific row. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column. */ template diff --git a/cpp/include/cudf/strings/detail/copy_range.cuh b/cpp/include/cudf/strings/detail/copy_range.cuh index 7e8ab6a0b68..05dbdf18b64 100644 --- a/cpp/include/cudf/strings/detail/copy_range.cuh +++ b/cpp/include/cudf/strings/detail/copy_range.cuh @@ -93,8 +93,8 @@ namespace detail { * @param target_begin The starting index of the target range (inclusive) * @param target_end The index of the last element in the target range * (exclusive) - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return std::unique_ptr The result target column */ template diff --git a/cpp/include/cudf/strings/detail/fill.hpp b/cpp/include/cudf/strings/detail/fill.hpp index 1ddf0ad5cdf..040175af9e5 100644 --- a/cpp/include/cudf/strings/detail/fill.hpp +++ b/cpp/include/cudf/strings/detail/fill.hpp @@ -37,8 +37,8 @@ namespace detail { * @param begin First row index to include the new string. * @param end Last row index (exclusive). * @param value String to use when filling the range. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column. */ std::unique_ptr fill( diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index faae9a50be2..16955b3251b 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -213,8 +213,8 @@ __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin, * @param map_end End of index iterator. * @param offsets The offset values to be associated with the output chars column. * @param chars_bytes The total number of bytes for the output chars column. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New chars column fit for a strings column. */ template @@ -278,8 +278,8 @@ std::unique_ptr gather_chars(StringIterator strings_begin, * @param strings Strings instance for this operation. * @param begin Start of index iterator. * @param end End of index iterator. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column containing the gathered strings. */ template @@ -361,8 +361,8 @@ std::unique_ptr gather( * @param begin Start of index iterator. * @param end End of index iterator. * @param nullify_out_of_bounds If true, indices outside the column's range are nullified. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column containing the gathered strings. */ template diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh index d632e6c24cb..4657f6c83bd 100644 --- a/cpp/include/cudf/strings/detail/merge.cuh +++ b/cpp/include/cudf/strings/detail/merge.cuh @@ -39,8 +39,8 @@ namespace detail { * @param lhs First column. * @param rhs Second column. * @param row_order Indexes for each column. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column. */ template diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh index eb7493f4895..0a53c930bb3 100644 --- a/cpp/include/cudf/strings/detail/scatter.cuh +++ b/cpp/include/cudf/strings/detail/scatter.cuh @@ -48,8 +48,8 @@ namespace detail { * @param scatter_map Iterator of indices into the output column. * @param target The set of columns into which values from the source column * are to be scattered. - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory * @return New strings column. */ template diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh index 6894c34a077..efd03d882e6 100644 --- a/cpp/include/cudf/strings/detail/utilities.cuh +++ b/cpp/include/cudf/strings/detail/utilities.cuh @@ -41,8 +41,8 @@ namespace detail { * @tparam Iterator Used as input to scan to set the offset values. * @param begin The beginning of the input sequence * @param end The end of the input sequence - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return offsets child column for strings column */ template @@ -74,8 +74,8 @@ std::unique_ptr make_offsets_child_column( * @tparam Iter Iterator type that returns string_view instances * @param strings_begin Iterator to the beginning of the string_view sequence * @param num_strings The number of string_view instances in the sequence - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return Child offsets column */ template @@ -131,8 +131,8 @@ __device__ inline char* copy_string(char* buffer, const string_view& d_string) * chars memory. * @param exec_size Number of rows for executing the `size_and_exec_fn` function. * @param strings_count Number of strings. - * @param mr Device memory resource used to allocate the returned columns' device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned columns' device memory. * @return offsets child column and chars child column for a strings column */ template @@ -190,8 +190,8 @@ auto make_strings_children( * After that, the d_offsets and d_chars are set and this is called again to fill in the * chars memory. * @param strings_count Number of strings. - * @param mr Device memory resource used to allocate the returned columns' device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned columns' device memory. * @return offsets child column and chars child column for a strings column */ template diff --git a/cpp/include/cudf/wrappers/durations.hpp b/cpp/include/cudf/wrappers/durations.hpp index 8bc8b7a7e6e..3a27d798487 100644 --- a/cpp/include/cudf/wrappers/durations.hpp +++ b/cpp/include/cudf/wrappers/durations.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -59,6 +59,8 @@ using duration_us = cuda::std::chrono::duration; static_assert(sizeof(duration_D) == sizeof(typename duration_D::rep), ""); +static_assert(sizeof(duration_h) == sizeof(typename duration_h::rep), ""); +static_assert(sizeof(duration_m) == sizeof(typename duration_m::rep), ""); static_assert(sizeof(duration_s) == sizeof(typename duration_s::rep), ""); static_assert(sizeof(duration_ms) == sizeof(typename duration_ms::rep), ""); static_assert(sizeof(duration_us) == sizeof(typename duration_us::rep), ""); @@ -85,6 +87,8 @@ namespace std { } DURATION_LIMITS(cudf::duration_D); +DURATION_LIMITS(cudf::duration_h); +DURATION_LIMITS(cudf::duration_m); DURATION_LIMITS(cudf::duration_s); DURATION_LIMITS(cudf::duration_ms); DURATION_LIMITS(cudf::duration_us); diff --git a/cpp/include/cudf/wrappers/timestamps.hpp b/cpp/include/cudf/wrappers/timestamps.hpp index ac13dae6a74..8481068ca05 100644 --- a/cpp/include/cudf/wrappers/timestamps.hpp +++ b/cpp/include/cudf/wrappers/timestamps.hpp @@ -16,9 +16,7 @@ #pragma once -#include - -#include +#include /** * @file timestamps.hpp @@ -42,33 +40,37 @@ using timestamp = time_point; */ /** - * @brief Type alias representing an int32_t duration of days since the unix - * epoch. + * @brief Type alias representing a cudf::duration_D (int32_t) since the unix epoch. + */ +using timestamp_D = detail::timestamp; +/** + * @brief Type alias representing a cudf::duration_h (int32_t) since the unix epoch. + */ +using timestamp_h = detail::timestamp; +/** + * @brief Type alias representing a cudf::duration_m (int32_t) since the unix epoch. */ -using timestamp_D = - detail::timestamp>>; +using timestamp_m = detail::timestamp; /** - * @brief Type alias representing an int64_t duration of seconds since the - * unix epoch. + * @brief Type alias representing a cudf::duration_s (int64_t) since the unix epoch. */ -using timestamp_s = detail::timestamp>>; +using timestamp_s = detail::timestamp; /** - * @brief Type alias representing an int64_t duration of milliseconds since - * the unix epoch. + * @brief Type alias representing a cudf::duration_ms (int64_t) since the unix epoch. */ -using timestamp_ms = detail::timestamp>; +using timestamp_ms = detail::timestamp; /** - * @brief Type alias representing an int64_t duration of microseconds since - * the unix epoch. + * @brief Type alias representing a cudf::duration_us (int64_t) since the unix epoch. */ -using timestamp_us = detail::timestamp>; +using timestamp_us = detail::timestamp; /** - * @brief Type alias representing an int64_t duration of nanoseconds since - * the unix epoch. + * @brief Type alias representing a cudf::duration_ns (int64_t) since the unix epoch. */ -using timestamp_ns = detail::timestamp>; +using timestamp_ns = detail::timestamp; static_assert(sizeof(timestamp_D) == sizeof(typename timestamp_D::rep), ""); +static_assert(sizeof(timestamp_h) == sizeof(typename timestamp_h::rep), ""); +static_assert(sizeof(timestamp_m) == sizeof(typename timestamp_m::rep), ""); static_assert(sizeof(timestamp_s) == sizeof(typename timestamp_s::rep), ""); static_assert(sizeof(timestamp_ms) == sizeof(typename timestamp_ms::rep), ""); static_assert(sizeof(timestamp_us) == sizeof(typename timestamp_us::rep), ""); @@ -95,6 +97,8 @@ namespace std { } TIMESTAMP_LIMITS(cudf::timestamp_D); +TIMESTAMP_LIMITS(cudf::timestamp_h); +TIMESTAMP_LIMITS(cudf::timestamp_m); TIMESTAMP_LIMITS(cudf::timestamp_s); TIMESTAMP_LIMITS(cudf::timestamp_ms); TIMESTAMP_LIMITS(cudf::timestamp_us); diff --git a/cpp/include/nvtext/detail/tokenize.hpp b/cpp/include/nvtext/detail/tokenize.hpp index 8b74c9cde94..faddfef8be2 100644 --- a/cpp/include/nvtext/detail/tokenize.hpp +++ b/cpp/include/nvtext/detail/tokenize.hpp @@ -30,8 +30,8 @@ namespace detail { * @param strings Strings column tokenize. * @param delimiter UTF-8 characters used to separate each string into tokens. * The default of empty string will separate tokens using whitespace. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings columns of tokens. */ std::unique_ptr tokenize( @@ -46,8 +46,8 @@ std::unique_ptr tokenize( * * @param strings Strings column to tokenize. * @param delimiters Strings used to separate individual strings into tokens. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings columns of tokens. */ std::unique_ptr tokenize( @@ -63,8 +63,8 @@ std::unique_ptr tokenize( * @param strings Strings column to use for this operation. * @param delimiter Strings used to separate each string into tokens. * The default of empty string will separate tokens using whitespace. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New INT32 column of token counts. */ std::unique_ptr count_tokens( @@ -79,8 +79,8 @@ std::unique_ptr count_tokens( * * @param strings Strings column to use for this operation. * @param delimiters Strings used to separate each string into tokens. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New INT32 column of token counts. */ std::unique_ptr count_tokens( diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index a1b00a4cd6b..6b03a97c59b 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -324,8 +324,8 @@ namespace detail { * @param rhs Right-hand side `column_view` used in the binary operation * @param op `binary_operator` to be used to combine `lhs` and `rhs` * @param output_type `data_type` of the output column - * @param mr Device memory resource to use for device memory allocation * @param stream CUDA stream used for device memory operations + * @param mr Device memory resource to use for device memory allocation * @return std::unique_ptr Output column used for binary operation */ std::unique_ptr make_fixed_width_column_for_output(scalar const& lhs, @@ -351,8 +351,8 @@ std::unique_ptr make_fixed_width_column_for_output(scalar const& lhs, * @param rhs Right-hand side `scalar` used in the binary operation * @param op `binary_operator` to be used to combine `lhs` and `rhs` * @param output_type `data_type` of the output column - * @param mr Device memory resource to use for device memory allocation * @param stream CUDA stream used for device memory operations + * @param mr Device memory resource to use for device memory allocation * @return std::unique_ptr Output column used for binary operation */ std::unique_ptr make_fixed_width_column_for_output(column_view const& lhs, @@ -378,8 +378,8 @@ std::unique_ptr make_fixed_width_column_for_output(column_view const& lh * @param rhs Right-hand side `column_view` used in the binary operation * @param op `binary_operator` to be used to combine `lhs` and `rhs` * @param output_type `data_type` of the output column - * @param mr Device memory resource to use for device memory allocation * @param stream CUDA stream used for device memory operations + * @param mr Device memory resource to use for device memory allocation * @return std::unique_ptr Output column used for binary operation */ std::unique_ptr make_fixed_width_column_for_output(column_view const& lhs, @@ -426,8 +426,8 @@ namespace jit { * @param lhs Left-hand side `scalar` used in the binary operation * @param rhs Right-hand side `column_view` used in the binary operation * @param op `binary_operator` to be used to combine `lhs` and `rhs` - * @param mr Device memory resource to use for device memory allocation * @param stream CUDA stream used for device memory operations + * @param mr Device memory resource to use for device memory allocation * @return std::unique_ptr Resulting output column from the binary operation */ std::unique_ptr fixed_point_binary_operation(scalar const& lhs, @@ -496,8 +496,8 @@ std::unique_ptr fixed_point_binary_operation(scalar const& lhs, * @param lhs Left-hand side `column_view` used in the binary operation * @param rhs Right-hand side `scalar` used in the binary operation * @param op `binary_operator` to be used to combine `lhs` and `rhs` - * @param mr Device memory resource to use for device memory allocation * @param stream CUDA stream used for device memory operations + * @param mr Device memory resource to use for device memory allocation * @return std::unique_ptr Resulting output column from the binary operation */ std::unique_ptr fixed_point_binary_operation(column_view const& lhs, @@ -566,8 +566,8 @@ std::unique_ptr fixed_point_binary_operation(column_view const& lhs, * @param lhs Left-hand side `column_view` used in the binary operation * @param rhs Right-hand side `column_view` used in the binary operation * @param op `binary_operator` to be used to combine `lhs` and `rhs` - * @param mr Device memory resource to use for device memory allocation * @param stream CUDA stream used for device memory operations + * @param mr Device memory resource to use for device memory allocation * @return std::unique_ptr Resulting output column from the binary operation */ std::unique_ptr fixed_point_binary_operation(column_view const& lhs, diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp index cf3a6025847..26a0f26b59c 100644 --- a/cpp/src/binaryop/compiled/binary_ops.hpp +++ b/cpp/src/binaryop/compiled/binary_ops.hpp @@ -68,9 +68,10 @@ std::unique_ptr string_null_min_max( * * @param lhs The left operand string scalar * @param rhs The right operand string column + * @param op The binary operator * @param output_type The desired data type of the output column - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory * @return std::unique_ptr Output column */ std::unique_ptr binary_operation( @@ -94,9 +95,10 @@ std::unique_ptr binary_operation( * * @param lhs The left operand string column * @param rhs The right operand string scalar + * @param op The binary operator * @param output_type The desired data type of the output column - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory * @return std::unique_ptr Output column */ std::unique_ptr binary_operation( @@ -119,9 +121,10 @@ std::unique_ptr binary_operation( * * @param lhs The left operand string column * @param rhs The right operand string column + * @param op The binary operator enum * @param output_type The desired data type of the output column - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory * @return std::unique_ptr Output column */ std::unique_ptr binary_operation( diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu index df1ce6c0e67..21a27ff8c3d 100644 --- a/cpp/src/copying/copy.cu +++ b/cpp/src/copying/copy.cu @@ -14,13 +14,12 @@ * limitations under the License. */ -#include #include #include -#include +#include #include #include -#include +#include #include #include @@ -181,23 +180,24 @@ std::unique_ptr scatter_gather_based_if_else(cudf::column_view const& lh rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto scatter_map = rmm::device_uvector{static_cast(size), stream}; - auto const scatter_map_end = thrust::copy_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(size_type{0}), - thrust::make_counting_iterator(size_type{size}), - scatter_map.begin(), - is_left); + auto gather_map = rmm::device_uvector{static_cast(size), stream}; + auto const gather_map_end = thrust::copy_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(size_type{0}), + thrust::make_counting_iterator(size_type{size}), + gather_map.begin(), + is_left); + + gather_map.resize(thrust::distance(gather_map.begin(), gather_map_end), stream); auto const scatter_src_lhs = cudf::detail::gather(table_view{std::vector{lhs}}, - scatter_map.begin(), - scatter_map_end, + gather_map, out_of_bounds_policy::DONT_CHECK, + negative_index_policy::NOT_ALLOWED, stream); auto result = cudf::detail::scatter( table_view{std::vector{scatter_src_lhs->get_column(0).view()}}, - scatter_map.begin(), - scatter_map_end, + gather_map, table_view{std::vector{rhs}}, false, stream, @@ -227,8 +227,12 @@ std::unique_ptr scatter_gather_based_if_else(cudf::scalar const& lhs, static_cast(scatter_map_size), scatter_map.begin()}; - auto result = cudf::scatter( - scatter_source, scatter_map_column_view, table_view{std::vector{rhs}}, false, mr); + auto result = cudf::detail::scatter(scatter_source, + scatter_map_column_view, + table_view{std::vector{rhs}}, + false, + stream, + mr); return std::move(result->release()[0]); } diff --git a/cpp/src/copying/gather.cu b/cpp/src/copying/gather.cu index 181752d18e8..5c66f67ff0f 100644 --- a/cpp/src/copying/gather.cu +++ b/cpp/src/copying/gather.cu @@ -54,6 +54,21 @@ std::unique_ptr
gather(table_view const& source_table, return gather(source_table, map_begin, map_end, bounds_policy, stream, mr); } +std::unique_ptr
gather(table_view const& source_table, + device_span const gather_map, + out_of_bounds_policy bounds_policy, + negative_index_policy neg_indices, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(gather_map.size() <= std::numeric_limits::max(), + "invalid gather map size"); + auto map_col = column_view(data_type{type_to_id()}, + static_cast(gather_map.size()), + gather_map.data()); + return gather(source_table, map_col, bounds_policy, neg_indices, stream, mr); +} + } // namespace detail std::unique_ptr
gather(table_view const& source_table, diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 3312316f548..211bc0e1ebe 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -16,8 +16,6 @@ #include #include #include -#include -#include #include #include #include @@ -305,6 +303,21 @@ std::unique_ptr
scatter(table_view const& source, return detail::scatter(source, map_begin, map_end, target, check_bounds, stream, mr); } +std::unique_ptr
scatter(table_view const& source, + device_span const scatter_map, + table_view const& target, + bool check_bounds, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(scatter_map.size() <= std::numeric_limits::max(), + "invalid scatter map size"); + auto map_col = column_view(data_type{type_to_id()}, + static_cast(scatter_map.size()), + scatter_map.data()); + return scatter(source, map_col, target, check_bounds, stream, mr); +} + std::unique_ptr
scatter(std::vector> const& source, column_view const& indices, table_view const& target, diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu index 16b86177b8e..7e2a82a683c 100644 --- a/cpp/src/dictionary/remove_keys.cu +++ b/cpp/src/dictionary/remove_keys.cu @@ -49,8 +49,8 @@ namespace { * and returns true if that key is to be used in the output dictionary. * @param dictionary_column The column to use for creating the new dictionary. * @param keys_to_keep_fn Called to determine which keys in `dictionary_column` to keep. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. */ template std::unique_ptr remove_keys_fn( diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu index 37118779248..11c81ee434b 100644 --- a/cpp/src/dictionary/replace.cu +++ b/cpp/src/dictionary/replace.cu @@ -44,8 +44,8 @@ namespace { * * @param input lhs for `copy_if_else` * @param replacement_iter rhs for `copy_if_else` - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return Always returns column of type INT32 (size_type) */ template diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index b64199d7f0c..b8150f7fd14 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -26,13 +26,14 @@ #include #include #include -#include #include #include +#include #include #include #include #include +#include #include #include #include @@ -166,7 +167,6 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final cudf::detail::result_cache* sparse_results; cudf::detail::result_cache* dense_results; device_span gather_map; - size_type const map_size; Map const& map; bitmask_type const* __restrict__ row_bitmask; rmm::cuda_stream_view stream; @@ -179,7 +179,6 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final cudf::detail::result_cache* sparse_results, cudf::detail::result_cache* dense_results, device_span gather_map, - size_type map_size, Map const& map, bitmask_type const* row_bitmask, rmm::cuda_stream_view stream, @@ -188,7 +187,6 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final sparse_results(sparse_results), dense_results(dense_results), gather_map(gather_map), - map_size(map_size), map(map), row_bitmask(row_bitmask), stream(stream), @@ -202,9 +200,9 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final { auto s = sparse_results->get_result(col, agg); auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}), - gather_map.begin(), - gather_map.begin() + map_size, + gather_map, out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, stream, mr); return std::move(dense_result_table->release()[0]); @@ -374,7 +372,7 @@ void sparse_to_dense_results(table_view const& keys, cudf::detail::result_cache* sparse_results, cudf::detail::result_cache* dense_results, device_span gather_map, - size_type map_size, + // size_type map_size, Map const& map, bool keys_have_nulls, null_policy include_null_keys, @@ -393,7 +391,7 @@ void sparse_to_dense_results(table_view const& keys, // Given an aggregation, this will get the result from sparse_results and // convert and return dense, compacted result auto finalizer = hash_compound_agg_finalizer( - col, sparse_results, dense_results, gather_map, map_size, map, row_bitmask_ptr, stream, mr); + col, sparse_results, dense_results, gather_map, map, row_bitmask_ptr, stream, mr); for (auto&& agg : agg_v) { agg->finalize(finalizer); } @@ -595,15 +593,18 @@ std::unique_ptr
groupby_null_templated(table_view const& keys, &sparse_results, cache, gather_map, - gather_map.size(), *map, keys_have_nulls, include_null_keys, stream, mr); - return cudf::detail::gather( - keys, gather_map.begin(), gather_map.end(), out_of_bounds_policy::DONT_CHECK, stream, mr); + return cudf::detail::gather(keys, + gather_map, + out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); } } // namespace diff --git a/cpp/src/groupby/sort/group_collect.cu b/cpp/src/groupby/sort/group_collect.cu index a30d4639af8..43a1674d97f 100644 --- a/cpp/src/groupby/sort/group_collect.cu +++ b/cpp/src/groupby/sort/group_collect.cu @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/src/groupby/sort/group_nth_element.cu b/cpp/src/groupby/sort/group_nth_element.cu index e7dc57f6c93..f2c57abf54e 100644 --- a/cpp/src/groupby/sort/group_nth_element.cu +++ b/cpp/src/groupby/sort/group_nth_element.cu @@ -20,12 +20,13 @@ #include #include #include -#include +#include #include #include #include #include +#include #include #include @@ -113,10 +114,11 @@ std::unique_ptr group_nth_element(column_view const& values, return (bitmask_iterator[i] && intra_group_index[i] == nth); }); } + auto output_table = cudf::detail::gather(table_view{{values}}, - nth_index.begin(), - nth_index.end(), + nth_index, out_of_bounds_policy::NULLIFY, + cudf::detail::negative_index_policy::NOT_ALLOWED, stream, mr); if (!output_table->get_column(0).has_nulls()) output_table->get_column(0).set_null_mask({}, 0); diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp index cb01ee8e053..f5060a6ed4e 100644 --- a/cpp/src/groupby/sort/group_reductions.hpp +++ b/cpp/src/groupby/sort/group_reductions.hpp @@ -45,8 +45,8 @@ namespace detail { * @param values Grouped values to get sum of * @param num_groups Number of groups * @param group_labels ID of group that the corresponding value belongs to - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory */ std::unique_ptr group_sum(column_view const& values, size_type num_groups, @@ -68,8 +68,8 @@ std::unique_ptr group_sum(column_view const& values, * @param values Grouped values to get product of * @param num_groups Number of groups * @param group_labels ID of group that the corresponding value belongs to - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory */ std::unique_ptr group_product(column_view const& values, size_type num_groups, @@ -91,8 +91,8 @@ std::unique_ptr group_product(column_view const& values, * @param values Grouped values to get minimum from * @param num_groups Number of groups * @param group_labels ID of group that the corresponding value belongs to - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory */ std::unique_ptr group_min(column_view const& values, size_type num_groups, @@ -114,8 +114,8 @@ std::unique_ptr group_min(column_view const& values, * @param values Grouped values to get maximum from * @param num_groups Number of groups * @param group_labels ID of group that the corresponding value belongs to - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory */ std::unique_ptr group_max(column_view const& values, size_type num_groups, @@ -138,8 +138,8 @@ std::unique_ptr group_max(column_view const& values, * @param num_groups Number of groups * @param group_labels ID of group that the corresponding value belongs to * @param key_sort_order Indices indicating sort order of groupby keys - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory */ std::unique_ptr group_argmax(column_view const& values, size_type num_groups, @@ -163,8 +163,8 @@ std::unique_ptr group_argmax(column_view const& values, * @param num_groups Number of groups * @param group_labels ID of group that the corresponding value belongs to * @param key_sort_order Indices indicating sort order of groupby keys - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory */ std::unique_ptr group_argmin(column_view const& values, size_type num_groups, @@ -188,8 +188,8 @@ std::unique_ptr group_argmin(column_view const& values, * @param values Grouped values to get valid count of * @param group_labels ID of group that the corresponding value belongs to * @param num_groups Number of groups ( unique values in @p group_labels ) - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory */ std::unique_ptr group_count_valid(column_view const& values, cudf::device_span group_labels, @@ -209,8 +209,8 @@ std::unique_ptr group_count_valid(column_view const& values, * * @param group_offsets Offsets of groups' starting points within @p values * @param num_groups Number of groups ( unique values in @p group_labels ) - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory */ std::unique_ptr group_count_all(cudf::device_span group_offsets, size_type num_groups, @@ -232,8 +232,8 @@ std::unique_ptr group_count_all(cudf::device_span group * @param values Grouped values to compute M2 values * @param group_means Pre-computed groupwise MEAN * @param group_labels ID of group corresponding value in @p values belongs to - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory */ std::unique_ptr group_m2(column_view const& values, column_view const& group_means, @@ -260,8 +260,8 @@ std::unique_ptr group_m2(column_view const& values, * @param group_labels ID of group corresponding value in @p values belongs to * @param ddof Delta degrees of freedom. The divisor used in calculation of * `var` is `N - ddof`, where `N` is the group size. - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory */ std::unique_ptr group_var(column_view const& values, column_view const& group_means, @@ -289,8 +289,8 @@ std::unique_ptr group_var(column_view const& values, * @param group_offsets Offsets of groups' starting points within @p values * @param quantiles List of quantiles q where q lies in [0,1] * @param interp Method to use when desired value lies between data points - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory */ std::unique_ptr group_quantiles(column_view const& values, column_view const& group_sizes, @@ -322,8 +322,8 @@ std::unique_ptr group_quantiles(column_view const& values, * @param null_handling Exclude nulls while counting if null_policy::EXCLUDE, * Include nulls if null_policy::INCLUDE. * Nulls are treated equal. - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory */ std::unique_ptr group_nunique(column_view const& values, cudf::device_span group_labels, @@ -355,8 +355,8 @@ std::unique_ptr group_nunique(column_view const& values, * @param n nth element to choose from each group of @p values * @param null_handling Exclude nulls while counting if null_policy::EXCLUDE, * Include nulls if null_policy::INCLUDE. - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory */ std::unique_ptr group_nth_element(column_view const& values, column_view const& group_sizes, @@ -433,8 +433,8 @@ std::unique_ptr group_merge_lists(column_view const& values, * @param values Grouped values (tuples of values `(valid_count, mean, M2)`) to merge. * @param group_offsets Offsets of groups' starting points within @p values. * @param num_groups Number of groups. - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory */ std::unique_ptr group_merge_m2(column_view const& values, cudf::device_span group_offsets, diff --git a/cpp/src/groupby/sort/group_replace_nulls.cu b/cpp/src/groupby/sort/group_replace_nulls.cu index 86590fc1734..cb954eb7ce5 100644 --- a/cpp/src/groupby/sort/group_replace_nulls.cu +++ b/cpp/src/groupby/sort/group_replace_nulls.cu @@ -14,8 +14,9 @@ * limitations under the License. */ #include -#include +#include #include +#include #include #include @@ -68,9 +69,9 @@ std::unique_ptr group_replace_nulls(cudf::column_view const& grouped_val } auto output = cudf::detail::gather(cudf::table_view({grouped_value}), - gather_map.begin(), - gather_map.end(), + gather_map, cudf::out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, stream, mr); diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/groupby/sort/group_tdigest.cu index 5b4252a9063..0738e4c5730 100644 --- a/cpp/src/groupby/sort/group_tdigest.cu +++ b/cpp/src/groupby/sort/group_tdigest.cu @@ -299,6 +299,8 @@ __global__ void generate_cluster_limits_kernel(int delta_, nearest_w_index = last_inserted_index + 1; auto [r, i, adjusted] = cumulative_weight(nearest_w_index); adjusted_next_limit = max(next_limit, adjusted); + (void)r; + (void)i; } cluster_wl[group_num_clusters[group_index]] = adjusted_next_limit; last_inserted_index = nearest_w_index; @@ -469,6 +471,7 @@ std::unique_ptr compute_tdigests(int delta, group_cumulative_weight] __device__(size_type value_index) -> size_type { auto [group_index, relative_value_index, cumulative_weight] = group_cumulative_weight(value_index); + (void)relative_value_index; // compute start of cluster weight limits for this group double const* weight_limits = group_cluster_wl + group_cluster_offsets[group_index]; @@ -532,10 +535,12 @@ struct get_scalar_minmax { __device__ thrust::tuple operator()(size_type group_index) { - // note: .element() is taking care of fixed-point conversions for us. - return {static_cast(col.element(group_offsets[group_index])), - static_cast( - col.element(group_offsets[group_index] + (group_valid_counts[group_index] - 1)))}; + auto const valid_count = group_valid_counts[group_index]; + return valid_count > 0 + ? thrust::make_tuple( + static_cast(col.element(group_offsets[group_index])), + static_cast(col.element(group_offsets[group_index] + valid_count - 1))) + : thrust::make_tuple(0.0, 0.0); } }; @@ -601,15 +606,9 @@ struct typed_group_tdigest { template < typename T, + typename... Args, typename std::enable_if_t() && !cudf::is_fixed_point()>* = nullptr> - std::unique_ptr operator()(column_view const& col, - cudf::device_span group_offsets, - cudf::device_span group_labels, - cudf::device_span group_valid_counts, - size_type num_groups, - int delta, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + std::unique_ptr operator()(Args&&...) { CUDF_FAIL("Non-numeric type in group_tdigest"); } diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 4d3736a41f0..3b4549c135f 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -299,7 +299,8 @@ __inline__ __device__ T decode_value(char const* begin, return cudf::io::parse_numeric(begin, end, opts); } -template +template () and !cudf::is_duration()>* = nullptr> __inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const& opts) @@ -307,81 +308,22 @@ __inline__ __device__ T decode_value(char const* begin, return cudf::io::parse_numeric(begin, end, opts); } -template <> -__inline__ __device__ cudf::timestamp_D decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - return timestamp_D{cudf::duration_D{to_date(begin, end, opts.dayfirst)}}; -} - -template <> -__inline__ __device__ cudf::timestamp_s decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - auto milli = to_date_time(begin, end, opts.dayfirst); - if (milli == -1) { - return timestamp_s{cudf::duration_s{to_non_negative_integer(begin, end)}}; - } else { - return timestamp_s{cudf::duration_s{milli / 1000}}; - } -} - -template <> -__inline__ __device__ cudf::timestamp_ms decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - auto milli = to_date_time(begin, end, opts.dayfirst); - if (milli == -1) { - return timestamp_ms{cudf::duration_ms{to_non_negative_integer(begin, end)}}; - } else { - return timestamp_ms{cudf::duration_ms{milli}}; - } -} - -template <> -__inline__ __device__ cudf::timestamp_us decode_value(char const* begin, - char const* end, - parse_options_view const& opts) +template ()>* = nullptr> +__inline__ __device__ T decode_value(char const* begin, + char const* end, + parse_options_view const& opts) { - auto milli = to_date_time(begin, end, opts.dayfirst); - if (milli == -1) { - return timestamp_us{cudf::duration_us{to_non_negative_integer(begin, end)}}; - } else { - return timestamp_us{cudf::duration_us{milli * 1000}}; - } + return to_timestamp(begin, end, opts.dayfirst); } -template <> -__inline__ __device__ cudf::timestamp_ns decode_value(char const* begin, - char const* end, - parse_options_view const& opts) +template ()>* = nullptr> +__inline__ __device__ T decode_value(char const* begin, + char const* end, + parse_options_view const& opts) { - auto milli = to_date_time(begin, end, opts.dayfirst); - if (milli == -1) { - return timestamp_ns{cudf::duration_ns{to_non_negative_integer(begin, end)}}; - } else { - return timestamp_ns{cudf::duration_ns{milli * 1000000}}; - } + return to_duration(begin, end); } -#ifndef DURATION_DECODE_VALUE -#define DURATION_DECODE_VALUE(Type) \ - template <> \ - __inline__ __device__ Type decode_value( \ - const char* begin, const char* end, parse_options_view const& opts) \ - { \ - return Type{to_time_delta(begin, end)}; \ - } -#endif -DURATION_DECODE_VALUE(duration_D) -DURATION_DECODE_VALUE(duration_s) -DURATION_DECODE_VALUE(duration_ms) -DURATION_DECODE_VALUE(duration_us) -DURATION_DECODE_VALUE(duration_ns) - // The purpose of this is merely to allow compilation ONLY // TODO : make this work for csv template <> diff --git a/cpp/src/io/csv/datetime.cuh b/cpp/src/io/csv/datetime.cuh index 3e785259476..18758bd5309 100644 --- a/cpp/src/io/csv/datetime.cuh +++ b/cpp/src/io/csv/datetime.cuh @@ -18,8 +18,10 @@ #include -#include +#include + #include +#include namespace cudf { namespace io { @@ -49,101 +51,24 @@ __inline__ __device__ T to_non_negative_integer(char const* begin, char const* e return value; } -// User-defined literals to clarify numbers and units for time calculation -__inline__ __device__ constexpr uint32_t operator"" _days(unsigned long long int days) -{ - return days; -} -__inline__ __device__ constexpr uint32_t operator"" _erasInDays(unsigned long long int eras) -{ - return eras * 146097_days; // multiply by days within an era (400 year span) -} -__inline__ __device__ constexpr uint32_t operator"" _years(unsigned long long int years) -{ - return years; -} -__inline__ __device__ constexpr uint32_t operator"" _erasInYears(unsigned long long int eras) -{ - return (eras * 1_erasInDays) / 365_days; -} - -/** - * @brief Computes the number of days since "March 1, 0000", given a date. - * - * This function takes year, month, and day and returns the number of days since the baseline which - * is taken as 0000-03-01. This value is chosen as the origin for ease of calculation (now February - * becomes the last month). - * - * @return days since March 1, 0000 - */ -__inline__ __device__ constexpr int32_t days_since_baseline(int year, int month, int day) -{ - // More details of this formula are located in cuDF datetime_ops - // In brief, the calculation is split over several components: - // era: a 400 year range, where the date cycle repeats exactly - // yoe: year within the 400 range of an era - // doy: day within the 364 range of a year - // doe: exact day within the whole era - // The months are shifted so that March is the starting month and February - // (possible leap day in it) is the last month for the linear calculation - year -= (month <= 2) ? 1 : 0; - - const int32_t era = (year >= 0 ? year : year - 399_years) / 1_erasInYears; - const int32_t yoe = year - era * 1_erasInYears; - const int32_t doy = (153_days * (month + (month > 2 ? -3 : 9)) + 2) / 5 + day - 1; - const int32_t doe = (yoe * 365_days) + (yoe / 4_years) - (yoe / 100_years) + doy; - - return (era * 1_erasInDays) + doe; -} - -/** - * @brief Computes the number of days since epoch, given a date. - * - * This function takes year, month, and day and returns the number of days since epoch (1970-01-01). - * - * @return days since epoch - */ -__inline__ __device__ constexpr int32_t days_since_epoch(int year, int month, int day) -{ - // Shift the start date to epoch to match unix time - static_assert(static_cast(days_since_baseline(1970, 1, 1)) == 719468_days, - "Baseline to epoch returns incorrect number of days"); - - return days_since_baseline(year, month, day) - days_since_baseline(1970, 1, 1); -} - /** - * @brief Computes the number of seconds since epoch, given a date and time. + * @brief Extracts the Day, Month, and Year from a string. * - * This function takes year, month, day, hour, minute and second and returns - * the number of seconds since epoch (1970-01-01), + * This function takes a string and produces a `year_month_day` representation. + * Acceptable formats are a combination of `YYYY`, `M`, `MM`, `D` and `DD` with + * `/` or `-` as separators. Data with only year and month (no day) is also valid. * - * @return seconds since epoch + * @param begin Pointer to the first element of the string + * @param end Pointer to the first element after the string + * @param dayfirst Flag indicating that first field is the day + * @return Extracted year, month and day in `cuda::std::chrono::year_month_day` format */ -__inline__ __device__ constexpr int64_t seconds_since_epoch( - int year, int month, int day, int hour, int minute, int second) +__inline__ __device__ cuda::std::chrono::year_month_day extract_date(char const* begin, + char const* end, + bool dayfirst) { - // Leverage the function to find the days since epoch - const int64_t days = days_since_epoch(year, month, day); + using namespace cuda::std::chrono; - // Return sum total seconds from each time portion - return (days * 24 * 60 * 60) + (hour * 60 * 60) + (minute * 60) + second; -} - -/** - * @brief Extracts the Day, Month, and Year from a string. - * - * @param[in] begin Pointer to the first element of the string - * @param[in] end Pointer to the first element after the string - * @param[in] dayfirst Flag indicating that first field is the day - * @param[out] year - * @param[out] month - * @param[out] day - * @return true if successful, false otherwise - */ -__inline__ __device__ bool extract_date( - char const* begin, char const* end, bool dayfirst, int* year, int* month, int* day) -{ char sep = '/'; auto sep_pos = thrust::find(thrust::seq, begin, end, sep); @@ -153,11 +78,13 @@ __inline__ __device__ bool extract_date( sep_pos = thrust::find(thrust::seq, begin, end, sep); } - if (sep_pos == end) return false; + year y; + month m; + day d; //--- is year the first filed? if ((sep_pos - begin) == 4) { - *year = to_non_negative_integer(begin, sep_pos); + y = year{to_non_negative_integer(begin, sep_pos)}; // year is signed // Month auto s2 = sep_pos + 1; @@ -165,72 +92,70 @@ __inline__ __device__ bool extract_date( if (sep_pos == end) { //--- Data is just Year and Month - no day - *month = to_non_negative_integer(s2, end); - *day = 1; + m = month{to_non_negative_integer(s2, end)}; // month and day are unsigned + d = day{1}; } else { - *month = to_non_negative_integer(s2, sep_pos); - *day = to_non_negative_integer((sep_pos + 1), end); + m = month{to_non_negative_integer(s2, sep_pos)}; + d = day{to_non_negative_integer((sep_pos + 1), end)}; } } else { //--- if the dayfirst flag is set, then restricts the format options if (dayfirst) { - *day = to_non_negative_integer(begin, sep_pos); + d = day{to_non_negative_integer(begin, sep_pos)}; auto s2 = sep_pos + 1; sep_pos = thrust::find(thrust::seq, s2, end, sep); - *month = to_non_negative_integer(s2, sep_pos); - *year = to_non_negative_integer((sep_pos + 1), end); + m = month{to_non_negative_integer(s2, sep_pos)}; + y = year{to_non_negative_integer((sep_pos + 1), end)}; } else { - *month = to_non_negative_integer(begin, sep_pos); + m = month{to_non_negative_integer(begin, sep_pos)}; auto s2 = sep_pos + 1; sep_pos = thrust::find(thrust::seq, s2, end, sep); if (sep_pos == end) { //--- Data is just Year and Month - no day - *year = to_non_negative_integer(s2, end); - *day = 1; + y = year{to_non_negative_integer(s2, end)}; + d = day{1}; } else { - *day = to_non_negative_integer(s2, sep_pos); - *year = to_non_negative_integer((sep_pos + 1), end); + d = day{to_non_negative_integer(s2, sep_pos)}; + y = year{to_non_negative_integer((sep_pos + 1), end)}; } } } - return true; + return year_month_day{y, m, d}; } /** * @brief Parses a string to extract the hour, minute, second and millisecond time field - * values. + * values of a day. * * Incoming format is expected to be `HH:MM:SS.MS`, with the latter second and millisecond fields * optional. Each time field can be a single, double, or triple (in the case of milliseconds) * digits. 12-hr and 24-hr time format is detected via the absence or presence of AM/PM characters * at the end. * - * @param[in] begin Pointer to the first element of the string - * @param[in] end Pointer to the first element after the string - * @param[out] hour The hour value - * @param[out] minute The minute value - * @param[out] second The second value (0 if not present) - * @param[out] millisecond The millisecond (0 if not present) + * @param begin Pointer to the first element of the string + * @param end Pointer to the first element after the string + * @return Extracted hours, minutes, seconds and milliseconds of `chrono::hh_mm_ss` type with a + * precision of milliseconds */ -__inline__ __device__ void extract_time( - char const* begin, char const* end, int* hour, int* minute, int* second, int* millisecond) +__inline__ __device__ cuda::std::chrono::hh_mm_ss extract_time_of_day( + char const* begin, char const* end) { constexpr char sep = ':'; // Adjust for AM/PM and any whitespace before - int hour_adjust = 0; - auto last = end - 1; + duration_h d_h{0}; + auto last = end - 1; if (*last == 'M' || *last == 'm') { - if (*(last - 1) == 'P' || *(last - 1) == 'p') { hour_adjust = 12; } + if (*(last - 1) == 'P' || *(last - 1) == 'p') { d_h = duration_h{12}; } last = last - 2; while (*last == ' ') { --last; @@ -240,94 +165,89 @@ __inline__ __device__ void extract_time( // Find hour-minute separator const auto hm_sep = thrust::find(thrust::seq, begin, end, sep); - *hour = to_non_negative_integer(begin, hm_sep) + hour_adjust; + // Extract hours + d_h += cudf::duration_h{to_non_negative_integer(begin, hm_sep)}; + + duration_m d_m{0}; + duration_s d_s{0}; + duration_ms d_ms{0}; // Find minute-second separator (if present) const auto ms_sep = thrust::find(thrust::seq, hm_sep + 1, end, sep); if (ms_sep == end) { - *minute = to_non_negative_integer(hm_sep + 1, end); - *second = 0; - *millisecond = 0; + d_m = duration_m{to_non_negative_integer(hm_sep + 1, end)}; } else { - *minute = to_non_negative_integer(hm_sep + 1, ms_sep); + d_m = duration_m{to_non_negative_integer(hm_sep + 1, ms_sep)}; // Find second-millisecond separator (if present) const auto sms_sep = thrust::find(thrust::seq, ms_sep + 1, end, '.'); if (sms_sep == end) { - *second = to_non_negative_integer(ms_sep + 1, end); - *millisecond = 0; + d_s = duration_s{to_non_negative_integer(ms_sep + 1, end)}; } else { - *second = to_non_negative_integer(ms_sep + 1, sms_sep); - *millisecond = to_non_negative_integer(sms_sep + 1, end); + d_s = duration_s{to_non_negative_integer(ms_sep + 1, sms_sep)}; + d_ms = duration_ms{to_non_negative_integer(sms_sep + 1, end)}; } } + return cuda::std::chrono::hh_mm_ss{d_h + d_m + d_s + d_ms}; } /** - * @brief Parses a date string into a `date32`, days since epoch. - * - * This function takes a string and produces a `date32` representation. - * Acceptable formats are a combination of `MM/YYYY` and `MM/DD/YYYY`. - * - * @param[in] begin Pointer to the first element of the string - * @param[in] end Pointer to the first element after the string - * @param[in] dayfirst Flag to indicate that day is the first field - `DD/MM/YYYY` - * @return Number of days since epoch + * @brief Checks whether `c` is decimal digit */ -__inline__ __device__ int32_t to_date(char const* begin, char const* end, bool dayfirst) -{ - int day, month, year; - - return extract_date(begin, end, dayfirst, &year, &month, &day) - ? days_since_epoch(year, month, day) - : -1; -} +constexpr bool is_digit(char c) { return c >= '0' and c <= '9'; } /** - * @brief Parses a datetime string and computes the number of milliseconds since epoch. + * @brief Parses a datetime string and computes the corresponding timestamp. * - * This function takes a string and produces a `date32` representation. - * Acceptable formats are a combination of `MM/YYYY` and `MM/DD/YYYY`. + * Acceptable date formats are a combination of `YYYY`, `M`, `MM`, `D` and `DD` with `/` or `-` as + * separators. Input with only year and month (no day) is also valid. Character `T` or blank space + * is expected to be the separator between date and time of day. Optional time of day information + * like hours, minutes, seconds and milliseconds are expected to be `HH:MM:SS.MS`. Each time field + * can be a single, double, or triple (in the case of milliseconds) digits. 12-hr and 24-hr time + * format is detected via the absence or presence of AM/PM characters at the end. * + * @tparam timestamp_type Type of output timestamp * @param begin Pointer to the first element of the string * @param end Pointer to the first element after the string * @param dayfirst Flag to indicate day/month or month/day order - * @return Milliseconds since epoch + * @return Timestamp converted to `timestamp_type` */ -__inline__ __device__ int64_t to_date_time(char const* begin, char const* end, bool dayfirst) +template +__inline__ __device__ timestamp_type to_timestamp(char const* begin, char const* end, bool dayfirst) { - int day, month, year; - int hour, minute, second, millisecond = 0; - int64_t answer = -1; + using duration_type = typename timestamp_type::duration; + + auto sep_pos = end; // Find end of the date portion - // TODO: Refactor all the date/time parsing to remove multiple passes over each character because - // of find() then convert(); that can also avoid the ugliness below. - auto sep_pos = thrust::find(thrust::seq, begin, end, 'T'); - if (sep_pos == end) { - // Attempt to locate the position between date and time, ignore premature space separators - // around the day/month/year portions - int count = 0; - for (auto i = begin; i < end; ++i) { - if (count == 3 && *i == ' ') { - sep_pos = i; - break; - } else if ((*i == '/' || *i == '-') || (count == 2 && *i != ' ')) { - count++; - } + int count = 0; + bool digits_only = true; + for (auto i = begin; i < end; ++i) { + digits_only = digits_only and is_digit(*i); + if (*i == 'T') { + sep_pos = i; + break; + } else if (count == 3 && *i == ' ') { + sep_pos = i; + break; + } else if ((*i == '/' || *i == '-') || (count == 2 && *i != ' ')) { + count++; } } - // There is only date if there's no separator, otherwise it's malformed + // Exit if the input string is digit-only + if (digits_only) { + return timestamp_type{ + duration_type{to_non_negative_integer(begin, end)}}; + } + + auto ymd = extract_date(begin, sep_pos, dayfirst); + timestamp_type answer{cuda::std::chrono::sys_days{ymd}}; + + // Extract time only if separator is present if (sep_pos != end) { - if (extract_date(begin, sep_pos, dayfirst, &year, &month, &day)) { - extract_time(sep_pos + 1, end, &hour, &minute, &second, &millisecond); - answer = seconds_since_epoch(year, month, day, hour, minute, second) * 1000 + millisecond; - } - } else { - if (extract_date(begin, end, dayfirst, &year, &month, &day)) { - answer = seconds_since_epoch(year, month, day, 0, 0, 0) * 1000; - } + auto t = extract_time_of_day(sep_pos + 1, end); + answer += cuda::std::chrono::duration_cast(t.to_duration()); } return answer; @@ -382,58 +302,65 @@ __inline__ __device__ T parse_optional_integer(char const** begin, char const* e } /** - * @brief Parses the input string into a duration of the given type. + * @brief Parses the input string into a duration of `duration_type`. * + * The expected format can be one of the following: `DD days`, `DD days +HH:MM:SS.NS`, `DD days + * HH:MM::SS.NS`, `HH:MM::SS.NS` and digits-only string. Note `DD` and optional `NS` field can + * contain arbitrary number of digits while `HH`, `MM` and `SS` can be single or double digits. + * + * @tparam duration_type Type of the parsed duration * @param begin Pointer to the first element of the string * @param end Pointer to the first element after the string - * @return The parsed duration + * @return The parsed duration in `duration_type` */ -template -__inline__ __device__ int64_t to_time_delta(char const* begin, char const* end) +template +__inline__ __device__ duration_type to_duration(char const* begin, char const* end) { + using cuda::std::chrono::duration_cast; + // %d days [+]%H:%M:%S.n => %d days, %d days [+]%H:%M:%S, %H:%M:%S.n, %H:%M:%S, %value. constexpr char sep = ':'; - int32_t days{0}; - int8_t hour{0}; // single pass to parse days, hour, minute, seconds, nanosecond auto cur = begin; auto const value = parse_integer(&cur, end); cur = skip_spaces(cur, end); - if (std::is_same_v || cur >= end) { // %value - return value; + if (std::is_same_v || cur >= end) { + return duration_type{static_cast(value)}; } + // " days [+]" auto const after_days_sep = skip_if_starts_with(cur, end, "days"); auto const has_days_seperator = (after_days_sep != cur); cur = skip_spaces(after_days_sep, end); cur += (*cur == '+'); + + duration_D d_d{0}; + duration_h d_h{0}; if (has_days_seperator) { - days = value; - hour = parse_integer(&cur, end); + d_d = duration_D{value}; + d_h = duration_h{parse_integer(&cur, end)}; } else { - hour = value; + d_h = duration_h{value}; } - auto const minute = parse_optional_integer(&cur, end, sep); - auto const second = parse_optional_integer(&cur, end, sep); - - int nanosecond = 0; - if (std::is_same_v) { - return ((days * 24L + hour) * 60L + minute) * 60L + second; - } else if (*cur == '.') { //.n - auto const start_subsecond = ++cur; - nanosecond = parse_integer(&cur, end); - int8_t const num_digits = min(9L, cur - start_subsecond); - constexpr int64_t powers_of_ten[] = { - 1L, 10L, 100L, 1000L, 10000L, 100000L, 1000000L, 10000000L, 100000000L, 1000000000L}; - nanosecond *= powers_of_ten[9 - num_digits]; - } + duration_m d_m{parse_optional_integer(&cur, end, sep)}; + duration_s d_s{parse_optional_integer(&cur, end, sep)}; + + // Convert all durations to the given type + auto output_d = duration_cast(d_d + d_h + d_m + d_s); + + if constexpr (std::is_same_v) { return output_d; } + + auto const d_ns = (*cur != '.') ? duration_ns{0} : [&]() { + auto const start_subsecond = ++cur; + auto const unscaled_subseconds = parse_integer(&cur, end); + auto const scale = min(9L, cur - start_subsecond) - 9; + auto const rescaled = numeric::decimal64{unscaled_subseconds, numeric::scale_type{scale}}; + return duration_ns{rescaled.value()}; + }(); - return cuda::std::chrono::duration_cast( - cudf::duration_s{((days * 24L + hour) * 60L + minute) * 60L + second}) - .count() + - cuda::std::chrono::duration_cast(cudf::duration_ns{nanosecond}).count(); + return output_d + duration_cast(d_ns); } } // namespace io diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index de75ea6a51d..9a0c701ea49 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -135,9 +135,9 @@ struct column_to_strings_fn { // instead of column-wise; might be faster // // Note: Cannot pass `stream` to detail:: version of calls below, because they are - // not exposed in header (see, for example, detail::concatenate(tbl_view, separator, na_rep, mr, - // stream) is declared and defined in combine.cu); Possible solution: declare `extern`, or just - // declare a prototype inside `namespace cudf::strings::detail`; + // not exposed in header (see, for example, detail::concatenate(tbl_view, separator, na_rep, + // stream, mr) is declared and defined in combine.cu); Possible solution: declare `extern`, or + // just declare a prototype inside `namespace cudf::strings::detail`; // bools: // diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu index ba6bc30e0d4..673d9054631 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/json_gpu.cu @@ -22,7 +22,6 @@ #include #include -#include #include #include #include @@ -125,7 +124,8 @@ __inline__ __device__ T decode_value(const char* begin, * * @return The parsed numeric value */ -template +template () and !cudf::is_duration()>* = nullptr> __inline__ __device__ T decode_value(const char* begin, const char* end, parse_options_view const& opts) @@ -133,110 +133,22 @@ __inline__ __device__ T decode_value(const char* begin, return cudf::io::parse_numeric(begin, end, opts); } -/** - * @brief Decodes a timestamp_D - * - * @param[in] begin Beginning of the character string - * @param[in] end End of the character string - * @param opts The global parsing behavior options - * - * @return The parsed timestamp_D - */ -template <> -__inline__ __device__ cudf::timestamp_D decode_value(const char* begin, - const char* end, - parse_options_view const& opts) -{ - return cudf::timestamp_D{cudf::duration_D{to_date(begin, end, opts.dayfirst)}}; -} - -/** - * @brief Decodes a timestamp_s - * - * @param[in] begin Beginning of the character string - * @param[in] end End of the character string - * @param opts The global parsing behavior options - * - * @return The parsed timestamp_s - */ -template <> -__inline__ __device__ cudf::timestamp_s decode_value(const char* begin, - const char* end, - parse_options_view const& opts) -{ - auto milli = to_date_time(begin, end, opts.dayfirst); - return cudf::timestamp_s{cudf::duration_s{milli / 1000}}; -} - -/** - * @brief Decodes a timestamp_ms - * - * @param[in] begin Beginning of the character string - * @param[in] end End of the character string - * @param opts The global parsing behavior options - * - * @return The parsed timestamp_ms - */ -template <> -__inline__ __device__ cudf::timestamp_ms decode_value(const char* begin, - const char* end, - parse_options_view const& opts) -{ - auto milli = to_date_time(begin, end, opts.dayfirst); - return cudf::timestamp_ms{cudf::duration_ms{milli}}; -} - -/** - * @brief Decodes a timestamp_us - * - * @param[in] begin Beginning of the character string - * @param[in] end End of the character string - * @param opts The global parsing behavior options - * - * @return The parsed timestamp_us - */ -template <> -__inline__ __device__ cudf::timestamp_us decode_value(const char* begin, - const char* end, - parse_options_view const& opts) +template ()>* = nullptr> +__inline__ __device__ T decode_value(char const* begin, + char const* end, + parse_options_view const& opts) { - auto milli = to_date_time(begin, end, opts.dayfirst); - return cudf::timestamp_us{cudf::duration_us{milli * 1000}}; + return to_timestamp(begin, end, opts.dayfirst); } -/** - * @brief Decodes a timestamp_ns - * - * @param[in] begin Beginning of the character string - * @param[in] end End of the character string - * @param opts The global parsing behavior options - * - * @return The parsed timestamp_ns - */ -template <> -__inline__ __device__ cudf::timestamp_ns decode_value(const char* begin, - const char* end, - parse_options_view const& opts) +template ()>* = nullptr> +__inline__ __device__ T decode_value(char const* begin, + char const* end, + parse_options_view const& opts) { - auto milli = to_date_time(begin, end, opts.dayfirst); - return cudf::timestamp_ns{cudf::duration_ns{milli * 1000000}}; + return to_duration(begin, end); } -#ifndef DURATION_DECODE_VALUE -#define DURATION_DECODE_VALUE(Type) \ - template <> \ - __inline__ __device__ Type decode_value( \ - const char* begin, const char* end, parse_options_view const&) \ - { \ - return Type{to_time_delta(begin, end)}; \ - } -#endif -DURATION_DECODE_VALUE(duration_D) -DURATION_DECODE_VALUE(duration_s) -DURATION_DECODE_VALUE(duration_ms) -DURATION_DECODE_VALUE(duration_us) -DURATION_DECODE_VALUE(duration_ns) - // The purpose of these is merely to allow compilation ONLY template <> __inline__ __device__ cudf::string_view decode_value(const char*, diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index b15c5a0941d..d05bec92166 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -24,6 +24,7 @@ #include "timezone.cuh" #include +#include #include "orc.h" #include @@ -91,20 +92,6 @@ constexpr type_id to_type_id(const orc::SchemaType& schema, return type_id::EMPTY; } -/** - * @brief Function that translates cuDF time unit to ORC clock frequency - */ -constexpr int32_t to_clockrate(type_id timestamp_type_id) -{ - switch (timestamp_type_id) { - case type_id::TIMESTAMP_SECONDS: return 1; - case type_id::TIMESTAMP_MILLISECONDS: return 1000; - case type_id::TIMESTAMP_MICROSECONDS: return 1000000; - case type_id::TIMESTAMP_NANOSECONDS: return 1000000000; - default: return 0; - } -} - constexpr std::pair get_index_type_and_pos( const orc::StreamKind kind, uint32_t skip_count, bool non_child) { @@ -822,8 +809,8 @@ void update_null_mask(cudf::detail::hostdevice_2dvector& chunks } } - thrust::counting_iterator col_idx_it(0); - thrust::counting_iterator stripe_idx_it(0); + thrust::counting_iterator col_idx_it(0); + thrust::counting_iterator stripe_idx_it(0); if (is_mask_updated) { // Update chunks with pointers to column data which might have been changed. @@ -892,8 +879,8 @@ void reader::impl::decode_stream_data(cudf::detail::hostdevice_2dvector col_idx_it(0); - thrust::counting_iterator stripe_idx_it(0); + thrust::counting_iterator col_idx_it(0); + thrust::counting_iterator stripe_idx_it(0); // Update chunks with pointers to column data std::for_each(stripe_idx_it, stripe_idx_it + num_stripes, [&](auto stripe_idx) { diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index f566a4b53b5..c932cda80d5 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -17,6 +17,7 @@ #include #include #include + #include "orc_common.h" #include "orc_gpu.h" @@ -1744,9 +1745,10 @@ __global__ void __launch_bounds__(block_size) break; case DATE: if (s->chunk.dtype_len == 8) { - // Convert from days to milliseconds by multiplying by 24*3600*1000 + cudf::duration_D days{s->vals.i32[t + vals_skipped]}; + // Convert from days to milliseconds static_cast(data_out)[row] = - 86400000ll * (int64_t)s->vals.i32[t + vals_skipped]; + cuda::std::chrono::duration_cast(days).count(); } else { static_cast(data_out)[row] = s->vals.u32[t + vals_skipped]; } @@ -1787,13 +1789,17 @@ __global__ void __launch_bounds__(block_size) seconds += get_gmt_offset(tz_table.ttimes, tz_table.offsets, seconds); } if (seconds < 0 && nanos != 0) { seconds -= 1; } - if (s->chunk.ts_clock_rate) + if (s->chunk.ts_clock_rate) { + duration_ns d_ns{nanos}; + d_ns += duration_s{seconds}; static_cast(data_out)[row] = - seconds * s->chunk.ts_clock_rate + - (nanos + (499999999 / s->chunk.ts_clock_rate)) / - (1000000000 / s->chunk.ts_clock_rate); // Output to desired clock rate - else - static_cast(data_out)[row] = seconds * 1000000000 + nanos; + d_ns.count() * s->chunk.ts_clock_rate / + duration_ns::period::den; // Output to desired clock rate + } else { + cudf::duration_s d{seconds}; + static_cast(data_out)[row] = + cuda::std::chrono::duration_cast(d).count() + nanos; + } break; } } diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index cc7e22f2042..c8ed0e36966 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -614,12 +615,6 @@ inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, } } -/** - * @brief Timestamp scale table (powers of 10) - */ -static const __device__ __constant__ int32_t kTimeScale[10] = { - 1000000000, 100000000, 10000000, 1000000, 100000, 10000, 1000, 100, 10, 1}; - template static __device__ void encode_null_mask(orcenc_state_s* s, bitmask_type const* pushdown_mask, @@ -808,7 +803,7 @@ __global__ void __launch_bounds__(block_size) case BYTE: s->vals.u8[nz_idx] = column.element(row); break; case TIMESTAMP: { int64_t ts = column.element(row); - int32_t ts_scale = kTimeScale[min(s->chunk.scale, 9)]; + int32_t ts_scale = powers_of_ten[9 - min(s->chunk.scale, 9)]; int64_t seconds = ts / ts_scale; int64_t nanos = (ts - seconds * ts_scale); // There is a bug in the ORC spec such that for negative timestamps, it is understood @@ -822,7 +817,7 @@ __global__ void __launch_bounds__(block_size) if (nanos != 0) { // Trailing zeroes are encoded in the lower 3-bits uint32_t zeroes = 0; - nanos *= kTimeScale[9 - min(s->chunk.scale, 9)]; + nanos *= powers_of_ten[min(s->chunk.scale, 9)]; if (!(nanos % 100)) { nanos /= 100; zeroes = 1; diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp index f5bda3401c0..77fde0d1e75 100644 --- a/cpp/src/io/orc/timezone.cpp +++ b/cpp/src/io/orc/timezone.cpp @@ -315,14 +315,6 @@ dst_transition_s posix_parser::parse_transition() return {0, 0, 0, 0, time}; } -/** - * @brief Checks if a given year is a leap year. - */ -static bool is_leap_year(uint32_t year) -{ - return ((year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0))); -} - /** * @brief Returns the number of days in a month. */ @@ -346,10 +338,11 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year) { auto day = trans.day; + auto const is_leap = cuda::std::chrono::year{year}.is_leap(); + if (trans.type == 'M') { - auto const is_leap = is_leap_year(year); - auto const month = std::min(std::max(trans.month, 1), 12); - auto week = std::min(std::max(trans.week, 1), 52); + auto const month = std::min(std::max(trans.month, 1), 12); + auto week = std::min(std::max(trans.week, 1), 52); // Year-to-year day adjustment auto const adjusted_month = (month + 9) % 12 + 1; @@ -372,10 +365,10 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year) } } else if (trans.type == 'J') { // Account for 29th of February on leap years - day += (day > 31 + 29 && is_leap_year(year)); + day += (day > 31 + 29 && is_leap); } - return trans.time + day * day_seconds; + return trans.time + cuda::std::chrono::duration_cast(duration_D{day}).count(); } timezone_table build_timezone_transition_table(std::string const& timezone_name, @@ -445,7 +438,7 @@ timezone_table build_timezone_transition_table(std::string const& timezone_name, // Add entries to fill the transition cycle int64_t year_timestamp = 0; - for (uint32_t year = 1970; year < 1970 + cycle_years; ++year) { + for (int32_t year = 1970; year < 1970 + cycle_years; ++year) { auto const dst_start_time = get_transition_time(dst_start, year); auto const dst_end_time = get_transition_time(dst_end, year); @@ -461,7 +454,9 @@ timezone_table build_timezone_transition_table(std::string const& timezone_name, std::swap(offsets.rbegin()[0], offsets.rbegin()[1]); } - year_timestamp += (365 + is_leap_year(year)) * day_seconds; + year_timestamp += cuda::std::chrono::duration_cast( + duration_D{365 + cuda::std::chrono::year{year}.is_leap()}) + .count(); } rmm::device_uvector d_ttimes{ttimes.size(), stream}; diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh index f4a2ef4fc49..e091efef072 100644 --- a/cpp/src/io/orc/timezone.cuh +++ b/cpp/src/io/orc/timezone.cuh @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -36,11 +37,11 @@ struct timezone_table_view { cudf::device_span offsets; }; -static constexpr int64_t day_seconds = 24 * 60 * 60; // Cycle in which the time offsets repeat -static constexpr uint32_t cycle_years = 400; +static constexpr int32_t cycle_years = 400; // Number of seconds in 400 years -static constexpr int64_t cycle_seconds = (365 * 400 + (100 - 3)) * day_seconds; +static constexpr int64_t cycle_seconds = + cuda::std::chrono::duration_cast(duration_D{365 * cycle_years + (100 - 3)}).count(); // Two entries per year, over the length of the cycle static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years; diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index f8158eaa6e9..ebc655578f7 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -627,6 +627,8 @@ inline __device__ void gpuStoreOutput(uint2* dst, */ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, int src_pos, int64_t* dst) { + using cuda::std::chrono::duration_cast; + const uint8_t* src8; uint32_t dict_pos, dict_size = s->dict_size, ofs; int64_t ts; @@ -646,7 +648,7 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, int src ofs <<= 3; // bytes -> bits if (dict_pos + 4 < dict_size) { uint3 v; - int64_t nanos, secs, days; + int64_t nanos, days; v.x = *reinterpret_cast(src8 + dict_pos + 0); v.y = *reinterpret_cast(src8 + dict_pos + 4); v.z = *reinterpret_cast(src8 + dict_pos + 8); @@ -661,13 +663,15 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, int src nanos |= v.x; // Convert from Julian day at noon to UTC seconds days = static_cast(v.z); - secs = (days - 2440588) * - (24 * 60 * 60); // TBD: Should be noon instead of midnight, but this matches pyarrow - if (s->col.ts_clock_rate) - ts = (secs * s->col.ts_clock_rate) + - nanos / (1000000000 / s->col.ts_clock_rate); // Output to desired clock rate - else - ts = (secs * 1000000000) + nanos; + cudf::duration_D d{ + days - 2440588}; // TBD: Should be noon instead of midnight, but this matches pyarrow + if (s->col.ts_clock_rate) { + int64_t secs = duration_cast(d).count() + + duration_cast(cudf::duration_ns{nanos}).count(); + ts = secs * s->col.ts_clock_rate; // Output to desired clock rate + } else { + ts = duration_cast(d).count() + nanos; + } } else { ts = 0; } @@ -999,11 +1003,14 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s, case INT64: if (s->col.ts_clock_rate) { int32_t units = 0; - if (s->col.converted_type == TIME_MICROS || s->col.converted_type == TIMESTAMP_MICROS) - units = 1000000; + if (s->col.converted_type == TIME_MICROS || s->col.converted_type == TIMESTAMP_MICROS) { + units = cudf::timestamp_us::period::den; + } + else if (s->col.converted_type == TIME_MILLIS || - s->col.converted_type == TIMESTAMP_MILLIS) - units = 1000; + s->col.converted_type == TIMESTAMP_MILLIS) { + units = cudf::timestamp_ms::period::den; + } if (units && units != s->col.ts_clock_rate) s->ts_scale = (s->col.ts_clock_rate < units) ? -(units / s->col.ts_clock_rate) : (s->col.ts_clock_rate / units); diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 5b1d8c846bf..e79a19fc2e9 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -712,6 +712,13 @@ static __device__ void PlainBoolEncode(page_enc_state_s* s, } } +/** + * @brief Determines the difference between the Proleptic Gregorian Calendar epoch (1970-01-01 + * 00:00:00 UTC) and the Julian date epoch (-4713-11-24 12:00:00 UTC). + * + * @return The difference between two epochs in `cuda::std::chrono::duration` format with a period + * of hours. + */ constexpr auto julian_calendar_epoch_diff() { using namespace cuda::std::chrono; @@ -720,22 +727,21 @@ constexpr auto julian_calendar_epoch_diff() } /** - * @brief Converts a sys_time into a pair with nanoseconds since midnight and number of - * Julian days. Does not deal with time zones. Used by INT96 code. + * @brief Converts a timestamp_ns into a pair with nanoseconds since midnight and number of Julian + * days. Does not deal with time zones. Used by INT96 code. * * @param ns number of nanoseconds since epoch * @return std::pair where nanoseconds is the number of nanoseconds * elapsed in the day and days is the number of days from Julian epoch. */ -static __device__ std::pair -convert_nanoseconds(cuda::std::chrono::sys_time const ns) +static __device__ std::pair convert_nanoseconds(timestamp_ns const ns) { using namespace cuda::std::chrono; auto const nanosecond_ticks = ns.time_since_epoch(); auto const gregorian_days = floor(nanosecond_ticks); auto const julian_days = gregorian_days + ceil(julian_calendar_epoch_diff()); - auto const last_day_ticks = nanosecond_ticks - duration_cast(gregorian_days); + auto const last_day_ticks = nanosecond_ticks - gregorian_days; return {last_day_ticks, julian_days}; } @@ -1038,19 +1044,17 @@ __global__ void __launch_bounds__(128, 8) } auto const ret = convert_nanoseconds([&]() { - using namespace cuda::std::chrono; - switch (s->col.leaf_column->type().id()) { case type_id::TIMESTAMP_SECONDS: case type_id::TIMESTAMP_MILLISECONDS: { - return sys_time{milliseconds{v}}; + return timestamp_ns{duration_ms{v}}; } break; case type_id::TIMESTAMP_MICROSECONDS: case type_id::TIMESTAMP_NANOSECONDS: { - return sys_time{microseconds{v}}; + return timestamp_ns{duration_us{v}}; } break; } - return sys_time{microseconds{0}}; + return timestamp_ns{duration_ns{0}}; }()); // the 12 bytes of fixed length data. diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index a8dfabd9514..06a696d6751 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -22,6 +22,7 @@ #include "reader_impl.hpp" #include +#include #include #include @@ -180,24 +181,6 @@ type_id to_type_id(SchemaElement const& schema, return type_id::EMPTY; } -/** - * @brief Function that translates cuDF time unit to Parquet clock frequency - */ -constexpr int32_t to_clockrate(type_id timestamp_type_id) -{ - switch (timestamp_type_id) { - case type_id::DURATION_SECONDS: return 1; - case type_id::DURATION_MILLISECONDS: return 1000; - case type_id::DURATION_MICROSECONDS: return 1000000; - case type_id::DURATION_NANOSECONDS: return 1000000000; - case type_id::TIMESTAMP_SECONDS: return 1; - case type_id::TIMESTAMP_MILLISECONDS: return 1000; - case type_id::TIMESTAMP_MICROSECONDS: return 1000000; - case type_id::TIMESTAMP_NANOSECONDS: return 1000000000; - default: return 0; - } -} - /** * @brief Function that returns the required the number of bits to store a value */ @@ -207,6 +190,11 @@ T required_bits(uint32_t max_level) return static_cast(CompactProtocolReader::NumRequiredBits(max_level)); } +/** + * @brief Converts cuDF units to Parquet units. + * + * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type. + */ std::tuple conversion_info(type_id column_type_id, type_id timestamp_type_id, parquet::Type physical, diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 99b6410c44d..e71cd063e70 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -85,8 +85,8 @@ class writer::impl { * @param filepath Filepath if storing dataset to a file * @param options Settings for controlling behavior * @param mode Option to write at once or in chunks - * @param mr Device memory resource to use for device memory allocation * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation */ explicit impl(std::unique_ptr sink, chunked_parquet_writer_options const& options, diff --git a/cpp/src/io/utilities/time_utils.cuh b/cpp/src/io/utilities/time_utils.cuh new file mode 100644 index 00000000000..687766c1bcc --- /dev/null +++ b/cpp/src/io/utilities/time_utils.cuh @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace cudf { +namespace io { + +/** + * @brief Lookup table to compute power of ten + */ +static const __device__ __constant__ int32_t powers_of_ten[10] = { + 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000}; + +struct get_period { + template + constexpr int32_t operator()() + { + if constexpr (is_chrono()) { return T::period::den; } + CUDF_FAIL("Invalid, non chrono type"); + } +}; + +/** + * @brief Function that translates cuDF time unit to clock frequency + */ +constexpr int32_t to_clockrate(type_id timestamp_type_id) +{ + return timestamp_type_id == type_id::EMPTY + ? 0 + : type_dispatcher(data_type{timestamp_type_id}, get_period{}); +} + +} // namespace io +} // namespace cudf diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 636729a735e..07ad2e052f1 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -17,11 +17,12 @@ #include #include +#include #include -#include -#include +#include #include +#include #include #include diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index dd21a22803b..dfb1af3cef1 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -16,8 +16,6 @@ #pragma once #include -#include -#include #include #include #include diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 740431b8563..db79075d864 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu index 4b84d80f6a0..4bef312b396 100644 --- a/cpp/src/lists/combine/concatenate_list_elements.cu +++ b/cpp/src/lists/combine/concatenate_list_elements.cu @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -202,9 +202,9 @@ std::unique_ptr gather_list_entries(column_view const& input, }); auto result = cudf::detail::gather(table_view{{entry_col}}, - gather_map.begin(), - gather_map.end(), + gather_map, out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, stream, mr); return std::move(result->release()[0]); diff --git a/cpp/src/lists/copying/copying.cu b/cpp/src/lists/copying/copying.cu index ff4649f4945..d4a3d5555a6 100644 --- a/cpp/src/lists/copying/copying.cu +++ b/cpp/src/lists/copying/copying.cu @@ -13,8 +13,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include +#include #include -#include +#include +#include #include #include diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu index da20cabdd8f..77d41c5ddc9 100644 --- a/cpp/src/lists/copying/segmented_gather.cu +++ b/cpp/src/lists/copying/segmented_gather.cu @@ -21,6 +21,7 @@ #include #include + #include namespace cudf { @@ -29,6 +30,7 @@ namespace detail { std::unique_ptr segmented_gather(lists_column_view const& value_column, lists_column_view const& gather_map, + out_of_bounds_policy bounds_policy, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -38,27 +40,38 @@ std::unique_ptr segmented_gather(lists_column_view const& value_column, CUDF_EXPECTS(value_column.size() == gather_map.size(), "Gather map and list column should be same size"); - auto gather_map_sliced_child = gather_map.get_sliced_child(stream); - auto const gather_map_size = gather_map_sliced_child.size(); - auto gather_index_begin = gather_map.offsets().begin() + 1 + gather_map.offset(); - auto gather_index_end = gather_index_begin + gather_map.size(); - auto value_offsets = value_column.offsets().begin() + value_column.offset(); - auto map_begin = cudf::detail::indexalator_factory::make_input_iterator(gather_map_sliced_child); + auto const gather_map_sliced_child = gather_map.get_sliced_child(stream); + auto const gather_map_size = gather_map_sliced_child.size(); + auto const gather_index_begin = gather_map.offsets_begin() + 1; + auto const gather_index_end = gather_map.offsets_end(); + auto const value_offsets = value_column.offsets_begin(); + auto const map_begin = + cudf::detail::indexalator_factory::make_input_iterator(gather_map_sliced_child); + auto const out_of_bounds = [] __device__(auto const index, auto const list_size) { + return index >= list_size || (index < 0 && -index > list_size); + }; // Calculate Flattened gather indices (value_offset[row]+sub_index - auto transformer = [value_offsets, map_begin, gather_index_begin, gather_index_end] __device__( - size_type index) -> size_type { + auto transformer = [value_offsets, + map_begin, + gather_index_begin, + gather_index_end, + bounds_policy, + out_of_bounds] __device__(size_type index) -> size_type { // Get each row's offset. (Each row is a list). auto offset_idx = thrust::upper_bound( thrust::seq, gather_index_begin, gather_index_end, gather_index_begin[-1] + index) - gather_index_begin; // Get each sub_index in list in each row of gather_map. - auto sub_index = map_begin[index]; - auto list_size = value_offsets[offset_idx + 1] - value_offsets[offset_idx]; - auto wrapped_sub_index = (sub_index % list_size + list_size) % list_size; + auto sub_index = map_begin[index]; + auto list_size = value_offsets[offset_idx + 1] - value_offsets[offset_idx]; + auto wrapped_sub_index = sub_index < 0 ? sub_index + list_size : sub_index; + auto constexpr null_idx = cuda::std::numeric_limits::max(); // Add sub_index to value_column offsets, to get gather indices of child of value_column - return value_offsets[offset_idx] + wrapped_sub_index - value_offsets[0]; + return (bounds_policy == out_of_bounds_policy::NULLIFY && out_of_bounds(sub_index, list_size)) + ? null_idx + : value_offsets[offset_idx] + wrapped_sub_index - value_offsets[0]; }; auto child_gather_index_begin = cudf::detail::make_counting_transform_iterator(0, transformer); @@ -66,7 +79,7 @@ std::unique_ptr segmented_gather(lists_column_view const& value_column, auto child_table = cudf::detail::gather(table_view({value_column.get_sliced_child(stream)}), child_gather_index_begin, child_gather_index_begin + gather_map_size, - out_of_bounds_policy::DONT_CHECK, + bounds_policy, stream, mr); auto child = std::move(child_table->release().front()); @@ -94,9 +107,11 @@ std::unique_ptr segmented_gather(lists_column_view const& value_column, std::unique_ptr segmented_gather(lists_column_view const& source_column, lists_column_view const& gather_map_list, + out_of_bounds_policy bounds_policy, rmm::mr::device_memory_resource* mr) { - return detail::segmented_gather(source_column, gather_map_list, rmm::cuda_stream_default, mr); + return detail::segmented_gather( + source_column, gather_map_list, bounds_policy, rmm::cuda_stream_default, mr); } } // namespace lists diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index e53ae4ff0c1..c547ca14f2d 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -18,13 +18,14 @@ #include #include -#include +#include #include #include #include #include #include #include +#include #include #include @@ -33,6 +34,7 @@ #include #include +#include #include namespace cudf { @@ -376,10 +378,10 @@ struct column_row_comparator_dispatch { } template ()>* = nullptr> - bool operator()(size_type i, size_type j) const + bool operator()(size_type, size_type) const { CUDF_FAIL( - "`column_row_comparator_dispatch` cannot operate on types that are not equally comparable."); + "column_row_comparator_dispatch cannot operate on types that are not equally comparable."); } }; @@ -543,13 +545,17 @@ std::vector> get_unique_entries_and_list_offsets( all_lists_entries.has_nulls(), stream); + auto gather_map = column_view(data_type{type_to_id()}, + static_cast(thrust::distance(output_begin, output_end)), + unique_indices.data()); + // Collect unique entries and entry list offsets. // The new null_count and bitmask of the unique entries will also be generated // by the gather function. return cudf::detail::gather(table_view{{all_lists_entries, entries_list_offsets}}, - output_begin, - output_end, + gather_map, cudf::out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, stream, mr) ->release(); diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu index 94b86b670b1..c8ef4912392 100644 --- a/cpp/src/lists/extract.cu +++ b/cpp/src/lists/extract.cu @@ -16,10 +16,11 @@ #include #include #include -#include +#include #include #include +#include #include @@ -97,9 +98,9 @@ std::unique_ptr extract_list_element(lists_column_view lists_column, // call gather on the child column auto result = cudf::detail::gather(table_view({child_column}), - d_gather_map, - d_gather_map + gather_map->size(), + gather_map->view(), out_of_bounds_policy::NULLIFY, // nullify-out-of-bounds + cudf::detail::negative_index_policy::NOT_ALLOWED, stream, mr) ->release(); diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index 67d71b7a39a..e8c56cdafd8 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu index 80beb6e715c..98ab713bfa2 100644 --- a/cpp/src/partitioning/round_robin.cu +++ b/cpp/src/partitioning/round_robin.cu @@ -68,8 +68,8 @@ namespace { * @Param[in] input The input table to be round-robin partitioned * @Param[in] num_partitions Number of partitions for the table * @Param[in] start_partition Index of the 1st partition - * @Param[in] mr Device memory resource used to allocate the returned table's device memory * @Param[in] stream CUDA stream used for device memory operations and kernel launches. + * @Param[in] mr Device memory resource used to allocate the returned table's device memory * * @Returns A std::pair consisting of a unique_ptr to the partitioned table and the partition * offsets for each partition within the table diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu index 25bf4a436ad..073b318b879 100644 --- a/cpp/src/quantiles/quantile.cu +++ b/cpp/src/quantiles/quantile.cu @@ -16,12 +16,13 @@ #include +#include #include -#include #include #include #include #include +#include #include #include #include @@ -30,6 +31,9 @@ #include #include +#include + +#include #include #include diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu index bf515342afb..59a614664c9 100644 --- a/cpp/src/reductions/minmax.cu +++ b/cpp/src/reductions/minmax.cu @@ -57,15 +57,14 @@ struct minmax_pair { * * @tparam Op Binary operator functor * @tparam InputIterator Input iterator Type + * @tparam OutputType Output scalar type * @param d_in input iterator * @param num_items number of items to reduce * @param binary_op binary operator used to reduce - * @param mr Device resource used for result allocation * @param stream CUDA stream to run kernels on. * @return rmm::device_scalar */ -template ::type> rmm::device_scalar reduce_device(InputIterator d_in, @@ -155,16 +154,19 @@ struct minmax_functor { if (col.has_nulls()) { auto pair_to_minmax = thrust::make_transform_iterator( make_pair_iterator(*device_col), create_minmax_with_nulls{}); - return reduce_device(pair_to_minmax, col.size(), minmax_binary_op{}, stream); + return reduce_device(pair_to_minmax, col.size(), minmax_binary_op{}, stream); } else { auto col_to_minmax = thrust::make_transform_iterator(device_col->begin(), create_minmax{}); - return reduce_device(col_to_minmax, col.size(), minmax_binary_op{}, stream); + return reduce_device(col_to_minmax, col.size(), minmax_binary_op{}, stream); } } /** * @brief Functor to copy a minmax_pair result to individual scalar instances. + * + * @tparam T type of the data + * @tparam ResultType result type to assign min, max to minmax_pair */ template > struct assign_min_max { @@ -246,6 +248,11 @@ struct minmax_functor { } // namespace +/** + * @copydoc cudf::minmax + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ std::pair, std::unique_ptr> minmax( cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -260,9 +267,6 @@ std::pair, std::unique_ptr> minmax( } } // namespace detail -/** - * @copydoc cudf::minmax - */ std::pair, std::unique_ptr> minmax( const column_view& col, rmm::mr::device_memory_resource* mr) { diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp index 699494c49c5..0d3ac2d366f 100644 --- a/cpp/src/reductions/reductions.cpp +++ b/cpp/src/reductions/reductions.cpp @@ -67,17 +67,17 @@ struct reduce_dispatch_functor { return reduction::standard_deviation(col, output_dtype, var_agg->_ddof, stream, mr); } break; case aggregation::MEDIAN: { - auto sorted_indices = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream, mr); + auto sorted_indices = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream); auto valid_sorted_indices = split(*sorted_indices, {col.size() - col.null_count()})[0]; auto col_ptr = - quantile(col, {0.5}, interpolation::LINEAR, valid_sorted_indices, true, stream, mr); + quantile(col, {0.5}, interpolation::LINEAR, valid_sorted_indices, true, stream); return get_element(*col_ptr, 0, stream, mr); } break; case aggregation::QUANTILE: { auto quantile_agg = dynamic_cast(agg.get()); CUDF_EXPECTS(quantile_agg->_quantiles.size() == 1, "Reduction quantile accepts only one quantile value"); - auto sorted_indices = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream, mr); + auto sorted_indices = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream); auto valid_sorted_indices = split(*sorted_indices, {col.size() - col.null_count()})[0]; auto col_ptr = quantile(col, @@ -85,8 +85,7 @@ struct reduce_dispatch_functor { quantile_agg->_interpolation, valid_sorted_indices, true, - stream, - mr); + stream); return get_element(*col_ptr, 0, stream, mr); } break; case aggregation::NUNIQUE: { diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh index e0dc219b767..c8345a30f79 100644 --- a/cpp/src/reductions/simple.cuh +++ b/cpp/src/reductions/simple.cuh @@ -41,8 +41,8 @@ namespace simple { * @tparam Op the operator of cudf::reduction::op:: * @param col Input column of data to reduce - * @param mr Device memory resource used to allocate the returned scalar's device memory * @param stream Used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned scalar's device memory * @return Output scalar in device memory */ template @@ -76,10 +76,10 @@ std::unique_ptr simple_reduction(column_view const& col, * * @tparam DecimalXX The `decimal32` or `decimal64` type * @tparam Op The operator of cudf::reduction::op:: + * * @param col Input column of data to reduce - - * @param mr Device memory resource used to allocate the returned scalar's device memory * @param stream Used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned scalar's device memory * @return Output scalar in device memory */ template @@ -124,10 +124,10 @@ std::unique_ptr fixed_point_reduction(column_view const& col, * @tparam ElementType The key type of the input dictionary column. * @tparam ResultType The output data-type for the resulting scalar * @tparam Op The operator of cudf::reduction::op:: - + * * @param col Input dictionary column of data to reduce - * @param mr Device memory resource used to allocate the returned scalar's device memory * @param stream Used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned scalar's device memory * @return Output scalar in device memory */ template @@ -376,8 +376,8 @@ struct element_type_dispatcher { * @tparam ElementType The input column type or key type. * @param col Input column (must be numeric) * @param output_type Requested type of the scalar result - * @param mr Device memory resource used to allocate the returned scalar's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned scalar's device memory */ template ()>* = nullptr> diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index 4fa42021bd2..2145dcc6b91 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include @@ -387,9 +387,9 @@ std::unique_ptr replace_nulls_policy_impl(cudf::column_view const& } auto output = cudf::detail::gather(cudf::table_view({input}), - gather_map.begin(), - gather_map.end(), + gather_map, cudf::out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, stream, mr); diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu index 30ff7b0549e..4cc8a84c868 100644 --- a/cpp/src/reshape/interleave_columns.cu +++ b/cpp/src/reshape/interleave_columns.cu @@ -15,11 +15,13 @@ */ #include -#include +#include #include #include +#include #include #include +#include #include #include diff --git a/cpp/src/rolling/lead_lag_nested_detail.cuh b/cpp/src/rolling/lead_lag_nested_detail.cuh index 4cff3053aa2..bde7101b9a9 100644 --- a/cpp/src/rolling/lead_lag_nested_detail.cuh +++ b/cpp/src/rolling/lead_lag_nested_detail.cuh @@ -18,11 +18,18 @@ #include #include +#include #include #include -#include -#include +#include +#include #include + +#include + +#include +#include + #include namespace cudf::detail { @@ -151,13 +158,12 @@ std::unique_ptr compute_lead_lag_for_nested(aggregation::Kind op, }); } - auto output_with_nulls = - cudf::detail::gather(table_view{std::vector{input}}, - gather_map_column->view().template begin(), - gather_map_column->view().end(), - out_of_bounds_policy::NULLIFY, - stream, - mr); + auto output_with_nulls = cudf::detail::gather(table_view{std::vector{input}}, + gather_map_column->view(), + out_of_bounds_policy::NULLIFY, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); if (default_outputs.is_empty()) { return std::move(output_with_nulls->release()[0]); } @@ -172,22 +178,22 @@ std::unique_ptr compute_lead_lag_for_nested(aggregation::Kind op, scatter_map.begin(), is_null_index_predicate(input.size(), gather_map.begin())); + scatter_map.resize(thrust::distance(scatter_map.begin(), scatter_map_end), stream); // Bail early, if all LEAD/LAG computations succeeded. No defaults need be substituted. if (scatter_map.is_empty()) { return std::move(output_with_nulls->release()[0]); } // Gather only those default values that are to be substituted. auto gathered_defaults = cudf::detail::gather(table_view{std::vector{default_outputs}}, - scatter_map.begin(), - scatter_map_end, + scatter_map, out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, stream); // Scatter defaults into locations where LEAD/LAG computed nulls. auto scattered_results = cudf::detail::scatter( table_view{std::vector{gathered_defaults->release()[0]->view()}}, - scatter_map.begin(), - scatter_map_end, + scatter_map, table_view{std::vector{output_with_nulls->release()[0]->view()}}, false, stream, diff --git a/cpp/src/strings/find.cu b/cpp/src/strings/find.cu index 57d5d6afc75..8e40b7ce7e8 100644 --- a/cpp/src/strings/find.cu +++ b/cpp/src/strings/find.cu @@ -47,8 +47,8 @@ namespace { * @param start First character position to start the search. * @param stop Last character position (exclusive) to end the search. * @param pfn Functor used for locating `target` in each string. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New integer column with character position values. */ template @@ -172,8 +172,8 @@ namespace { * @param strings Column of strings to check for target. * @param target UTF-8 encoded string to check in strings column. * @param pfn Returns bool value if target is found in the given string. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New BOOL column. */ template @@ -233,8 +233,8 @@ std::unique_ptr contains_fn(strings_column_view const& strings, * @param strings Column of strings to check for `targets[i]`. * @param targets Column of strings to be checked in `strings[i]``. * @param pfn Returns bool value if target is found in the given string. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New BOOL column. */ template diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu index 72359ada8c9..82759a6c73f 100644 --- a/cpp/src/strings/substring.cu +++ b/cpp/src/strings/substring.cu @@ -192,8 +192,8 @@ struct substring_from_fn { * @param null_count Number of nulls for the output column. * @param starts Start positions index iterator. * @param stops Stop positions index iterator. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. */ std::unique_ptr compute_substrings_from_fn(column_device_view const& d_column, size_type null_count, diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu index d01d0a8cbbc..c316b2c6f4e 100644 --- a/cpp/src/unary/cast_ops.cu +++ b/cpp/src/unary/cast_ops.cu @@ -163,8 +163,8 @@ struct device_cast { * @tparam T Type of the `fixed_point` column_view (`decimal32` or `decimal64`) * @param input Input `column_view` * @param scale `scale` of the returned `column` - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory * * @return std::unique_ptr Returned column with new @p scale */ diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 088b0b747fb..d3475cbbed2 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -266,6 +266,7 @@ ConfigureTest(ITERATOR_TEST iterator/scalar_iterator_test.cu iterator/optional_iterator_test_chrono.cu iterator/optional_iterator_test_numeric.cu + iterator/indexalator_test.cu ) ################################################################################################### diff --git a/cpp/tests/copying/segmented_gather_list_tests.cpp b/cpp/tests/copying/segmented_gather_list_tests.cpp index b02d0ad387d..528986e2a8d 100644 --- a/cpp/tests/copying/segmented_gather_list_tests.cpp +++ b/cpp/tests/copying/segmented_gather_list_tests.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include template @@ -31,7 +32,7 @@ using FixedWidthTypesNotBool = cudf::test::Concat; -TYPED_TEST_CASE(SegmentedGatherTest, FixedWidthTypesNotBool); +TYPED_TEST_SUITE(SegmentedGatherTest, FixedWidthTypesNotBool); class SegmentedGatherTestList : public cudf::test::BaseFixture { }; @@ -42,6 +43,11 @@ class SegmentedGatherTestList : public cudf::test::BaseFixture { template using LCW = cudf::test::lists_column_wrapper; using cudf::lists_column_view; +using cudf::lists::detail::segmented_gather; +using cudf::test::iterators::no_nulls; +using cudf::test::iterators::null_at; +using cudf::test::iterators::nulls_at; +auto constexpr NULLIFY = cudf::out_of_bounds_policy::NULLIFY; TYPED_TEST(SegmentedGatherTest, Gather) { @@ -49,13 +55,23 @@ TYPED_TEST(SegmentedGatherTest, Gather) // List LCW list{{1, 2, 3, 4}, {5}, {6, 7}, {8, 9, 10}}; - LCW gather_map{{3, 2, 1, 0}, {0}, {0, 1}, {0, 2, 1}}; - LCW expected{{4, 3, 2, 1}, {5}, {6, 7}, {8, 10, 9}}; - auto results = - cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); + { + // Straight-line case. + auto const gather_map = LCW{{3, 2, 1, 0}, {0}, {0, 1}, {0, 2, 1}}; + auto const expected = LCW{{4, 3, 2, 1}, {5}, {6, 7}, {8, 10, 9}}; + auto const results = segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + } - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + { + // Nullify out-of-bounds values. + auto const gather_map = LCW{{3, 2, 4, 0}, {0}, {0, -3}, {0, 2, 1}}; + auto const expected = LCW{{{4, 3, 2, 1}, null_at(2)}, {5}, {{6, 7}, null_at(1)}, {8, 10, 9}}; + auto const results = + segmented_gather(lists_column_view{list}, lists_column_view{gather_map}, NULLIFY); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + } } TYPED_TEST(SegmentedGatherTest, GatherNothing) @@ -65,41 +81,31 @@ TYPED_TEST(SegmentedGatherTest, GatherNothing) // List { - LCW list{{1, 2, 3, 4}, {5}, {6, 7}, {8, 9, 10}}; - LCW gather_map{LCW{}, LCW{}, LCW{}, LCW{}}; - - auto results = - cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); - - LCW expected{LCW{}, LCW{}, LCW{}, LCW{}}; + auto const list = LCW{{1, 2, 3, 4}, {5}, {6, 7}, {8, 9, 10}}; + auto const gather_map = LCW{LCW{}, LCW{}, LCW{}, LCW{}}; + auto const results = segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); + auto const expected = LCW{LCW{}, LCW{}, LCW{}, LCW{}}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } // List> { - LCW list{{{1, 2, 3, 4}, {5}}, {{6, 7}}, {{}, {8, 9, 10}}}; - LCW gather_map{LCW{}, LCW{}, LCW{}}; - - auto results = - cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); + auto const list = LCW{{{1, 2, 3, 4}, {5}}, {{6, 7}}, {{}, {8, 9, 10}}}; + auto const gather_map = LCW{LCW{}, LCW{}, LCW{}}; + auto const results = segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); // hack to get column of empty list of list - LCW expected_dummy{{{1, 2, 3, 4}, {5}}, LCW{}, LCW{}, LCW{}}; - auto expected = cudf::split(expected_dummy, {1})[1]; + auto const expected_dummy = LCW{{{1, 2, 3, 4}, {5}}, LCW{}, LCW{}, LCW{}}; + auto const expected = cudf::split(expected_dummy, {1})[1]; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } - // List>> { - LCW list{{{{1, 2, 3, 4}, {5}}}, {{{6, 7}, {8, 9, 10}}}}; - LCW gather_map{LCW{}, LCW{}}; - - auto results = - cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); - - LCW expected_dummy{{{{1, 2, 3, 4}}}, // hack to get column of empty list of list of list - LCW{}, - LCW{}}; - auto expected = cudf::split(expected_dummy, {1})[1]; + auto const list = LCW{{{{1, 2, 3, 4}, {5}}}, {{{6, 7}, {8, 9, 10}}}}; + auto const gather_map = LCW{LCW{}, LCW{}}; + auto const results = segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); + // hack to get column of empty list of list of list + auto const expected_dummy = LCW{{{{1, 2, 3, 4}}}, LCW{}, LCW{}}; + auto const expected = cudf::split(expected_dummy, {1})[1]; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); // the result should preserve the full List>> hierarchy @@ -120,18 +126,29 @@ TYPED_TEST(SegmentedGatherTest, GatherNulls) { using T = TypeParam; - auto valids = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i % 2 == 0 ? true : false; }); + auto valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); // List - LCW list{{{1, 2, 3, 4}, valids}, {5}, {{6, 7}, valids}, {{8, 9, 10}, valids}}; - LCW gather_map{{0, 1}, LCW{}, {1}, {2, 1, 0}}; - - auto results = - cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); + auto const list = LCW{{{1, 2, 3, 4}, valids}, {5}, {{6, 7}, valids}, {{8, 9, 10}, valids}}; - LCW expected{{{1, 2}, valids}, LCW{}, {{7}, valids + 1}, {{10, 9, 8}, valids}}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + { + // Test gathering on lists that contain nulls. + auto const gather_map = LCW{{0, 1}, LCW{}, {1}, {2, 1, 0}}; + auto const results = segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); + auto const expected = + LCW{{{1, 2}, valids}, LCW{}, {{7}, valids + 1}, {{10, 9, 8}, valids}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + } + { + // Test gathering on lists that contain nulls, with out-of-bounds indices. + auto const gather_map = LCW{{10, -10}, LCW{}, {1}, {2, -10, 0}}; + auto const results = + segmented_gather(lists_column_view{list}, lists_column_view{gather_map}, NULLIFY); + auto const expected = + LCW{{{0, 0}, nulls_at({0, 1})}, LCW{}, {{7}, valids + 1}, {{10, 0, 8}, null_at(1)}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + } } TYPED_TEST(SegmentedGatherTest, GatherNested) @@ -140,39 +157,76 @@ TYPED_TEST(SegmentedGatherTest, GatherNested) // List> { - LCW list{{{2, 3}, {4, 5}}, - {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}, - {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {-17, -18}}}; - LCW gather_map{{0, 2, -2}, {1}, {1, 0, -1, 5}}; - - auto results = - cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); + // clang-format off + auto const list = LCW{{{2, 3}, {4, 5}}, + {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}, + {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {-17, -18}}}; + auto const gather_map = LCW{{0, -2, -2}, {1}, {1, 0, -1, -5}}; + auto const results = segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); + auto const expected = LCW{{{2, 3}, {2, 3}, {2, 3}}, + {{9, 10, 11}}, + {{17, 18}, {15, 16}, {-17, -18}, {15, 16}}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + // clang-format on + } - LCW expected{ - {{2, 3}, {2, 3}, {2, 3}}, {{9, 10, 11}}, {{17, 18}, {15, 16}, {-17, -18}, {15, 16}}}; + // List>, with out-of-bounds gather indices. + { + // clang-format off + auto const list = LCW{{{2, 3}, {4, 5}}, + {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}, + {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {-17, -18}}}; + auto const gather_map = LCW{{0, 2, -2}, {1}, {1, 0, -1, -6}}; + auto const results = + segmented_gather(lists_column_view{list}, lists_column_view{gather_map}, NULLIFY); + auto const expected = LCW{{{{2, 3}, LCW{}, {2, 3}}, null_at(1)}, + {{9, 10, 11}}, + {{{17, 18}, {15, 16}, {-17, -18}, LCW{}}, null_at(3)}}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + // clang-format on } // List>> { - LCW list{{{{2, 3}, {4, 5}}, {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}}, - {{{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}}, - {{LCW{0}}}, - {{{10}, {20, 30, 40, 50}, {60, 70, 80}}, - {{0, 1, 3}, {5}}, - {{11, 12, 13, 14, 15}, {16, 17}, {0}}}, - {{{10, 20}}, {LCW{30}}, {{40, 50}, {60, 70, 80}}}}; - LCW gather_map{{1}, LCW{}, {0}, {1}, {0, -1, 1}}; - - auto results = - cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); + // clang-format off + auto const list = LCW{{{{2, 3}, {4, 5}}, {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}}, + {{{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}}, + {{LCW{0}}}, + {{{10}, {20, 30, 40, 50}, {60, 70, 80}}, + {{0, 1, 3}, {5}}, + {{11, 12, 13, 14, 15}, {16, 17}, {0}}}, + {{{10, 20}}, {LCW{30}}, {{40, 50}, {60, 70, 80}}}}; + auto const gather_map = LCW{{1}, LCW{}, {0}, {1}, {0, -1, 1}}; + auto const results = segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); + auto const expected = LCW{{{{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}}, + LCW{}, + {{LCW{0}}}, + {{{0, 1, 3}, {5}}}, + {{{10, 20}}, {{40, 50}, {60, 70, 80}}, {LCW{30}}}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + // clang-format on + } - LCW expected{{{{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}}, - LCW{}, - {{LCW{0}}}, - {{{0, 1, 3}, {5}}}, - {{{10, 20}}, {{40, 50}, {60, 70, 80}}, {LCW{30}}}}; + // List>>, with out-of-bounds gather indices. + { + auto const list = LCW{{{{2, 3}, {4, 5}}, {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}}, + {{{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}}, + {{LCW{0}}}, + {{{10}, {20, 30, 40, 50}, {60, 70, 80}}, + {{0, 1, 3}, {5}}, + {{11, 12, 13, 14, 15}, {16, 17}, {0}}}, + {{{10, 20}}, {LCW{30}}, {{40, 50}, {60, 70, 80}}}}; + auto const gather_map = LCW{{1}, LCW{}, {0}, {1}, {0, -1, 3, -4}}; + auto const results = + segmented_gather(lists_column_view{list}, lists_column_view{gather_map}, NULLIFY); + auto const expected = + LCW{{{{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}}, + LCW{}, + {{LCW{0}}}, + {{{0, 1, 3}, {5}}}, + {{{{10, 20}}, {{40, 50}, {60, 70, 80}}, LCW{}, LCW{}}, nulls_at({2, 3})}}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + // clang-format on } } @@ -182,19 +236,32 @@ TYPED_TEST(SegmentedGatherTest, GatherOutOfOrder) // List> { - LCW list{{{2, 3}, {4, 5}}, - {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}, - {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}}; - LCW gather_map{{1, 0}, {1, 2, 0}, {5, 4, 3, 2, 1, 0}}; - - auto results = - cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); - - LCW expected{{{4, 5}, {2, 3}}, - {{9, 10, 11}, {12, 13, 14}, {6, 7, 8}}, - {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}, {15, 16}}}; + // clang-format off + auto const list = LCW{{{2, 3}, {4, 5}}, + {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}, + {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}}; + auto const gather_map = LCW{{1, 0}, {1, 2, 0}, {4, 3, 2, 1, 0}}; + auto const results = segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); + auto const expected = LCW{{{4, 5}, {2, 3}}, + {{9, 10, 11}, {12, 13, 14}, {6, 7, 8}}, + {{17, 18}, {17, 18}, {17, 18}, {17, 18}, {15, 16}}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + // clang-format on + } + // List>, with out-of-bounds gather indices. + { + // clang-format off + auto const list = LCW{{{2, 3}, {4, 5}}, + {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}, + {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}}; + auto const gather_map = LCW{{1, 0}, {3, -1, -4}, {5, 4, 3, 2, 1, 0}}; + auto const results = segmented_gather(lists_column_view{list}, lists_column_view{gather_map}, NULLIFY); + auto const expected = LCW{{{4, 5}, {2, 3}}, + {{LCW{}, {12, 13, 14}, LCW{}}, nulls_at({0, 2})}, + {{LCW{}, {17, 18}, {17, 18}, {17, 18}, {17, 18}, {15, 16}}, null_at(0)}}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + // clang-format on } } @@ -204,19 +271,32 @@ TYPED_TEST(SegmentedGatherTest, GatherNegatives) // List> { - LCW list{{{2, 3}, {4, 5}}, - {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}, - {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}}; - LCW gather_map{{-1, 0}, {-2, -1, 0}, {-5, -4, -3, -2, -1, 0}}; - - auto results = - cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); - - LCW expected{{{4, 5}, {2, 3}}, - {{9, 10, 11}, {12, 13, 14}, {6, 7, 8}}, - {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}, {15, 16}}}; - + // clang-format off + auto const list = LCW{{{2, 3}, {4, 5}}, + {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}, + {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}}; + auto const gather_map = LCW{{-1, 0}, {-2, -1, 0}, {-5, -4, -3, -2, -1, 0}}; + auto const results = segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); + auto const expected = LCW{{{4, 5}, {2, 3}}, + {{9, 10, 11}, {12, 13, 14}, {6, 7, 8}}, + {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}, {15, 16}}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + // clang-format on + } + // List>, with out-of-bounds gather indices. + { + // clang-format off + auto const list = LCW{{{2, 3}, {4, 5}}, + {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}, + {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}}; + auto const gather_map = LCW{{-1, 0}, {-2, -1, -4}, {-6, -4, -3, -2, -1, 0}}; + auto const results = + segmented_gather(lists_column_view{list}, lists_column_view{gather_map}, NULLIFY); + auto const expected = LCW{{{4, 5}, {2, 3}}, + {{{9, 10, 11}, {12, 13, 14}, LCW{}}, null_at(2)}, + {{LCW{}, {17, 18}, {17, 18}, {17, 18}, {17, 18}, {15, 16}}, null_at(0)}}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + // clang-format on } } @@ -224,51 +304,43 @@ TYPED_TEST(SegmentedGatherTest, GatherNestedNulls) { using T = TypeParam; - auto valids = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i % 2 == 0 ? true : false; }); + auto valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); // List> { - LCW list{{{{2, 3}, valids}, {4, 5}}, - {{{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}, valids}, - {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}, - {{{{25, 26}, valids}, {27, 28}, {{29, 30}, valids}, {31, 32}, {33, 34}}, valids}}; - - LCW gather_map{{0, 1}, {0, 2}, LCW{}, {0, 1, 4}}; - - auto results = - cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); - - auto trues = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - LCW expected{{{{2, 3}, valids}, {4, 5}}, - {{{6, 7, 8}, {12, 13, 14}}, trues}, - LCW{}, - {{{{25, 26}, valids}, {27, 28}, {33, 34}}, valids}}; - + // clang-format off + auto const list = LCW{{{{2, 3}, valids}, {4, 5}}, + {{{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}, valids}, + {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}, + {{{{25, 26}, valids}, {27, 28}, {{29, 30}, valids}, {31, 32}, {33, 34}}, valids}}; + auto const gather_map = LCW{{0, 1}, {0, 2}, LCW{}, {0, 1, 4}}; + auto const results = segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); + auto const expected = LCW{{{{2, 3}, valids}, {4, 5}}, + {{{6, 7, 8}, {12, 13, 14}}, no_nulls()}, + LCW{}, + {{{{25, 26}, valids}, {27, 28}, {33, 34}}, valids}}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + // clang-format on } // List>>> { - LCW list{{{{{2, 3}, {4, 5}}, {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}}, - {{{15, 16}, {{27, 28}, valids}, {{37, 38}, valids}, {47, 48}, {57, 58}}}, - {{LCW{0}}}, - {{{10}, {20, 30, 40, 50}, {60, 70, 80}}, - {{0, 1, 3}, {5}}, - {{11, 12, 13, 14, 15}, {16, 17}, {0}}}, - {{{{{10, 20}, valids}}, {LCW{30}}, {{40, 50}, {60, 70, 80}}}, valids}}}; - - LCW gather_map{{1, 2, 4}}; - - auto results = - cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); - - LCW expected{{{{{15, 16}, {{27, 28}, valids}, {{37, 38}, valids}, {47, 48}, {57, 58}}}, - {{LCW{0}}}, - {{{{{10, 20}, valids}}, {LCW{30}}, {{40, 50}, {60, 70, 80}}}, valids}}}; - + // clang-format off + auto const list = LCW{{{{{2, 3}, {4, 5}}, {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}}, + {{{15, 16}, {{27, 28}, valids}, {{37, 38}, valids}, {47, 48}, {57, 58}}}, + {{LCW{0}}}, + {{{10}, {20, 30, 40, 50}, {60, 70, 80}}, + {{0, 1, 3}, {5}}, + {{11, 12, 13, 14, 15}, {16, 17}, {0}}}, + {{{{{10, 20}, valids}}, {LCW{30}}, {{40, 50}, {60, 70, 80}}}, valids}}}; + auto const gather_map = LCW{{1, 2, 4}}; + auto const results = segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); + auto const expected = LCW{{{{{15, 16}, {{27, 28}, valids}, {{37, 38}, valids}, {47, 48}, {57, 58}}}, + {{LCW{0}}}, + {{{{{10, 20}, valids}}, {LCW{30}}, {{40, 50}, {60, 70, 80}}}, valids}}}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + // clang-format on } } @@ -276,15 +348,11 @@ TYPED_TEST(SegmentedGatherTest, GatherNestedWithEmpties) { using T = TypeParam; - LCW list{{{2, 3}, LCW{}}, {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}, {LCW{}}}; - LCW gather_map{LCW{0}, LCW{0}, LCW{0}}; - - auto results = - cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); - - // skip one null, gather one null. - LCW expected{{{2, 3}}, {{6, 7, 8}}, {LCW{}}}; - + auto const list = LCW{{{2, 3}, LCW{}}, {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}, {LCW{}}}; + auto const gather_map = LCW{LCW{0}, LCW{0}, LCW{0}}; + auto results = segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); + auto const expected = + LCW{{{2, 3}}, {{6, 7, 8}}, {LCW{}}}; // skip one null, gather one null. CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); } @@ -292,7 +360,7 @@ TYPED_TEST(SegmentedGatherTest, GatherSliced) { using T = TypeParam; { - LCW a{ + auto const a = LCW{ {{1, 1, 1}, {2, 2}, {3, 3}}, {{4, 4, 4}, {5, 5}, {6, 6}}, {{7, 7, 7}, {8, 8}, {9, 9}}, @@ -302,23 +370,27 @@ TYPED_TEST(SegmentedGatherTest, GatherSliced) {{50, 50, 50, 50}, {6, 13}}, {{70, 70, 70, 70}, {80}}, }; - auto split_a = cudf::split(a, {3}); - - auto result0 = cudf::lists::detail::segmented_gather( - lists_column_view{split_a[0]}, lists_column_view{LCW{{1, 2}, {0, 2}, {0, 1}}}); - LCW expected0{ - {{2, 2}, {3, 3}}, - {{4, 4, 4}, {6, 6}}, - {{7, 7, 7}, {8, 8}}, - }; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected0, result0->view()); - - auto result1 = cudf::lists::detail::segmented_gather( - lists_column_view{split_a[1]}, - lists_column_view{LCW{{0, 1}, LCW{}, LCW{}, {0, 1}, LCW{}}}); - LCW expected1{ - {{10, 10, 10}, {11, 11}}, LCW{}, LCW{}, {{50, 50, 50, 50}, {6, 13}}, LCW{}}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view()); + auto const split_a = cudf::split(a, {3}); + + { + auto const gather_map = lists_column_view{LCW{{1, 2}, {0, 2}, {0, 1}}}; + auto const result = segmented_gather(lists_column_view{split_a[0]}, gather_map); + auto const expected = LCW{ + {{2, 2}, {3, 3}}, + {{4, 4, 4}, {6, 6}}, + {{7, 7, 7}, {8, 8}}, + }; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } + + { + auto const gather_map = + lists_column_view{LCW{{0, 1}, LCW{}, LCW{}, {0, 1}, LCW{}}}; + auto const result = segmented_gather(lists_column_view{split_a[1]}, gather_map); + auto const expected = + LCW{{{10, 10, 10}, {11, 11}}, LCW{}, LCW{}, {{50, 50, 50, 50}, {6, 13}}, LCW{}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } } auto valids = @@ -404,13 +476,24 @@ TEST_F(SegmentedGatherTestString, StringGather) { using T = cudf::string_view; // List - LCW list{{"a", "b", "c", "d"}, {"1", "22", "333", "4"}, {"x", "y", "z"}}; - LCW gather_map{{0, 1, 3, 2}, {1, 0, 3, 2}, LCW{}}; - LCW expected{{"a", "b", "d", "c"}, {"22", "1", "4", "333"}, LCW{}}; + { + auto const list = LCW{{"a", "b", "c", "d"}, {"1", "22", "333", "4"}, {"x", "y", "z"}}; + auto const gather_map = LCW{{0, 1, 3, 2}, {1, 0, 3, 2}, LCW{}}; + auto const expected = LCW{{"a", "b", "d", "c"}, {"22", "1", "4", "333"}, LCW{}}; + auto const result = segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } - auto result = - cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + // List, with out-of-order gather indices. + { + auto const list = LCW{{"a", "b", "c", "d"}, {"1", "22", "333", "4"}, {"x", "y", "z"}}; + auto const gather_map = LCW{{0, 1, 3, 4}, {1, -5, 3, 2}, LCW{}}; + auto const expected = LCW{{{"a", "b", "d", "c"}, cudf::test::iterators::null_at(3)}, + {{"22", "1", "4", "333"}, cudf::test::iterators::null_at(1)}, + LCW{}}; + auto result = segmented_gather(lists_column_view{list}, lists_column_view{gather_map}, NULLIFY); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } } using SegmentedGatherTestFloat = SegmentedGatherTest; @@ -419,28 +502,51 @@ TEST_F(SegmentedGatherTestFloat, GatherMapSliced) using T = float; // List - LCW list{{1, 2, 3, 4}, {5}, {6, 7}, {8, 9, 10}, {11, 12}, {13, 14, 15, 16}}; - LCW gather_map{{3, 2, 1, 0}, {0}, {0, 1}, {0, 2, 1}, {0}, {1}}; - // gather_map.offset: 0, 4, 5, 7, 10, 11, 12 - LCW expected{{4, 3, 2, 1}, {5}, {6, 7}, {8, 10, 9}, {11}, {14}}; + { + auto const list = LCW{{1, 2, 3, 4}, {5}, {6, 7}, {8, 9, 10}, {11, 12}, {13, 14, 15, 16}}; + auto const gather_map = LCW{{3, 2, 1, 0}, {0}, {0, 1}, {0, 2, 1}, {0}, {1}}; + // gather_map.offset: 0, 4, 5, 7, 10, 11, 12 + auto const expected = LCW{{4, 3, 2, 1}, {5}, {6, 7}, {8, 10, 9}, {11}, {14}}; + auto const results = segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); - auto results = - cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map}); + auto const sliced = cudf::split(list, {1, 4}); + auto const split_m = cudf::split(gather_map, {1, 4}); + auto const split_e = cudf::split(expected, {1, 4}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); - auto sliced = cudf::split(list, {1, 4}); - auto split_m = cudf::split(gather_map, {1, 4}); - auto split_e = cudf::split(expected, {1, 4}); - - auto result0 = cudf::lists::detail::segmented_gather(lists_column_view{sliced[0]}, - lists_column_view{split_m[0]}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(split_e[0], result0->view()); - auto result1 = cudf::lists::detail::segmented_gather(lists_column_view{sliced[1]}, - lists_column_view{split_m[1]}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(split_e[1], result1->view()); - auto result2 = cudf::lists::detail::segmented_gather(lists_column_view{sliced[2]}, - lists_column_view{split_m[2]}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(split_e[2], result2->view()); + auto result0 = segmented_gather(lists_column_view{sliced[0]}, lists_column_view{split_m[0]}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(split_e[0], result0->view()); + auto result1 = segmented_gather(lists_column_view{sliced[1]}, lists_column_view{split_m[1]}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(split_e[1], result1->view()); + auto result2 = segmented_gather(lists_column_view{sliced[2]}, lists_column_view{split_m[2]}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(split_e[2], result2->view()); + } + + // List, with out-of-bounds gather indices. + { + auto const list = LCW{{1, 2, 3, 4}, {5}, {6, 7}, {8, 9, 10}, {11, 12}, {13, 14, 15, 16}}; + auto const gather_map = LCW{{3, -5, 1, 0}, {0}, {0, 1}, {0, 2, 3}, {0}, {1}}; + // gather_map.offset: 0, 4, 5, 7, 10, 11, 12 + auto const expected = + LCW{{{4, 0, 2, 1}, null_at(1)}, {5}, {6, 7}, {{8, 10, 9}, null_at(2)}, {11}, {14}}; + auto results = + segmented_gather(lists_column_view{list}, lists_column_view{gather_map}, NULLIFY); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto const sliced = cudf::split(list, {1, 4}); + auto const split_m = cudf::split(gather_map, {1, 4}); + auto const split_e = cudf::split(expected, {1, 4}); + + auto const result0 = + segmented_gather(lists_column_view{sliced[0]}, lists_column_view{split_m[0]}, NULLIFY); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(split_e[0], result0->view()); + auto const result1 = + segmented_gather(lists_column_view{sliced[1]}, lists_column_view{split_m[1]}, NULLIFY); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(split_e[1], result1->view()); + auto const result2 = + segmented_gather(lists_column_view{sliced[2]}, lists_column_view{split_m[2]}, NULLIFY); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(split_e[2], result2->view()); + } } TEST_F(SegmentedGatherTestFloat, Fails) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index e83592a028a..121bd1e2c99 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -365,6 +365,7 @@ TEST_F(JsonReaderTest, Durations) std::ofstream outfile(filepath, std::ofstream::out); outfile << "[-2]\n[-1]\n[0]\n"; outfile << "[1 days]\n[0 days 23:01:00]\n[0 days 00:00:00.000000123]\n"; + outfile << "[0:0:0.000123]\n[0:0:0.000123000]\n[00:00:00.100000001]\n"; outfile << "[-2147483648]\n[2147483647]\n"; } @@ -388,6 +389,9 @@ TEST_F(JsonReaderTest, Durations) 1L * 60 * 60 * 24 * 1000000000L, (23 * 60 + 1) * 60 * 1000000000L, 123L, + 123000L, + 123000L, + 100000001L, -2147483648L, 2147483647L}, validity}); diff --git a/cpp/tests/iterator/indexalator_test.cu b/cpp/tests/iterator/indexalator_test.cu new file mode 100644 index 00000000000..d5379b6dd30 --- /dev/null +++ b/cpp/tests/iterator/indexalator_test.cu @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ + +#include + +#include +#include + +#include + +using TestingTypes = cudf::test::IntegralTypesNotBool; + +template +struct IndexalatorTest : public IteratorTest { +}; + +TYPED_TEST_CASE(IndexalatorTest, TestingTypes); + +TYPED_TEST(IndexalatorTest, input_iterator) +{ + using T = TypeParam; + + auto host_values = cudf::test::make_type_param_vector({0, 6, 0, -14, 13, 64, -13, -20, 45}); + + auto d_col = cudf::test::fixed_width_column_wrapper(host_values.begin(), host_values.end()); + + auto expected_values = thrust::host_vector(host_values.size()); + std::transform(host_values.begin(), host_values.end(), expected_values.begin(), [](auto v) { + return static_cast(v); + }); + + auto it_dev = cudf::detail::indexalator_factory::make_input_iterator(d_col); + this->iterator_test_thrust(expected_values, it_dev, host_values.size()); +} + +TYPED_TEST(IndexalatorTest, pair_iterator) +{ + using T = TypeParam; + + auto host_values = cudf::test::make_type_param_vector({0, 6, 0, -14, 13, 64, -13, -120, 115}); + auto validity = std::vector({0, 1, 1, 1, 1, 1, 0, 1, 1}); + + auto d_col = cudf::test::fixed_width_column_wrapper( + host_values.begin(), host_values.end(), validity.begin()); + + auto expected_values = + thrust::host_vector>(host_values.size()); + std::transform(host_values.begin(), + host_values.end(), + validity.begin(), + expected_values.begin(), + [](T v, bool b) { return thrust::make_pair(static_cast(v), b); }); + + auto it_dev = cudf::detail::indexalator_factory::make_input_pair_iterator(d_col); + this->iterator_test_thrust(expected_values, it_dev, host_values.size()); +} + +TYPED_TEST(IndexalatorTest, optional_iterator) +{ + using T = TypeParam; + + auto host_values = cudf::test::make_type_param_vector({0, 6, 0, -104, 103, 64, -13, -20, 45}); + auto validity = std::vector({0, 1, 1, 1, 1, 1, 0, 1, 1}); + + auto d_col = cudf::test::fixed_width_column_wrapper( + host_values.begin(), host_values.end(), validity.begin()); + + auto expected_values = thrust::host_vector>(host_values.size()); + + std::transform(host_values.begin(), + host_values.end(), + validity.begin(), + expected_values.begin(), + [](T v, bool b) { + return (b) ? thrust::make_optional(static_cast(v)) + : thrust::nullopt; + }); + + auto it_dev = cudf::detail::indexalator_factory::make_input_optional_iterator(d_col); + this->iterator_test_thrust(expected_values, it_dev, host_values.size()); +} diff --git a/docs/cudf/source/api_docs/index.rst b/docs/cudf/source/api_docs/index.rst index 960608d8f3c..0bf1d11bff4 100644 --- a/docs/cudf/source/api_docs/index.rst +++ b/docs/cudf/source/api_docs/index.rst @@ -17,4 +17,5 @@ This page provides a list of all publicly accessible modules, methods and classe general_utilities window io + subword_tokenize diff --git a/docs/cudf/source/api_docs/subword_tokenize.rst b/docs/cudf/source/api_docs/subword_tokenize.rst new file mode 100644 index 00000000000..e8737a9ee0a --- /dev/null +++ b/docs/cudf/source/api_docs/subword_tokenize.rst @@ -0,0 +1,12 @@ +================ +SubwordTokenizer +================ +.. currentmodule:: cudf.core.subword_tokenizer + +Constructor +~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + :template: autosummary/class_with_autosummary.rst + + SubwordTokenizer diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java index dde97b65215..e4fb71033af 100644 --- a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java +++ b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -557,9 +557,15 @@ protected synchronized boolean cleanImpl(boolean logErrorIfNotClean) { boolean neededCleanup = false; if (data != null || valid != null || offsets != null) { try { - ColumnVector.closeBuffers(data); - ColumnVector.closeBuffers(offsets); - ColumnVector.closeBuffers(valid); + if (data != null) { + data.close(); + } + if (offsets != null) { + offsets.close(); + } + if (valid != null) { + valid.close(); + } } finally { // Always mark the resource as freed even if an exception is thrown. // We cannot know how far it progressed before the exception, and diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd index 5a9228608e5..5e7380c1d4e 100644 --- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd @@ -17,7 +17,8 @@ cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \ cdef unique_ptr[column] from_timestamps( column_view input_col, - string format) except + + string format, + column_view input_strings_names) except + cdef unique_ptr[column] is_timestamp( column_view input_col, diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 70bdb6e2e60..2ae3c53cb1b 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -160,6 +160,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, # Access the Parquet user_data json to find the index index_col = None + is_range_index = False cdef map[string, string] user_data = c_out_table.metadata.user_data json_str = user_data[b'pandas'].decode('utf-8') meta = None @@ -171,7 +172,6 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, index_col[0]['kind'] == 'range': is_range_index = True else: - is_range_index = False index_col_names = OrderedDict() for idx_col in index_col: for c in meta['columns']: diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx index c6307d6cdb9..ab53a242db2 100644 --- a/python/cudf/cudf/_lib/reduce.pyx +++ b/python/cudf/cudf/_lib/reduce.pyx @@ -100,11 +100,8 @@ def scan(scan_op, Column incol, inclusive, **kwargs): cdef unique_ptr[column] c_result cdef Aggregation cython_agg = make_aggregation(scan_op, kwargs) - cdef scan_type c_inclusive - if inclusive is True: - c_inclusive = scan_type.INCLUSIVE - elif inclusive is False: - c_inclusive = scan_type.EXCLUSIVE + cdef scan_type c_inclusive = \ + scan_type.INCLUSIVE if inclusive else scan_type.EXCLUSIVE with nogil: c_result = move(cpp_scan( diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 74490d6bb19..f9e98efbbd9 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -509,7 +509,8 @@ def from_booleans(Column input_col): def int2timestamp( Column input_col, - format): + str format, + Column names): """ Converting/Casting input date-time column to string column with specified format @@ -517,6 +518,9 @@ def int2timestamp( Parameters ---------- input_col : input column of type timestamp in integer format + format : The string specifying output format + names : The string names to use for weekdays ("%a", "%A") and + months ("%b", "%B") Returns ------- @@ -525,12 +529,15 @@ def int2timestamp( """ cdef column_view input_column_view = input_col.view() cdef string c_timestamp_format = format.encode("UTF-8") + cdef column_view input_strings_names = names.view() + cdef unique_ptr[column] c_result with nogil: c_result = move( cpp_from_timestamps( input_column_view, - c_timestamp_format)) + c_timestamp_format, + input_strings_names)) return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 810cdd51df5..dc08f767c26 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -234,7 +234,7 @@ cdef data_from_unique_ptr( cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release()) cdef vector[unique_ptr[column]].iterator it = c_columns.begin() - cdef int i + cdef size_t i columns = [Column.from_unique_ptr(move(dereference(it+i))) for i in range(c_columns.size())] diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index b2f3274faab..72f902889f0 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -59,19 +59,6 @@ def get_loc(self, key, method=None, tolerance=None): def __getitem__(self, key): raise NotImplementedError() - def serialize(self): - header = {} - header["index_column"] = {} - # store metadata values of index separately - # Indexes: Numerical/DateTime/String are often GPU backed - header["index_column"], frames = self._values.serialize() - - header["name"] = pickle.dumps(self.name) - header["dtype"] = pickle.dumps(self.dtype) - header["type-serialized"] = pickle.dumps(type(self)) - header["frame_count"] = len(frames) - return header, frames - def __contains__(self, item): return item in self._values @@ -122,13 +109,10 @@ def get_level_values(self, level): @classmethod def deserialize(cls, header, frames): - h = header["index_column"] - idx_typ = pickle.loads(header["type-serialized"]) - name = pickle.loads(header["name"]) - - col_typ = pickle.loads(h["type-serialized"]) - index = col_typ.deserialize(h, frames[: header["frame_count"]]) - return idx_typ(index, name=name) + # Dispatch deserialization to the appropriate index type in case + # deserialization is ever attempted with the base class directly. + idx_type = pickle.loads(header["type-serialized"]) + return idx_type.deserialize(header, frames) @property def names(self): @@ -814,22 +798,8 @@ def astype(self, dtype, copy=False): self.copy(deep=copy)._values.astype(dtype), name=self.name ) + # TODO: This method is deprecated and can be removed. def to_array(self, fillna=None): - """Get a dense numpy array for the data. - - Parameters - ---------- - fillna : str or None - Defaults to None, which will skip null values. - If it equals "pandas", null values are filled with NaNs. - Non integral dtype is promoted to np.float64. - - Notes - ----- - - if ``fillna`` is ``None``, null values are skipped. Therefore, the - output size could be smaller. - """ return self._values.to_array(fillna=fillna) def to_series(self, index=None, name=None): diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index c057b729fd1..d2da594fa3b 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -426,7 +426,7 @@ def remove_categories( # ensure all the removals are in the current categories # list. If not, raise an error to match Pandas behavior if not removals_mask.all(): - vals = removals[~removals_mask].to_array() + vals = removals[~removals_mask].to_numpy() raise ValueError(f"removals must all be in old categories: {vals}") new_categories = cats[~cats.isin(removals)]._column @@ -1012,11 +1012,11 @@ def _encode(self, value) -> ScalarLike: return self.categories.find_first_value(value) def _decode(self, value: int) -> ScalarLike: - if value == self.default_na_value(): + if value == self._default_na_value(): return None return self.categories.element_indexing(value) - def default_na_value(self) -> ScalarLike: + def _default_na_value(self) -> ScalarLike: return -1 def find_and_replace( @@ -1175,7 +1175,7 @@ def fillna( fill_is_scalar = np.isscalar(fill_value) if fill_is_scalar: - if fill_value == self.default_na_value(): + if fill_value == self._default_na_value(): fill_value = self.codes.dtype.type(fill_value) else: try: @@ -1578,7 +1578,7 @@ def _create_empty_categorical_column( categories=column.as_column(dtype.categories), codes=column.as_column( cudf.utils.utils.scalar_broadcast_to( - categorical_column.default_na_value(), + categorical_column._default_na_value(), categorical_column.size, categorical_column.codes.dtype, ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index de278db919d..b6b4ef65cab 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -130,6 +130,12 @@ def values_host(self) -> "np.ndarray": """ Return a numpy representation of the Column. """ + if len(self) == 0: + return np.array([], dtype=self.dtype) + + if self.has_nulls: + raise ValueError("Column must have no nulls.") + return self.data_array_view.copy_to_host() @property @@ -138,7 +144,7 @@ def values(self) -> "cupy.ndarray": Return a CuPy representation of the Column. """ if len(self) == 0: - return cupy.asarray([], dtype=self.dtype) + return cupy.array([], dtype=self.dtype) if self.has_nulls: raise ValueError("Column must have no nulls.") @@ -319,9 +325,11 @@ def _get_mask_as_column(self) -> ColumnBase: def _memory_usage(self, **kwargs) -> int: return self.__sizeof__() - def default_na_value(self) -> Any: + def _default_na_value(self) -> Any: raise NotImplementedError() + # TODO: This method is decpreated and can be removed when the associated + # Frame methods are removed. def to_gpu_array(self, fillna=None) -> "cuda.devicearray.DeviceNDArray": """Get a dense numba device array for the data. @@ -337,10 +345,12 @@ def to_gpu_array(self, fillna=None) -> "cuda.devicearray.DeviceNDArray": output size could be smaller. """ if fillna: - return self.fillna(self.default_na_value()).data_array_view + return self.fillna(self._default_na_value()).data_array_view else: return self.dropna(drop_nan=False).data_array_view + # TODO: This method is decpreated and can be removed when the associated + # Frame methods are removed. def to_array(self, fillna=None) -> np.ndarray: """Get a dense numpy array for the data. diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 0d4edbf0113..eba6764e83d 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -4,7 +4,9 @@ import builtins import datetime as dt +import locale import re +from locale import nl_langinfo from numbers import Number from types import SimpleNamespace from typing import Any, Mapping, Sequence, Union, cast @@ -50,6 +52,56 @@ "datetime64[s]": "%Y-%m-%d %H:%M:%S", } +_DATETIME_SPECIAL_FORMATS = { + "%b", + "%B", + "%A", + "%a", +} + +_DATETIME_NAMES = [ + nl_langinfo(locale.AM_STR), # type: ignore + nl_langinfo(locale.PM_STR), # type: ignore + nl_langinfo(locale.DAY_1), + nl_langinfo(locale.DAY_2), + nl_langinfo(locale.DAY_3), + nl_langinfo(locale.DAY_4), + nl_langinfo(locale.DAY_5), + nl_langinfo(locale.DAY_6), + nl_langinfo(locale.DAY_7), + nl_langinfo(locale.ABDAY_1), + nl_langinfo(locale.ABDAY_2), + nl_langinfo(locale.ABDAY_3), + nl_langinfo(locale.ABDAY_4), + nl_langinfo(locale.ABDAY_5), + nl_langinfo(locale.ABDAY_6), + nl_langinfo(locale.ABDAY_7), + nl_langinfo(locale.MON_1), + nl_langinfo(locale.MON_2), + nl_langinfo(locale.MON_3), + nl_langinfo(locale.MON_4), + nl_langinfo(locale.MON_5), + nl_langinfo(locale.MON_6), + nl_langinfo(locale.MON_7), + nl_langinfo(locale.MON_8), + nl_langinfo(locale.MON_9), + nl_langinfo(locale.MON_10), + nl_langinfo(locale.MON_11), + nl_langinfo(locale.MON_12), + nl_langinfo(locale.ABMON_1), + nl_langinfo(locale.ABMON_2), + nl_langinfo(locale.ABMON_3), + nl_langinfo(locale.ABMON_4), + nl_langinfo(locale.ABMON_5), + nl_langinfo(locale.ABMON_6), + nl_langinfo(locale.ABMON_7), + nl_langinfo(locale.ABMON_8), + nl_langinfo(locale.ABMON_9), + nl_langinfo(locale.ABMON_10), + nl_langinfo(locale.ABMON_11), + nl_langinfo(locale.ABMON_12), +] + class DatetimeColumn(column.ColumnBase): """ @@ -278,17 +330,23 @@ def as_string_column( format = _dtype_to_format_conversion.get( self.dtype.name, "%Y-%m-%d %H:%M:%S" ) + if format in _DATETIME_SPECIAL_FORMATS: + names = as_column(_DATETIME_NAMES) + else: + names = cudf.core.column.column_empty( + 0, dtype="object", masked=False + ) if len(self) > 0: return string._datetime_to_str_typecast_functions[ cudf.dtype(self.dtype) - ](self, format) + ](self, format, names) else: return cast( "cudf.core.column.StringColumn", column.column_empty(0, dtype="object", masked=False), ) - def default_na_value(self) -> DatetimeLikeScalar: + def _default_na_value(self) -> DatetimeLikeScalar: """Returns the default NA value for this column """ return np.datetime64("nat", self.time_unit) @@ -433,14 +491,11 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: def _make_copy_with_na_as_null(self): """Return a copy with NaN values replaced with nulls.""" null = column_empty_like(self, masked=True, newsize=1) + na_value = np.datetime64("nat", self.time_unit) out_col = cudf._lib.replace.replace( self, as_column( - Buffer( - np.array([self.default_na_value()], dtype=self.dtype).view( - "|u1" - ) - ), + Buffer(np.array([na_value], dtype=self.dtype).view("|u1")), dtype=self.dtype, ), null, diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 736aa5b5a7b..27ff5da5505 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -288,7 +288,7 @@ def _process_values_for_isin( return lhs, rhs - def default_na_value(self) -> ScalarLike: + def _default_na_value(self) -> ScalarLike: """Returns the default NA value for this column """ dkind = self.dtype.kind diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index c59081e4b59..07dc8743c00 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5210,10 +5210,10 @@ def values(self) -> cupy.ndarray: """ Return a CuPy representation of the StringColumn. """ - raise NotImplementedError( - "String Arrays is not yet implemented in cudf" - ) + raise TypeError("String Arrays is not yet implemented in cudf") + # TODO: This method is deprecated and should be removed when the associated + # Frame methods are removed. def to_array(self, fillna: bool = None) -> np.ndarray: """Get a dense numpy array for the data. @@ -5409,7 +5409,7 @@ def normalize_binop_value(self, other) -> "column.ColumnBase": else: raise TypeError(f"cannot broadcast {type(other)}") - def default_na_value(self) -> ScalarLike: + def _default_na_value(self) -> ScalarLike: return None def binary_operator( diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index a3888d30f30..c7b13903751 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -304,7 +304,7 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn": ), ) - def default_na_value(self) -> ScalarLike: + def _default_na_value(self) -> ScalarLike: """Returns the default NA value for this column """ return np.timedelta64("nat", self.time_unit) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index bdbd94ef754..0287e83a99a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -356,7 +356,7 @@ def _init_from_series_list(self, data, columns, index): # Setting `final_columns` to self._index so # that the resulting `transpose` will be have # columns set to `final_columns` - self._index = final_columns + self._index = as_index(final_columns) transpose = self.T else: @@ -529,38 +529,28 @@ def _constructor_expanddim(self): ) def serialize(self): - header = {} - frames = [] - header["type-serialized"] = pickle.dumps(type(self)) + header, frames = super().serialize() + header["index"], index_frames = self._index.serialize() header["index_frame_count"] = len(index_frames) - frames.extend(index_frames) - - # Use the column directly to avoid duplicating the index - # need to pickle column names to handle numpy integer columns - header["columns"], column_frames = column.serialize_columns( - self._columns - ) - frames.extend(column_frames) + # For backwards compatibility with older versions of cuDF, index + # columns are placed before data columns. + frames = index_frames + frames - header["column_names"] = pickle.dumps(tuple(self._data.names)) return header, frames @classmethod def deserialize(cls, header, frames): - # Reconstruct the index - index_frames = frames[: header["index_frame_count"]] + index_nframes = header["index_frame_count"] + obj = super().deserialize( + header, frames[header["index_frame_count"] :] + ) idx_typ = pickle.loads(header["index"]["type-serialized"]) - index = idx_typ.deserialize(header["index"], index_frames) - - # Reconstruct the columns - column_frames = frames[header["index_frame_count"] :] - - column_names = pickle.loads(header["column_names"]) - columns = column.deserialize_columns(header["columns"], column_frames) + index = idx_typ.deserialize(header["index"], frames[:index_nframes]) + obj._index = index - return cls._from_data(dict(zip(column_names, columns)), index=index,) + return obj @property def dtypes(self): @@ -978,36 +968,6 @@ def __array_function__(self, func, types, args, kwargs): else: return NotImplemented - @property - def values(self): - """ - Return a CuPy representation of the DataFrame. - - Only the values in the DataFrame will be returned, the axes labels will - be removed. - - Returns - ------- - out: cupy.ndarray - The values of the DataFrame. - """ - return cupy.asarray(self.as_gpu_matrix()) - - def __array__(self, dtype=None): - raise TypeError( - "Implicit conversion to a host NumPy array via __array__ is not " - "allowed, To explicitly construct a GPU matrix, consider using " - ".as_gpu_matrix()\nTo explicitly construct a host " - "matrix, consider using .as_matrix()" - ) - - def __arrow_array__(self, type=None): - raise TypeError( - "Implicit conversion to a host PyArrow Table via __arrow_array__ " - "is not allowed, To explicitly construct a PyArrow Table, " - "consider using .to_arrow()" - ) - def _get_numeric_data(self): """ Return a dataframe with only numeric data types """ columns = [ @@ -2750,7 +2710,7 @@ def columns(self, columns): if isinstance( columns, (Series, cudf.Index, cudf.core.column.ColumnBase) ): - columns = pd.Index(columns.to_array(), tupleize_cols=is_multiindex) + columns = pd.Index(columns.to_numpy(), tupleize_cols=is_multiindex) elif not isinstance(columns, pd.Index): columns = pd.Index(columns, tupleize_cols=is_multiindex) @@ -2816,6 +2776,87 @@ def index(self, value): idx = as_index(value) self._index = idx + def _reindex( + self, columns, dtypes=None, deep=False, index=None, inplace=False + ): + """ + Helper for `.reindex` + + Parameters + ---------- + columns : array-like + The list of columns to select from the Frame, + if ``columns`` is a superset of ``Frame.columns`` new + columns are created. + dtypes : dict + Mapping of dtypes for the empty columns being created. + deep : boolean, optional, default False + Whether to make deep copy or shallow copy of the columns. + index : Index or array-like, default None + The ``index`` to be used to reindex the Frame with. + inplace : bool, default False + Whether to perform the operation in place on the data. + + Returns + ------- + DataFrame + """ + if dtypes is None: + dtypes = {} + + df = self + if index is not None: + index = cudf.core.index.as_index(index) + + if isinstance(index, cudf.MultiIndex): + idx_dtype_match = all( + left_dtype == right_dtype + for left_dtype, right_dtype in zip( + (col.dtype for col in df.index._data.columns), + (col.dtype for col in index._data.columns), + ) + ) + else: + idx_dtype_match = df.index.dtype == index.dtype + + if not idx_dtype_match: + columns = ( + columns if columns is not None else list(df._column_names) + ) + df = cudf.DataFrame() + else: + df = cudf.DataFrame(None, index).join( + df, how="left", sort=True + ) + # double-argsort to map back from sorted to unsorted positions + df = df.take(index.argsort(ascending=True).argsort()) + + index = index if index is not None else df.index + names = columns if columns is not None else list(df._data.names) + cols = { + name: ( + df._data[name].copy(deep=deep) + if name in df._data + else column_empty( + dtype=dtypes.get(name, np.float64), + masked=True, + row_count=len(index), + ) + ) + for name in names + } + + result = self.__class__._from_data( + data=cudf.core.column_accessor.ColumnAccessor( + cols, + multiindex=self._data.multiindex, + level_names=self._data.level_names, + ), + index=index, + ) + + return self._mimic_inplace(result, inplace=inplace) + def reindex( self, labels=None, axis=0, index=None, columns=None, copy=True ): @@ -2866,12 +2907,16 @@ def reindex( if labels is None and index is None and columns is None: return self.copy(deep=copy) - df = self - cols = columns - dtypes = dict(df.dtypes) + dtypes = dict(self.dtypes) idx = labels if index is None and axis in (0, "index") else index - cols = labels if cols is None and axis in (1, "columns") else cols - df = df if cols is None else df[list(set(df.columns) & set(cols))] + cols = ( + labels if columns is None and axis in (1, "columns") else columns + ) + df = ( + self + if cols is None + else self[list(set(self._column_names) & set(cols))] + ) result = df._reindex( columns=cols, dtypes=dtypes, deep=copy, index=idx, inplace=False @@ -2879,36 +2924,6 @@ def reindex( return result - def _set_index( - self, index, to_drop=None, inplace=False, verify_integrity=False, - ): - """Helper for `.set_index` - - Parameters - ---------- - index : Index - The new index to set. - to_drop : list optional, default None - A list of labels indicating columns to drop. - inplace : boolean, default False - Modify the DataFrame in place (do not create a new object). - verify_integrity : boolean, default False - Check for duplicates in the new index. - """ - if not isinstance(index, BaseIndex): - raise ValueError("Parameter index should be type `Index`.") - - df = self if inplace else self.copy(deep=True) - - if verify_integrity and not index.is_unique: - raise ValueError(f"Values in Index are not unique: {index}") - - if to_drop: - df.drop(columns=to_drop, inplace=True) - - df.index = index - return df if not inplace else None - def set_index( self, keys, @@ -3015,7 +3030,7 @@ def set_index( columns_to_add = [] names = [] to_drop = [] - for i, col in enumerate(keys): + for col in keys: # Is column label if is_scalar(col) or isinstance(col, tuple): if col in self.columns: @@ -3073,17 +3088,24 @@ def set_index( elif len(columns_to_add) == 1: idx = cudf.Index(columns_to_add[0], name=names[0]) else: - idf = cudf.DataFrame() - for i, col in enumerate(columns_to_add): - idf[i] = col - idx = cudf.MultiIndex.from_frame(idf, names=names) - - return self._set_index( - index=idx, - to_drop=to_drop, - inplace=inplace, - verify_integrity=verify_integrity, - ) + idx = cudf.MultiIndex._from_data( + {i: col for i, col in enumerate(columns_to_add)} + ) + idx.names = names + + if not isinstance(idx, BaseIndex): + raise ValueError("Parameter index should be type `Index`.") + + df = self if inplace else self.copy(deep=True) + + if verify_integrity and not idx.is_unique: + raise ValueError(f"Values in Index are not unique: {idx}") + + if to_drop: + df.drop(columns=to_drop, inplace=True) + + df.index = idx + return df if not inplace else None def reset_index( self, level=None, drop=False, inplace=False, col_level=0, col_fill="" @@ -3146,10 +3168,7 @@ class max_speed "col_fill parameter is not supported yet." ) - if inplace: - result = self - else: - result = self.copy() + result = self if inplace else self.copy() if not drop: if isinstance(self.index, cudf.MultiIndex): @@ -3172,9 +3191,7 @@ class max_speed ): result.insert(0, name, index_column) result.index = RangeIndex(len(self)) - if inplace: - return - else: + if not inplace: return result def take(self, positions, keep_index=True): @@ -3227,12 +3244,12 @@ def insert(self, loc, name, value): name or label of column to be inserted value : Series or array-like """ - num_cols = len(self._data) if name in self._data: raise NameError(f"duplicated column name {name}") + num_cols = len(self._data) if loc < 0: - loc = num_cols + loc + 1 + loc += num_cols + 1 if not (0 <= loc <= num_cols): raise ValueError( @@ -3660,7 +3677,7 @@ def rename( out = DataFrame( index=self.index.replace( to_replace=list(index.keys()), - replacement=list(index.values()), + value=list(index.values()), ) ) else: @@ -3677,21 +3694,11 @@ def rename( return out.copy(deep=copy) def as_gpu_matrix(self, columns=None, order="F"): - """Convert to a matrix in device memory. - - Parameters - ---------- - columns : sequence of str - List of a column names to be extracted. The order is preserved. - If None is specified, all columns are used. - order : 'F' or 'C' - Optional argument to determine whether to return a column major - (Fortran) matrix or a row major (C) matrix. - - Returns - ------- - A (nrow x ncol) numba device ndarray - """ + warnings.warn( + "The as_gpu_matrix method will be removed in a future cuDF " + "release. Consider using `to_cupy` instead.", + DeprecationWarning, + ) if columns is None: columns = self._data.names @@ -3735,18 +3742,11 @@ def as_gpu_matrix(self, columns=None, order="F"): return cuda.as_cuda_array(matrix).view(dtype) def as_matrix(self, columns=None): - """Convert to a matrix in host memory. - - Parameters - ---------- - columns : sequence of str - List of a column names to be extracted. The order is preserved. - If None is specified, all columns are used. - - Returns - ------- - A (nrow x ncol) numpy ndarray in "F" order. - """ + warnings.warn( + "The as_matrix method will be removed in a future cuDF " + "release. Consider using `to_numpy` instead.", + DeprecationWarning, + ) return self.as_gpu_matrix(columns=columns).copy_to_host() def one_hot_encoding( @@ -3914,71 +3914,6 @@ def argsort(self, ascending=True, na_position="last"): ) return cudf.Series(inds_col) - @annotate("SORT_INDEX", color="red", domain="cudf_python") - def sort_index( - self, - axis=0, - level=None, - ascending=True, - inplace=False, - kind=None, - na_position="last", - sort_remaining=True, - ignore_index=False, - ): - """Sort object by labels (along an axis). - - Parameters - ---------- - axis : {0 or ‘index’, 1 or ‘columns’}, default 0 - The axis along which to sort. The value 0 identifies the rows, - and 1 identifies the columns. - level : int or level name or list of ints or list of level names - If not None, sort on values in specified index level(s). - This is only useful in the case of MultiIndex. - ascending : bool, default True - Sort ascending vs. descending. - inplace : bool, default False - If True, perform operation in-place. - kind : sorting method such as `quick sort` and others. - Not yet supported. - na_position : {‘first’, ‘last’}, default ‘last’ - Puts NaNs at the beginning if first; last puts NaNs at the end. - sort_remaining : bool, default True - Not yet supported - ignore_index : bool, default False - if True, index will be replaced with RangeIndex. - - Returns - ------- - DataFrame or None - - Examples - -------- - >>> df = cudf.DataFrame( - ... {"b":[3, 2, 1], "a":[2, 1, 3]}, index=[1, 3, 2]) - >>> df.sort_index(axis=0) - b a - 1 3 2 - 2 1 3 - 3 2 1 - >>> df.sort_index(axis=1) - a b - 1 2 3 - 3 1 2 - 2 3 1 - """ - return super()._sort_index( - axis=axis, - level=level, - ascending=ascending, - inplace=inplace, - kind=kind, - na_position=na_position, - sort_remaining=sort_remaining, - ignore_index=ignore_index, - ) - def sort_values( self, by, @@ -3990,7 +3925,6 @@ def sort_values( ignore_index=False, ): """ - Sort by the values row-wise. Parameters @@ -4418,7 +4352,7 @@ def merge( sort=False, lsuffix=None, rsuffix=None, - method="hash", + method=None, indicator=False, suffixes=("_x", "_y"), ): @@ -4463,8 +4397,9 @@ def merge( suffixes: Tuple[str, str], defaults to ('_x', '_y') Suffixes applied to overlapping column names on the left and right sides - method : {‘hash’, ‘sort’}, default ‘hash’ - The implementation method to be used for the operation. + method : + This parameter is unused. It is deprecated and will be removed in a + future version. Returns ------- @@ -4526,6 +4461,13 @@ def merge( else: lsuffix, rsuffix = suffixes + if method is not None: + warnings.warn( + "The 'method' argument is deprecated and will be removed " + "in a future version of cudf.", + FutureWarning, + ) + # Compute merge gdf_result = super()._merge( right, @@ -4536,7 +4478,6 @@ def merge( right_index=right_index, how=how, sort=sort, - method=method, indicator=indicator, suffixes=suffixes, ) @@ -4551,7 +4492,7 @@ def join( lsuffix="", rsuffix="", sort=False, - method="hash", + method=None, ): """Join columns with other DataFrame on index or on a key column. @@ -4565,6 +4506,9 @@ def join( column names when avoiding conflicts. sort : bool Set to True to ensure sorted ordering. + method : + This parameter is unused. It is deprecated and will be removed in a + future version. Returns ------- @@ -4578,6 +4522,13 @@ def join( - *on* is not supported yet due to lack of multi-index support. """ + if method is not None: + warnings.warn( + "The 'method' argument is deprecated and will be removed " + "in a future version of cudf.", + FutureWarning, + ) + lhs = self rhs = other @@ -4587,7 +4538,6 @@ def join( right_index=True, how=how, suffixes=(lsuffix, rsuffix), - method=method, sort=sort, ) df.index.name = ( @@ -4885,12 +4835,6 @@ def apply( runtime compilation features """ - # libcudacxx tuples are not compatible with nvrtc 11.0 - runtime = cuda.cudadrv.runtime.Runtime() - mjr, mnr = runtime.get_version() - if mjr < 11 or (mjr == 11 and mnr < 1): - raise RuntimeError("DataFrame.apply requires CUDA 11.1+") - for dtype in self.dtypes: if ( isinstance(dtype, cudf.core.dtypes._BaseDtype) @@ -5103,140 +5047,6 @@ def partition_by_hash(self, columns, nparts, keep_index=True): # Slice into partition return [outdf[s:e] for s, e in zip(offsets, offsets[1:] + [None])] - def replace( - self, - to_replace=None, - value=None, - inplace=False, - limit=None, - regex=False, - method=None, - ): - """ - Replace values given in *to_replace* with *replacement*. - - Parameters - ---------- - to_replace : numeric, str, list-like or dict - Value(s) that will be replaced. - - * numeric or str: - - values equal to *to_replace* will be replaced - with *replacement* - * list of numeric or str: - - If *replacement* is also list-like, - *to_replace* and *replacement* must be of same length. - * dict: - - Dicts can be used to replace different values in different - columns. For example, `{'a': 1, 'z': 2}` specifies that the - value 1 in column `a` and the value 2 in column `z` should be - replaced with replacement*. - - Dicts can be used to specify different replacement values for - different existing values. For example, {'a': 'b', 'y': 'z'} - replaces the value ‘a’ with ‘b’ and ‘y’ with ‘z’. - To use a dict in this way the value parameter should be None. - - value : numeric, str, list-like, or dict - Value(s) to replace `to_replace` with. If a dict is provided, then - its keys must match the keys in *to_replace*, and corresponding - values must be compatible (e.g., if they are lists, then they must - match in length). - inplace : bool, default False - If True, in place. - - Raises - ------ - TypeError - - If ``to_replace`` is not a scalar, array-like, dict, or None - - If ``to_replace`` is a dict and value is not a list, dict, - or Series - ValueError - - If a list is passed to ``to_replace`` and ``value`` but they - are not the same length. - - Returns - ------- - result : DataFrame - DataFrame after replacement. - - Examples - -------- - - Scalar ``to_replace`` and ``value`` - - >>> import cudf - >>> df = cudf.DataFrame({'A': [0, 1, 2, 3, 4], - ... 'B': [5, 6, 7, 8, 9], - ... 'C': ['a', 'b', 'c', 'd', 'e']}) - >>> df - A B C - 0 0 5 a - 1 1 6 b - 2 2 7 c - 3 3 8 d - 4 4 9 e - >>> df.replace(0, 5) - A B C - 0 5 5 a - 1 1 6 b - 2 2 7 c - 3 3 8 d - 4 4 9 e - - List-like ``to_replace`` - - >>> df.replace([0, 1, 2, 3], 4) - A B C - 0 4 5 a - 1 4 6 b - 2 4 7 c - 3 4 8 d - 4 4 9 e - >>> df.replace([0, 1, 2, 3], [4, 3, 2, 1]) - A B C - 0 4 5 a - 1 3 6 b - 2 2 7 c - 3 1 8 d - 4 4 9 e - - dict-like ``to_replace`` - - >>> df.replace({0: 10, 1: 100}) - A B C - 0 10 5 a - 1 100 6 b - 2 2 7 c - 3 3 8 d - 4 4 9 e - >>> df.replace({'A': 0, 'B': 5}, 100) - A B C - 0 100 100 a - 1 1 6 b - 2 2 7 c - 3 3 8 d - 4 4 9 e - - Notes - ----- - Parameters that are currently not supported are: `limit`, `regex`, - `method` - """ - if limit is not None: - raise NotImplementedError("limit parameter is not implemented yet") - - if regex: - raise NotImplementedError("regex parameter is not implemented yet") - - if method not in ("pad", None): - raise NotImplementedError( - "method parameter is not implemented yet" - ) - - outdf = super().replace(to_replace=to_replace, replacement=value) - - return self._mimic_inplace(outdf, inplace=inplace) - def info( self, verbose=None, @@ -5897,9 +5707,9 @@ def to_records(self, index=True): dtype = np.dtype(members) ret = np.recarray(len(self), dtype=dtype) if index: - ret["index"] = self.index.to_array() + ret["index"] = self.index.to_numpy() for col in self._data.names: - ret[col] = self[col].to_array() + ret[col] = self[col].to_numpy() return ret @classmethod diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 28080cbc4c1..e63ba4a4499 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3,10 +3,12 @@ from __future__ import annotations import copy +import pickle import warnings from collections import abc from typing import ( Any, + Callable, Dict, MutableMapping, Optional, @@ -24,13 +26,14 @@ import cudf from cudf import _lib as libcudf -from cudf._typing import ColumnLike, DataFrameOrSeries +from cudf._typing import ColumnLike, DataFrameOrSeries, Dtype from cudf.api.types import ( _is_non_decimal_numeric_dtype, _is_scalar_or_zero_d_array, is_decimal_dtype, is_dict_like, is_integer_dtype, + is_list_like, is_scalar, issubdtype, ) @@ -39,13 +42,16 @@ as_column, build_categorical_column, column_empty, + deserialize_columns, + serialize_columns, ) from cudf.core.column_accessor import ColumnAccessor from cudf.core.join import merge +from cudf.core.udf.pipeline import compile_or_get from cudf.core.window import Rolling from cudf.utils import ioutils from cudf.utils.docutils import copy_docstring -from cudf.utils.dtypes import is_column_like +from cudf.utils.dtypes import find_common_type, is_column_like T = TypeVar("T", bound="Frame") @@ -64,6 +70,21 @@ class Frame(libcudf.table.Table): _data: "ColumnAccessor" + def serialize(self): + header = { + "type-serialized": pickle.dumps(type(self)), + "column_names": pickle.dumps(tuple(self._data.names)), + } + header["columns"], frames = serialize_columns(self._columns) + return header, frames + + @classmethod + def deserialize(cls, header, frames): + cls_deserialize = pickle.loads(header["type-serialized"]) + column_names = pickle.loads(header["column_names"]) + columns = deserialize_columns(header["columns"], frames) + return cls_deserialize._from_data(dict(zip(column_names, columns))) + @classmethod def _from_data( cls, @@ -591,6 +612,151 @@ def _empty_like(self, keep_index=True): result._copy_type_metadata(self, include_index=keep_index) return result + @property + def values(self): + """ + Return a CuPy representation of the DataFrame. + + Only the values in the DataFrame will be returned, the axes labels will + be removed. + + Returns + ------- + cupy.ndarray + The values of the DataFrame. + """ + return self.to_cupy() + + @property + def values_host(self): + """ + Return a NumPy representation of the data. + + Only the values in the DataFrame will be returned, the axes labels will + be removed. + + Returns + ------- + numpy.ndarray + A host representation of the underlying data. + """ + return self.to_numpy() + + def __array__(self, dtype=None): + raise TypeError( + "Implicit conversion to a host NumPy array via __array__ is not " + "allowed, To explicitly construct a GPU matrix, consider using " + ".to_cupy()\nTo explicitly construct a host matrix, consider " + "using .to_numpy()." + ) + + def __arrow_array__(self, type=None): + raise TypeError( + "Implicit conversion to a host PyArrow object via __arrow_array__ " + "is not allowed. Consider using .to_arrow()" + ) + + def _to_array( + self, + get_column_values: Callable, + make_empty_matrix: Callable, + dtype: Union[Dtype, None] = None, + na_value=None, + ) -> Union[cupy.ndarray, np.ndarray]: + # Internal function to implement to_cupy and to_numpy, which are nearly + # identical except for the attribute they access to generate values. + + def get_column_values_na(col): + if na_value is not None: + col = col.fillna(na_value) + return get_column_values(col) + + # Early exit for an empty Frame. + ncol = self._num_columns + if ncol == 0: + return make_empty_matrix(shape=(0, 0), dtype=np.dtype("float64")) + + if dtype is None: + dtype = find_common_type( + [col.dtype for col in self._data.values()] + ) + + matrix = make_empty_matrix(shape=(len(self), ncol), dtype=dtype) + for i, col in enumerate(self._data.values()): + # TODO: col.values may fail if there is nullable data or an + # unsupported dtype. We may want to catch and provide a more + # suitable error. + matrix[:, i] = get_column_values_na(col) + return matrix + + def to_cupy( + self, + dtype: Union[Dtype, None] = None, + copy: bool = False, + na_value=None, + ) -> cupy.ndarray: + """Convert the Frame to a CuPy array. + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to pass to :meth:`numpy.asarray`. + copy : bool, default False + Whether to ensure that the returned value is not a view on + another array. Note that ``copy=False`` does not *ensure* that + ``to_cupy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. + na_value : Any, default None + The value to use for missing values. The default value depends on + dtype and the dtypes of the DataFrame columns. + + Returns + ------- + cupy.ndarray + """ + return self._to_array( + (lambda col: col.values.copy()) + if copy + else (lambda col: col.values), + cupy.empty, + dtype, + na_value, + ) + + def to_numpy( + self, + dtype: Union[Dtype, None] = None, + copy: bool = True, + na_value=None, + ) -> np.ndarray: + """Convert the Frame to a NumPy array. + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to pass to :meth:`numpy.asarray`. + copy : bool, default True + Whether to ensure that the returned value is not a view on + another array. This parameter must be ``True`` since cuDF must copy + device memory to host to provide a numpy array. + na_value : Any, default None + The value to use for missing values. The default value depends on + dtype and the dtypes of the DataFrame columns. + + Returns + ------- + numpy.ndarray + """ + if not copy: + raise ValueError( + "copy=False is not supported because conversion to a numpy " + "array always copies the data." + ) + + return self._to_array( + (lambda col: col.values_host), np.empty, dtype, na_value + ) + def clip(self, lower=None, upper=None, inplace=False, axis=1): """ Trim values at input threshold(s). @@ -1436,10 +1602,29 @@ def _apply(self, func): """ Apply `func` across the rows of the frame. """ - output_dtype, ptx = cudf.core.udf.pipeline.compile_masked_udf( - func, self.dtypes + kernel, retty = compile_or_get(self, func) + + # Mask and data column preallocated + ans_col = cupy.empty(len(self), dtype=retty) + ans_mask = cudf.core.column.column_empty(len(self), dtype="bool") + launch_args = [(ans_col, ans_mask)] + offsets = [] + for col in self._data.values(): + data = col.data + mask = col.mask + if mask is None: + launch_args.append(data) + else: + launch_args.append((data, mask)) + offsets.append(col.offset) + launch_args += offsets + launch_args.append(len(self)) # size + kernel.forall(len(self))(*launch_args) + + result = cudf.Series(ans_col).set_mask( + libcudf.transform.bools_to_mask(ans_mask) ) - result = cudf._lib.transform.masked_udf(self, ptx, output_dtype) + return result def rank( @@ -1599,6 +1784,147 @@ def repeat(self, repeats, axis=None): return self._repeat(repeats) + @annotate("SORT_INDEX", color="red", domain="cudf_python") + def sort_index( + self, + axis=0, + level=None, + ascending=True, + inplace=False, + kind=None, + na_position="last", + sort_remaining=True, + ignore_index=False, + key=None, + ): + """Sort object by labels (along an axis). + + Parameters + ---------- + axis : {0 or ‘index’, 1 or ‘columns’}, default 0 + The axis along which to sort. The value 0 identifies the rows, + and 1 identifies the columns. + level : int or level name or list of ints or list of level names + If not None, sort on values in specified index level(s). + This is only useful in the case of MultiIndex. + ascending : bool, default True + Sort ascending vs. descending. + inplace : bool, default False + If True, perform operation in-place. + kind : sorting method such as `quick sort` and others. + Not yet supported. + na_position : {‘first’, ‘last’}, default ‘last’ + Puts NaNs at the beginning if first; last puts NaNs at the end. + sort_remaining : bool, default True + Not yet supported + ignore_index : bool, default False + if True, index will be replaced with RangeIndex. + key : callable, optional + If not None, apply the key function to the index values before + sorting. This is similar to the key argument in the builtin + sorted() function, with the notable difference that this key + function should be vectorized. It should expect an Index and return + an Index of the same shape. For MultiIndex inputs, the key is + applied per level. + + Returns + ------- + Frame or None + + Notes + ----- + Difference from pandas: + * Not supporting: kind, sort_remaining=False + + Examples + -------- + **Series** + >>> import cudf + >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4]) + >>> series + 3 a + 2 b + 1 c + 4 d + dtype: object + >>> series.sort_index() + 1 c + 2 b + 3 a + 4 d + dtype: object + + Sort Descending + + >>> series.sort_index(ascending=False) + 4 d + 3 a + 2 b + 1 c + dtype: object + + **DataFrame** + >>> df = cudf.DataFrame( + ... {"b":[3, 2, 1], "a":[2, 1, 3]}, index=[1, 3, 2]) + >>> df.sort_index(axis=0) + b a + 1 3 2 + 2 1 3 + 3 2 1 + >>> df.sort_index(axis=1) + a b + 1 2 3 + 3 1 2 + 2 3 1 + """ + if kind is not None: + raise NotImplementedError("kind is not yet supported") + + if not sort_remaining: + raise NotImplementedError( + "sort_remaining == False is not yet supported" + ) + + if key is not None: + raise NotImplementedError("key is not yet supported.") + + if axis in (0, "index"): + idx = self.index + if isinstance(idx, cudf.MultiIndex): + if level is None: + midx_data = idx.to_frame(index=False) + else: + # Pandas doesn't handle na_position in case of MultiIndex. + na_position = "first" if ascending is True else "last" + labels = [ + idx._get_level_label(lvl) + for lvl in (level if is_list_like(level) else (level,)) + ] + midx_data = cudf.DataFrame._from_data( + idx._data.select_by_label(labels) + ) + + inds = midx_data.argsort( + ascending=ascending, na_position=na_position + ) + out = self.take(inds) + elif (ascending and idx.is_monotonic_increasing) or ( + not ascending and idx.is_monotonic_decreasing + ): + out = self.copy() + else: + inds = idx.argsort( + ascending=ascending, na_position=na_position + ) + out = self.take(inds) + else: + labels = sorted(self._data.names, reverse=not ascending) + out = self[labels] + + if ignore_index is True: + out = out.reset_index(drop=True) + return self._mimic_inplace(out, inplace=inplace) + def _repeat(self, count): if not is_scalar(count): count = as_column(count) @@ -1639,35 +1965,21 @@ def _shift(self, offset, fill_value=None): zip(self._column_names, data_columns), self._index ) - def __array__(self, dtype=None): - raise TypeError( - "Implicit conversion to a host NumPy array via __array__ is not " - "allowed, To explicitly construct a GPU array, consider using " - "cupy.asarray(...)\nTo explicitly construct a " - "host array, consider using .to_array()" - ) - - def __arrow_array__(self, type=None): - raise TypeError( - "Implicit conversion to a host PyArrow Array via __arrow_array__ " - "is not allowed, To explicitly construct a PyArrow Array, " - "consider using .to_arrow()" - ) - def round(self, decimals=0, how="half_even"): """ - Round a DataFrame to a variable number of decimal places. + Round to a variable number of decimal places. Parameters ---------- decimals : int, dict, Series - Number of decimal places to round each column to. If an int is - given, round each column to the same number of places. - Otherwise dict and Series round to variable numbers of places. - Column names should be in the keys if `decimals` is a - dict-like, or in the index if `decimals` is a Series. Any - columns not included in `decimals` will be left as is. Elements - of `decimals` which are not columns of the input will be + Number of decimal places to round each column to. This parameter + must be an int for a Series. For a DataFrame, a dict or a Series + are also valid inputs. If an int is given, round each column to the + same number of places. Otherwise dict and Series round to variable + numbers of places. Column names should be in the keys if + `decimals` is a dict-like, or in the index if `decimals` is a + Series. Any columns not included in `decimals` will be left as is. + Elements of `decimals` which are not columns of the input will be ignored. how : str, optional Type of rounding. Can be either "half_even" (default) @@ -1675,12 +1987,23 @@ def round(self, decimals=0, how="half_even"): Returns ------- - DataFrame - A DataFrame with the affected columns rounded to the specified - number of decimal places. + Series or DataFrame + A Series or DataFrame with the affected columns rounded to the + specified number of decimal places. Examples -------- + **Series** + + >>> s = cudf.Series([0.1, 1.4, 2.9]) + >>> s.round() + 0 0.0 + 1 1.0 + 2 3.0 + dtype: float64 + + **DataFrame** + >>> df = cudf.DataFrame( [(.21, .32), (.01, .67), (.66, .03), (.21, .18)], ... columns=['dogs', 'cats'] @@ -1729,34 +2052,24 @@ def round(self, decimals=0, how="half_even"): if isinstance(decimals, cudf.Series): decimals = decimals.to_pandas() - if isinstance(decimals, (dict, pd.Series)): - if ( - isinstance(decimals, pd.Series) - and not decimals.index.is_unique - ): + if isinstance(decimals, pd.Series): + if not decimals.index.is_unique: raise ValueError("Index of decimals must be unique") - - cols = { - name: col.round(decimals[name], how=how) - if ( - name in decimals.keys() - and _is_non_decimal_numeric_dtype(col.dtype) - ) - else col.copy(deep=True) - for name, col in self._data.items() - } + decimals = decimals.to_dict() elif isinstance(decimals, int): - cols = { - name: col.round(decimals, how=how) - if _is_non_decimal_numeric_dtype(col.dtype) - else col.copy(deep=True) - for name, col in self._data.items() - } - else: + decimals = {name: decimals for name in self._column_names} + elif not isinstance(decimals, abc.Mapping): raise TypeError( "decimals must be an integer, a dict-like or a Series" ) + cols = { + name: col.round(decimals[name], how=how) + if (name in decimals and _is_non_decimal_numeric_dtype(col.dtype)) + else col.copy(deep=True) + for name, col in self._data.items() + } + return self.__class__._from_data( data=cudf.core.column_accessor.ColumnAccessor( cols, @@ -2170,8 +2483,219 @@ def drop_duplicates( result._copy_type_metadata(self) return result - def replace(self, to_replace: Any, replacement: Any) -> Frame: - if not (to_replace is None and replacement is None): + def replace( + self, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method=None, + ): + """Replace values given in ``to_replace`` with ``value``. + + Parameters + ---------- + to_replace : numeric, str or list-like + Value(s) to replace. + + * numeric or str: + - values equal to ``to_replace`` will be replaced + with ``value`` + * list of numeric or str: + - If ``value`` is also list-like, ``to_replace`` and + ``value`` must be of same length. + * dict: + - Dicts can be used to specify different replacement values + for different existing values. For example, {'a': 'b', + 'y': 'z'} replaces the value ‘a’ with ‘b’ and + ‘y’ with ‘z’. + To use a dict in this way the ``value`` parameter should + be ``None``. + value : scalar, dict, list-like, str, default None + Value to replace any values matching ``to_replace`` with. + inplace : bool, default False + If True, in place. + + See also + -------- + Series.fillna + + Raises + ------ + TypeError + - If ``to_replace`` is not a scalar, array-like, dict, or None + - If ``to_replace`` is a dict and value is not a list, dict, + or Series + ValueError + - If a list is passed to ``to_replace`` and ``value`` but they + are not the same length. + + Returns + ------- + result : Series + Series after replacement. The mask and index are preserved. + + Notes + ----- + Parameters that are currently not supported are: `limit`, `regex`, + `method` + + Examples + -------- + **Series** + + Scalar ``to_replace`` and ``value`` + + >>> import cudf + >>> s = cudf.Series([0, 1, 2, 3, 4]) + >>> s + 0 0 + 1 1 + 2 2 + 3 3 + 4 4 + dtype: int64 + >>> s.replace(0, 5) + 0 5 + 1 1 + 2 2 + 3 3 + 4 4 + dtype: int64 + + List-like ``to_replace`` + + >>> s.replace([1, 2], 10) + 0 0 + 1 10 + 2 10 + 3 3 + 4 4 + dtype: int64 + + dict-like ``to_replace`` + + >>> s.replace({1:5, 3:50}) + 0 0 + 1 5 + 2 2 + 3 50 + 4 4 + dtype: int64 + >>> s = cudf.Series(['b', 'a', 'a', 'b', 'a']) + >>> s + 0 b + 1 a + 2 a + 3 b + 4 a + dtype: object + >>> s.replace({'a': None}) + 0 b + 1 + 2 + 3 b + 4 + dtype: object + + If there is a mimatch in types of the values in + ``to_replace`` & ``value`` with the actual series, then + cudf exhibits different behaviour with respect to pandas + and the pairs are ignored silently: + + >>> s = cudf.Series(['b', 'a', 'a', 'b', 'a']) + >>> s + 0 b + 1 a + 2 a + 3 b + 4 a + dtype: object + >>> s.replace('a', 1) + 0 b + 1 a + 2 a + 3 b + 4 a + dtype: object + >>> s.replace(['a', 'c'], [1, 2]) + 0 b + 1 a + 2 a + 3 b + 4 a + dtype: object + + **DataFrame** + + Scalar ``to_replace`` and ``value`` + + >>> import cudf + >>> df = cudf.DataFrame({'A': [0, 1, 2, 3, 4], + ... 'B': [5, 6, 7, 8, 9], + ... 'C': ['a', 'b', 'c', 'd', 'e']}) + >>> df + A B C + 0 0 5 a + 1 1 6 b + 2 2 7 c + 3 3 8 d + 4 4 9 e + >>> df.replace(0, 5) + A B C + 0 5 5 a + 1 1 6 b + 2 2 7 c + 3 3 8 d + 4 4 9 e + + List-like ``to_replace`` + + >>> df.replace([0, 1, 2, 3], 4) + A B C + 0 4 5 a + 1 4 6 b + 2 4 7 c + 3 4 8 d + 4 4 9 e + >>> df.replace([0, 1, 2, 3], [4, 3, 2, 1]) + A B C + 0 4 5 a + 1 3 6 b + 2 2 7 c + 3 1 8 d + 4 4 9 e + + dict-like ``to_replace`` + + >>> df.replace({0: 10, 1: 100}) + A B C + 0 10 5 a + 1 100 6 b + 2 2 7 c + 3 3 8 d + 4 4 9 e + >>> df.replace({'A': 0, 'B': 5}, 100) + A B C + 0 100 100 a + 1 1 6 b + 2 2 7 c + 3 3 8 d + 4 4 9 e + """ + if limit is not None: + raise NotImplementedError("limit parameter is not implemented yet") + + if regex: + raise NotImplementedError("regex parameter is not implemented yet") + + if method not in ("pad", None): + raise NotImplementedError( + "method parameter is not implemented yet" + ) + + if not (to_replace is None and value is None): copy_data = self._data.copy(deep=False) ( all_na_per_column, @@ -2179,7 +2703,7 @@ def replace(self, to_replace: Any, replacement: Any) -> Frame: replacements_per_column, ) = _get_replacement_values_for_columns( to_replace=to_replace, - value=replacement, + value=value, columns_dtype_map={ col: copy_data._data[col].dtype for col in copy_data._data }, @@ -2193,7 +2717,7 @@ def replace(self, to_replace: Any, replacement: Any) -> Frame: all_na_per_column[name], ) except (KeyError, OverflowError): - # We need to create a deep copy if : + # We need to create a deep copy if: # i. `find_and_replace` was not successful or any of # `to_replace_per_column`, `replacements_per_column`, # `all_na_per_column` don't contain the `name` @@ -2206,7 +2730,7 @@ def replace(self, to_replace: Any, replacement: Any) -> Frame: result = self._from_data(copy_data, self._index) - return result + return self._mimic_inplace(result, inplace=inplace) def _copy_type_metadata( self, other: Frame, include_index: bool = True @@ -3159,7 +3683,6 @@ def _merge( right_index=False, how="inner", sort=False, - method="hash", indicator=False, suffixes=("_x", "_y"), ): @@ -3182,7 +3705,6 @@ def _merge( right_index=right_index, how=how, sort=sort, - method=method, indicator=indicator, suffixes=suffixes, ) @@ -3230,85 +3752,6 @@ def _encode(self): keys = self.__class__._from_data(data, index) return keys, indices - def _reindex( - self, columns, dtypes=None, deep=False, index=None, inplace=False - ): - """ - Helper for `.reindex` - - Parameters - ---------- - columns : array-like - The list of columns to select from the Frame, - if ``columns`` is a superset of ``Frame.columns`` new - columns are created. - dtypes : dict - Mapping of dtypes for the empty columns being created. - deep : boolean, optional, default False - Whether to make deep copy or shallow copy of the columns. - index : Index or array-like, default None - The ``index`` to be used to reindex the Frame with. - inplace : bool, default False - Whether to perform the operation in place on the data. - - Returns - ------- - DataFrame - """ - if dtypes is None: - dtypes = {} - - df = self - if index is not None: - index = cudf.core.index.as_index(index) - - if isinstance(index, cudf.MultiIndex): - idx_dtype_match = all( - left_dtype == right_dtype - for left_dtype, right_dtype in zip( - (col.dtype for col in df.index._data.columns), - (col.dtype for col in index._data.columns), - ) - ) - else: - idx_dtype_match = df.index.dtype == index.dtype - - if not idx_dtype_match: - columns = columns if columns is not None else list(df.columns) - df = cudf.DataFrame() - else: - df = cudf.DataFrame(None, index).join( - df, how="left", sort=True - ) - # double-argsort to map back from sorted to unsorted positions - df = df.take(index.argsort(ascending=True).argsort()) - - index = index if index is not None else df.index - names = columns if columns is not None else list(df.columns) - cols = { - name: ( - df._data[name].copy(deep=deep) - if name in df._data - else column_empty( - dtype=dtypes.get(name, np.float64), - masked=True, - row_count=len(index), - ) - ) - for name in names - } - - result = self.__class__._from_data( - data=cudf.core.column_accessor.ColumnAccessor( - cols, - multiindex=self._data.multiindex, - level_names=self._data.level_names, - ), - index=index, - ) - - return self._mimic_inplace(result, inplace=inplace) - def _unaryop(self, op): data_columns = (col.unary_operator(op) for col in self._columns) return self.__class__._from_data( @@ -4846,55 +5289,27 @@ def _column(self, value): @property def values(self): - """ - Return a CuPy representation of the data. - - Returns - ------- - out : cupy.ndarray - A device representation of the underlying data. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, -10, 100, 20]) - >>> ser.values - array([ 1, -10, 100, 20]) - >>> type(ser.values) - - >>> index = cudf.Index([1, -10, 100, 20]) - >>> index.values - array([ 1, -10, 100, 20]) - >>> type(index.values) - - """ return self._column.values @property def values_host(self): - """ - Return a NumPy representation of the data. + return self._column.values_host - Returns - ------- - out : numpy.ndarray - A host representation of the underlying data. + def to_cupy( + self, + dtype: Union[Dtype, None] = None, + copy: bool = True, + na_value=None, + ) -> cupy.ndarray: + return super().to_cupy(dtype, copy, na_value).flatten() - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, -10, 100, 20]) - >>> ser.values_host - array([ 1, -10, 100, 20]) - >>> type(ser.values_host) - - >>> index = cudf.Index([1, -10, 100, 20]) - >>> index.values_host - array([ 1, -10, 100, 20]) - >>> type(index.values_host) - - """ - return self._column.values_host + def to_numpy( + self, + dtype: Union[Dtype, None] = None, + copy: bool = True, + na_value=None, + ) -> np.ndarray: + return super().to_numpy(dtype, copy, na_value).flatten() def tolist(self): @@ -4906,38 +5321,14 @@ def tolist(self): to_list = tolist + # TODO: When this method is removed we can also remove + # ColumnBase.to_gpu_array. def to_gpu_array(self, fillna=None): - """Get a dense numba device array for the data. - - Parameters - ---------- - fillna : str or None - See *fillna* in ``.to_array``. - - Notes - ----- - - if ``fillna`` is ``None``, null values are skipped. Therefore, the - output size could be smaller. - - Returns - ------- - numba.DeviceNDArray - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([10, 20, 30, 40, 50]) - >>> s - 0 10 - 1 20 - 2 30 - 3 40 - 4 50 - dtype: int64 - >>> s.to_gpu_array() - - """ + warnings.warn( + "The to_gpu_array method will be removed in a future cuDF " + "release. Consider using `to_cupy` instead.", + DeprecationWarning, + ) return self._column.to_gpu_array(fillna=fillna) @classmethod diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 6414d4a7e84..49a5e5e2143 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -4,6 +4,7 @@ import math import pickle +import warnings from numbers import Number from typing import ( Any, @@ -601,6 +602,23 @@ def __init__(self, data, **kwargs): name = kwargs.get("name") super().__init__({name: data}) + @classmethod + def deserialize(cls, header, frames): + if "index_column" in header: + warnings.warn( + "Index objects serialized in cudf version " + "21.10 or older will no longer be deserializable " + "after version 21.12. Please load and resave any " + "pickles before upgrading to version 22.02.", + DeprecationWarning, + ) + header["columns"] = [header.pop("index_column")] + header["column_names"] = pickle.dumps( + [pickle.loads(header["name"])] + ) + + return super().deserialize(header, frames) + def drop_duplicates(self, keep="first"): """ Return Index with duplicate values removed @@ -852,8 +870,8 @@ def get_loc(self, key, method=None, tolerance=None): # Not sorted and not unique. Return a boolean mask mask = cupy.full(self._data.nrows, False) - true_inds = sort_inds.slice(lower_bound, upper_bound).to_gpu_array() - mask[cupy.array(true_inds)] = True + true_inds = sort_inds.slice(lower_bound, upper_bound).values + mask[true_inds] = True return mask def __sizeof__(self): @@ -1597,6 +1615,27 @@ def quarter(self): res = extract_quarter(self._values) return Int8Index(res, dtype="int8") + def isocalendar(self): + """ + Returns a DataFrame with the year, week, and day + calculated according to the ISO 8601 standard. + + Returns + ------- + DataFrame + with columns year, week and day + + Examples + -------- + >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:00:00", + ... "1999-12-31 18:40:00"]) + >>> gIndex.isocalendar() + year week day + 2020-05-31 08:00:00 2020 22 7 + 1999-12-31 18:40:00 1999 52 5 + """ + return cudf.core.tools.datetimes._to_iso_calendar(self) + def to_pandas(self): nanos = self._values.astype("datetime64[ns]") return pd.DatetimeIndex(nanos.to_pandas(), name=self.name) @@ -2108,7 +2147,9 @@ def __init__(self, values, copy=False, **kwargs): super().__init__(values, **kwargs) def to_pandas(self): - return pd.Index(self.to_array(), name=self.name, dtype="object") + return pd.Index( + self.to_numpy(na_value=None), name=self.name, dtype="object" + ) def take(self, indices): return self._values[indices] diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 1d1f661779f..cc9c0fb66da 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -125,7 +125,7 @@ def _match_join_keys( else: warnings.warn( f"Can't safely cast column from {rtype} to {ltype}, " - "upcasting to {common_type}." + f"upcasting to {common_type}." ) return lcol.astype(common_type), rcol.astype(common_type) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 276038146e1..55540d362ac 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -29,7 +29,6 @@ def merge( right_index, how, sort, - method, indicator, suffixes, ): @@ -47,7 +46,6 @@ def merge( right_index=right_index, how=how, sort=sort, - method=method, indicator=indicator, suffixes=suffixes, ) @@ -87,7 +85,6 @@ def __init__( right_index, how, sort, - method, indicator, suffixes, ): diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index bc97c72db88..941783ecee7 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -854,17 +854,23 @@ def take(self, indices): return result def serialize(self): - header = {} - header["type-serialized"] = pickle.dumps(type(self)) - header["names"] = pickle.dumps(self.names) - - header["columns"], frames = column.serialize_columns(self._columns) - + header, frames = super().serialize() + # Overwrite the names in _data with the true names. + header["column_names"] = pickle.dumps(self.names) return header, frames @classmethod def deserialize(cls, header, frames): - names = pickle.loads(header["names"]) + if "names" in header: + warnings.warn( + "MultiIndex objects serialized in cudf version " + "21.10 or older will no longer be deserializable " + "after version 21.12. Please load and resave any " + "pickles before upgrading to version 22.02.", + DeprecationWarning, + ) + header["column_names"] = header["names"] + column_names = pickle.loads(header["column_names"]) if "source_data" in header: warnings.warn( "MultiIndex objects serialized in cudf version " @@ -874,11 +880,12 @@ def deserialize(cls, header, frames): DeprecationWarning, ) df = cudf.DataFrame.deserialize(header["source_data"], frames) - obj = cls.from_frame(df) - return obj._set_names(names) - columns = column.deserialize_columns(header["columns"], frames) - obj = cls._from_data(dict(zip(range(0, len(names)), columns))) - return obj._set_names(names) + return cls.from_frame(df)._set_names(column_names) + + # Spoof the column names to construct the frame, then set manually. + header["column_names"] = pickle.dumps(range(0, len(column_names))) + obj = super().deserialize(header, frames) + return obj._set_names(column_names) def __getitem__(self, index): if isinstance(index, int): @@ -1633,9 +1640,7 @@ def get_loc(self, key, method=None, tolerance=None): # the range is returned. return slice(lower_bound, upper_bound) - true_inds = cupy.array( - sort_inds.slice(lower_bound, upper_bound).to_gpu_array() - ) + true_inds = sort_inds.slice(lower_bound, upper_bound).values true_inds = _maybe_indices_to_slice(true_inds) if isinstance(true_inds, slice): return true_inds diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 594f9fc42d0..590ac077e97 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -331,21 +331,6 @@ def from_pandas(cls, s, nan_as_null=None): """ return cls(s, nan_as_null=nan_as_null) - def serialize(self): - header = {} - frames = [] - header["type-serialized"] = pickle.dumps(type(self)) - header["index"], index_frames = self._index.serialize() - header["index_frame_count"] = len(index_frames) - frames.extend(index_frames) - - header["column"], column_frames = self._column.serialize() - header["column_frame_count"] = len(column_frames) - frames.extend(column_frames) - - header["name"] = pickle.dumps(self.name) - return header, frames - @property def dt(self): """ @@ -374,20 +359,42 @@ def dt(self): "Can only use .dt accessor with datetimelike values" ) + def serialize(self): + header, frames = super().serialize() + + header["index"], index_frames = self._index.serialize() + header["index_frame_count"] = len(index_frames) + # For backwards compatibility with older versions of cuDF, index + # columns are placed before data columns. + frames = index_frames + frames + + return header, frames + @classmethod def deserialize(cls, header, frames): + if "column" in header: + warnings.warn( + "Series objects serialized in cudf version " + "21.10 or older will no longer be deserializable " + "after version 21.12. Please load and resave any " + "pickles before upgrading to version 22.02.", + DeprecationWarning, + ) + header["columns"] = [header.pop("column")] + header["column_names"] = pickle.dumps( + [pickle.loads(header["name"])] + ) + index_nframes = header["index_frame_count"] + obj = super().deserialize( + header, frames[header["index_frame_count"] :] + ) + idx_typ = pickle.loads(header["index"]["type-serialized"]) index = idx_typ.deserialize(header["index"], frames[:index_nframes]) - name = pickle.loads(header["name"]) + obj._index = index - frames = frames[index_nframes:] - - column_nframes = header["column_frame_count"] - col_typ = pickle.loads(header["column"]["type-serialized"]) - column = col_typ.deserialize(header["column"], frames[:column_nframes]) - - return cls._from_data({name: column}, index=index) + return obj def _get_columns_by_label(self, labels, downcast=False): """Return the column specified by `labels` @@ -738,25 +745,6 @@ def set_index(self, index): index = index if isinstance(index, BaseIndex) else as_index(index) return self._from_data(self._data, index, self.name) - def as_index(self): - """Returns a new Series with a RangeIndex. - - Examples - ---------- - >>> s = cudf.Series([1,2,3], index=['a','b','c']) - >>> s - a 1 - b 2 - c 3 - dtype: int64 - >>> s.as_index() - 0 1 - 1 2 - 2 3 - dtype: int64 - """ - return self.set_index(RangeIndex(len(self))) - def to_frame(self, name=None): """Convert Series into a DataFrame @@ -800,57 +788,8 @@ def to_frame(self, name=None): return cudf.DataFrame({col: self._column}, index=self.index) def set_mask(self, mask, null_count=None): - """Create new Series by setting a mask array. - - This will override the existing mask. The returned Series will - reference the same data buffer as this Series. - - Parameters - ---------- - mask : 1D array-like - The null-mask. Valid values are marked as ``1``; otherwise ``0``. - The mask bit given the data index ``idx`` is computed as:: - - (mask[idx // 8] >> (idx % 8)) & 1 - null_count : int, optional - The number of null values. - If None, it is calculated automatically. - - Returns - ------- - Series - A new series with the applied mask. - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([1, 2, 3, 4, 5]) - >>> ref_array = cudf.Series([10, None, 11, None, 16]) - >>> series - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - dtype: int64 - >>> ref_array - 0 10 - 1 - 2 11 - 3 - 4 16 - dtype: int64 - >>> series.set_mask(ref_array._column.mask) - 0 1 - 1 - 2 3 - 3 - 4 5 - dtype: int64 - """ warnings.warn( - "Series.set_mask is deprecated and will be removed " - "in the future.", + "Series.set_mask is deprecated and will be removed in the future.", DeprecationWarning, ) return self._from_data( @@ -2540,43 +2479,13 @@ def fillna( value=value, method=method, axis=axis, inplace=inplace, limit=limit ) + # TODO: When this method is removed we can also remove ColumnBase.to_array. def to_array(self, fillna=None): - """Get a dense numpy array for the data. - - Parameters - ---------- - fillna : str or None - Defaults to None, which will skip null values. - If it equals "pandas", null values are filled with NaNs. - Non integral dtype is promoted to np.float64. - - Returns - ------- - numpy.ndarray - A numpy array representation of the elements in the Series. - - Notes - ----- - If ``fillna`` is ``None``, null values are skipped. Therefore, the - output size could be smaller. - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([10, 11, 12, 13, 14]) - >>> series - 0 10 - 1 11 - 2 12 - 3 13 - 4 14 - dtype: int64 - >>> array = series.to_array() - >>> array - array([10, 11, 12, 13, 14]) - >>> type(array) - - """ + warnings.warn( + "The to_array method will be removed in a future cuDF " + "release. Consider using `to_numpy` instead.", + DeprecationWarning, + ) return self._column.to_array(fillna=fillna) def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): @@ -2912,81 +2821,10 @@ def argsort(self, ascending=True, na_position="last"): """ return self._sort(ascending=ascending, na_position=na_position)[1] - def sort_index( - self, - axis=0, - level=None, - ascending=True, - inplace=False, - kind=None, - na_position="last", - sort_remaining=True, - ignore_index=False, - ): - """ - Sort by the index. - - Parameters - ---------- - axis : {0 or ‘index’, 1 or ‘columns’}, default 0 - Axis to direct sorting. This can only be 0 for Series. - level : int or level name or list of ints or list of level names - If not None, sort on values in specified index level(s). - This is only useful in the case of MultiIndex. - ascending : bool, default True - Sort ascending vs. descending. - inplace : bool, default False - If True, perform operation in-place. - kind : sorting method such as `quick sort` and others. - Not yet supported. - na_position : {‘first’, ‘last’}, default ‘last’ - Puts NaNs at the beginning if first; last puts NaNs at the end. - sort_remaining : bool, default True - Not yet supported - ignore_index : bool, default False - if True, index will be replaced with RangeIndex. - - Returns - ------- - Series - The original Series sorted by the labels. - - Examples - -------- - >>> import cudf - >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4]) - >>> series - 3 a - 2 b - 1 c - 4 d - dtype: object - >>> series.sort_index() - 1 c - 2 b - 3 a - 4 d - dtype: object - - Sort Descending - - >>> series.sort_index(ascending=False) - 4 d - 3 a - 2 b - 1 c - dtype: object - """ - return super()._sort_index( - axis=axis, - level=level, - ascending=ascending, - inplace=inplace, - kind=kind, - na_position=na_position, - sort_remaining=sort_remaining, - ignore_index=ignore_index, - ) + def sort_index(self, axis=0, *args, **kwargs): + if axis not in (0, "index"): + raise ValueError("Only axis=0 is valid for Series.") + return super().sort_index(axis=axis, *args, **kwargs) def sort_values( self, @@ -3217,170 +3055,14 @@ def _sort(self, ascending=True, na_position="last"): sr_inds = self._from_data({self.name: col_inds}, self._index) return sr_keys, sr_inds - def replace( - self, - to_replace=None, - value=None, - inplace=False, - limit=None, - regex=False, - method=None, - ): - """ - Replace values given in ``to_replace`` with ``value``. - - Parameters - ---------- - to_replace : numeric, str or list-like - Value(s) to replace. - - * numeric or str: - - values equal to ``to_replace`` will be replaced - with ``value`` - * list of numeric or str: - - If ``value`` is also list-like, ``to_replace`` and - ``value`` must be of same length. - * dict: - - Dicts can be used to specify different replacement values - for different existing values. For example, {'a': 'b', - 'y': 'z'} replaces the value ‘a’ with ‘b’ and - ‘y’ with ‘z’. - To use a dict in this way the ``value`` parameter should - be ``None``. - value : scalar, dict, list-like, str, default None - Value to replace any values matching ``to_replace`` with. - inplace : bool, default False - If True, in place. - - See also - -------- - Series.fillna - - Raises - ------ - TypeError - - If ``to_replace`` is not a scalar, array-like, dict, or None - - If ``to_replace`` is a dict and value is not a list, dict, - or Series - ValueError - - If a list is passed to ``to_replace`` and ``value`` but they - are not the same length. - - Returns - ------- - result : Series - Series after replacement. The mask and index are preserved. - - Notes - ----- - Parameters that are currently not supported are: `limit`, `regex`, - `method` - - Examples - -------- - - Scalar ``to_replace`` and ``value`` - - >>> import cudf - >>> s = cudf.Series([0, 1, 2, 3, 4]) - >>> s - 0 0 - 1 1 - 2 2 - 3 3 - 4 4 - dtype: int64 - >>> s.replace(0, 5) - 0 5 - 1 1 - 2 2 - 3 3 - 4 4 - dtype: int64 - - List-like ``to_replace`` - - >>> s.replace([1, 2], 10) - 0 0 - 1 10 - 2 10 - 3 3 - 4 4 - dtype: int64 - - dict-like ``to_replace`` - - >>> s.replace({1:5, 3:50}) - 0 0 - 1 5 - 2 2 - 3 50 - 4 4 - dtype: int64 - >>> s = cudf.Series(['b', 'a', 'a', 'b', 'a']) - >>> s - 0 b - 1 a - 2 a - 3 b - 4 a - dtype: object - >>> s.replace({'a': None}) - 0 b - 1 - 2 - 3 b - 4 - dtype: object - - If there is a mimatch in types of the values in - ``to_replace`` & ``value`` with the actual series, then - cudf exhibits different behaviour with respect to pandas - and the pairs are ignored silently: - - >>> s = cudf.Series(['b', 'a', 'a', 'b', 'a']) - >>> s - 0 b - 1 a - 2 a - 3 b - 4 a - dtype: object - >>> s.replace('a', 1) - 0 b - 1 a - 2 a - 3 b - 4 a - dtype: object - >>> s.replace(['a', 'c'], [1, 2]) - 0 b - 1 a - 2 a - 3 b - 4 a - dtype: object - """ - if limit is not None: - raise NotImplementedError("limit parameter is not implemented yet") - - if regex: - raise NotImplementedError("regex parameter is not implemented yet") - - if method not in ("pad", None): - raise NotImplementedError( - "method parameter is not implemented yet" - ) - + def replace(self, to_replace=None, value=None, *args, **kwargs): if is_dict_like(to_replace) and value is not None: raise ValueError( "Series.replace cannot use dict-like to_replace and non-None " "value" ) - result = super().replace(to_replace=to_replace, replacement=value) - - return self._mimic_inplace(result, inplace=inplace) + return super().replace(to_replace, value, *args, **kwargs) def update(self, other): """ @@ -3487,35 +3169,10 @@ def update(self, other): self.mask(mask, other, inplace=True) def reverse(self): - """ - Reverse the Series - - Returns - ------- - Series - A reversed Series. - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([1, 2, 3, 4, 5, 6]) - >>> series - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - 5 6 - dtype: int64 - >>> series.reverse() - 5 6 - 4 5 - 3 4 - 2 3 - 1 2 - 0 1 - dtype: int64 - """ + warnings.warn( + "Series.reverse is deprecated and will be removed in the future.", + DeprecationWarning, + ) rinds = column.arange((self._column.size - 1), -1, -1, dtype=np.int32) return self._from_data( {self.name: self._column[rinds]}, self.index._values[rinds] @@ -3684,6 +3341,94 @@ def _return_sentinel_series(): return codes # UDF related + def apply(self, func, convert_dtype=True, args=(), **kwargs): + """ + Apply a scalar function to the values of a Series. + + Similar to `pandas.Series.apply. Applies a user + defined function elementwise over a series. + + Parameters + ---------- + func : function + Scalar Python function to apply. + convert_dtype : bool, default True + In cuDF, this parameter is always True. Because + cuDF does not support arbitrary object dtypes, + the result will always be the common type as determined + by numba based on the function logic and argument types. + See examples for details. + args : tuple + Not supported + **kwargs + Not supported + + Notes + ----- + UDFs are cached in memory to avoid recompilation. The first + call to the UDF will incur compilation overhead. + + Examples + -------- + + Apply a basic function to a series + >>> sr = cudf.Series([1,2,3]) + >>> def f(x): + ... return x + 1 + >>> sr.apply(f) + 0 2 + 1 3 + 2 4 + dtype: int64 + + Apply a basic function to a series with nulls + >>> sr = cudf.Series([1,cudf.NA,3]) + >>> def f(x): + ... return x + 1 + >>> sr.apply(f) + 0 2 + 1 + 2 4 + dtype: int64 + + Use a function that does something conditionally, + based on if the value is or is not null + >>> sr = cudf.Series([1,cudf.NA,3]) + >>> def f(x): + ... if x is cudf.NA: + ... return 42 + ... else: + ... return x - 1 + >>> sr.apply(f) + 0 0 + 1 42 + 2 2 + dtype: int64 + + Results will be upcast to the common dtype required + as derived from the UDFs logic. Note that this means + the common type will be returned even if such data + is passed that would not result in any values of that + dtype. + + >>> sr = cudf.Series([1,cudf.NA,3]) + >>> def f(x): + ... return x + 1.5 + >>> sr.apply(f) + 0 2.5 + 1 + 2 4.5 + dtype: float64 + + + + """ + if args or kwargs: + raise ValueError( + "UDFs using *args or **kwargs are not yet supported." + ) + + return super()._apply(func) def applymap(self, udf, out_dtype=None): """Apply an elementwise function to transform the values in the Column. @@ -3895,39 +3640,9 @@ def mode(self, dropna=True): return Series(val_counts.index.sort_values(), name=self.name) def round(self, decimals=0, how="half_even"): - """ - Round each value in a Series to the given number of decimals. - - Parameters - ---------- - decimals : int, default 0 - Number of decimal places to round to. If decimals is negative, - it specifies the number of positions to the left of the decimal - point. - how : str, optional - Type of rounding. Can be either "half_even" (default) - of "half_up" rounding. - - Returns - ------- - Series - Rounded values of the Series. - - Examples - -------- - >>> s = cudf.Series([0.1, 1.4, 2.9]) - >>> s.round() - 0 0.0 - 1 1.0 - 2 3.0 - dtype: float64 - """ - return Series( - self._column.round(decimals=decimals, how=how), - name=self.name, - index=self.index, - dtype=self.dtype, - ) + if not isinstance(decimals, int): + raise ValueError("decimals must be an int") + return super().round(decimals, how) def cov(self, other, min_periods=None): """ @@ -4554,7 +4269,7 @@ def _describe_numeric(self): ) data = ( [self.count(), self.mean(), self.std(), self.min()] - + self.quantile(percentiles).to_array(fillna="pandas").tolist() + + self.quantile(percentiles).to_numpy(na_value=np.nan).tolist() + [self.max()] ) data = _format_stats_values(data) @@ -4580,7 +4295,7 @@ def _describe_timedelta(self): ] + self.quantile(percentiles) .astype("str") - .to_array(fillna="pandas") + .to_numpy(na_value=None) .tolist() + [str(pd.Timedelta(self.max()))] ) @@ -4632,7 +4347,7 @@ def _describe_timestamp(self): ] + self.quantile(percentiles) .astype("str") - .to_array(fillna="pandas") + .to_numpy(na_value=None) .tolist() + [str(pd.Timestamp((self.max()).astype("datetime64[ns]")))] ) @@ -4946,7 +4661,6 @@ def merge( right_index=right_index, how=how, sort=sort, - method=method, indicator=False, suffixes=suffixes, ) @@ -5509,6 +5223,49 @@ def quarter(self): {None: res}, index=self.series._index, name=self.series.name, ) + def isocalendar(self): + """ + Returns a DataFrame with the year, week, and day + calculated according to the ISO 8601 standard. + + Returns + ------- + DataFrame + with columns year, week and day + + Examples + -------- + >>> ser = cudf.Series(pd.date_range(start="2021-07-25", + ... end="2021-07-30")) + >>> ser.dt.isocalendar() + year week day + 0 2021 29 7 + 1 2021 30 1 + 2 2021 30 2 + 3 2021 30 3 + 4 2021 30 4 + 5 2021 30 5 + >>> ser.dt.isocalendar().week + 0 29 + 1 30 + 2 30 + 3 30 + 4 30 + 5 30 + Name: week, dtype: object + + >>> serIndex = cudf.to_datetime(pd.Series(["2010-01-01", pd.NaT])) + >>> serIndex.dt.isocalendar() + year week day + 0 2009 53 5 + 1 + >>> serIndex.dt.isocalendar().year + 0 2009 + 1 + Name: year, dtype: object + """ + return cudf.core.tools.datetimes._to_iso_calendar(self) + @property def is_month_start(self): """ @@ -5812,9 +5569,8 @@ def strftime(self, date_format, *args, **kwargs): Notes ----- - The following date format identifiers are not yet supported: ``%a``, - ``%A``, ``%w``, ``%b``, ``%B``, ``%U``, ``%W``, ``%c``, ``%x``, - ``%X``, ``%G``, ``%u``, ``%V`` + The following date format identifiers are not yet + supported: ``%c``, ``%x``,``%X`` Examples -------- @@ -5853,19 +5609,9 @@ def strftime(self, date_format, *args, **kwargs): # once https://github.com/rapidsai/cudf/issues/5991 # is implemented not_implemented_formats = { - "%a", - "%A", - "%w", - "%b", - "%B", - "%U", - "%W", "%c", "%x", "%X", - "%G", - "%u", - "%V", } for d_format in not_implemented_formats: if d_format in date_format: @@ -5875,7 +5621,6 @@ def strftime(self, date_format, *args, **kwargs): f"https://github.com/rapidsai/cudf/issues/5991 " f"for tracking purposes." ) - str_col = self.series._column.as_string_column( dtype="str", format=date_format ) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index b0fb2fb4274..e17c58d1db7 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -4,6 +4,7 @@ from typing import Sequence, Union import numpy as np +import pandas as pd from pandas.core.tools.datetimes import _unit_map import cudf @@ -221,8 +222,8 @@ def to_datetime( format=format, ) return as_index(col, name=arg.name) - elif isinstance(arg, cudf.Series): - col = arg._column + elif isinstance(arg, (cudf.Series, pd.Series)): + col = column.as_column(arg) col = _process_col( col=col, unit=unit, @@ -652,3 +653,23 @@ def _isin_datetimelike( res = lhs._obtain_isin_result(rhs) return res + + +def _to_iso_calendar(arg): + formats = ["%G", "%V", "%u"] + if not isinstance(arg, (cudf.Index, cudf.core.series.DatetimeProperties)): + raise AttributeError( + "Can only use .isocalendar accessor with series or index" + ) + if isinstance(arg, cudf.Index): + iso_params = [ + arg._column.as_string_column(arg._values.dtype, fmt) + for fmt in formats + ] + index = arg._column + elif isinstance(arg.series, cudf.Series): + iso_params = [arg.strftime(fmt) for fmt in formats] + index = arg.series.index + + data = dict(zip(["year", "week", "day"], iso_params)) + return cudf.DataFrame(data, index=index, dtype=np.int32) diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 776b9e0a47c..8c69b94cc84 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -165,7 +165,9 @@ def to_numeric(arg, errors="raise", downcast=None): if isinstance(arg, (cudf.Series, pd.Series)): return cudf.Series(col) else: - col = col.fillna(col.default_na_value()) + if col.has_nulls: + # To match pandas, always return a floating type filled with nan. + col = col.astype(float).fillna(np.nan) return col.values diff --git a/python/cudf/cudf/core/udf/classes.py b/python/cudf/cudf/core/udf/api.py similarity index 69% rename from python/cudf/cudf/core/udf/classes.py rename to python/cudf/cudf/core/udf/api.py index fe2fbd9daad..23b4d02c57d 100644 --- a/python/cudf/cudf/core/udf/classes.py +++ b/python/cudf/cudf/core/udf/api.py @@ -14,3 +14,11 @@ class Masked: def __init__(self, value, valid): self.value = value self.valid = valid + + +def pack_return(masked_or_scalar): + # Blank function to give us something for the typing and + # lowering to grab onto. Just a dummy function for us to + # call within kernels that will get replaced later by the + # lowered implementation + pass diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py index 1467a61f215..3986abc2bf0 100644 --- a/python/cudf/cudf/core/udf/lowering.py +++ b/python/cudf/cudf/core/udf/lowering.py @@ -9,11 +9,10 @@ ) from numba.extending import lower_builtin, types +from cudf.core.udf import api +from cudf.core.udf._ops import arith_ops, comparison_ops from cudf.core.udf.typing import MaskedType, NAType -from . import classes -from ._ops import arith_ops, comparison_ops - @cuda_lowering_registry.lower_constant(NAType) def constant_na(context, builder, ty, pyval): @@ -154,9 +153,8 @@ def register_const_op(op): to_lower_op = make_const_op(op) cuda_lower(op, MaskedType, types.Number)(to_lower_op) cuda_lower(op, types.Number, MaskedType)(to_lower_op) - - # to_lower_op_reflected = make_reflected_const_op(op) - # cuda_lower(op, types.Number, MaskedType)(to_lower_op_reflected) + cuda_lower(op, MaskedType, types.Boolean)(to_lower_op) + cuda_lower(op, types.Boolean, MaskedType)(to_lower_op) # register all lowering at init @@ -194,6 +192,24 @@ def masked_scalar_is_null_impl(context, builder, sig, args): return builder.load(result) +# Main kernel always calls `pack_return` on whatever the user defined +# function returned. This returns the same data if its already a `Masked` +# else packs it up into a new one that is valid from the get go +@cuda_lower(api.pack_return, MaskedType) +def pack_return_masked_impl(context, builder, sig, args): + return args[0] + + +@cuda_lower(api.pack_return, types.Boolean) +@cuda_lower(api.pack_return, types.Number) +def pack_return_scalar_impl(context, builder, sig, args): + outdata = cgutils.create_struct_proxy(sig.return_type)(context, builder) + outdata.value = args[0] + outdata.valid = context.get_constant(types.boolean, 1) + + return outdata._getvalue() + + @cuda_lower(operator.truth, MaskedType) def masked_scalar_truth_impl(context, builder, sig, args): indata = cgutils.create_struct_proxy(MaskedType(types.boolean))( @@ -253,7 +269,7 @@ def cast_masked_to_masked(context, builder, fromty, toty, val): # Masked constructor for use in a kernel for testing -@lower_builtin(classes.Masked, types.Number, types.boolean) +@lower_builtin(api.Masked, types.Number, types.boolean) def masked_constructor(context, builder, sig, args): ty = sig.return_type value, valid = args diff --git a/python/cudf/cudf/core/udf/pipeline.py b/python/cudf/cudf/core/udf/pipeline.py index c7b8be92c00..7f3aa7baa93 100644 --- a/python/cudf/cudf/core/udf/pipeline.py +++ b/python/cudf/cudf/core/udf/pipeline.py @@ -1,28 +1,40 @@ +import cachetools +import numpy as np +from numba import cuda from numba.np import numpy_support +from numba.types import Tuple, boolean, int64, void from nvtx import annotate +from cudf.core.udf.api import Masked, pack_return from cudf.core.udf.typing import MaskedType from cudf.utils import cudautils +libcudf_bitmask_type = numpy_support.from_dtype(np.dtype("int32")) +MASK_BITSIZE = np.dtype("int32").itemsize * 8 +precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32) + @annotate("NUMBA JIT", color="green", domain="cudf_python") -def compile_masked_udf(func, dtypes): +def get_udf_return_type(func, dtypes): """ - Generate an inlineable PTX function that will be injected into - a variadic kernel inside libcudf - - assume all input types are `MaskedType(input_col.dtype)` and then - compile the requestied PTX function as a function over those types + Get the return type of a masked UDF for a given set of argument dtypes. It + is assumed that a `MaskedType(dtype)` is passed to the function for each + input dtype. """ to_compiler_sig = tuple( MaskedType(arg) for arg in (numpy_support.from_dtype(np_type) for np_type in dtypes) ) - # Get the inlineable PTX function - ptx, numba_output_type = cudautils.compile_udf(func, to_compiler_sig) - numpy_output_type = numpy_support.as_dtype(numba_output_type.value_type) + # Get the return type. The PTX is also returned by compile_udf, but is not + # needed here. + ptx, output_type = cudautils.compile_udf(func, to_compiler_sig) + + if not isinstance(output_type, MaskedType): + numba_output_type = numpy_support.from_dtype(np.dtype(output_type)) + else: + numba_output_type = output_type - return numpy_output_type, ptx + return numba_output_type def nulludf(func): @@ -50,3 +62,159 @@ def wrapper(*args): return to_udf_table._apply(func) return wrapper + + +def masked_array_type_from_col(col): + """ + Return a type representing a tuple of arrays, + the first element an array of the numba type + corresponding to `dtype`, and the second an + array of bools representing a mask. + """ + nb_scalar_ty = numpy_support.from_dtype(col.dtype) + if col.mask is None: + return nb_scalar_ty[::1] + else: + return Tuple((nb_scalar_ty[::1], libcudf_bitmask_type[::1])) + + +def construct_signature(df, return_type): + """ + Build the signature of numba types that will be used to + actually JIT the kernel itself later, accounting for types + and offsets + """ + + # Tuple of arrays, first the output data array, then the mask + return_type = Tuple((return_type[::1], boolean[::1])) + offsets = [] + sig = [return_type] + for col in df._data.values(): + sig.append(masked_array_type_from_col(col)) + offsets.append(int64) + + # return_type + data,masks + offsets + size + sig = void(*(sig + offsets + [int64])) + + return sig + + +@cuda.jit(device=True) +def mask_get(mask, pos): + return (mask[pos // MASK_BITSIZE] >> (pos % MASK_BITSIZE)) & 1 + + +kernel_template = """\ +def _kernel(retval, {input_columns}, {input_offsets}, size): + i = cuda.grid(1) + ret_data_arr, ret_mask_arr = retval + if i < size: +{masked_input_initializers} + ret = {user_udf_call} + ret_masked = pack_return(ret) + ret_data_arr[i] = ret_masked.value + ret_mask_arr[i] = ret_masked.valid +""" + +unmasked_input_initializer_template = """\ + d_{idx} = input_col_{idx} + masked_{idx} = Masked(d_{idx}[i], True) +""" + +masked_input_initializer_template = """\ + d_{idx}, m_{idx} = input_col_{idx} + masked_{idx} = Masked(d_{idx}[i], mask_get(m_{idx}, i + offset_{idx})) +""" + + +def _define_function(df, scalar_return=False): + # Create argument list for kernel + input_columns = ", ".join([f"input_col_{i}" for i in range(len(df._data))]) + input_offsets = ", ".join([f"offset_{i}" for i in range(len(df._data))]) + + # Create argument list to pass to device function + args = ", ".join([f"masked_{i}" for i in range(len(df._data))]) + user_udf_call = f"f_({args})" + + # Generate the initializers for each device function argument + initializers = [] + for i, col in enumerate(df._data.values()): + idx = str(i) + if col.mask is not None: + template = masked_input_initializer_template + else: + template = unmasked_input_initializer_template + + initializer = template.format(idx=idx) + + initializers.append(initializer) + + masked_input_initializers = "\n".join(initializers) + + # Incorporate all of the above into the kernel code template + d = { + "input_columns": input_columns, + "input_offsets": input_offsets, + "masked_input_initializers": masked_input_initializers, + "user_udf_call": user_udf_call, + } + + return kernel_template.format(**d) + + +@annotate("UDF COMPILATION", color="darkgreen", domain="cudf_python") +def compile_or_get(df, f): + """ + Return a compiled kernel in terms of MaskedTypes that launches a + kernel equivalent of `f` for the dtypes of `df`. The kernel uses + a thread for each row and calls `f` using that rows data / mask + to produce an output value and output valdity for each row. + + If the UDF has already been compiled for this requested dtypes, + a cached version will be returned instead of running compilation. + + """ + + # check to see if we already compiled this function + frame_dtypes = tuple(col.dtype for col in df._data.values()) + cache_key = ( + *cudautils.make_cache_key(f, frame_dtypes), + *(col.mask is None for col in df._data.values()), + ) + if precompiled.get(cache_key) is not None: + kernel, scalar_return_type = precompiled[cache_key] + return kernel, scalar_return_type + + numba_return_type = get_udf_return_type(f, frame_dtypes) + + _is_scalar_return = not isinstance(numba_return_type, MaskedType) + scalar_return_type = ( + numba_return_type + if _is_scalar_return + else numba_return_type.value_type + ) + + sig = construct_signature(df, scalar_return_type) + f_ = cuda.jit(device=True)(f) + + # Dict of 'local' variables into which `_kernel` is defined + local_exec_context = {} + global_exec_context = { + "f_": f_, + "cuda": cuda, + "Masked": Masked, + "mask_get": mask_get, + "pack_return": pack_return, + } + exec( + _define_function(df, scalar_return=_is_scalar_return), + global_exec_context, + local_exec_context, + ) + # The python function definition representing the kernel + _kernel = local_exec_context["_kernel"] + kernel = cuda.jit(sig)(_kernel) + scalar_return_type = numpy_support.as_dtype(scalar_return_type) + precompiled[cache_key] = (kernel, scalar_return_type) + + return kernel, scalar_return_type diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py index 6e026412f24..042d97db838 100644 --- a/python/cudf/cudf/core/udf/typing.py +++ b/python/cudf/cudf/core/udf/typing.py @@ -17,8 +17,8 @@ from numba.cuda.cudadecl import registry as cuda_decl_registry from pandas._libs.missing import NAType as _NAType -from . import classes -from ._ops import arith_ops, comparison_ops +from cudf.core.udf import api +from cudf.core.udf._ops import arith_ops, comparison_ops class MaskedType(types.Type): @@ -101,7 +101,7 @@ def __eq__(self, other): # For typing a Masked constant value defined outside a kernel (e.g. captured in # a closure). -@typeof_impl.register(classes.Masked) +@typeof_impl.register(api.Masked) def typeof_masked(val, c): return MaskedType(typeof(val.value)) @@ -110,7 +110,7 @@ def typeof_masked(val, c): # type in a kernel. @cuda_decl_registry.register class MaskedConstructor(ConcreteTemplate): - key = classes.Masked + key = api.Masked cases = [ nb_signature(MaskedType(t), t, types.boolean) @@ -123,20 +123,20 @@ class MaskedConstructor(ConcreteTemplate): make_attribute_wrapper(MaskedType, "valid", "valid") -# Typing for `classes.Masked` +# Typing for `api.Masked` @cuda_decl_registry.register_attr class ClassesTemplate(AttributeTemplate): - key = types.Module(classes) + key = types.Module(api) def resolve_Masked(self, mod): return types.Function(MaskedConstructor) -# Registration of the global is also needed for Numba to type classes.Masked -cuda_decl_registry.register_global(classes, types.Module(classes)) -# For typing bare Masked (as in `from .classes import Masked` +# Registration of the global is also needed for Numba to type api.Masked +cuda_decl_registry.register_global(api, types.Module(api)) +# For typing bare Masked (as in `from .api import Masked` cuda_decl_registry.register_global( - classes.Masked, types.Function(MaskedConstructor) + api.Masked, types.Function(MaskedConstructor) ) @@ -247,10 +247,10 @@ def generic(self, args, kws): # In the case of op(Masked, scalar), we resolve the type between # the Masked value_type and the scalar's type directly if isinstance(args[0], MaskedType) and isinstance( - args[1], types.Number + args[1], (types.Number, types.Boolean) ): to_resolve_types = (args[0].value_type, args[1]) - elif isinstance(args[0], types.Number) and isinstance( + elif isinstance(args[0], (types.Number, types.Boolean)) and isinstance( args[1], MaskedType ): to_resolve_types = (args[1].value_type, args[0]) @@ -287,6 +287,23 @@ def generic(self, args, kws): return nb_signature(types.boolean, MaskedType(types.boolean)) +@cuda_decl_registry.register_global(api.pack_return) +class UnpackReturnToMasked(AbstractTemplate): + """ + Turn a returned MaskedType into its value and validity + or turn a scalar into the tuple (scalar, True). + """ + + def generic(self, args, kws): + if isinstance(args[0], MaskedType): + # MaskedType(dtype, valid) -> MaskedType(dtype, valid) + return nb_signature(args[0], args[0]) + elif isinstance(args[0], (types.Number, types.Boolean)): + # scalar_type -> MaskedType(scalar_type, True) + return_type = MaskedType(args[0]) + return nb_signature(return_type, args[0]) + + for op in arith_ops + comparison_ops: # Every op shares the same typing class cuda_decl_registry.register_global(op)(MaskedScalarArithOp) diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index d5e9142934b..9562fca7399 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -222,8 +222,8 @@ def assert_column_equal( left = left.astype(left.categories.dtype) right = right.astype(right.categories.dtype) if not columns_equal: - msg1 = f"{left.to_array()}" - msg2 = f"{right.to_array()}" + msg1 = f"{left.values_host}" + msg2 = f"{right.values_host}" try: diff = left.apply_boolean_mask(left != right).size diff = diff * 100.0 / left.size diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py index fa3c88a3551..925c9ef720c 100644 --- a/python/cudf/cudf/tests/test_applymap.py +++ b/python/cudf/cudf/tests/test_applymap.py @@ -43,7 +43,7 @@ def test_applymap_round(nelem, masked): # Check expect = np.round(data) - got = out.to_array() + got = out.to_numpy() np.testing.assert_array_almost_equal(expect, got) @@ -58,5 +58,5 @@ def test_applymap_change_out_dtype(): # Check expect = np.array(data, dtype=float) - got = out.to_array() + got = out.to_numpy() np.testing.assert_array_equal(expect, got) diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py index ecd13b57ca4..29654fb9556 100644 --- a/python/cudf/cudf/tests/test_array_function.py +++ b/python/cudf/cudf/tests/test_array_function.py @@ -34,7 +34,7 @@ def test_array_func_cudf_series(np_ar, func): if np.isscalar(expect): assert_eq(expect, got) else: - assert_eq(expect, got.to_array()) + assert_eq(expect, got.to_numpy()) @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index 8cfcf4d2b6d..3fe0321ec54 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -31,7 +31,7 @@ def test_ufunc_cudf_non_nullseries(np_ar_tup, func): s_1, s_2 = cudf.Series(x), cudf.Series(y) expect = func(x, y) got = func(s_1, s_2) - assert_eq(expect, got.to_array()) + assert_eq(expect, got.to_numpy()) @pytest.mark.parametrize( @@ -45,7 +45,7 @@ def test_ufunc_cudf_series_bitwise(func): s_1, s_2 = cudf.Series(x), cudf.Series(y) expect = func(x, y) got = func(s_1, s_2) - assert_eq(expect, got.to_array()) + assert_eq(expect, got.to_numpy()) @pytest.mark.parametrize( @@ -67,16 +67,16 @@ def test_ufunc_cudf_null_series(np_ar_tup, func): s_1, s_2 = cudf.Series(x), cudf.Series(y) expect = func(x, y) got = func(s_1, s_2) - assert_eq(expect, got.fillna(np.nan).to_array()) + assert_eq(expect, got.fillna(np.nan).to_numpy()) scalar = 0.5 expect = func(x, scalar) got = func(s_1, scalar) - assert_eq(expect, got.fillna(np.nan).to_array()) + assert_eq(expect, got.fillna(np.nan).to_numpy()) expect = func(scalar, x) got = func(scalar, s_1) - assert_eq(expect, got.fillna(np.nan).to_array()) + assert_eq(expect, got.fillna(np.nan).to_numpy()) @pytest.mark.xfail( @@ -93,16 +93,16 @@ def test_ufunc_cudf_null_series_comparison_ops(np_ar_tup, func): s_1, s_2 = cudf.Series(x), cudf.Series(y) expect = func(x, y) got = func(s_1, s_2) - assert_eq(expect, got.fillna(np.nan).to_array()) + assert_eq(expect, got.fillna(np.nan).to_numpy()) scalar = 0.5 expect = func(x, scalar) got = func(s_1, scalar) - assert_eq(expect, got.fillna(np.nan).to_array()) + assert_eq(expect, got.fillna(np.nan).to_numpy()) expect = func(scalar, x) got = func(scalar, s_1) - assert_eq(expect, got.fillna(np.nan).to_array()) + assert_eq(expect, got.fillna(np.nan).to_numpy()) @pytest.mark.parametrize( @@ -115,7 +115,7 @@ def test_ufunc_cudf_series_cupy_array(np_ar_tup, func): cudf_s = cudf.Series(x) cupy_ar = cp.array(y) got = func(cudf_s, cupy_ar) - assert_eq(expect, got.to_array()) + assert_eq(expect, got.to_numpy()) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index f8063408e28..50fd27f2752 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -80,7 +80,7 @@ def func(index): result = binop(sr.astype("int32"), sr) expect = binop(arr.astype("int32"), arr) - np.testing.assert_almost_equal(result.to_array(), expect, decimal=5) + np.testing.assert_almost_equal(result.to_numpy(), expect, decimal=5) from concurrent.futures import ThreadPoolExecutor @@ -108,7 +108,7 @@ def test_series_binop_scalar(nelem, binop, obj_class, use_cudf_scalar): if obj_class == "Index": result = Series(result) - np.testing.assert_almost_equal(result.to_array(), binop(arr, rhs)) + np.testing.assert_almost_equal(result.to_numpy(), binop(arr, rhs)) _bitwise_binops = [operator.and_, operator.or_, operator.xor] @@ -146,7 +146,7 @@ def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype): if obj_class == "Index": result = Series(result) - np.testing.assert_almost_equal(result.to_array(), binop(arr1, arr2)) + np.testing.assert_almost_equal(result.to_numpy(), binop(arr1, arr2)) _logical_binops = [ @@ -211,9 +211,9 @@ def test_series_compare(cmpop, obj_class, dtype): result2 = Series(result2) result3 = Series(result3) - np.testing.assert_equal(result1.to_array(), cmpop(arr1, arr1)) - np.testing.assert_equal(result2.to_array(), cmpop(arr2, arr2)) - np.testing.assert_equal(result3.to_array(), cmpop(arr1, arr2)) + np.testing.assert_equal(result1.to_numpy(), cmpop(arr1, arr1)) + np.testing.assert_equal(result2.to_numpy(), cmpop(arr2, arr2)) + np.testing.assert_equal(result3.to_numpy(), cmpop(arr1, arr2)) def _series_compare_nulls_typegen(): @@ -298,8 +298,8 @@ def test_series_compare_scalar( result1 = Series(result1) result2 = Series(result2) - np.testing.assert_equal(result1.to_array(), cmpop(arr1, rhs)) - np.testing.assert_equal(result2.to_array(), cmpop(rhs, arr1)) + np.testing.assert_equal(result1.to_numpy(), cmpop(arr1, rhs)) + np.testing.assert_equal(result2.to_numpy(), cmpop(rhs, arr1)) _nulls = ["none", "some"] @@ -347,7 +347,7 @@ def test_validity_add(nelem, lhs_nulls, rhs_nulls): )[:nelem] # Fill NA values na_value = -10000 - got = res.fillna(na_value).to_array() + got = res.fillna(na_value).to_numpy() expect = lhs_data + rhs_data if lhs_nulls == "some" or rhs_nulls == "some": expect[~res_mask] = na_value @@ -383,7 +383,7 @@ def test_series_binop_mixed_dtype(binop, lhs_dtype, rhs_dtype, obj_class): if obj_class == "Index": result = Series(result) - np.testing.assert_almost_equal(result.to_array(), binop(lhs, rhs)) + np.testing.assert_almost_equal(result.to_numpy(), binop(lhs, rhs)) @pytest.mark.parametrize("obj_class", ["Series", "Index"]) @@ -408,7 +408,7 @@ def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype, obj_class): if obj_class == "Index": result = Series(result) - np.testing.assert_array_equal(result.to_array(), cmpop(lhs, rhs)) + np.testing.assert_array_equal(result.to_numpy(), cmpop(lhs, rhs)) _reflected_ops = [ @@ -468,7 +468,7 @@ def test_reflected_ops_scalar(func, dtype, obj_class): ps_result = func(random_series) # verify - np.testing.assert_allclose(ps_result, gs_result.to_array()) + np.testing.assert_allclose(ps_result, gs_result.to_numpy()) _cudf_scalar_reflected_ops = [ @@ -536,7 +536,7 @@ def test_reflected_ops_cudf_scalar(funcs, dtype, obj_class): ps_result = cpu_func(random_series) # verify - np.testing.assert_allclose(ps_result, gs_result.to_array()) + np.testing.assert_allclose(ps_result, gs_result.to_numpy()) @pytest.mark.parametrize("binop", _binops) diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 8d6c551761d..bc3ae721554 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -36,7 +36,7 @@ def test_categorical_basic(): assert pdsr.cat.ordered == sr.cat.ordered np.testing.assert_array_equal( - pdsr.cat.codes.values, sr.cat.codes.to_array() + pdsr.cat.codes.values, sr.cat.codes.to_numpy() ) string = str(sr) @@ -48,7 +48,7 @@ def test_categorical_basic(): t a """ assert all(x == y for x, y in zip(string.split(), expect_str.split())) - assert_eq(cat.codes, cudf_cat.codes.to_array()) + assert_eq(cat.codes, cudf_cat.codes.to_numpy()) def test_categorical_integer(): @@ -58,13 +58,13 @@ def test_categorical_integer(): pdsr = pd.Series(cat) sr = cudf.Series(cat) np.testing.assert_array_equal( - cat.codes, sr.cat.codes.astype(cat.codes.dtype).fillna(-1).to_array() + cat.codes, sr.cat.codes.astype(cat.codes.dtype).fillna(-1).to_numpy() ) assert sr.null_count == 2 np.testing.assert_array_equal( pdsr.cat.codes.values, - sr.cat.codes.astype(pdsr.cat.codes.dtype).fillna(-1).to_array(), + sr.cat.codes.astype(pdsr.cat.codes.dtype).fillna(-1).to_numpy(), ) string = str(sr) @@ -90,12 +90,12 @@ def test_categorical_compare_unordered(): out = sr == sr assert out.dtype == np.bool_ assert type(out[0]) == np.bool_ - assert np.all(out.to_array()) + assert np.all(out.to_numpy()) assert np.all(pdsr == pdsr) # test inequality out = sr != sr - assert not np.any(out.to_array()) + assert not np.any(out.to_numpy()) assert not np.any(pdsr != pdsr) assert not pdsr.cat.ordered @@ -126,20 +126,20 @@ def test_categorical_compare_ordered(): out = sr1 == sr1 assert out.dtype == np.bool_ assert type(out[0]) == np.bool_ - assert np.all(out.to_array()) + assert np.all(out.to_numpy()) assert np.all(pdsr1 == pdsr1) # test inequality out = sr1 != sr1 - assert not np.any(out.to_array()) + assert not np.any(out.to_numpy()) assert not np.any(pdsr1 != pdsr1) assert pdsr1.cat.ordered assert sr1.cat.ordered # test using ordered operators - np.testing.assert_array_equal(pdsr1 < pdsr2, (sr1 < sr2).to_array()) - np.testing.assert_array_equal(pdsr1 > pdsr2, (sr1 > sr2).to_array()) + np.testing.assert_array_equal(pdsr1 < pdsr2, (sr1 < sr2).to_numpy()) + np.testing.assert_array_equal(pdsr1 > pdsr2, (sr1 > sr2).to_numpy()) def test_categorical_binary_add(): @@ -198,7 +198,7 @@ def test_categorical_masking(): got_matches = sr == "a" np.testing.assert_array_equal( - expect_matches.values, got_matches.to_array() + expect_matches.values, got_matches.to_numpy() ) # mask series @@ -320,14 +320,14 @@ def test_categorical_empty(): cat = pd.Categorical([]) pdsr = pd.Series(cat) sr = cudf.Series(cat) - np.testing.assert_array_equal(cat.codes, sr.cat.codes.to_array()) + np.testing.assert_array_equal(cat.codes, sr.cat.codes.to_numpy()) # Test attributes assert_eq(pdsr.cat.categories, sr.cat.categories) assert pdsr.cat.ordered == sr.cat.ordered np.testing.assert_array_equal( - pdsr.cat.codes.values, sr.cat.codes.to_array() + pdsr.cat.codes.values, sr.cat.codes.to_numpy() ) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index b82f736fe89..d2c7c073aa1 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -102,7 +102,7 @@ def column_slicing_test(col, offset, size, cast_to_float=False): sliced_series.reset_index(drop=True), ) else: - assert_eq(np.asarray(pd_series[sl]), sliced_series.to_array()) + assert_eq(np.asarray(pd_series[sl]), sliced_series.to_numpy()) @pytest.mark.parametrize("offset", [0, 1, 15]) @@ -161,11 +161,17 @@ def test_as_column_scalar_with_nan(nan_as_null): size = 10 scalar = np.nan - expected = cudf.Series([np.nan] * size, nan_as_null=nan_as_null).to_array() + expected = ( + cudf.Series([np.nan] * size, nan_as_null=nan_as_null) + .dropna() + .to_numpy() + ) - got = cudf.Series( - as_column(scalar, length=size, nan_as_null=nan_as_null) - ).to_array() + got = ( + cudf.Series(as_column(scalar, length=size, nan_as_null=nan_as_null)) + .dropna() + .to_numpy() + ) np.testing.assert_equal(expected, got) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 2eb59616253..0b8b6dd565f 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -393,9 +393,9 @@ def test_csv_reader_negative_vals(tmpdir): df = read_csv(str(fname), names=names, dtype=dtypes, skiprows=1) - np.testing.assert_allclose(zero, df["0"].to_array()) - np.testing.assert_allclose(one, df["1"].to_array()) - np.testing.assert_allclose(two, df["2"].to_array()) + np.testing.assert_allclose(zero, df["0"].to_numpy()) + np.testing.assert_allclose(one, df["1"].to_numpy()) + np.testing.assert_allclose(two, df["2"].to_numpy()) def test_csv_reader_strings(tmpdir): @@ -483,7 +483,7 @@ def test_csv_reader_mangle_dupe_cols(tmpdir): cu_df = read_csv(StringIO(buffer), mangle_dupe_cols=False) # check that the dupe columns were removed assert len(cu_df.columns) == 3 - np.testing.assert_array_equal(cu_df["abc"].to_array(), [1]) + np.testing.assert_array_equal(cu_df["abc"].to_numpy(), [1]) def test_csv_reader_float_decimal(tmpdir): @@ -516,11 +516,11 @@ def test_csv_reader_float_decimal(tmpdir): decimal=",", ) - np.testing.assert_allclose(basic_32_ref, df["basic_32"].to_array()) - np.testing.assert_allclose(basic_64_ref, df["basic_64"].to_array()) - np.testing.assert_allclose(round_ref, df["round"].to_array()) - np.testing.assert_allclose(decimal_only_ref, df["decimal_only"].to_array()) - np.testing.assert_allclose(precision_ref, df["precision"].to_array()) + np.testing.assert_allclose(basic_32_ref, df["basic_32"].to_numpy()) + np.testing.assert_allclose(basic_64_ref, df["basic_64"].to_numpy()) + np.testing.assert_allclose(round_ref, df["round"].to_numpy()) + np.testing.assert_allclose(decimal_only_ref, df["decimal_only"].to_numpy()) + np.testing.assert_allclose(precision_ref, df["precision"].to_numpy()) def test_csv_reader_NaN_values(): @@ -620,12 +620,12 @@ def test_csv_reader_thousands(tmpdir): str(fname), names=names, dtype=dtypes, skiprows=1, thousands="'" ) - np.testing.assert_allclose(f32_ref, df["float32"].to_array()) - np.testing.assert_allclose(f64_ref, df["float64"].to_array()) - np.testing.assert_allclose(int32_ref, df["int32"].to_array()) - np.testing.assert_allclose(int64_ref, df["int64"].to_array()) - np.testing.assert_allclose(uint32_ref, df["uint32"].to_array()) - np.testing.assert_allclose(uint64_ref, df["uint64"].to_array()) + np.testing.assert_allclose(f32_ref, df["float32"].to_numpy()) + np.testing.assert_allclose(f64_ref, df["float64"].to_numpy()) + np.testing.assert_allclose(int32_ref, df["int32"].to_numpy()) + np.testing.assert_allclose(int64_ref, df["int64"].to_numpy()) + np.testing.assert_allclose(uint32_ref, df["uint32"].to_numpy()) + np.testing.assert_allclose(uint64_ref, df["uint64"].to_numpy()) def test_csv_reader_buffer_strings(): @@ -768,10 +768,10 @@ def test_csv_quotednumbers(tmpdir): df2 = read_csv(str(fname), names=names, dtype=dtypes, skiprows=1) assert len(df2.columns) == 2 - np.testing.assert_allclose(integer_ref, df1["integer"].to_array()) - np.testing.assert_allclose(decimal_ref, df1["decimal"].to_array()) - np.testing.assert_allclose(integer_ref, df2["integer"].to_array()) - np.testing.assert_allclose(decimal_ref, df2["decimal"].to_array()) + np.testing.assert_allclose(integer_ref, df1["integer"].to_numpy()) + np.testing.assert_allclose(decimal_ref, df1["decimal"].to_numpy()) + np.testing.assert_allclose(integer_ref, df2["integer"].to_numpy()) + np.testing.assert_allclose(decimal_ref, df2["decimal"].to_numpy()) def test_csv_reader_nrows(tmpdir): @@ -1033,8 +1033,8 @@ def test_csv_reader_tabs(): "1970-12-12T00:00:00.000000000", "2018-06-15T00:00:00.000000000", ] - np.testing.assert_allclose(floats, df["float_point"].to_array()) - np.testing.assert_allclose(ints, df["integer"].to_array()) + np.testing.assert_allclose(floats, df["float_point"].to_numpy()) + np.testing.assert_allclose(ints, df["integer"].to_numpy()) for row in range(4): assert str(df["date"][row]) == dates[row] @@ -1280,7 +1280,7 @@ def test_csv_reader_bools_false_positives(tmpdir): df = read_csv(StringIO(buffer), header=None, dtype=["int32"]) - np.testing.assert_array_equal(items, df["0"].to_array()) + np.testing.assert_array_equal(items, df["0"].to_numpy()) def test_csv_reader_aligned_byte_range(tmpdir): @@ -1312,7 +1312,7 @@ def test_csv_reader_hexadecimals(pdf_dtype, gdf_dtype): pdf = pd.DataFrame(data=values, dtype=pdf_dtype, columns=["hex_int"]) gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"]) np.testing.assert_array_equal( - pdf["hex_int"], gdf["hex_int"].to_array() + pdf["hex_int"], gdf["hex_int"].to_numpy() ) else: # otherwise, dtype inference returns as object (string) diff --git a/python/cudf/cudf/tests/test_cuda_apply.py b/python/cudf/cudf/tests/test_cuda_apply.py index 2604030097b..a00dbbba5f0 100644 --- a/python/cudf/cudf/tests/test_cuda_apply.py +++ b/python/cudf/cudf/tests/test_cuda_apply.py @@ -37,8 +37,8 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2): kwargs=dict(extra1=extra1, extra2=extra2), ) - got_out1 = outdf["out1"].to_array() - got_out2 = outdf["out2"].to_array() + got_out1 = outdf["out1"].to_numpy() + got_out2 = outdf["out2"].to_numpy() np.testing.assert_array_almost_equal(got_out1, expect_out1) np.testing.assert_array_almost_equal(got_out2, expect_out2) @@ -74,8 +74,8 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2): got_out1 = outdf["out1"] got_out2 = outdf["out2"] - np.testing.assert_array_almost_equal(got_out1.to_array(), expect_out1) - np.testing.assert_array_almost_equal(got_out2.to_array(), expect_out2) + np.testing.assert_array_almost_equal(got_out1.to_numpy(), expect_out1) + np.testing.assert_array_almost_equal(got_out2.to_numpy(), expect_out2) @pytest.mark.parametrize("nelem", [1, 15, 30, 64, 128, 129]) @@ -112,8 +112,8 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2): got_out1 = outdf["out1"] got_out2 = outdf["out2"] - np.testing.assert_array_almost_equal(got_out1.to_array(), expect_out1) - np.testing.assert_array_almost_equal(got_out2.to_array(), expect_out2) + np.testing.assert_array_almost_equal(got_out1.to_numpy(), expect_out1) + np.testing.assert_array_almost_equal(got_out2.to_numpy(), expect_out2) @pytest.mark.parametrize("nelem", [1, 15, 30, 64, 128, 129]) @@ -158,8 +158,8 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2): got_out1 = outdf["out1"] got_out2 = outdf["out2"] - np.testing.assert_array_almost_equal(got_out1.to_array(), expect_out1) - np.testing.assert_array_almost_equal(got_out2.to_array(), expect_out2) + np.testing.assert_array_almost_equal(got_out1.to_numpy(), expect_out1) + np.testing.assert_array_almost_equal(got_out2.to_numpy(), expect_out2) @pytest.mark.parametrize("nelem", [1, 2, 64, 128, 1000, 5000]) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 376fc3e6b88..5a839507182 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -201,7 +201,7 @@ def test_series_basic(): a1 = np.arange(10, dtype=np.float64) series = cudf.Series(a1) assert len(series) == 10 - np.testing.assert_equal(series.to_array(), np.hstack([a1])) + np.testing.assert_equal(series.to_numpy(), np.hstack([a1])) def test_series_from_cupy_scalars(): @@ -265,13 +265,13 @@ def test_dataframe_basic(): # Populate with cuda memory df["keys"] = np.arange(10, dtype=np.float64) - np.testing.assert_equal(df["keys"].to_array(), np.arange(10)) + np.testing.assert_equal(df["keys"].to_numpy(), np.arange(10)) assert len(df) == 10 # Populate with numpy array rnd_vals = np.random.random(10) df["vals"] = rnd_vals - np.testing.assert_equal(df["vals"].to_array(), rnd_vals) + np.testing.assert_equal(df["vals"].to_numpy(), rnd_vals) assert len(df) == 10 assert tuple(df.columns) == ("keys", "vals") @@ -287,11 +287,11 @@ def test_dataframe_basic(): hkeys = np.asarray(np.arange(10, dtype=np.float64).tolist() + [123]) hvals = np.asarray(rnd_vals.tolist() + [321]) - np.testing.assert_equal(df["keys"].to_array(), hkeys) - np.testing.assert_equal(df["vals"].to_array(), hvals) + np.testing.assert_equal(df["keys"].to_numpy(), hkeys) + np.testing.assert_equal(df["vals"].to_numpy(), hvals) # As matrix - mat = df.as_matrix() + mat = df.values_host expect = np.vstack([hkeys, hvals]).T @@ -301,7 +301,7 @@ def test_dataframe_basic(): df_tup = cudf.DataFrame() data = np.arange(10) df_tup[(1, "foobar")] = data - np.testing.assert_equal(data, df_tup[(1, "foobar")].to_array()) + np.testing.assert_equal(data, df_tup[(1, "foobar")].to_numpy()) df = cudf.DataFrame(pd.DataFrame({"a": [1, 2, 3], "c": ["a", "b", "c"]})) pdf = pd.DataFrame(pd.DataFrame({"a": [1, 2, 3], "c": ["a", "b", "c"]})) @@ -715,7 +715,7 @@ def test_dataframe_astype(nelem): assert df["a"].dtype is np.dtype(np.int32) df["b"] = df["a"].astype(np.float32) assert df["b"].dtype is np.dtype(np.float32) - np.testing.assert_equal(df["a"].to_array(), df["b"].to_array()) + np.testing.assert_equal(df["a"].to_numpy(), df["b"].to_numpy()) def test_astype_dict(): @@ -738,12 +738,12 @@ def test_index_astype(nelem): df.index = df.index.astype(np.float32) assert df.index.dtype is np.dtype(np.float32) df["a"] = df["a"].astype(np.float32) - np.testing.assert_equal(df.index.to_array(), df["a"].to_array()) + np.testing.assert_equal(df.index.to_numpy(), df["a"].to_numpy()) df["b"] = df["a"] df = df.set_index("b") df["a"] = df["a"].astype(np.int16) df.index = df.index.astype(np.int16) - np.testing.assert_equal(df.index.to_array(), df["a"].to_array()) + np.testing.assert_equal(df.index.to_numpy(), df["a"].to_numpy()) def test_dataframe_to_string(): @@ -786,9 +786,9 @@ def test_dataframe_to_string(): # check data values = masked.copy() validids = [0, 2, 3, 5] - densearray = masked.to_array() + densearray = masked.dropna().to_numpy() np.testing.assert_equal(data[validids], densearray) - # valid position is corret + # valid position is correct for i in validids: assert data[i] == values[i] @@ -941,12 +941,13 @@ def test_dataframe_dir_and_getattr(): df.not_a_column -@pytest.mark.parametrize("order", ["C", "F"]) -def test_empty_dataframe_as_gpu_matrix(order): +def test_empty_dataframe_to_array(): df = cudf.DataFrame() # Check fully empty dataframe. - mat = df.as_gpu_matrix(order=order).copy_to_host() + mat = df.to_cupy() + assert mat.shape == (0, 0) + mat = df.to_numpy() assert mat.shape == (0, 0) df = cudf.DataFrame() @@ -955,12 +956,11 @@ def test_empty_dataframe_as_gpu_matrix(order): df[k] = np.random.random(nelem) # Check all columns in empty dataframe. - mat = df.head(0).as_gpu_matrix(order=order).copy_to_host() + mat = df.head(0).to_cupy() assert mat.shape == (0, 3) -@pytest.mark.parametrize("order", ["C", "F"]) -def test_dataframe_as_gpu_matrix(order): +def test_dataframe_to_cupy(): df = cudf.DataFrame() nelem = 123 @@ -968,20 +968,20 @@ def test_dataframe_as_gpu_matrix(order): df[k] = np.random.random(nelem) # Check all columns - mat = df.as_gpu_matrix(order=order).copy_to_host() + mat = df.to_numpy() assert mat.shape == (nelem, 4) for i, k in enumerate(df.columns): - np.testing.assert_array_equal(df[k].to_array(), mat[:, i]) + np.testing.assert_array_equal(df[k].to_numpy(), mat[:, i]) # Check column subset - mat = df.as_gpu_matrix(order=order, columns=["a", "c"]).copy_to_host() + mat = df[["a", "c"]].to_cupy().get() assert mat.shape == (nelem, 2) for i, k in enumerate("ac"): - np.testing.assert_array_equal(df[k].to_array(), mat[:, i]) + np.testing.assert_array_equal(df[k].to_numpy(), mat[:, i]) -def test_dataframe_as_gpu_matrix_null_values(): +def test_dataframe_to_cupy_null_values(): df = cudf.DataFrame() nelem = 123 @@ -999,14 +999,15 @@ def test_dataframe_as_gpu_matrix_null_values(): refvalues[k] = data # Check null value causes error - with pytest.raises(ValueError) as raises: - df.as_gpu_matrix() - raises.match("column 'a' has null values") + with pytest.raises(ValueError): + df.to_cupy() + with pytest.raises(ValueError): + df.to_numpy() for k in df.columns: df[k] = df[k].fillna(na) - mat = df.as_gpu_matrix().copy_to_host() + mat = df.to_numpy() for i, k in enumerate(df.columns): np.testing.assert_array_equal(refvalues[k], mat[:, i]) @@ -1082,7 +1083,7 @@ def test_dataframe_setitem_index_len1(): gdf["a"] = [1] gdf["b"] = gdf.index._values - np.testing.assert_equal(gdf.b.to_array(), [0]) + np.testing.assert_equal(gdf.b.to_numpy(), [0]) def test_empty_dataframe_setitem_df(): @@ -1098,7 +1099,7 @@ def test_assign(): assert list(gdf.columns) == ["x"] assert list(gdf2.columns) == ["x", "y"] - np.testing.assert_equal(gdf2.y.to_array(), [2, 3, 4]) + np.testing.assert_equal(gdf2.y.to_numpy(), [2, 3, 4]) @pytest.mark.parametrize("nrows", [1, 8, 100, 1000]) @@ -1150,7 +1151,7 @@ def test_dataframe_hash_partition(nrows, nparts, nkeys): for p in got: if len(p): # Take rows of the keycolumns and build a set of the key-values - unique_keys = set(map(tuple, p.as_matrix(columns=keycols))) + unique_keys = set(map(tuple, p[keycols].values_host)) # Ensure that none of the key-values have occurred in other groups assert not (unique_keys & part_unique_keys) part_unique_keys |= unique_keys @@ -1593,7 +1594,7 @@ def test_from_arrow(nelem, data_type): # For some reason PyArrow to_pandas() converts to numpy array and has # better type compatibility - np.testing.assert_array_equal(s.to_pandas(), gs.to_array()) + np.testing.assert_array_equal(s.to_pandas(), gs.to_numpy()) @pytest.mark.parametrize("nelem", [0, 2, 3, 100, 1000]) @@ -1733,7 +1734,7 @@ def test_from_python_array(data_type): gs = cudf.Series(data) - np.testing.assert_equal(gs.to_array(), np_arr) + np.testing.assert_equal(gs.to_numpy(), np_arr) def test_series_shape(): @@ -2200,11 +2201,11 @@ def test_series_hash_encode(nrows): encoded_series = s.hash_encode(num_features) assert isinstance(encoded_series, cudf.Series) - enc_arr = encoded_series.to_array() + enc_arr = encoded_series.to_numpy() assert np.all(enc_arr >= 0) assert np.max(enc_arr) < num_features - enc_with_name_arr = s.hash_encode(num_features, use_name=True).to_array() + enc_with_name_arr = s.hash_encode(num_features, use_name=True).to_numpy() assert enc_with_name_arr[0] != enc_arr[0] @@ -2391,7 +2392,7 @@ def test_series_digitize(num_rows, num_bins, right, dtype, series_bins): else: indices = s.digitize(bins, right) np.testing.assert_array_equal( - np.digitize(data, bins, right), indices.to_array() + np.digitize(data, bins, right), indices.to_numpy() ) @@ -3228,15 +3229,6 @@ def test_array_ufunc(): assert_eq(np.sqrt(gdf.x), np.sqrt(pdf.x)) -@pytest.mark.parametrize("nan_value", [-5, -5.0, 0, 5, 5.0, None, "pandas"]) -def test_series_to_gpu_array(nan_value): - - s = cudf.Series([0, 1, None, 3]) - np.testing.assert_array_equal( - s.to_array(nan_value), s.to_gpu_array(nan_value).copy_to_host() - ) - - def test_dataframe_describe_exclude(): np.random.seed(12) data_length = 10000 @@ -3474,8 +3466,6 @@ def test_dataframe_round(decimals): expected = pdf.round(pdecimals) assert_eq(result, expected) - for c in gdf.columns: - np.array_equal(gdf[c].nullmask.to_array(), result[c].to_array()) @pytest.mark.parametrize( @@ -4061,8 +4051,7 @@ def test_series_values_host_property(data): marks=pytest.mark.xfail(raises=NotImplementedError), ), pytest.param( - ["m", "a", "d", "v"], - marks=pytest.mark.xfail(raises=NotImplementedError), + ["m", "a", "d", "v"], marks=pytest.mark.xfail(raises=TypeError), ), ], ) @@ -5719,7 +5708,7 @@ def test_df_sr_mask_where(data, condition, other, error, inplace): expect_where.cat.codes, got_where.cat.codes.astype(expect_where.cat.codes.dtype) .fillna(-1) - .to_array(), + .to_numpy(), ) assert_eq(expect_where.cat.categories, got_where.cat.categories) @@ -5727,7 +5716,7 @@ def test_df_sr_mask_where(data, condition, other, error, inplace): expect_mask.cat.codes, got_mask.cat.codes.astype(expect_mask.cat.codes.dtype) .fillna(-1) - .to_array(), + .to_numpy(), ) assert_eq(expect_mask.cat.categories, got_mask.cat.categories) else: @@ -7275,9 +7264,8 @@ def test_cudf_arrow_array_error(): with pytest.raises( TypeError, - match="Implicit conversion to a host PyArrow Table via __arrow_array__" - " is not allowed, To explicitly construct a PyArrow Table, consider " - "using .to_arrow()", + match="Implicit conversion to a host PyArrow object via " + "__arrow_array__ is not allowed. Consider using .to_arrow()", ): df.__arrow_array__() @@ -7285,18 +7273,16 @@ def test_cudf_arrow_array_error(): with pytest.raises( TypeError, - match="Implicit conversion to a host PyArrow Array via __arrow_array__" - " is not allowed, To explicitly construct a PyArrow Array, consider " - "using .to_arrow()", + match="Implicit conversion to a host PyArrow object via " + "__arrow_array__ is not allowed. Consider using .to_arrow()", ): sr.__arrow_array__() sr = cudf.Series(["a", "b", "c"]) with pytest.raises( TypeError, - match="Implicit conversion to a host PyArrow Array via __arrow_array__" - " is not allowed, To explicitly construct a PyArrow Array, consider " - "using .to_arrow()", + match="Implicit conversion to a host PyArrow object via " + "__arrow_array__ is not allowed. Consider using .to_arrow()", ): sr.__arrow_array__() diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py index 5b258c760b3..1a9098c70db 100644 --- a/python/cudf/cudf/tests/test_dataframe_copy.py +++ b/python/cudf/cudf/tests/test_dataframe_copy.py @@ -41,7 +41,7 @@ def test_dataframe_deep_copy(copy_parameters): copy_gdf["b"] = [0, 0, 0] pdf_is_equal = np.array_equal(pdf["b"].values, copy_pdf["b"].values) gdf_is_equal = np.array_equal( - gdf["b"].to_array(), copy_gdf["b"].to_array() + gdf["b"].to_numpy(), copy_gdf["b"].to_numpy() ) assert pdf_is_equal == copy_parameters["expected_equality"] assert gdf_is_equal == copy_parameters["expected_equality"] @@ -67,7 +67,7 @@ def test_dataframe_deep_copy_and_insert(copy_parameters): copy_gdf["b"] = [0, 0, 0] pdf_is_equal = np.array_equal(pdf["b"].values, copy_pdf["b"].values) gdf_is_equal = np.array_equal( - gdf["b"].to_array(), copy_gdf["b"].to_array() + gdf["b"].to_numpy(), copy_gdf["b"].to_numpy() ) assert pdf_is_equal == copy_parameters["expected_equality"] assert gdf_is_equal == copy_parameters["expected_equality"] @@ -160,8 +160,6 @@ def test_kernel_deep_copy(): cdf = gdf.copy(deep=True) sr = gdf["b"] - # column.to_gpu_array calls to_dense_buffer which returns a copy - # need to access buffer directly and then call gpu_array add_one[1, len(sr)](sr._column.data_array_view) assert not gdf.to_string().split() == cdf.to_string().split() @@ -173,7 +171,7 @@ def test_kernel_shallow_copy(): gdf = DataFrame.from_pandas(pdf) cdf = gdf.copy(deep=False) sr = gdf["a"] - add_one[1, len(sr)](sr.to_gpu_array()) + add_one[1, len(sr)](sr.to_cupy()) assert_eq(gdf, cdf) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 6e5b3c39dc4..3bbac217283 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -140,24 +140,24 @@ def test_datetime_series_binops_numpy(lhs_dtype, rhs_dtype): gdf_data_2 = Series(pd_data_2).astype(rhs_dtype) np_data_1 = np.array(pd_data_1).astype(lhs_dtype) np_data_2 = np.array(pd_data_2).astype(rhs_dtype) - np.testing.assert_equal(np_data_1, gdf_data_1.to_array()) - np.testing.assert_equal(np_data_2, gdf_data_2.to_array()) + np.testing.assert_equal(np_data_1, gdf_data_1.to_numpy()) + np.testing.assert_equal(np_data_2, gdf_data_2.to_numpy()) np.testing.assert_equal( - np.less(np_data_1, np_data_2), (gdf_data_1 < gdf_data_2).to_array() + np.less(np_data_1, np_data_2), (gdf_data_1 < gdf_data_2).to_numpy() ) np.testing.assert_equal( - np.greater(np_data_1, np_data_2), (gdf_data_1 > gdf_data_2).to_array() + np.greater(np_data_1, np_data_2), (gdf_data_1 > gdf_data_2).to_numpy() ) np.testing.assert_equal( - np.equal(np_data_1, np_data_2), (gdf_data_1 == gdf_data_2).to_array() + np.equal(np_data_1, np_data_2), (gdf_data_1 == gdf_data_2).to_numpy() ) np.testing.assert_equal( np.less_equal(np_data_1, np_data_2), - (gdf_data_1 <= gdf_data_2).to_array(), + (gdf_data_1 <= gdf_data_2).to_numpy(), ) np.testing.assert_equal( np.greater_equal(np_data_1, np_data_2), - (gdf_data_1 >= gdf_data_2).to_array(), + (gdf_data_1 >= gdf_data_2).to_numpy(), ) @@ -268,7 +268,7 @@ def test_typecast_from_datetime(data, dtype): np_casted = np_data.astype(dtype) gdf_casted = gdf_data.astype(dtype) - np.testing.assert_equal(np_casted, gdf_casted.to_array()) + np.testing.assert_equal(np_casted, gdf_casted.to_numpy()) @pytest.mark.parametrize("data", [data1(), data2()]) @@ -284,7 +284,7 @@ def test_typecast_from_datetime_to_int64_to_datetime(data, dtype): np_casted = np_data.astype(np.int64).astype(dtype) gdf_casted = gdf_data.astype(np.int64).astype(dtype) - np.testing.assert_equal(np_casted, gdf_casted.to_array()) + np.testing.assert_equal(np_casted, gdf_casted.to_numpy()) @pytest.mark.parametrize("data", [timeseries_us_data()]) @@ -296,7 +296,7 @@ def test_typecast_to_different_datetime_resolutions(data, dtype): pd_data = pd.Series(data.copy()) np_data = np.array(pd_data).astype(dtype) gdf_series = Series(pd_data).astype(dtype) - np.testing.assert_equal(np_data, gdf_series.to_array()) + np.testing.assert_equal(np_data, gdf_series.to_numpy()) @pytest.mark.parametrize( @@ -331,7 +331,7 @@ def test_typecast_to_datetime(data, from_dtype, to_dtype): np_casted = np_data.astype(to_dtype) gdf_casted = gdf_data.astype(to_dtype) - np.testing.assert_equal(np_casted, gdf_casted.to_array()) + np.testing.assert_equal(np_casted, gdf_casted.to_numpy()) @pytest.mark.parametrize("data", [numerical_data()]) @@ -347,7 +347,7 @@ def test_typecast_to_from_datetime(data, from_dtype, to_dtype): np_casted = np_data.astype(to_dtype).astype(from_dtype) gdf_casted = gdf_data.astype(to_dtype).astype(from_dtype) - np.testing.assert_equal(np_casted, gdf_casted.to_array()) + np.testing.assert_equal(np_casted, gdf_casted.to_numpy()) @pytest.mark.parametrize("data", [numerical_data()]) @@ -361,12 +361,12 @@ def test_typecast_to_from_datetime(data, from_dtype, to_dtype): ) def test_typecast_from_datetime_to_datetime(data, from_dtype, to_dtype): np_data = data.astype(from_dtype) - gdf_col = Series(np_data)._column + ser = Series(np_data) np_casted = np_data.astype(to_dtype) - gdf_casted = gdf_col.astype(to_dtype) + ser_casted = ser.astype(to_dtype) - np.testing.assert_equal(np_casted, gdf_casted.to_array()) + np.testing.assert_equal(np_casted, ser_casted.to_numpy()) @pytest.mark.parametrize("data", [numerical_data()]) @@ -1131,7 +1131,26 @@ def test_datetime_fillna(data, dtype, fill_value): ) @pytest.mark.parametrize("dtype", DATETIME_TYPES) @pytest.mark.parametrize( - "date_format", ["%d - %m", "%y/%H", "%Y", "%I - %M / %S", "%f", "%j", "%p"] + "date_format", + [ + "%d - %m", + "%y/%H", + "%Y", + "%I - %M / %S", + "%f", + "%j", + "%p", + "%w", + "%U", + "%W", + "%G", + "%u", + "%V", + "%b", + "%B", + "%a", + "%A", + ], ) def test_datetime_strftime(data, dtype, date_format): gsr = cudf.Series(data, dtype=dtype) @@ -1143,24 +1162,7 @@ def test_datetime_strftime(data, dtype, date_format): assert_eq(expected, actual) -@pytest.mark.parametrize( - "date_format", - [ - "%a", - "%A", - "%w", - "%b", - "%B", - "%U", - "%W", - "%c", - "%x", - "%X", - "%G", - "%u", - "%V", - ], -) +@pytest.mark.parametrize("date_format", ["%c", "%x", "%X"]) def test_datetime_strftime_not_implemented_formats(date_format): gsr = cudf.Series([1, 2, 3], dtype="datetime64[ms]") @@ -1334,6 +1336,55 @@ def test_quarter(): assert_eq(expect2.values, got2.values, check_dtype=False) +@pytest.mark.parametrize( + "data", + [ + pd.Series([], dtype="datetime64[ns]"), + pd.Series(pd.date_range("2010-01-01", "2010-02-01")), + pd.Series([None, None], dtype="datetime64[ns]"), + pd.Series("2020-05-31 08:00:00", dtype="datetime64[s]"), + pd.Series( + pd.date_range(start="2021-07-25", end="2021-07-30"), + index=["a", "b", "c", "d", "e", "f"], + ), + ], +) +def test_isocalendar_series(data): + ps = data.copy() + gs = cudf.from_pandas(ps) + + expect = ps.dt.isocalendar() + got = gs.dt.isocalendar() + + assert_eq(expect, got, check_dtype=False) + + +@pytest.mark.parametrize( + "data", + [ + pd.DatetimeIndex([], dtype="datetime64[ns]"), + pd.DatetimeIndex([None, None], dtype="datetime64[ns]"), + pd.DatetimeIndex( + [ + "2020-05-31 08:00:00", + "1999-12-31 18:40:00", + "2000-12-31 04:00:00", + ], + dtype="datetime64[ns]", + ), + pd.DatetimeIndex(["2100-03-14 07:30:00"], dtype="datetime64[ns]"), + ], +) +def test_isocalendar_index(data): + ps = data.copy() + gs = cudf.from_pandas(ps) + + expect = ps.isocalendar() + got = gs.isocalendar() + + assert_eq(expect, got, check_dtype=False) + + @pytest.mark.parametrize("dtype", DATETIME_TYPES) def test_days_in_months(dtype): nrows = 1000 diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py index 4b2fca0d12d..4b26e2c13bc 100644 --- a/python/cudf/cudf/tests/test_dlpack.py +++ b/python/cudf/cudf/tests/test_dlpack.py @@ -126,7 +126,7 @@ def test_to_dlpack_cupy_1d(data_1d): expectation = data_size_expectation_builder(data_1d, False) with expectation: gs = cudf.Series(data_1d, nan_as_null=False) - cudf_host_array = gs.to_array(fillna="pandas") + cudf_host_array = gs.to_numpy(na_value=np.nan) dlt = gs._column.to_dlpack() cupy_array = cupy.fromDlpack(dlt) @@ -155,7 +155,7 @@ def test_from_dlpack_cupy_1d(data_1d): dlt = cupy_array.toDlpack() gs = cudf.from_dlpack(dlt) - cudf_host_array = gs.to_array(fillna="pandas") + cudf_host_array = gs.to_numpy(na_value=np.nan) assert_eq(cudf_host_array, cupy_host_array) @@ -190,7 +190,7 @@ def test_to_dlpack_cupy_1d_null(data_1d): with expectation: gs = cudf.Series(data_1d) - cudf_host_array = gs.to_array(fillna="pandas") + cudf_host_array = gs.to_numpy(na_value=np.nan) dlt = gs._column.to_dlpack() cupy_array = cupy.fromDlpack(dlt) diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index f464ac1a6c2..bc43c82729b 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -24,11 +24,11 @@ def assert_df(g, p): def assert_df2(g, p): assert g.index.dtype == p.index.dtype - np.testing.assert_equal(g.index.to_array(), p.index) + np.testing.assert_equal(g.index.to_numpy(), p.index) assert tuple(g.columns) == tuple(p.columns) for k in g.columns: assert g[k].dtype == p[k].dtype - np.testing.assert_equal(g[k].to_array(), p[k]) + np.testing.assert_equal(g[k].to_numpy(), p[k]) # most tests are similar to pandas drop_duplicates diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py index e527fd0af17..39fa7b11ce2 100644 --- a/python/cudf/cudf/tests/test_extension_compilation.py +++ b/python/cudf/cudf/tests/test_extension_compilation.py @@ -5,7 +5,7 @@ from numba.cuda import compile_ptx from cudf import NA -from cudf.core.udf.classes import Masked +from cudf.core.udf.api import Masked from cudf.core.udf.typing import MaskedType arith_ops = ( diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py index 46cbc9d2b52..1f16686a6a6 100644 --- a/python/cudf/cudf/tests/test_factorize.py +++ b/python/cudf/cudf/tests/test_factorize.py @@ -19,7 +19,7 @@ def test_factorize_series_obj(ncats, nelem): df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32) uvals, labels = df["cats"].factorize() - np.testing.assert_array_equal(labels.to_array(), sorted(set(arr))) + np.testing.assert_array_equal(labels.to_numpy(), sorted(set(arr))) assert isinstance(uvals, cp.ndarray) assert isinstance(labels, Index) diff --git a/python/cudf/cudf/tests/test_gpu_arrow_parser.py b/python/cudf/cudf/tests/test_gpu_arrow_parser.py index a088ae9f923..3b3aa72901f 100644 --- a/python/cudf/cudf/tests/test_gpu_arrow_parser.py +++ b/python/cudf/cudf/tests/test_gpu_arrow_parser.py @@ -53,8 +53,8 @@ def test_gpu_parse_arrow_data_cpu_schema(): np.testing.assert_array_less(-105, lon) dct = reader.to_dict() - np.testing.assert_array_equal(lat, dct["dest_lat"].to_array()) - np.testing.assert_array_equal(lon, dct["dest_lon"].to_array()) + np.testing.assert_array_equal(lat, dct["dest_lat"].to_numpy()) + np.testing.assert_array_equal(lon, dct["dest_lon"].to_numpy()) def test_gpu_parse_arrow_data_gpu_schema(): @@ -86,8 +86,8 @@ def test_gpu_parse_arrow_data_gpu_schema(): np.testing.assert_array_less(-105, lon) dct = reader.to_dict() - np.testing.assert_array_equal(lat, dct["dest_lat"].to_array()) - np.testing.assert_array_equal(lon, dct["dest_lon"].to_array()) + np.testing.assert_array_equal(lat, dct["dest_lat"].to_numpy()) + np.testing.assert_array_equal(lon, dct["dest_lon"].to_numpy()) def test_gpu_parse_arrow_data_bad_cpu_schema_good_gpu_schema(): @@ -119,8 +119,8 @@ def test_gpu_parse_arrow_data_bad_cpu_schema_good_gpu_schema(): np.testing.assert_array_less(-105, lon) dct = reader.to_dict() - np.testing.assert_array_equal(lat, dct["dest_lat"].to_array()) - np.testing.assert_array_equal(lon, dct["dest_lon"].to_array()) + np.testing.assert_array_equal(lat, dct["dest_lat"].to_numpy()) + np.testing.assert_array_equal(lon, dct["dest_lon"].to_numpy()) expected_values = """ @@ -288,9 +288,9 @@ def test_gpu_parse_arrow_timestamps(dtype): reader = GpuArrowReader(cpu_schema, gpu_data) assert reader[0].name == "timestamp" timestamp_arr = reader[0].data.copy_to_host() - np.testing.assert_array_equal(timestamp_arr, gdf["timestamp"].to_array()) + np.testing.assert_array_equal(timestamp_arr, gdf["timestamp"].to_numpy()) dct = reader.to_dict() - np.testing.assert_array_equal(timestamp_arr, dct["timestamp"].to_array()) + np.testing.assert_array_equal(timestamp_arr, dct["timestamp"].to_numpy()) if __name__ == "__main__": diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index b58078818dd..338e10ebe30 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -196,7 +196,7 @@ def test_groupby_as_index_multiindex(pdf, gdf, as_index): else: # column names don't match - check just the values for gcol, pcol in zip(gdf, pdf): - assert_array_equal(gdf[gcol].to_array(), pdf[pcol].values) + assert_array_equal(gdf[gcol].to_numpy(), pdf[pcol].values) def test_groupby_default(pdf, gdf): @@ -244,7 +244,7 @@ def test_groupby_cats(): df["vals"] = np.random.random(len(df)) cats = df["cats"].values_host - vals = df["vals"].to_array() + vals = df["vals"].to_numpy() grouped = df.groupby(["cats"], as_index=False).mean() diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index a3de92ba9e5..4ae86dc1cfc 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -176,7 +176,7 @@ def test_categorical_index(): assert_eq(pdf.index, gdf1.index) assert_eq( pdf.index.codes, - gdf1.index.codes.astype(pdf.index.codes.dtype).to_array(), + gdf1.index.codes.astype(pdf.index.codes.dtype).to_numpy(), ) assert isinstance(gdf2.index, CategoricalIndex) @@ -184,7 +184,7 @@ def test_categorical_index(): assert_eq(pdf.index, gdf2.index) assert_eq( pdf.index.codes, - gdf2.index.codes.astype(pdf.index.codes.dtype).to_array(), + gdf2.index.codes.astype(pdf.index.codes.dtype).to_numpy(), ) @@ -223,7 +223,7 @@ def test_pandas_as_index(): pdf_category_index.codes, gdf_category_index.codes.astype( pdf_category_index.codes.dtype - ).to_array(), + ).to_numpy(), ) @@ -291,7 +291,7 @@ def test_set_index_as_property(): # Check set_index(Series) cdf.index = cdf["b"] - assert_eq(cdf.index._values.to_array(), col2) + assert_eq(cdf.index.to_numpy(), col2) with pytest.raises(ValueError): cdf.index = [list(range(10))] @@ -459,14 +459,14 @@ def test_index_copy_deep(idx, deep): def test_index_isna(idx): pidx = pd.Index(idx, name="idx") gidx = cudf.core.index.Int64Index(idx, name="idx") - assert_eq(gidx.isna().to_array(), pidx.isna()) + assert_eq(gidx.isna().to_numpy(), pidx.isna()) @pytest.mark.parametrize("idx", [[1, None, 3, None, 5]]) def test_index_notna(idx): pidx = pd.Index(idx, name="idx") gidx = cudf.core.index.Int64Index(idx, name="idx") - assert_eq(gidx.notna().to_array(), pidx.notna()) + assert_eq(gidx.notna().to_numpy(), pidx.notna()) def test_rangeindex_slice_attr_name(): @@ -674,17 +674,13 @@ def test_index_where(data, condition, other, error): got = gs.where(gs_condition, other=gs_other) np.testing.assert_array_equal( expect.codes, - got.codes.astype(expect.codes.dtype).fillna(-1).to_array(), + got.codes.astype(expect.codes.dtype).fillna(-1).to_numpy(), ) assert_eq(expect.categories, got.categories) else: assert_eq( - ps.where(ps_condition, other=ps_other).fillna( - gs._values.default_na_value() - ), - gs.where(gs_condition, other=gs_other) - .to_pandas() - .fillna(gs._values.default_na_value()), + ps.where(ps_condition, other=ps_other), + gs.where(gs_condition, other=gs_other).to_pandas(), ) else: assert_exceptions_equal( diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 58d39ff35a6..e550c7c374e 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -110,16 +110,16 @@ def test_series_indexing(i1, i2, i3): # Indexing sr1 = series.iloc[i1] assert sr1.null_count == 0 - np.testing.assert_equal(sr1.to_array(), a1[:12]) + np.testing.assert_equal(sr1.to_numpy(), a1[:12]) sr2 = sr1.iloc[i2] assert sr2.null_count == 0 - np.testing.assert_equal(sr2.to_array(), a1[3:12]) + np.testing.assert_equal(sr2.to_numpy(), a1[3:12]) # Index with stride sr3 = sr2.iloc[i3] assert sr3.null_count == 0 - np.testing.assert_equal(sr3.to_array(), a1[3:12:2]) + np.testing.assert_equal(sr3.to_numpy(), a1[3:12:2]) # Integer indexing if isinstance(i1, range): @@ -197,10 +197,10 @@ def test_dataframe_column_name_indexing(): df["a"] = data df[1] = data np.testing.assert_equal( - df["a"].to_array(), np.asarray(range(10), dtype=np.int32) + df["a"].to_numpy(), np.asarray(range(10), dtype=np.int32) ) np.testing.assert_equal( - df[1].to_array(), np.asarray(range(10), dtype=np.int32) + df[1].to_numpy(), np.asarray(range(10), dtype=np.int32) ) pdf = pd.DataFrame() @@ -245,20 +245,20 @@ def test_dataframe_slicing(): first_10 = df[:10] assert len(first_10) == 10 assert tuple(first_10.columns) == ("a", "b", "c", "d") - np.testing.assert_equal(first_10["a"].to_array(), ha[:10]) - np.testing.assert_equal(first_10["b"].to_array(), hb[:10]) - np.testing.assert_equal(first_10["c"].to_array(), hc[:10]) - np.testing.assert_equal(first_10["d"].to_array(), hd[:10]) + np.testing.assert_equal(first_10["a"].to_numpy(), ha[:10]) + np.testing.assert_equal(first_10["b"].to_numpy(), hb[:10]) + np.testing.assert_equal(first_10["c"].to_numpy(), hc[:10]) + np.testing.assert_equal(first_10["d"].to_numpy(), hd[:10]) del first_10 # Row slice last 10 last_10 = df[-10:] assert len(last_10) == 10 assert tuple(last_10.columns) == ("a", "b", "c", "d") - np.testing.assert_equal(last_10["a"].to_array(), ha[-10:]) - np.testing.assert_equal(last_10["b"].to_array(), hb[-10:]) - np.testing.assert_equal(last_10["c"].to_array(), hc[-10:]) - np.testing.assert_equal(last_10["d"].to_array(), hd[-10:]) + np.testing.assert_equal(last_10["a"].to_numpy(), ha[-10:]) + np.testing.assert_equal(last_10["b"].to_numpy(), hb[-10:]) + np.testing.assert_equal(last_10["c"].to_numpy(), hc[-10:]) + np.testing.assert_equal(last_10["d"].to_numpy(), hd[-10:]) del last_10 # Row slice [begin:end] @@ -267,10 +267,10 @@ def test_dataframe_slicing(): subrange = df[begin:end] assert len(subrange) == end - begin assert tuple(subrange.columns) == ("a", "b", "c", "d") - np.testing.assert_equal(subrange["a"].to_array(), ha[begin:end]) - np.testing.assert_equal(subrange["b"].to_array(), hb[begin:end]) - np.testing.assert_equal(subrange["c"].to_array(), hc[begin:end]) - np.testing.assert_equal(subrange["d"].to_array(), hd[begin:end]) + np.testing.assert_equal(subrange["a"].to_numpy(), ha[begin:end]) + np.testing.assert_equal(subrange["b"].to_numpy(), hb[begin:end]) + np.testing.assert_equal(subrange["c"].to_numpy(), hc[begin:end]) + np.testing.assert_equal(subrange["d"].to_numpy(), hd[begin:end]) del subrange @@ -527,7 +527,7 @@ def test_series_loc_categorical(): # order of categories changes, so we can only # compare values: assert_eq( - ps.loc[["a", "d", "e"]].values, gs.loc[["a", "d", "e"]].to_array() + ps.loc[["a", "d", "e"]].values, gs.loc[["a", "d", "e"]].to_numpy() ) assert_eq( @@ -594,9 +594,9 @@ def test_series_iloc(nelem): np.testing.assert_allclose(gs.iloc[nelem - 1], ps.iloc[nelem - 1]) # positive tests for slice - np.testing.assert_allclose(gs.iloc[-1:1].to_array(), ps.iloc[-1:1]) + np.testing.assert_allclose(gs.iloc[-1:1].to_numpy(), ps.iloc[-1:1]) np.testing.assert_allclose( - gs.iloc[nelem - 1 : -1].to_array(), ps.iloc[nelem - 1 : -1] + gs.iloc[nelem - 1 : -1].to_numpy(), ps.iloc[nelem - 1 : -1] ) np.testing.assert_allclose( gs.iloc[0 : nelem - 1].to_pandas(), ps.iloc[0 : nelem - 1] @@ -697,8 +697,8 @@ def test_dataframe_iloc_index_error(): pdf["b"] = hb def assert_col(g, p): - np.testing.assert_equal(g["a"].to_array(), p["a"]) - np.testing.assert_equal(g["b"].to_array(), p["b"]) + np.testing.assert_equal(g["a"].to_numpy(), p["a"]) + np.testing.assert_equal(g["b"].to_numpy(), p["b"]) assert_col(gdf.iloc[nelem * 2], pdf.iloc[nelem * 2]) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index b18cce60bfd..775b866f5ce 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -21,47 +21,30 @@ def make_params(): np.random.seed(0) hows = _JOIN_TYPES - methods = "hash,sort".split(",") # Test specific cases (1) aa = [0, 0, 4, 5, 5] bb = [0, 0, 2, 3, 5] for how in hows: - if how in ["left", "inner", "right", "leftanti", "leftsemi"]: - for method in methods: - yield (aa, bb, how, method) - else: - yield (aa, bb, how, "sort") + yield (aa, bb, how) # Test specific cases (2) aa = [0, 0, 1, 2, 3] bb = [0, 1, 2, 2, 3] for how in hows: - if how in ["left", "inner", "right", "leftanti", "leftsemi"]: - for method in methods: - yield (aa, bb, how, method) - else: - yield (aa, bb, how, "sort") + yield (aa, bb, how) # Test large random integer inputs aa = np.random.randint(0, 50, 100) bb = np.random.randint(0, 50, 100) for how in hows: - if how in ["left", "inner", "right", "leftanti", "leftsemi"]: - for method in methods: - yield (aa, bb, how, method) - else: - yield (aa, bb, how, "sort") + yield (aa, bb, how) # Test floating point inputs aa = np.random.random(50) bb = np.random.random(50) for how in hows: - if how in ["left", "inner", "right", "leftanti", "leftsemi"]: - for method in methods: - yield (aa, bb, how, method) - else: - yield (aa, bb, how, "sort") + yield (aa, bb, how) def pd_odd_joins(left, right, join_type): @@ -102,8 +85,8 @@ def assert_join_results_equal(expect, got, how, **kwargs): raise ValueError(f"Not a join result: {type(expect).__name__}") -@pytest.mark.parametrize("aa,bb,how,method", make_params()) -def test_dataframe_join_how(aa, bb, how, method): +@pytest.mark.parametrize("aa,bb,how", make_params()) +def test_dataframe_join_how(aa, bb, how): df = cudf.DataFrame() df["a"] = aa df["b"] = bb @@ -122,7 +105,7 @@ def work_pandas(df, how): def work_gdf(df): df1 = df.set_index("a") df2 = df.set_index("b") - joined = df1.join(df2, how=how, sort=True, method=method) + joined = df1.join(df2, how=how, sort=True) return joined expect = work_pandas(df.to_pandas(), how) @@ -136,8 +119,7 @@ def work_gdf(df): assert got.index.name is None assert list(expect.columns) == list(got.columns) - # test disabled until libgdf sort join gets updated with new api - if method == "hash": + if how in {"left", "inner", "right", "leftanti", "leftsemi"}: assert_eq(sorted(expect.index.values), sorted(got.index.values)) if how != "outer": # Newly introduced ambiguous ValueError thrown when @@ -161,9 +143,9 @@ def work_gdf(df): def _check_series(expect, got): magic = 0xDEADBEAF - direct_equal = np.all(expect.values == got.to_array()) + direct_equal = np.all(expect.values == got.to_numpy()) nanfilled_equal = np.all( - expect.fillna(magic).values == got.fillna(magic).to_array() + expect.fillna(magic).values == got.fillna(magic).to_numpy() ) msg = "direct_equal={}, nanfilled_equal={}".format( direct_equal, nanfilled_equal @@ -221,8 +203,8 @@ def test_dataframe_join_cats(): assert list(got.columns) == ["b", "c"] assert len(got) > 0 assert set(got.index.to_pandas()) & set("abc") - assert set(got["b"].to_array()) & set(bb) - assert set(got["c"].to_array()) & set(cc) + assert set(got["b"].to_numpy()) & set(bb) + assert set(got["c"].to_numpy()) & set(cc) def test_dataframe_join_combine_cats(): diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 8c06dbea03f..3391c9c22ee 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -190,7 +190,7 @@ def test_json_lines_basic(json_input, engine): assert all(cu_df.dtypes == ["int64", "int64", "int64"]) for cu_col, pd_col in zip(cu_df.columns, pd_df.columns): assert str(cu_col) == str(pd_col) - np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_array()) + np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy()) @pytest.mark.filterwarnings("ignore:Using CPU") @@ -209,7 +209,7 @@ def test_json_lines_multiple(tmpdir, json_input, engine): assert all(cu_df.dtypes == ["int64", "int64", "int64"]) for cu_col, pd_col in zip(cu_df.columns, pd_df.columns): assert str(cu_col) == str(pd_col) - np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_array()) + np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy()) @pytest.mark.parametrize("engine", ["auto", "cudf"]) @@ -240,7 +240,7 @@ def test_json_read_directory(tmpdir, json_input, engine): assert all(cu_df.dtypes == ["int64", "int64", "int64"]) for cu_col, pd_col in zip(cu_df.columns, pd_df.columns): assert str(cu_col) == str(pd_col) - np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_array()) + np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy()) def test_json_lines_byte_range(json_input): @@ -340,9 +340,9 @@ def test_json_bool_values(): # types should be ['bool', 'int64'] np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes) - np.testing.assert_array_equal(pd_df[0], cu_df["0"].to_array()) + np.testing.assert_array_equal(pd_df[0], cu_df["0"].to_numpy()) # boolean values should be converted to 0/1 - np.testing.assert_array_equal(pd_df[1], cu_df["1"].to_array()) + np.testing.assert_array_equal(pd_df[1], cu_df["1"].to_numpy()) cu_df = cudf.read_json(buffer, lines=True, dtype=["bool", "long"]) np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes) @@ -364,15 +364,9 @@ def test_json_null_literal(buffer): # second column contains only empty fields, type should be set to int8 np.testing.assert_array_equal(df.dtypes, ["float64", "int8"]) np.testing.assert_array_equal( - df["0"].to_array(fillna=np.nan), [1.0, np.nan] - ) - np.testing.assert_array_equal( - df["1"].to_array(fillna=np.nan), - [ - df["1"]._column.default_na_value(), - df["1"]._column.default_na_value(), - ], + df["0"].to_numpy(na_value=np.nan), [1.0, np.nan] ) + np.testing.assert_array_equal(df["1"].to_numpy(na_value=0), [0, 0]) def test_json_bad_protocol_string(): diff --git a/python/cudf/cudf/tests/test_label_encode.py b/python/cudf/cudf/tests/test_label_encode.py index bac324d9c1c..106179e2b47 100644 --- a/python/cudf/cudf/tests/test_label_encode.py +++ b/python/cudf/cudf/tests/test_label_encode.py @@ -44,7 +44,7 @@ def test_label_encode(nelem, dtype): # label encode series ncol = df["cats"].label_encoding(cats=vals) - arr = ncol.to_array() + arr = ncol.to_numpy() # verify labels of new column for i in range(arr.size): @@ -75,7 +75,7 @@ def test_label_encode_drop_one(): # label encode series ncol = df["cats"].label_encoding(cats=vals, dtype="float32") - arr = ncol.to_array() + arr = ncol.to_numpy() # verify labels of new column @@ -110,7 +110,7 @@ def test_label_encode_float_output(): na_sentinel=np.nan, ) - got = df2["cats_labels"].to_array(fillna="pandas") + got = df2["cats_labels"].to_numpy(na_value=np.nan) handcoded = np.array([encoder.get(v, np.nan) for v in arr]) np.testing.assert_equal(got, handcoded) diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py index 0a3ead6cf31..0292d47f31a 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/test_onehot.py @@ -18,7 +18,7 @@ def test_onehot_simple(): df["vals"] = np.arange(10, dtype=np.int32) # One Hot (Series) for i, col in enumerate(df["vals"].one_hot_encoding(list(range(10)))): - arr = col.to_array() + arr = col.to_numpy() # Verify 1 in the right position np.testing.assert_equal(arr[i], 1) # Every other slots are 0s @@ -31,7 +31,7 @@ def test_onehot_simple(): assert df2.columns[0] == "vals" for i in range(1, len(df2.columns)): assert df2.columns[i] == "vals_%s" % (i - 1) - got = df2.as_matrix(columns=df2.columns[1:]) + got = df2[df2.columns[1:]].values_host expect = np.identity(got.shape[0]) np.testing.assert_equal(got, expect) @@ -45,7 +45,7 @@ def test_onehot_random(): df2 = df.one_hot_encoding( column="src", prefix="out_", cats=tuple(range(10, 17)) ) - mat = df2.as_matrix(columns=df2.columns[1:]) + mat = df2[df2.columns[1:]].values_host for val in range(low, high): colidx = val - low @@ -73,11 +73,11 @@ def test_onehot_masked(): ) assert tuple(out.columns) == ("a", "a_0", "a_1", "a_2", "a_3", "a_4") - np.testing.assert_array_equal((out["a_0"] == 1).to_array(), arr == 0) - np.testing.assert_array_equal((out["a_1"] == 1).to_array(), arr == 1) - np.testing.assert_array_equal((out["a_2"] == 1).to_array(), arr == 2) - np.testing.assert_array_equal((out["a_3"] == 1).to_array(), arr == 3) - np.testing.assert_array_equal((out["a_4"] == 1).to_array(), arr == 4) + np.testing.assert_array_equal((out["a_0"] == 1).to_numpy(), arr == 0) + np.testing.assert_array_equal((out["a_1"] == 1).to_numpy(), arr == 1) + np.testing.assert_array_equal((out["a_2"] == 1).to_numpy(), arr == 2) + np.testing.assert_array_equal((out["a_3"] == 1).to_numpy(), arr == 3) + np.testing.assert_array_equal((out["a_4"] == 1).to_numpy(), arr == 4) def test_onehot_generic_index(): @@ -91,10 +91,10 @@ def test_onehot_generic_index(): "fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32 ) assert set(out.columns) == {"fo", "fo_0", "fo_1", "fo_2", "fo_3"} - np.testing.assert_array_equal(values == 0, out.fo_0.to_array()) - np.testing.assert_array_equal(values == 1, out.fo_1.to_array()) - np.testing.assert_array_equal(values == 2, out.fo_2.to_array()) - np.testing.assert_array_equal(values == 3, out.fo_3.to_array()) + np.testing.assert_array_equal(values == 0, out.fo_0.to_numpy()) + np.testing.assert_array_equal(values == 1, out.fo_1.to_numpy()) + np.testing.assert_array_equal(values == 2, out.fo_2.to_numpy()) + np.testing.assert_array_equal(values == 3, out.fo_3.to_numpy()) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_pandas_interop.py b/python/cudf/cudf/tests/test_pandas_interop.py index c90d6f23c2d..78cf5b998e8 100644 --- a/python/cudf/cudf/tests/test_pandas_interop.py +++ b/python/cudf/cudf/tests/test_pandas_interop.py @@ -52,11 +52,11 @@ def test_from_pandas_ex1(): df = DataFrame.from_pandas(pdf) assert tuple(df.columns) == tuple(pdf.columns) - assert np.all(df["a"].to_array() == pdf["a"]) - matches = df["b"].to_array(fillna="pandas") == pdf["b"] + assert np.all(df["a"].to_numpy() == pdf["a"]) + matches = df["b"].to_numpy(na_value=np.nan) == pdf["b"] # the 3d element is False due to (nan == nan) == False assert np.all(matches == [True, True, False, True]) - assert np.isnan(df["b"].to_array(fillna="pandas")[2]) + assert np.isnan(df["b"].to_numpy(na_value=np.nan)[2]) assert np.isnan(pdf["b"][2]) diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py index 2e8de9b5d50..9a02d5145bb 100644 --- a/python/cudf/cudf/tests/test_query.py +++ b/python/cudf/cudf/tests/test_query.py @@ -84,8 +84,8 @@ def test_query_ref_env(data, fn): df2 = df.query(query_expr) # check assert len(df2) == np.count_nonzero(expect_mask) - np.testing.assert_array_almost_equal(df2["a"].to_array(), aa[expect_mask]) - np.testing.assert_array_almost_equal(df2["b"].to_array(), bb[expect_mask]) + np.testing.assert_array_almost_equal(df2["a"].to_numpy(), aa[expect_mask]) + np.testing.assert_array_almost_equal(df2["b"].to_numpy(), bb[expect_mask]) def test_query_env_changing(): @@ -95,11 +95,11 @@ def test_query_env_changing(): # first attempt c = 10 got = df.query(expr) - np.testing.assert_array_equal(aa[aa < c], got["a"].to_array()) + np.testing.assert_array_equal(aa[aa < c], got["a"].to_numpy()) # change env c = 50 got = df.query(expr) - np.testing.assert_array_equal(aa[aa < c], got["a"].to_array()) + np.testing.assert_array_equal(aa[aa < c], got["a"].to_numpy()) def test_query_local_dict(): @@ -108,7 +108,7 @@ def test_query_local_dict(): expr = "a < @val" got = df.query(expr, local_dict={"val": 10}) - np.testing.assert_array_equal(aa[aa < 10], got["a"].to_array()) + np.testing.assert_array_equal(aa[aa < 10], got["a"].to_numpy()) # test for datetime df = DataFrame() @@ -118,7 +118,7 @@ def test_query_local_dict(): expr = "datetimes==@search_date" got = df.query(expr, local_dict={"search_date": search_date}) - np.testing.assert_array_equal(data[1], got["datetimes"].to_array()) + np.testing.assert_array_equal(data[1], got["datetimes"].to_numpy()) def test_query_splitted_combine(): diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index 43d477190ae..f47e87374dc 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -68,7 +68,7 @@ def test_series_replace(): a2 = np.array([5, 1, 2, 3, 4]) sr1 = cudf.Series(a1) sr2 = sr1.replace(0, 5) - assert_eq(a2, sr2.to_array()) + assert_eq(a2, sr2.to_numpy()) # Categorical psr3 = pd.Series(["one", "two", "three"], dtype="category") @@ -85,35 +85,35 @@ def test_series_replace(): # List input a6 = np.array([5, 6, 2, 3, 4]) sr6 = sr1.replace([0, 1], [5, 6]) - assert_eq(a6, sr6.to_array()) + assert_eq(a6, sr6.to_numpy()) with pytest.raises(TypeError): sr1.replace([0, 1], [5.5, 6.5]) # Series input a8 = np.array([5, 5, 5, 3, 4]) - sr8 = sr1.replace(sr1[:3].to_array(), 5) - assert_eq(a8, sr8.to_array()) + sr8 = sr1.replace(sr1[:3].to_numpy(), 5) + assert_eq(a8, sr8.to_numpy()) # large input containing null sr9 = cudf.Series(list(range(400)) + [None]) sr10 = sr9.replace([22, 323, 27, 0], None) assert sr10.null_count == 5 - assert len(sr10.to_array()) == (401 - 5) + assert len(sr10.dropna().to_numpy()) == (401 - 5) sr11 = sr9.replace([22, 323, 27, 0], -1) assert sr11.null_count == 1 - assert len(sr11.to_array()) == (401 - 1) + assert len(sr11.dropna().to_numpy()) == (401 - 1) # large input not containing nulls sr9 = sr9.fillna(-11) sr12 = sr9.replace([22, 323, 27, 0], None) assert sr12.null_count == 4 - assert len(sr12.to_array()) == (401 - 4) + assert len(sr12.dropna().to_numpy()) == (401 - 4) sr13 = sr9.replace([22, 323, 27, 0], -1) assert sr13.null_count == 0 - assert len(sr13.to_array()) == 401 + assert len(sr13.to_numpy()) == 401 def test_series_replace_with_nulls(): @@ -123,12 +123,12 @@ def test_series_replace_with_nulls(): a2 = np.array([-10, 1, 2, 3, 4]) sr1 = cudf.Series(a1) sr2 = sr1.replace(0, None).fillna(-10) - assert_eq(a2, sr2.to_array()) + assert_eq(a2, sr2.to_numpy()) # List input a6 = np.array([-10, 6, 2, 3, 4]) sr6 = sr1.replace([0, 1], [None, 6]).fillna(-10) - assert_eq(a6, sr6.to_array()) + assert_eq(a6, sr6.to_numpy()) sr1 = cudf.Series([0, 1, 2, 3, 4, None]) with pytest.raises(TypeError): @@ -137,11 +137,11 @@ def test_series_replace_with_nulls(): # Series input a8 = np.array([-10, -10, -10, 3, 4, -10]) sr8 = sr1.replace(cudf.Series([-10] * 3, index=sr1[:3]), None).fillna(-10) - assert_eq(a8, sr8.to_array()) + assert_eq(a8, sr8.to_numpy()) a9 = np.array([-10, 6, 2, 3, 4, -10]) sr9 = sr1.replace([0, 1], [None, 6]).fillna(-10) - assert_eq(a9, sr9.to_array()) + assert_eq(a9, sr9.to_numpy()) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 838ea91c7d3..736bcf131cc 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -40,14 +40,6 @@ def test_null_series(nrows, dtype): psrepr = psrepr.replace("NaN", "") psrepr = psrepr.replace("NaT", "") psrepr = psrepr.replace("None", "") - if ( - dtype.startswith("int") - or dtype.startswith("uint") - or dtype.startswith("long") - ): - psrepr = psrepr.replace( - str(sr._column.default_na_value()) + "\n", "\n" - ) if "UInt" in psrepr: psrepr = psrepr.replace("UInt", "uint") elif "Int" in psrepr: diff --git a/python/cudf/cudf/tests/test_scan.py b/python/cudf/cudf/tests/test_scan.py index 0ef7b89a606..741a9f45d09 100644 --- a/python/cudf/cudf/tests/test_scan.py +++ b/python/cudf/cudf/tests/test_scan.py @@ -38,7 +38,7 @@ def test_cumsum(dtype, nelem): gs = cudf.Series(data) ps = pd.Series(data) np.testing.assert_array_almost_equal( - gs.cumsum().to_array(), ps.cumsum(), decimal=decimal + gs.cumsum().to_numpy(), ps.cumsum(), decimal=decimal ) # dataframe series (named series) @@ -47,7 +47,7 @@ def test_cumsum(dtype, nelem): pdf = pd.DataFrame() pdf["a"] = pd.Series(data) np.testing.assert_array_almost_equal( - gdf.a.cumsum().to_array(), pdf.a.cumsum(), decimal=decimal + gdf.a.cumsum().to_numpy(), pdf.a.cumsum(), decimal=decimal ) @@ -96,7 +96,7 @@ def test_cummin(dtype, nelem): gs = cudf.Series(data) ps = pd.Series(data) np.testing.assert_array_almost_equal( - gs.cummin().to_array(), ps.cummin(), decimal=decimal + gs.cummin().to_numpy(), ps.cummin(), decimal=decimal ) # dataframe series (named series) @@ -105,7 +105,7 @@ def test_cummin(dtype, nelem): pdf = pd.DataFrame() pdf["a"] = pd.Series(data) np.testing.assert_array_almost_equal( - gdf.a.cummin().to_array(), pdf.a.cummin(), decimal=decimal + gdf.a.cummin().to_numpy(), pdf.a.cummin(), decimal=decimal ) @@ -153,7 +153,7 @@ def test_cummax(dtype, nelem): gs = cudf.Series(data) ps = pd.Series(data) np.testing.assert_array_almost_equal( - gs.cummax().to_array(), ps.cummax(), decimal=decimal + gs.cummax().to_numpy(), ps.cummax(), decimal=decimal ) # dataframe series (named series) @@ -162,7 +162,7 @@ def test_cummax(dtype, nelem): pdf = pd.DataFrame() pdf["a"] = pd.Series(data) np.testing.assert_array_almost_equal( - gdf.a.cummax().to_array(), pdf.a.cummax(), decimal=decimal + gdf.a.cummax().to_numpy(), pdf.a.cummax(), decimal=decimal ) @@ -210,7 +210,7 @@ def test_cumprod(dtype, nelem): gs = cudf.Series(data) ps = pd.Series(data) np.testing.assert_array_almost_equal( - gs.cumprod().to_array(), ps.cumprod(), decimal=decimal + gs.cumprod().to_numpy(), ps.cumprod(), decimal=decimal ) # dataframe series (named series) @@ -219,7 +219,7 @@ def test_cumprod(dtype, nelem): pdf = pd.DataFrame() pdf["a"] = pd.Series(data) np.testing.assert_array_almost_equal( - gdf.a.cumprod().to_array(), pdf.a.cumprod(), decimal=decimal + gdf.a.cumprod().to_numpy(), pdf.a.cumprod(), decimal=decimal ) diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py index c16c6486cd4..cd029d02d79 100644 --- a/python/cudf/cudf/tests/test_search.py +++ b/python/cudf/cudf/tests/test_search.py @@ -26,13 +26,13 @@ def test_searchsorted(side, obj_class, vals_class): # Reference object can be Series, Index, or Column if obj_class == "index": - sr = cudf.Series.as_index(sr) + sr.reset_index(drop=True) elif obj_class == "column": sr = sr._column # Values can be Series or Index if vals_class == "index": - vals = cudf.Series.as_index(vals) + vals.reset_index(drop=True) psr = sr.to_pandas() pvals = vals.to_pandas() diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 11deb6c0842..ca179703864 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -285,25 +285,25 @@ def test_series_append_existing_buffers(): a2 = cudf.Series(np.arange(5)) gs = gs.append(a2) assert len(gs) == 15 - np.testing.assert_equal(gs.to_array(), np.hstack([a1, a2.to_array()])) + np.testing.assert_equal(gs.to_numpy(), np.hstack([a1, a2.to_numpy()])) # Ensure appending to previous buffer a3 = cudf.Series(np.arange(3)) gs = gs.append(a3) assert len(gs) == 18 - a4 = np.hstack([a1, a2.to_array(), a3.to_array()]) - np.testing.assert_equal(gs.to_array(), a4) + a4 = np.hstack([a1, a2.to_numpy(), a3.to_numpy()]) + np.testing.assert_equal(gs.to_numpy(), a4) # Appending different dtype a5 = cudf.Series(np.array([1, 2, 3], dtype=np.int32)) a6 = cudf.Series(np.array([4.5, 5.5, 6.5], dtype=np.float64)) gs = a5.append(a6) np.testing.assert_equal( - gs.to_array(), np.hstack([a5.to_array(), a6.to_array()]) + gs.to_numpy(), np.hstack([a5.to_numpy(), a6.to_numpy()]) ) gs = cudf.Series(a6).append(a5) np.testing.assert_equal( - gs.to_array(), np.hstack([a6.to_array(), a5.to_array()]) + gs.to_numpy(), np.hstack([a6.to_numpy(), a5.to_numpy()]) ) @@ -694,7 +694,6 @@ def test_series_round(arr, decimals): expected = pser.round(decimals) assert_eq(result, expected) - np.array_equal(ser.nullmask.to_array(), result.to_array()) def test_series_round_half_up(): @@ -958,14 +957,8 @@ def test_series_update(data, other): ps = gs.to_pandas() - gs_column_before = gs._column - gs.update(g_other) - gs_column_after = gs._column - - assert_eq(gs_column_before.to_array(), gs_column_after.to_array()) - ps.update(p_other) - + gs.update(g_other) assert_eq(gs, ps) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index fed391ac6be..3a42411c839 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -94,10 +94,10 @@ def test_series_argsort(nelem, dtype, asc): res = sr.argsort(ascending=asc) if asc: - expected = np.argsort(sr.to_array(), kind="mergesort") + expected = np.argsort(sr.to_numpy(), kind="mergesort") else: - expected = np.argsort(sr.to_array() * -1, kind="mergesort") - np.testing.assert_array_equal(expected, res.to_array()) + expected = np.argsort(sr.to_numpy() * -1, kind="mergesort") + np.testing.assert_array_equal(expected, res.to_numpy()) @pytest.mark.parametrize( @@ -165,8 +165,8 @@ def test_dataframe_nlargest(nelem, n): # Check inds = np.argsort(aa) - assert_eq(res["a"].to_array(), aa[inds][-n:][::-1]) - assert_eq(res["b"].to_array(), bb[inds][-n:][::-1]) + assert_eq(res["a"].to_numpy(), aa[inds][-n:][::-1]) + assert_eq(res["b"].to_numpy(), bb[inds][-n:][::-1]) assert_eq(res.index.values, inds[-n:][::-1]) @@ -180,8 +180,8 @@ def test_dataframe_nsmallest(nelem, n): # Check inds = np.argsort(-aa) - assert_eq(res["a"].to_array(), aa[inds][-n:][::-1]) - assert_eq(res["b"].to_array(), bb[inds][-n:][::-1]) + assert_eq(res["a"].to_numpy(), aa[inds][-n:][::-1]) + assert_eq(res["b"].to_numpy(), bb[inds][-n:][::-1]) assert_eq(res.index.values, inds[-n:][::-1]) diff --git a/python/cudf/cudf/tests/test_sparse_df.py b/python/cudf/cudf/tests/test_sparse_df.py index e10ad8e5306..f7cf597afd2 100644 --- a/python/cudf/cudf/tests/test_sparse_df.py +++ b/python/cudf/cudf/tests/test_sparse_df.py @@ -46,7 +46,7 @@ def test_fillna(): masked_col = gar[8] sr = Series(data=masked_col.data) dense = sr.nans_to_nulls().fillna(123) - np.testing.assert_equal(123, dense.to_array()) + np.testing.assert_equal(123, dense.to_numpy()) assert len(dense) == len(sr) assert dense.null_count == 0 @@ -58,8 +58,8 @@ def test_to_dense_array(): sr = Series.from_masked_array(data=data, mask=mask, null_count=3) assert sr.has_nulls assert sr.null_count != len(sr) - filled = sr.to_array(fillna="pandas") - dense = sr.to_array() + filled = sr.to_numpy(na_value=np.nan) + dense = sr.dropna().to_numpy() assert dense.size < filled.size assert filled.size == len(sr) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 759a6a95798..ebe78d56c3f 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -84,7 +84,7 @@ def test_series_unique(): arr = np.random.randint(low=-1, high=10, size=size) mask = arr != -1 sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask()) - assert set(arr[mask]) == set(sr.unique().to_array()) + assert set(arr[mask]) == set(sr.unique().dropna().to_numpy()) assert len(set(arr[mask])) == sr.nunique() @@ -239,12 +239,12 @@ def test_kurtosis(data, null_flag): pdata.iloc[[0, 2]] = None got = data.kurtosis() - got = got if np.isscalar(got) else got.to_array() + got = got if np.isscalar(got) else got.to_numpy() expected = pdata.kurtosis() np.testing.assert_array_almost_equal(got, expected) got = data.kurt() - got = got if np.isscalar(got) else got.to_array() + got = got if np.isscalar(got) else got.to_numpy() expected = pdata.kurt() np.testing.assert_array_almost_equal(got, expected) @@ -281,7 +281,7 @@ def test_skew(data, null_flag): got = data.skew() expected = pdata.skew() - got = got if np.isscalar(got) else got.to_array() + got = got if np.isscalar(got) else got.to_numpy() np.testing.assert_array_almost_equal(got, expected) with pytest.raises(NotImplementedError): @@ -338,7 +338,7 @@ def test_series_pct_change(data, periods, fill_method): got = cs.pct_change(periods=periods, fill_method=fill_method) expected = ps.pct_change(periods=periods, fill_method=fill_method) np.testing.assert_array_almost_equal( - got.to_array(fillna="pandas"), expected + got.to_numpy(na_value=np.nan), expected ) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index a4ceed258db..b254a6ba02c 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -82,7 +82,7 @@ def test_string_export(ps_gs): assert_eq(expect, got) expect = np.array(ps) - got = gs.to_array() + got = gs.to_numpy() assert_eq(expect, got) expect = pa.Array.from_pandas(ps) diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 773bec56634..36a49aa4b33 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -167,17 +167,15 @@ def test_timedelta_from_pandas(data, dtype): ], ) @pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -@pytest.mark.parametrize("fillna", [None, "pandas"]) -def test_timedelta_series_to_array(data, dtype, fillna): +def test_timedelta_series_to_numpy(data, dtype): gsr = cudf.Series(data, dtype=dtype) expected = np.array( cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, dtype=dtype ) - if fillna is None: - expected = expected[~np.isnan(expected)] + expected = expected[~np.isnan(expected)] - actual = gsr.to_array(fillna=fillna) + actual = gsr.dropna().to_numpy() np.testing.assert_array_equal(expected, actual) diff --git a/python/cudf/cudf/tests/test_transform.py b/python/cudf/cudf/tests/test_transform.py index 0c246554082..021c4052759 100644 --- a/python/cudf/cudf/tests/test_transform.py +++ b/python/cudf/cudf/tests/test_transform.py @@ -32,4 +32,4 @@ def test_applymap_python_lambda(dtype, udf, testfunc): out_ser = lhs_ser.applymap(udf) result = testfunc(lhs_arr) - np.testing.assert_almost_equal(result, out_ser.to_array()) + np.testing.assert_almost_equal(result, out_ser.to_numpy()) diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index f73f1526c7f..c0018dae47d 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -2,7 +2,6 @@ import pandas as pd import pytest -from numba import cuda import cudf from cudf.core.udf.pipeline import nulludf @@ -15,12 +14,7 @@ operator.truediv, operator.floordiv, operator.mod, - pytest.param( - operator.pow, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/8470" - ), - ), + operator.pow, ] comparison_ops = [ @@ -34,13 +28,6 @@ def run_masked_udf_test(func_pdf, func_gdf, data, **kwargs): - - # Skip testing CUDA 11.0 - runtime = cuda.cudadrv.runtime.Runtime() - mjr, mnr = runtime.get_version() - if mjr < 11 or (mjr == 11 and mnr < 1): - pytest.skip("Skip testing for CUDA 11.0") - gdf = data pdf = data.to_pandas(nullable=True) @@ -53,6 +40,15 @@ def run_masked_udf_test(func_pdf, func_gdf, data, **kwargs): assert_eq(expect, obtain, **kwargs) +def run_masked_udf_series(func_psr, func_gsr, data, **kwargs): + gsr = data + psr = data.to_pandas(nullable=True) + + expect = psr.apply(func_psr) + obtain = gsr.apply(func_gsr) + assert_eq(expect, obtain, **kwargs) + + @pytest.mark.parametrize("op", arith_ops) def test_arith_masked_vs_masked(op): # This test should test all the typing @@ -91,8 +87,9 @@ def func_gdf(x, y): @pytest.mark.parametrize("op", arith_ops) -@pytest.mark.parametrize("constant", [1, 1.5]) -def test_arith_masked_vs_constant(op, constant): +@pytest.mark.parametrize("constant", [1, 1.5, True, False]) +@pytest.mark.parametrize("data", [[1, 2, cudf.NA]]) +def test_arith_masked_vs_constant(op, constant, data): def func_pdf(x): return op(x, constant) @@ -100,15 +97,28 @@ def func_pdf(x): def func_gdf(x): return op(x, constant) - # Just a single column -> result will be all NA - gdf = cudf.DataFrame({"data": [1, 2, None]}) + gdf = cudf.DataFrame({"data": data}) + if constant is False and op in { + operator.mod, + operator.pow, + operator.truediv, + operator.floordiv, + }: + # The following tests cases yield undefined behavior: + # - truediv(x, False) because its dividing by zero + # - floordiv(x, False) because its dividing by zero + # - mod(x, False) because its mod by zero, + # - pow(x, False) because we have an NA in the series and pandas + # insists that (NA**0 == 1) where we do not + pytest.skip() run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False) @pytest.mark.parametrize("op", arith_ops) -@pytest.mark.parametrize("constant", [1, 1.5]) -def test_arith_masked_vs_constant_reflected(op, constant): +@pytest.mark.parametrize("constant", [1, 1.5, True, False]) +@pytest.mark.parametrize("data", [[2, 3, cudf.NA], [1, cudf.NA, 1]]) +def test_arith_masked_vs_constant_reflected(op, constant, data): def func_pdf(x): return op(constant, x) @@ -117,13 +127,20 @@ def func_gdf(x): return op(constant, x) # Just a single column -> result will be all NA - gdf = cudf.DataFrame({"data": [1, 2, None]}) - + gdf = cudf.DataFrame({"data": data}) + + if constant == 1 and op is operator.pow: + # The following tests cases yield differing results from pandas: + # - 1**NA + # - True**NA + # both due to pandas insisting that this is equal to 1. + pytest.skip() run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False) @pytest.mark.parametrize("op", arith_ops) -def test_arith_masked_vs_null(op): +@pytest.mark.parametrize("data", [[1, cudf.NA, 3], [2, 3, cudf.NA]]) +def test_arith_masked_vs_null(op, data): def func_pdf(x): return op(x, pd.NA) @@ -131,7 +148,11 @@ def func_pdf(x): def func_gdf(x): return op(x, cudf.NA) - gdf = cudf.DataFrame({"data": [1, None, 3]}) + gdf = cudf.DataFrame({"data": data}) + + if 1 in gdf["data"] and op is operator.pow: + # In pandas, 1**NA == 1. + pytest.skip() run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False) @@ -255,6 +276,18 @@ def func_gdf(x): run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False) +def test_apply_return_literal_only(): + def func_pdf(x): + return 5 + + @nulludf + def func_gdf(x): + return 5 + + gdf = cudf.DataFrame({"a": [1, None, 3]}) + run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False) + + def test_apply_everything(): def func_pdf(w, x, y, z): if x is pd.NA: @@ -290,3 +323,110 @@ def func_gdf(w, x, y, z): } ) run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False) + + +### + + +@pytest.mark.parametrize( + "data", [cudf.Series([1, 2, 3]), cudf.Series([1, cudf.NA, 3])] +) +def test_series_apply_basic(data): + def func(x): + return x + 1 + + run_masked_udf_series(func, func, data, check_dtype=False) + + +def test_series_apply_null_conditional(): + def func_pdf(x): + if x is pd.NA: + return 42 + else: + return x - 1 + + def func_gdf(x): + if x is cudf.NA: + return 42 + else: + return x - 1 + + data = cudf.Series([1, cudf.NA, 3]) + + run_masked_udf_series(func_pdf, func_gdf, data) + + +### + + +@pytest.mark.parametrize("op", arith_ops) +def test_series_arith_masked_vs_masked(op): + def func(x): + return op(x, x) + + data = cudf.Series([1, cudf.NA, 3]) + run_masked_udf_series(func, func, data, check_dtype=False) + + +@pytest.mark.parametrize("op", comparison_ops) +def test_series_compare_masked_vs_masked(op): + """ + In the series case, only one other MaskedType to compare with + - itself + """ + + def func(x): + return op(x, x) + + data = cudf.Series([1, cudf.NA, 3]) + run_masked_udf_series(func, func, data, check_dtype=False) + + +@pytest.mark.parametrize("op", arith_ops) +@pytest.mark.parametrize("constant", [1, 1.5, cudf.NA]) +def test_series_arith_masked_vs_constant(op, constant): + def func(x): + return op(x, constant) + + # Just a single column -> result will be all NA + data = cudf.Series([1, 2, cudf.NA]) + if constant is cudf.NA and op is operator.pow: + # in pandas, 1**NA == 1. In cudf, 1**NA == 1. + with pytest.xfail(): + run_masked_udf_series(func, func, data, check_dtype=False) + return + run_masked_udf_series(func, func, data, check_dtype=False) + + +@pytest.mark.parametrize("op", arith_ops) +@pytest.mark.parametrize("constant", [1, 1.5, cudf.NA]) +def test_series_arith_masked_vs_constant_reflected(op, constant): + def func(x): + return op(constant, x) + + # Just a single column -> result will be all NA + data = cudf.Series([1, 2, cudf.NA]) + if constant is not cudf.NA and constant == 1 and op is operator.pow: + # in pandas, 1**NA == 1. In cudf, 1**NA == 1. + with pytest.xfail(): + run_masked_udf_series(func, func, data, check_dtype=False) + return + run_masked_udf_series(func, func, data, check_dtype=False) + + +def test_series_masked_is_null_conditional(): + def func_psr(x): + if x is pd.NA: + return 42 + else: + return x + + def func_gsr(x): + if x is cudf.NA: + return 42 + else: + return x + + data = cudf.Series([1, cudf.NA, 3, cudf.NA]) + + run_masked_udf_series(func_psr, func_gsr, data, check_dtype=False) diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py index 25ebe6fa710..22c78b5f933 100644 --- a/python/cudf/cudf/tests/test_unaops.py +++ b/python/cudf/cudf/tests/test_unaops.py @@ -19,16 +19,16 @@ def test_series_abs(dtype): arr = (np.random.random(1000) * 100).astype(dtype) sr = Series(arr) - np.testing.assert_equal(sr.abs().to_array(), np.abs(arr)) - np.testing.assert_equal(abs(sr).to_array(), abs(arr)) + np.testing.assert_equal(sr.abs().to_numpy(), np.abs(arr)) + np.testing.assert_equal(abs(sr).to_numpy(), abs(arr)) @pytest.mark.parametrize("dtype", utils.INTEGER_TYPES) def test_series_invert(dtype): arr = (np.random.random(1000) * 100).astype(dtype) sr = Series(arr) - np.testing.assert_equal((~sr).to_array(), np.invert(arr)) - np.testing.assert_equal((~sr).to_array(), ~arr) + np.testing.assert_equal((~sr).to_numpy(), np.invert(arr)) + np.testing.assert_equal((~sr).to_numpy(), ~arr) @pytest.mark.parametrize("dtype", utils.INTEGER_TYPES + ["bool"]) @@ -41,28 +41,28 @@ def test_series_not(dtype): arr = arr * (np.random.random(1000) * 100).astype(dtype) sr = Series(arr) - result = cudf.logical_not(sr).to_array() + result = cudf.logical_not(sr).to_numpy() expect = np.logical_not(arr) np.testing.assert_equal(result, expect) - np.testing.assert_equal((~sr).to_array(), ~arr) + np.testing.assert_equal((~sr).to_numpy(), ~arr) def test_series_neg(): arr = np.random.random(100) * 100 sr = Series(arr) - np.testing.assert_equal((-sr).to_array(), -arr) + np.testing.assert_equal((-sr).to_numpy(), -arr) def test_series_ceil(): arr = np.random.random(100) * 100 sr = Series(arr) - np.testing.assert_equal(sr.ceil().to_array(), np.ceil(arr)) + np.testing.assert_equal(sr.ceil().to_numpy(), np.ceil(arr)) def test_series_floor(): arr = np.random.random(100) * 100 sr = Series(arr) - np.testing.assert_equal(sr.floor().to_array(), np.floor(arr)) + np.testing.assert_equal(sr.floor().to_numpy(), np.floor(arr)) @pytest.mark.parametrize("nelem", [1, 7, 8, 9, 32, 64, 128]) @@ -77,7 +77,7 @@ def test_validity_ceil(nelem): res = sr.ceil() na_value = -100000 - got = res.fillna(na_value).to_array() + got = res.fillna(na_value).to_numpy() res_mask = np.asarray(bitmask, dtype=np.bool_)[: data.size] expect = np.ceil(data) diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py index 727bbb1c345..7b7fe674210 100755 --- a/python/cudf/cudf/utils/cudautils.py +++ b/python/cudf/cudf/utils/cudautils.py @@ -210,6 +210,16 @@ def grouped_window_sizes_from_offset(arr, group_starts, offset): _udf_code_cache: cachetools.LRUCache = cachetools.LRUCache(maxsize=32) +def make_cache_key(udf, sig): + codebytes = udf.__code__.co_code + if udf.__closure__ is not None: + cvars = tuple([x.cell_contents for x in udf.__closure__]) + cvarbytes = dumps(cvars) + else: + cvarbytes = b"" + return codebytes, cvarbytes, sig + + def compile_udf(udf, type_signature): """Compile ``udf`` with `numba` @@ -244,14 +254,7 @@ def compile_udf(udf, type_signature): # Check if we've already compiled a similar (but possibly distinct) # function before - codebytes = udf.__code__.co_code - if udf.__closure__ is not None: - cvars = tuple([x.cell_contents for x in udf.__closure__]) - cvarbytes = dumps(cvars) - else: - cvarbytes = b"" - - key = (type_signature, codebytes, cvarbytes) + key = make_cache_key(udf, type_signature) res = _udf_code_cache.get(key) if res: return res diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index bdaf5e144a5..c5620bed078 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -499,6 +499,34 @@ def find_common_type(dtypes): if len(dtypes) == 0: return None + # Early exit for categoricals since they're not hashable and therefore + # can't be put in a set. + if any(cudf.api.types.is_categorical_dtype(dtype) for dtype in dtypes): + if all( + ( + cudf.api.types.is_categorical_dtype(dtype) + and (not dtype.ordered if hasattr(dtype, "ordered") else True) + ) + for dtype in dtypes + ): + if len(set(dtype._categories.dtype for dtype in dtypes)) == 1: + return cudf.CategoricalDtype( + cudf.core.column.concat_columns( + [dtype._categories for dtype in dtypes] + ).unique() + ) + else: + raise ValueError( + "Only unordered categories of the same underlying type " + "may be coerced to a common type." + ) + else: + # TODO: Should this be an error case (mixing categorical with other + # dtypes) or should this return object? Unclear if we have enough + # information to decide right now, may have to come back to this as + # usage of find_common_type increases. + return cudf.dtype("O") + # Aggregate same types dtypes = set(dtypes) diff --git a/python/cudf/requirements/cuda-11.0/dev_requirements.txt b/python/cudf/requirements/cuda-11.0/dev_requirements.txt index f0e2b2f8bcf..d8dce276820 100644 --- a/python/cudf/requirements/cuda-11.0/dev_requirements.txt +++ b/python/cudf/requirements/cuda-11.0/dev_requirements.txt @@ -36,6 +36,6 @@ sphinx sphinx-copybutton sphinx-markdown-tables sphinxcontrib-websupport -transformers +transformers<=4.10.3 typing_extensions wheel diff --git a/python/cudf/requirements/cuda-11.2/dev_requirements.txt b/python/cudf/requirements/cuda-11.2/dev_requirements.txt index df7e78354cd..c11d108360d 100644 --- a/python/cudf/requirements/cuda-11.2/dev_requirements.txt +++ b/python/cudf/requirements/cuda-11.2/dev_requirements.txt @@ -36,6 +36,6 @@ sphinx sphinx-copybutton sphinx-markdown-tables sphinxcontrib-websupport -transformers +transformers<=4.10.3 typing_extensions wheel diff --git a/python/cudf/setup.py b/python/cudf/setup.py index cb8ca9158e1..c081a719808 100644 --- a/python/cudf/setup.py +++ b/python/cudf/setup.py @@ -52,7 +52,7 @@ "python-snappy>=0.6.0", "pyorc", "msgpack", - "transformers", + "transformers<=4.10.3", ] } diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index e604e5511da..d8037cadd7c 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -420,7 +420,7 @@ def struct(self): class Index(Series, dd.core.Index): - _partition_type = cudf.Index + _partition_type = cudf.Index # type: ignore def _naive_var(ddf, meta, skipna, ddof, split_every, out): diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index 7e2c3a4f36c..0cf9d835523 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -1,6 +1,7 @@ # Copyright (c) 2020, NVIDIA CORPORATION. import math from operator import getitem +from typing import Set import numpy as np import pandas as pd @@ -378,7 +379,7 @@ def _is_supported(arg, supported: set): """ if isinstance(arg, (list, dict)): if isinstance(arg, dict): - _global_set = set() + _global_set: Set[str] = set() for col in arg: if isinstance(arg[col], list): _global_set = _global_set.union(set(arg[col])) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 850cc0843cc..f5224681a1e 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -5,6 +5,7 @@ from io import BufferedWriter, BytesIO, IOBase import numpy as np +import pyarrow as pa from pyarrow import dataset as pa_ds, parquet as pq from dask import dataframe as dd @@ -45,7 +46,6 @@ def read_metadata(*args, **kwargs): and strings_to_cats ): new_meta._data[col] = new_meta._data[col].astype("int32") - return (new_meta, stats, parts, index) @classmethod @@ -332,12 +332,13 @@ def set_object_dtypes_from_pa_schema(df, schema): # pyarrow schema. if schema: for col_name, col in df._data.items(): - if col_name in schema.names and isinstance( - col, cudf.core.column.StringColumn + typ = schema.field(col_name).type + if ( + col_name in schema.names + and not isinstance(typ, (pa.ListType, pa.StructType)) + and isinstance(col, cudf.core.column.StringColumn) ): - df._data[col_name] = col.astype( - cudf_dtype_from_pa_type(schema.field(col_name).type) - ) + df._data[col_name] = col.astype(cudf_dtype_from_pa_type(typ)) def read_parquet( diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index d512418ed45..4c263ca2e53 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -491,6 +491,7 @@ def test_create_metadata_file_inconsistent_schema(tmpdir): @pytest.mark.parametrize( "data", [ + ["dog", "cat", "fish"], [[0], [1, 2], [3]], [None, [1, 2], [3]], [{"f1": 1}, {"f1": 0, "f2": "dog"}, {"f2": "cat"}], @@ -502,5 +503,7 @@ def test_cudf_dtypes_from_pandas(tmpdir, data): fn = str(tmpdir.join("test.parquet")) dfp = pd.DataFrame({"data": data}) dfp.to_parquet(fn, engine="pyarrow", index=True) - ddf2 = dask_cudf.read_parquet(fn) + # Use `split_row_groups=True` to avoid "fast path" where + # schema is not is passed through in older Dask versions + ddf2 = dask_cudf.read_parquet(fn, split_row_groups=True) dd.assert_eq(cudf.from_pandas(dfp), ddf2)