diff --git a/CHANGELOG.md b/CHANGELOG.md index de00213a6f6..b46ac22d767 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# cuDF 21.12.00 (Date TBD) + +Please see https://github.com/rapidsai/cudf/releases/tag/v21.12.00a for the latest changes to this development branch. + # cuDF 21.10.00 (Date TBD) Please see https://github.com/rapidsai/cudf/releases/tag/v21.10.00a for the latest changes to this development branch. diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh index e73153ce0c3..c2544ff7ffe 100755 --- a/ci/benchmark/build.sh +++ b/ci/benchmark/build.sh @@ -36,6 +36,9 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/" # like `/tmp` is. export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache" +# Dask & Distributed git tag +export DASK_DISTRIBUTED_GIT_TAG='2021.09.1' + function remove_libcudf_kernel_cache_dir { EXITCODE=$? logger "removing kernel cache dir: $LIBCUDF_KERNEL_CACHE_PATH" @@ -75,10 +78,10 @@ conda install "rmm=$MINOR_VERSION.*" "cudatoolkit=$CUDA_REL" \ # conda install "your-pkg=1.0.0" # Install the master version of dask, distributed, and streamz -logger "pip install git+https://github.com/dask/distributed.git@main --upgrade --no-deps" -pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps -logger "pip install git+https://github.com/dask/dask.git@main --upgrade --no-deps" -pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps +logger "pip install git+https://github.com/dask/distributed.git@$DASK_DISTRIBUTED_GIT_TAG --upgrade --no-deps" +pip install "git+https://github.com/dask/distributed.git@$DASK_DISTRIBUTED_GIT_TAG" --upgrade --no-deps +logger "pip install git+https://github.com/dask/dask.git@$DASK_DISTRIBUTED_GIT_TAG --upgrade --no-deps" +pip install "git+https://github.com/dask/dask.git@$DASK_DISTRIBUTED_GIT_TAG" --upgrade --no-deps logger "pip install git+https://github.com/python-streamz/streamz.git@master --upgrade --no-deps" pip install "git+https://github.com/python-streamz/streamz.git@master" --upgrade --no-deps diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 8e5b4d80115..7c5b9d836dd 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -30,6 +30,9 @@ export CONDA_ARTIFACT_PATH="$WORKSPACE/ci/artifacts/cudf/cpu/.conda-bld/" export GIT_DESCRIBE_TAG=`git describe --tags` export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` +# Dask & Distributed git tag +export DASK_DISTRIBUTED_GIT_TAG='2021.09.1' + ################################################################################ # TRAP - Setup trap for removing jitify cache ################################################################################ @@ -101,8 +104,8 @@ function install_dask { # Install the main version of dask, distributed, and streamz gpuci_logger "Install the main version of dask, distributed, and streamz" set -x - pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps - pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps + pip install "git+https://github.com/dask/distributed.git@$DASK_DISTRIBUTED_GIT_TAG" --upgrade --no-deps + pip install "git+https://github.com/dask/dask.git@$DASK_DISTRIBUTED_GIT_TAG" --upgrade --no-deps # Need to uninstall streamz that is already in the env. pip uninstall -y streamz pip install "git+https://github.com/python-streamz/streamz.git@master" --upgrade --no-deps diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index bbe1ae70499..d5251b18582 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -10,7 +10,7 @@ dependencies: - clang=11.0.0 - clang-tools=11.0.0 - cupy>7.1.0,<10.0.0a0 - - rmm=21.10.* + - rmm=21.12.* - cmake>=3.20.1 - cmake_setuptools>=0.1.3 - python>=3.7,<3.9 @@ -39,8 +39,8 @@ dependencies: - mypy=0.782 - typing_extensions - pre_commit - - dask>=2021.6.0 - - distributed>=2021.6.0 + - dask=2021.09.1 + - distributed=2021.09.1 - streamz - arrow-cpp=5.0.0 - dlpack>=0.5,<0.6.0a0 @@ -58,7 +58,7 @@ dependencies: - transformers - pydata-sphinx-theme - pip: - - git+https://github.com/dask/dask.git@main - - git+https://github.com/dask/distributed.git@main + - git+https://github.com/dask/dask.git@2021.09.1 + - git+https://github.com/dask/distributed.git@2021.09.1 - git+https://github.com/python-streamz/streamz.git@master - pyorc diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml index ed4c3ee2efc..7ab2cd60ce3 100644 --- a/conda/environments/cudf_dev_cuda11.2.yml +++ b/conda/environments/cudf_dev_cuda11.2.yml @@ -10,7 +10,7 @@ dependencies: - clang=11.0.0 - clang-tools=11.0.0 - cupy>7.1.0,<10.0.0a0 - - rmm=21.10.* + - rmm=21.12.* - cmake>=3.20.1 - cmake_setuptools>=0.1.3 - python>=3.7,<3.9 @@ -39,8 +39,8 @@ dependencies: - mypy=0.782 - typing_extensions - pre_commit - - dask>=2021.6.0 - - distributed>=2021.6.0 + - dask=2021.09.1 + - distributed=2021.09.1 - streamz - arrow-cpp=5.0.0 - dlpack>=0.5,<0.6.0a0 @@ -58,7 +58,7 @@ dependencies: - transformers - pydata-sphinx-theme - pip: - - git+https://github.com/dask/dask.git@main - - git+https://github.com/dask/distributed.git@main + - git+https://github.com/dask/dask.git@2021.09.1 + - git+https://github.com/dask/distributed.git@2021.09.1 - git+https://github.com/python-streamz/streamz.git@master - pyorc diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index d0965e97567..db8aa8e6c85 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -31,8 +31,8 @@ requirements: - python - streamz - cudf {{ version }} - - dask>=2021.6.0 - - distributed>=2021.6.0 + - dask=2021.09.1 + - distributed=2021.09.1 - python-confluent-kafka - cudf_kafka {{ version }} diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index 1b2c4efd610..45d96a2de85 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -26,13 +26,13 @@ requirements: host: - python - cudf {{ version }} - - dask>=2021.6.0 - - distributed>=2021.6.0 + - dask=2021.09.1 + - distributed=2021.09.1 run: - python - cudf {{ version }} - - dask>=2021.6.0 - - distributed>=2021.6.0 + - dask=2021.09.1 + - distributed=2021.09.1 test: # [linux64] requires: # [linux64] diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh index 3fc1182b33b..f56610bea86 100644 --- a/conda/recipes/dask-cudf/run_test.sh +++ b/conda/recipes/dask-cudf/run_test.sh @@ -8,6 +8,15 @@ function logger() { echo -e "\n>>>> $@\n" } +# Importing cudf on arm64 CPU only nodes is currently not working due to a +# difference in reported gpu devices between arm64 and amd64 +ARCH=$(arch) + +if [ "${ARCH}" = "aarch64" ]; then + logger "Skipping tests on arm64" + exit 0 +fi + # Install the latest version of dask and distributed logger "pip install git+https://github.com/dask/distributed.git@main --upgrade --no-deps" pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 0f05dcb4bb3..fd687de6698 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -93,6 +93,7 @@ test: - test -f $PREFIX/include/cudf/detail/sequence.hpp - test -f $PREFIX/include/cudf/detail/sorting.hpp - test -f $PREFIX/include/cudf/detail/stream_compaction.hpp + - test -f $PREFIX/include/cudf/detail/tdigest/tdigest.hpp - test -f $PREFIX/include/cudf/detail/transform.hpp - test -f $PREFIX/include/cudf/detail/transpose.hpp - test -f $PREFIX/include/cudf/detail/unary.hpp @@ -238,6 +239,7 @@ test: - test -f $PREFIX/include/cudf_test/cudf_gtest.hpp - test -f $PREFIX/include/cudf_test/cxxopts.hpp - test -f $PREFIX/include/cudf_test/file_utilities.hpp + - test -f $PREFIX/include/cudf_test/io_metadata_utilities.hpp - test -f $PREFIX/include/cudf_test/iterator_utilities.hpp - test -f $PREFIX/include/cudf_test/table_utilities.hpp - test -f $PREFIX/include/cudf_test/timestamp_utilities.cuh diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c72c258fd18..982fee640d9 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -28,7 +28,7 @@ include(rapids-find) rapids_cuda_init_architectures(CUDF) -project(CUDF VERSION 21.10.00 LANGUAGES C CXX CUDA) +project(CUDF VERSION 21.12.00 LANGUAGES C CXX CUDA) # Needed because GoogleBenchmark changes the state of FindThreads.cmake, # causing subsequent runs to have different values for the `Threads::Threads` target. @@ -236,8 +236,9 @@ add_library(cudf src/groupby/sort/group_max_scan.cu src/groupby/sort/group_min_scan.cu src/groupby/sort/group_rank_scan.cu - src/groupby/sort/group_sum_scan.cu src/groupby/sort/group_replace_nulls.cu + src/groupby/sort/group_sum_scan.cu + src/groupby/sort/group_tdigest.cu src/groupby/sort/sort_helper.cu src/hash/hashing.cu src/hash/md5_hash.cu @@ -318,6 +319,7 @@ add_library(cudf src/merge/merge.cu src/partitioning/partitioning.cu src/partitioning/round_robin.cu + src/quantiles/tdigest/tdigest.cu src/quantiles/quantile.cu src/quantiles/quantiles.cu src/reductions/all.cu @@ -565,6 +567,7 @@ add_library(cudftestutil STATIC tests/utilities/base_fixture.cpp tests/utilities/column_utilities.cu tests/utilities/table_utilities.cu + tests/io/metadata_utilities.cpp tests/strings/utilities.cu) set_target_properties(cudftestutil diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake index cade101cbfd..16d50fd3388 100644 --- a/cpp/cmake/thirdparty/get_nvcomp.cmake +++ b/cpp/cmake/thirdparty/get_nvcomp.cmake @@ -21,7 +21,7 @@ function(find_and_configure_nvcomp VERSION) GLOBAL_TARGETS nvcomp::nvcomp CPM_ARGS GITHUB_REPOSITORY NVIDIA/nvcomp - GIT_TAG 4f4e5713e69473be6e0c8ae483a932f666ae3c2f + GIT_TAG aa003db89e052e4ce408910ff17e1054b7c43b7d OPTIONS "BUILD_STATIC ON" "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index 72524996a69..1141f20e3b1 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "libcudf" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 21.10.00 +PROJECT_NUMBER = 21.12.00 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a @@ -2167,7 +2167,7 @@ SKIP_FUNCTION_MACROS = YES # the path). If a tag file is not located in the directory in which doxygen is # run, you must also specify the path to the tagfile here. -TAGFILES = rmm.tag=https://docs.rapids.ai/api/librmm/21.10 +TAGFILES = rmm.tag=https://docs.rapids.ai/api/librmm/21.12 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a # tag file that is based on the input files it reads. See section "Linking to diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt index aef477c91e1..4175b34ff40 100644 --- a/cpp/examples/basic/CMakeLists.txt +++ b/cpp/examples/basic/CMakeLists.txt @@ -6,7 +6,7 @@ set(CPM_DOWNLOAD_VERSION v0.32.2) file(DOWNLOAD https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) -set(CUDF_TAG branch-21.10) +set(CUDF_TAG branch-21.12) CPMFindPackage(NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf GIT_TAG ${CUDF_TAG} diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index c302895880d..fb6401a3cc1 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -87,7 +87,9 @@ class aggregation { CUDA, ///< CUDA UDF based reduction MERGE_LISTS, ///< merge multiple lists values into one list MERGE_SETS, ///< merge multiple lists values into one list then drop duplicate entries - MERGE_M2 ///< merge partial values of M2 aggregation + MERGE_M2, ///< merge partial values of M2 aggregation + TDIGEST, ///< create a tdigest from a set of input values + MERGE_TDIGEST ///< create a tdigest by merging multiple tdigests together }; aggregation() = delete; @@ -493,5 +495,80 @@ std::unique_ptr make_merge_sets_aggregation(null_equality nulls_equal = nu template std::unique_ptr make_merge_m2_aggregation(); +/** + * @brief Factory to create a TDIGEST aggregation + * + * Produces a tdigest (https://arxiv.org/pdf/1902.04023.pdf) column from input values. + * The input aggregation values are expected to be fixed-width numeric types. + * + * The tdigest column produced is of the following structure: + * + * struct { + * // centroids for the digest + * list { + * struct { + * double // mean + * double // weight + * }, + * ... + * } + * // these are from the input stream, not the centroids. they are used + * // during the percentile_approx computation near the beginning or + * // end of the quantiles + * double // min + * double // max + * } + * + * Each output row is a single tdigest. The length of the row is the "size" of the + * tdigest, each element of which represents a weighted centroid (mean, weight). + * + * @param max_centroids Parameter controlling compression level and accuracy on subsequent + * queries on the output tdigest data. `max_centroids` places an upper bound on the size of + * the computed tdigests: A value of 1000 will result in a tdigest containing no + * more than 1000 centroids (32 bytes each). Higher result in more accurate tdigest information. + * + * @returns A TDIGEST aggregation object. + */ +template +std::unique_ptr make_tdigest_aggregation(int max_centroids = 1000); + +/** + * @brief Factory to create a MERGE_TDIGEST aggregation + * + * Merges the results from a previous aggregation resulting from a `make_tdigest_aggregation` + * or `make_merge_tdigest_aggregation` to produce a new a tdigest + * (https://arxiv.org/pdf/1902.04023.pdf) column. + * + * The tdigest column produced is of the following structure: + * + * struct { + * // centroids for the digest + * list { + * struct { + * double // mean + * double // weight + * }, + * ... + * } + * // these are from the input stream, not the centroids. they are used + * // during the percentile_approx computation near the beginning or + * // end of the quantiles + * double // min + * double // max + * } + * + * Each output row is a single tdigest. The length of the row is the "size" of the + * tdigest, each element of which represents a weighted centroid (mean, weight). + * + * @param max_centroids Parameter controlling compression level and accuracy on subsequent + * queries on the output tdigest data. `max_centroids` places an upper bound on the size of + * the computed tdigests: A value of 1000 will result in a tdigest containing no + * more than 1000 centroids (32 bytes each). Higher result in more accurate tdigest information. + * + * @returns A MERGE_TDIGEST aggregation object. + */ +template +std::unique_ptr make_merge_tdigest_aggregation(int max_centroids = 1000); + /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 4cf902ef562..05d1bf3e595 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -91,6 +91,10 @@ class simple_aggregations_collector { // Declares the interface for the simple class merge_sets_aggregation const& agg); virtual std::vector> visit(data_type col_type, class merge_m2_aggregation const& agg); + virtual std::vector> visit(data_type col_type, + class tdigest_aggregation const& agg); + virtual std::vector> visit( + data_type col_type, class merge_tdigest_aggregation const& agg); }; class aggregation_finalizer { // Declares the interface for the finalizer @@ -125,6 +129,8 @@ class aggregation_finalizer { // Declares the interface for the finalizer virtual void visit(class merge_lists_aggregation const& agg); virtual void visit(class merge_sets_aggregation const& agg); virtual void visit(class merge_m2_aggregation const& agg); + virtual void visit(class tdigest_aggregation const& agg); + virtual void visit(class merge_tdigest_aggregation const& agg); }; /** @@ -884,6 +890,54 @@ class merge_m2_aggregation final : public groupby_aggregation { void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } }; +/** + * @brief Derived aggregation class for specifying TDIGEST aggregation + */ +class tdigest_aggregation final : public groupby_aggregation { + public: + explicit tdigest_aggregation(int max_centroids_) + : aggregation{TDIGEST}, max_centroids{max_centroids_} + { + } + + int const max_centroids; + + std::unique_ptr clone() const override + { + return std::make_unique(*this); + } + std::vector> get_simple_aggregations( + data_type col_type, simple_aggregations_collector& collector) const override + { + return collector.visit(col_type, *this); + } + void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } +}; + +/** + * @brief Derived aggregation class for specifying MERGE_TDIGEST aggregation + */ +class merge_tdigest_aggregation final : public groupby_aggregation { + public: + explicit merge_tdigest_aggregation(int max_centroids_) + : aggregation{MERGE_TDIGEST}, max_centroids{max_centroids_} + { + } + + int const max_centroids; + + std::unique_ptr clone() const override + { + return std::make_unique(*this); + } + std::vector> get_simple_aggregations( + data_type col_type, simple_aggregations_collector& collector) const override + { + return collector.visit(col_type, *this); + } + void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } +}; + /** * @brief Sentinel value used for `ARGMAX` aggregation. * @@ -954,14 +1008,16 @@ template struct target_type_impl< Source, k, - std::enable_if_t() && !is_chrono() && (k == aggregation::MEAN)>> { + std::enable_if_t() && not is_chrono() && + not is_fixed_point() && (k == aggregation::MEAN)>> { using type = double; }; template -struct target_type_impl() && (k == aggregation::MEAN)>> { +struct target_type_impl< + Source, + k, + std::enable_if_t<(is_chrono() or is_fixed_point()) && (k == aggregation::MEAN)>> { using type = Source; }; @@ -1118,6 +1174,24 @@ struct target_type_impl { using type = struct_view; }; +// Always use numeric types for TDIGEST +template +struct target_type_impl() || is_fixed_point())>> { + using type = struct_view; +}; + +// TDIGEST_MERGE. The root column type for a tdigest column is a list_view. Strictly +// speaking, this check is not sufficient to guarantee we are actually being given a +// real tdigest column, but we will do further verification inside the aggregation code. +template +struct target_type_impl>> { + using type = struct_view; +}; + /** * @brief Helper alias to get the accumulator type for performing aggregation * `k` on elements of type `Source` @@ -1222,6 +1296,10 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin return f.template operator()(std::forward(args)...); case aggregation::MERGE_M2: return f.template operator()(std::forward(args)...); + case aggregation::TDIGEST: + return f.template operator()(std::forward(args)...); + case aggregation::MERGE_TDIGEST: + return f.template operator()(std::forward(args)...); default: { #ifndef __CUDA_ARCH__ CUDF_FAIL("Unsupported aggregation."); diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp index fb5cfad6186..9f06661c8d1 100644 --- a/cpp/include/cudf/detail/copy.hpp +++ b/cpp/include/cudf/detail/copy.hpp @@ -75,6 +75,15 @@ std::vector slice(column_view const& input, std::vector const& indices, rmm::cuda_stream_view stream = rmm::cuda_stream_default); +/** + * @copydoc cudf::slice(table_view const&,std::vector const&) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::vector slice(table_view const& input, + std::vector const& indices, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + /** * @copydoc cudf::shift(column_view const&,size_type,scalar const&, * rmm::mr::device_memory_resource*) diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh index a779c3defbb..ec83e348e33 100644 --- a/cpp/include/cudf/detail/merge.cuh +++ b/cpp/include/cudf/detail/merge.cuh @@ -145,5 +145,22 @@ struct row_lexicographic_tagged_comparator { order const* _column_order{}; }; +/** + * @copydoc std::unique_ptr merge( + * std::vector const& tables_to_merge, + * std::vector const& key_cols, + * std::vector const& column_order, + * std::vector const& null_precedence, + * rmm::mr::device_memory_resource* mr) + * + * @param stream CUDA stream used for device memory operations and kernel launches + */ +std::unique_ptr merge(std::vector const& tables_to_merge, + std::vector const& key_cols, + std::vector const& column_order, + std::vector const& null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/quantiles.hpp b/cpp/include/cudf/detail/quantiles.hpp index 5fb2ce4cbe6..7a76f9cab88 100644 --- a/cpp/include/cudf/detail/quantiles.hpp +++ b/cpp/include/cudf/detail/quantiles.hpp @@ -22,7 +22,8 @@ namespace cudf { namespace detail { -/** @copydoc cudf::quantile() +/** + * @copydoc cudf::quantile() * * @param stream CUDA stream used for device memory operations and kernel launches. */ @@ -35,7 +36,8 @@ std::unique_ptr quantile( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** @copydoc cudf::quantiles() +/** + * @copydoc cudf::quantiles() * * @param stream CUDA stream used for device memory operations and kernel launches. */ @@ -49,5 +51,17 @@ std::unique_ptr quantiles( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc cudf::percentile_approx(column_view const&, column_view const&, + * rmm::mr::device_memory_resource*) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr percentile_approx( + column_view const& input, + column_view const& percentiles, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp index 3127a5f89f1..b5dfb34c043 100644 --- a/cpp/include/cudf/detail/sorting.hpp +++ b/cpp/include/cudf/detail/sorting.hpp @@ -32,7 +32,7 @@ namespace detail { * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr sorted_order( - table_view input, + table_view const& input, std::vector const& column_order = {}, std::vector const& null_precedence = {}, rmm::cuda_stream_view stream = rmm::cuda_stream_default, @@ -44,7 +44,7 @@ std::unique_ptr sorted_order( * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr stable_sorted_order( - table_view input, + table_view const& input, std::vector const& column_order = {}, std::vector const& null_precedence = {}, rmm::cuda_stream_view stream = rmm::cuda_stream_default, @@ -90,5 +90,17 @@ std::unique_ptr
segmented_sort_by_key( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc cudf::sort + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr
sort( + table_view const& values, + std::vector const& column_order = {}, + std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp new file mode 100644 index 00000000000..94c22911c1e --- /dev/null +++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +namespace cudf { +namespace detail { + +namespace tdigest { + +// mean and weight column indices within tdigest inner struct columns +constexpr size_type mean_column_index = 0; +constexpr size_type weight_column_index = 1; + +// min and max column indices within tdigest outer struct columns +constexpr size_type centroid_column_index = 0; +constexpr size_type min_column_index = 1; +constexpr size_type max_column_index = 2; + +/** + * @brief Verifies that the input column is a valid tdigest column. + * + * struct { + * // centroids for the digest + * list { + * struct { + * double // mean + * double // weight + * }, + * ... + * } + * // these are from the input stream, not the centroids. they are used + * // during the percentile_approx computation near the beginning or + * // end of the quantiles + * double // min + * double // max + * } + * + * Each output row is a single tdigest. The length of the row is the "size" of the + * tdigest, each element of which represents a weighted centroid (mean, weight). + * + * @param col Column to be checkeed + * + * @throws cudf::logic error if the column is not a valid tdigest column. + */ +void check_is_valid_tdigest_column(column_view const& col); + +/** + * @brief Create an empty tdigest column. + * + * An empty tdigest column contains a single row of length 0 + * + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. + * + * @returns An empty tdigest column. + */ +std::unique_ptr make_empty_tdigest_column( + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace tdigest +} // namespace detail +} // namespace cudf \ No newline at end of file diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index 4ae09b516a4..17d8e5eb7dd 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -389,7 +389,7 @@ class orc_writer_options { // Set of columns to output table_view _table; // Optional associated metadata - const table_metadata* _metadata = nullptr; + const table_input_metadata* _metadata = nullptr; friend orc_writer_options_builder; @@ -445,7 +445,7 @@ class orc_writer_options { /** * @brief Returns associated metadata. */ - table_metadata const* get_metadata() const { return _metadata; } + table_input_metadata const* get_metadata() const { return _metadata; } // Setters @@ -475,7 +475,7 @@ class orc_writer_options { * * @param meta Associated metadata. */ - void set_metadata(table_metadata* meta) { _metadata = meta; } + void set_metadata(table_input_metadata const* meta) { _metadata = meta; } }; class orc_writer_options_builder { @@ -541,7 +541,7 @@ class orc_writer_options_builder { * @param meta Associated metadata. * @return this for chaining. */ - orc_writer_options_builder& metadata(table_metadata* meta) + orc_writer_options_builder& metadata(table_input_metadata const* meta) { options._metadata = meta; return *this; @@ -570,6 +570,9 @@ class orc_writer_options_builder { * cudf::io::write_orc(options); * @endcode * + * Note: Support for writing tables with struct columns is currently experimental, the output may + * not be as reliable as writing for other datatypes. + * * @param options Settings for controlling reading behavior. * @param mr Device memory resource to use for device memory allocation. */ @@ -592,7 +595,7 @@ class chunked_orc_writer_options { // Enable writing column statistics bool _enable_statistics = true; // Optional associated metadata - const table_metadata_with_nullability* _metadata = nullptr; + const table_input_metadata* _metadata = nullptr; friend chunked_orc_writer_options_builder; @@ -638,7 +641,7 @@ class chunked_orc_writer_options { /** * @brief Returns associated metadata. */ - table_metadata_with_nullability const* get_metadata() const { return _metadata; } + table_input_metadata const* get_metadata() const { return _metadata; } // Setters @@ -661,7 +664,7 @@ class chunked_orc_writer_options { * * @param meta Associated metadata. */ - void metadata(table_metadata_with_nullability* meta) { _metadata = meta; } + void metadata(table_input_metadata const* meta) { _metadata = meta; } }; class chunked_orc_writer_options_builder { @@ -712,7 +715,7 @@ class chunked_orc_writer_options_builder { * @param meta Associated metadata. * @return this for chaining. */ - chunked_orc_writer_options_builder& metadata(table_metadata_with_nullability* meta) + chunked_orc_writer_options_builder& metadata(table_input_metadata const* meta) { options._metadata = meta; return *this; diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 25cbb6fd554..bc495c61d54 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -24,8 +24,6 @@ #include -#include - #include #include #include @@ -375,173 +373,6 @@ table_with_metadata read_parquet( * @{ * @file */ -class table_input_metadata; - -class column_in_metadata { - friend table_input_metadata; - std::string _name = ""; - thrust::optional _nullable; - // TODO: This isn't implemented yet - bool _list_column_is_map = false; - bool _use_int96_timestamp = false; - // bool _output_as_binary = false; - thrust::optional _decimal_precision; - std::vector children; - - public: - /** - * @brief Get the children of this column metadata - * - * @return this for chaining - */ - column_in_metadata& add_child(column_in_metadata const& child) - { - children.push_back(child); - return *this; - } - - /** - * @brief Set the name of this column - * - * @return this for chaining - */ - column_in_metadata& set_name(std::string const& name) - { - _name = name; - return *this; - } - - /** - * @brief Set the nullability of this column - * - * Only valid in case of chunked writes. In single writes, this option is ignored. - * - * @return column_in_metadata& - */ - column_in_metadata& set_nullability(bool nullable) - { - _nullable = nullable; - return *this; - } - - /** - * @brief Specify that this list column should be encoded as a map in the written parquet file - * - * The column must have the structure list>. This option is invalid otherwise - * - * @return this for chaining - */ - column_in_metadata& set_list_column_as_map() - { - _list_column_is_map = true; - return *this; - } - - /** - * @brief Specifies whether this timestamp column should be encoded using the deprecated int96 - * physical type. Only valid for the following column types: - * timestamp_s, timestamp_ms, timestamp_us, timestamp_ns - * - * @param req True = use int96 physical type. False = use int64 physical type - * @return this for chaining - */ - column_in_metadata& set_int96_timestamps(bool req) - { - _use_int96_timestamp = req; - return *this; - } - - /** - * @brief Set the decimal precision of this column. Only valid if this column is a decimal - * (fixed-point) type - * - * @param precision The integer precision to set for this decimal column - * @return this for chaining - */ - column_in_metadata& set_decimal_precision(uint8_t precision) - { - _decimal_precision = precision; - return *this; - } - - /** - * @brief Get reference to a child of this column - * - * @param i Index of the child to get - * @return this for chaining - */ - column_in_metadata& child(size_type i) { return children[i]; } - - /** - * @brief Get const reference to a child of this column - * - * @param i Index of the child to get - * @return this for chaining - */ - column_in_metadata const& child(size_type i) const { return children[i]; } - - /** - * @brief Get the name of this column - */ - std::string get_name() const { return _name; } - - /** - * @brief Get whether nullability has been explicitly set for this column. - */ - bool is_nullability_defined() const { return _nullable.has_value(); } - - /** - * @brief Gets the explicitly set nullability for this column. - * @throws If nullability is not explicitly defined for this column. - * Check using `is_nullability_defined()` first. - */ - bool nullable() const { return _nullable.value(); } - - /** - * @brief If this is the metadata of a list column, returns whether it is to be encoded as a map. - */ - bool is_map() const { return _list_column_is_map; } - - /** - * @brief Get whether to encode this timestamp column using deprecated int96 physical type - */ - bool is_enabled_int96_timestamps() const { return _use_int96_timestamp; } - - /** - * @brief Get whether precision has been set for this decimal column - */ - bool is_decimal_precision_set() const { return _decimal_precision.has_value(); } - - /** - * @brief Get the decimal precision that was set for this column. - * @throws If decimal precision was not set for this column. - * Check using `is_decimal_precision_set()` first. - */ - uint8_t get_decimal_precision() const { return _decimal_precision.value(); } - - /** - * @brief Get the number of children of this column - */ - size_type num_children() const { return children.size(); } -}; - -class table_input_metadata { - public: - table_input_metadata() = default; // Required by cython - - /** - * @brief Construct a new table_input_metadata from a table_view. - * - * The constructed table_input_metadata has the same structure as the passed table_view - * - * @param table The table_view to construct metadata for - * @param user_data Optional Additional metadata to encode, as key-value pairs - */ - table_input_metadata(table_view const& table, std::map user_data = {}); - - std::vector column_metadata; - std::map user_data; //!< Format-dependent metadata as key-values pairs -}; /** * @brief Class to build `parquet_writer_options`. diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index 661b36f10c8..ac965e2d416 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -23,6 +23,8 @@ #include +#include + #include #include #include @@ -125,34 +127,6 @@ struct table_metadata { std::map user_data; //!< Format-dependent metadata as key-values pairs }; -/** - * @brief Derived class of table_metadata which includes flattened nullability information of input. - * - * This information is used as an optimization for chunked writes. If the caller leaves - * column_nullable uninitialized, the writer code will assume the worst case : that all columns are - * nullable. - * - * If the column_nullable field is not empty, it is expected that it has a length equal to the - * number of columns in the flattened table being written. - * - * Flattening refers to the flattening of nested columns. For list columns, the number of values - * expected in the nullability vector is equal to the depth of the nesting. e.g. for a table of - * three columns of types: {int, list, float}, the nullability vector contains the values: - * - * |Index| Nullability of | - * |-----|----------------------------------------| - * | 0 | int column | - * | 1 | Level 0 of list column (list itself) | - * | 2 | Level 1 of list column (double values) | - * | 3 | float column | - * - * In the case where column nullability is known, pass `true` if the corresponding column could - * contain nulls in one or more subtables to be written, otherwise `false`. - */ -struct table_metadata_with_nullability : public table_metadata { - std::vector column_nullable; //!< Per-column nullability information. -}; - /** * @brief Table with table metadata used by io readers to return the metadata by value */ @@ -234,5 +208,174 @@ struct sink_info { } }; +class table_input_metadata; + +class column_in_metadata { + friend table_input_metadata; + std::string _name = ""; + thrust::optional _nullable; + bool _list_column_is_map = false; + bool _use_int96_timestamp = false; + // bool _output_as_binary = false; + thrust::optional _decimal_precision; + std::vector children; + + public: + column_in_metadata() = default; + column_in_metadata(std::string_view name) : _name{name} {} + /** + * @brief Get the children of this column metadata + * + * @return this for chaining + */ + column_in_metadata& add_child(column_in_metadata const& child) + { + children.push_back(child); + return *this; + } + + /** + * @brief Set the name of this column + * + * @return this for chaining + */ + column_in_metadata& set_name(std::string const& name) + { + _name = name; + return *this; + } + + /** + * @brief Set the nullability of this column + * + * Only valid in case of chunked writes. In single writes, this option is ignored. + * + * @return column_in_metadata& + */ + column_in_metadata& set_nullability(bool nullable) + { + _nullable = nullable; + return *this; + } + + /** + * @brief Specify that this list column should be encoded as a map in the written parquet file + * + * The column must have the structure list>. This option is invalid otherwise + * + * @return this for chaining + */ + column_in_metadata& set_list_column_as_map() + { + _list_column_is_map = true; + return *this; + } + + /** + * @brief Specifies whether this timestamp column should be encoded using the deprecated int96 + * physical type. Only valid for the following column types: + * timestamp_s, timestamp_ms, timestamp_us, timestamp_ns + * + * @param req True = use int96 physical type. False = use int64 physical type + * @return this for chaining + */ + column_in_metadata& set_int96_timestamps(bool req) + { + _use_int96_timestamp = req; + return *this; + } + + /** + * @brief Set the decimal precision of this column. Only valid if this column is a decimal + * (fixed-point) type + * + * @param precision The integer precision to set for this decimal column + * @return this for chaining + */ + column_in_metadata& set_decimal_precision(uint8_t precision) + { + _decimal_precision = precision; + return *this; + } + + /** + * @brief Get reference to a child of this column + * + * @param i Index of the child to get + * @return this for chaining + */ + column_in_metadata& child(size_type i) { return children[i]; } + + /** + * @brief Get const reference to a child of this column + * + * @param i Index of the child to get + * @return this for chaining + */ + column_in_metadata const& child(size_type i) const { return children[i]; } + + /** + * @brief Get the name of this column + */ + std::string get_name() const { return _name; } + + /** + * @brief Get whether nullability has been explicitly set for this column. + */ + bool is_nullability_defined() const { return _nullable.has_value(); } + + /** + * @brief Gets the explicitly set nullability for this column. + * @throws If nullability is not explicitly defined for this column. + * Check using `is_nullability_defined()` first. + */ + bool nullable() const { return _nullable.value(); } + + /** + * @brief If this is the metadata of a list column, returns whether it is to be encoded as a map. + */ + bool is_map() const { return _list_column_is_map; } + + /** + * @brief Get whether to encode this timestamp column using deprecated int96 physical type + */ + bool is_enabled_int96_timestamps() const { return _use_int96_timestamp; } + + /** + * @brief Get whether precision has been set for this decimal column + */ + bool is_decimal_precision_set() const { return _decimal_precision.has_value(); } + + /** + * @brief Get the decimal precision that was set for this column. + * @throws If decimal precision was not set for this column. + * Check using `is_decimal_precision_set()` first. + */ + uint8_t get_decimal_precision() const { return _decimal_precision.value(); } + + /** + * @brief Get the number of children of this column + */ + size_type num_children() const { return children.size(); } +}; + +class table_input_metadata { + public: + table_input_metadata() = default; // Required by cython + + /** + * @brief Construct a new table_input_metadata from a table_view. + * + * The constructed table_input_metadata has the same structure as the passed table_view + * + * @param table The table_view to construct metadata for + * @param user_data Optional Additional metadata to encode, as key-value pairs + */ + table_input_metadata(table_view const& table, std::map user_data = {}); + + std::vector column_metadata; + std::map user_data; //!< Format-dependent metadata as key-values pairs +}; + } // namespace io } // namespace cudf diff --git a/cpp/include/cudf/lists/drop_list_duplicates.hpp b/cpp/include/cudf/lists/drop_list_duplicates.hpp index f1ce3b7f0e3..e778428510d 100644 --- a/cpp/include/cudf/lists/drop_list_duplicates.hpp +++ b/cpp/include/cudf/lists/drop_list_duplicates.hpp @@ -28,32 +28,32 @@ namespace lists { */ /** - * @brief Create a new lists column by removing duplicated entries from each list element in the - * given lists column + * @brief Create a new lists column by extracting unique entries from list elements in the given + * lists column. * - * @throw cudf::logic_error if any row (list element) in the input column is a nested type. - * - * Given an `input` lists_column_view, the list elements in the column are copied to an output lists + * Given an input lists column, the list elements in the column are copied to an output lists * column such that their duplicated entries are dropped out to keep only the unique ones. The * order of those entries within each list are not guaranteed to be preserved as in the input. In * the current implementation, entries in the output lists are sorted by ascending order (nulls * last), but this is not guaranteed in future implementation. * - * @param lists_column The input lists_column_view - * @param nulls_equal Flag to specify whether null entries should be considered equal - * @param nans_equal Flag to specify whether NaN entries should be considered as equal value (only - * applicable for floating point data column) - * @param mr Device resource used to allocate memory + * @throw cudf::logic_error if the child column of the input lists column contains nested type other + * than struct. + * + * @param lists_column The input lists column to extract lists with unique entries. + * @param nulls_equal Flag to specify whether null entries should be considered equal. + * @param nans_equal Flag to specify whether NaN entries should be considered as equal value (only + * applicable for floating point data column). + * @param mr Device resource used to allocate memory. * * @code{.pseudo} - * lists_column = { {1, 1, 2, 1, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} } + * input = { {1, 1, 2, 1, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} } * output = { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL} } * - * Note that permuting the entries of each list in this output also produces another valid - * output. + * Note that permuting the entries of each list in this output also produces another valid output. * @endcode * - * @return A list column with list elements having unique entries + * @return A lists column with list elements having unique entries. */ std::unique_ptr drop_list_duplicates( lists_column_view const& lists_column, diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp index 94b5c344f4f..d21f6dff79c 100644 --- a/cpp/include/cudf/quantiles.hpp +++ b/cpp/include/cudf/quantiles.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -94,5 +95,32 @@ std::unique_ptr
quantiles( std::vector const& null_precedence = {}, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Calculate approximate percentiles on an input tdigest column. + * + * tdigest (https://arxiv.org/pdf/1902.04023.pdf) columns are produced specifically + * by the TDIGEST and MERGE_TDIGEST aggregations. These columns represent + * compressed representations of a very large input data set that can be + * queried for quantile information. + * + * Produces a LIST column where each row `i` represents output from querying the + * corresponding tdigest from `input` row `i`. The length of each output list + * is the number of percentages specified in `percentages`. + * + * @param input tdigest input data. One tdigest per row. + * @param percentiles Desired percentiles in range [0, 1]. + * @param mr Device memory resource used to allocate the returned column's device + * memory + * + * @throws cudf::logic_error if `input` is not a valid tdigest column. + * @throws cudf::logic_error if `percentiles` is not a FLOAT64 column. + * + * @returns LIST Column containing requested percentile values as FLOAT64. + */ +std::unique_ptr percentile_approx( + structs_column_view const& input, + column_view const& percentiles, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp index 36a8131a78e..69eb8b3490a 100644 --- a/cpp/include/cudf/sorting.hpp +++ b/cpp/include/cudf/sorting.hpp @@ -58,7 +58,7 @@ enum class rank_method { * `input` if it were sorted */ std::unique_ptr sorted_order( - table_view input, + table_view const& input, std::vector const& column_order = {}, std::vector const& null_precedence = {}, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -72,7 +72,7 @@ std::unique_ptr sorted_order( * @copydoc cudf::sorted_order */ std::unique_ptr stable_sorted_order( - table_view input, + table_view const& input, std::vector const& column_order = {}, std::vector const& null_precedence = {}, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -112,7 +112,7 @@ bool is_sorted(cudf::table_view const& table, * @return New table containing the desired sorted order of `input` */ std::unique_ptr
sort( - table_view input, + table_view const& input, std::vector const& column_order = {}, std::vector const& null_precedence = {}, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/cudf/utilities/bit.hpp b/cpp/include/cudf/utilities/bit.hpp index 458587946f2..cbd09fa7b0d 100644 --- a/cpp/include/cudf/utilities/bit.hpp +++ b/cpp/include/cudf/utilities/bit.hpp @@ -104,6 +104,7 @@ CUDA_HOST_DEVICE_CALLABLE void clear_bit_unsafe(bitmask_type* bitmask, size_type /** * @brief Indicates whether the specified bit is set to `1` * + * @param bitmask The bitmask containing the bit to clear * @param bit_index Index of the bit to test * @return true The specified bit is `1` * @return false The specified bit is `0` @@ -114,6 +115,23 @@ CUDA_HOST_DEVICE_CALLABLE bool bit_is_set(bitmask_type const* bitmask, size_type return bitmask[word_index(bit_index)] & (bitmask_type{1} << intra_word_index(bit_index)); } +/** + * @brief optional-like interface to check if a specified bit of a bitmask is set. + * + * @param bitmask The bitmask containing the bit to clear + * @param bit_index Index of the bit to test + * @param default_value Value to return if `bitmask` is nullptr + * @return true The specified bit is `1` + * @return false The specified bit is `0` + * @return `default_value` if `bitmask` is nullptr + */ +CUDA_HOST_DEVICE_CALLABLE bool bit_value_or(bitmask_type const* bitmask, + size_type bit_index, + bool default_value) +{ + return bitmask != nullptr ? bit_is_set(bitmask, bit_index) : default_value; +} + /** * @brief Returns a bitmask word with the `n` least significant bits set. * diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index f4e7e3e2a6d..40a833112e1 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -142,6 +142,31 @@ constexpr inline bool is_equality_comparable() return detail::is_equality_comparable_impl::value; } +namespace detail { +/** + * @brief Helper functor to check if a specified type `T` supports equality comparisons. + */ +struct unary_equality_comparable_functor { + template + bool operator()() const + { + return cudf::is_equality_comparable(); + } +}; +} // namespace detail + +/** + * @brief Checks whether `data_type` `type` supports equality comparisons. + * + * @param type Data_type for comparison. + * @return true If `type` supports equality comparisons. + * @return false If `type` does not support equality comparisons. + */ +inline bool is_equality_comparable(data_type type) +{ + return cudf::type_dispatcher(type, detail::unary_equality_comparable_functor{}); +} + /** * @brief Indicates whether the type `T` is a numeric type. * diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp index 553d8a97bd2..aa77686fee4 100644 --- a/cpp/include/cudf_test/column_utilities.hpp +++ b/cpp/include/cudf_test/column_utilities.hpp @@ -38,6 +38,8 @@ enum class debug_output_level { QUIET // no debug output }; +constexpr size_type default_ulp = 4; + /** * @brief Verifies the property equality of two columns. * @@ -93,12 +95,15 @@ bool expect_columns_equal(cudf::column_view const& lhs, * @param lhs The first column * @param rhs The second column * @param verbosity Level of debug output verbosity + * @param fp_ulps # of ulps of tolerance to allow when comparing + * floating point values * * @returns True if the columns (and their properties) are equivalent, false otherwise */ bool expect_columns_equivalent(cudf::column_view const& lhs, cudf::column_view const& rhs, - debug_output_level verbosity = debug_output_level::FIRST_ERROR); + debug_output_level verbosity = debug_output_level::FIRST_ERROR, + size_type fp_ulps = cudf::test::default_ulp); /** * @brief Verifies the bitwise equality of two device memory buffers. diff --git a/cpp/include/cudf_test/io_metadata_utilities.hpp b/cpp/include/cudf_test/io_metadata_utilities.hpp new file mode 100644 index 00000000000..6ca6eba6884 --- /dev/null +++ b/cpp/include/cudf_test/io_metadata_utilities.hpp @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace cudf::test { + +void expect_metadata_equal(cudf::io::table_input_metadata in_meta, + cudf::io::table_metadata out_meta); + +} diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt index 020f5c76c10..9f060c93215 100644 --- a/cpp/libcudf_kafka/CMakeLists.txt +++ b/cpp/libcudf_kafka/CMakeLists.txt @@ -25,7 +25,7 @@ include(rapids-cuda) include(rapids-export) include(rapids-find) -project(CUDA_KAFKA VERSION 21.10.00 LANGUAGES CXX) +project(CUDA_KAFKA VERSION 21.12.00 LANGUAGES CXX) # Set a default build type if none was specified rapids_cmake_build_type(Release) diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index c3d992e1181..b550b61785b 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -202,6 +202,18 @@ std::vector> simple_aggregations_collector::visit( return visit(col_type, static_cast(agg)); } +std::vector> simple_aggregations_collector::visit( + data_type col_type, tdigest_aggregation const& agg) +{ + return visit(col_type, static_cast(agg)); +} + +std::vector> simple_aggregations_collector::visit( + data_type col_type, merge_tdigest_aggregation const& agg) +{ + return visit(col_type, static_cast(agg)); +} + // aggregation_finalizer ---------------------------------------- void aggregation_finalizer::visit(aggregation const& agg) {} @@ -346,6 +358,16 @@ void aggregation_finalizer::visit(merge_m2_aggregation const& agg) visit(static_cast(agg)); } +void aggregation_finalizer::visit(tdigest_aggregation const& agg) +{ + visit(static_cast(agg)); +} + +void aggregation_finalizer::visit(merge_tdigest_aggregation const& agg) +{ + visit(static_cast(agg)); +} + } // namespace detail std::vector> aggregation::get_simple_aggregations( @@ -668,6 +690,25 @@ std::unique_ptr make_merge_m2_aggregation() template std::unique_ptr make_merge_m2_aggregation(); template std::unique_ptr make_merge_m2_aggregation(); +template +std::unique_ptr make_tdigest_aggregation(int max_centroids) +{ + return std::make_unique(max_centroids); +} +template std::unique_ptr make_tdigest_aggregation(int max_centroids); +template std::unique_ptr make_tdigest_aggregation( + int max_centroids); + +template +std::unique_ptr make_merge_tdigest_aggregation(int max_centroids) +{ + return std::make_unique(max_centroids); +} +template std::unique_ptr make_merge_tdigest_aggregation( + int max_centroids); +template std::unique_ptr make_merge_tdigest_aggregation( + int max_centroids); + namespace detail { namespace { struct target_type_functor { diff --git a/cpp/src/copying/slice.cu b/cpp/src/copying/slice.cu index 0e41689dc4b..d1c12056393 100644 --- a/cpp/src/copying/slice.cu +++ b/cpp/src/copying/slice.cu @@ -63,17 +63,9 @@ std::vector slice(column_view const& input, return std::vector{begin, begin + indices.size() / 2}; } -} // namespace detail - -std::vector slice(cudf::column_view const& input, - std::vector const& indices) -{ - CUDF_FUNC_RANGE(); - return detail::slice(input, indices, rmm::cuda_stream_default); -} - -std::vector slice(cudf::table_view const& input, - std::vector const& indices) +std::vector slice(table_view const& input, + std::vector const& indices, + rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); CUDF_EXPECTS(indices.size() % 2 == 0, "indices size must be even"); @@ -81,7 +73,7 @@ std::vector slice(cudf::table_view const& input, // 2d arrangement of column_views that represent the outgoing table_views sliced_table[i][j] // where i is the i'th column of the j'th table_view - auto op = [&indices](auto const& c) { return cudf::slice(c, indices); }; + auto op = [&indices, stream](auto const& c) { return cudf::detail::slice(c, indices, stream); }; auto f = thrust::make_transform_iterator(input.begin(), op); auto sliced_table = std::vector>(f, f + input.num_columns()); @@ -99,6 +91,22 @@ std::vector slice(cudf::table_view const& input, } return result; -}; +} + +} // namespace detail + +std::vector slice(cudf::column_view const& input, + std::vector const& indices) +{ + CUDF_FUNC_RANGE(); + return detail::slice(input, indices, rmm::cuda_stream_default); +} + +std::vector slice(cudf::table_view const& input, + std::vector const& indices) +{ + CUDF_FUNC_RANGE(); + return detail::slice(input, indices, rmm::cuda_stream_default); +} } // namespace cudf diff --git a/cpp/src/groupby/common/utils.hpp b/cpp/src/groupby/common/utils.hpp index 3da20fb9af3..2804dea576e 100644 --- a/cpp/src/groupby/common/utils.hpp +++ b/cpp/src/groupby/common/utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-20, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index 533f193d692..bdaccba38dc 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -27,10 +27,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include @@ -76,6 +78,9 @@ std::pair, std::vector> groupby::disp // Optionally flatten nested key columns. auto [flattened_keys, _, __, ___] = flatten_nested_columns(_keys, {}, {}, column_nullability::FORCE); + auto is_supported_key_type = [](auto col) { return cudf::is_equality_comparable(col.type()); }; + CUDF_EXPECTS(std::all_of(flattened_keys.begin(), flattened_keys.end(), is_supported_key_type), + "Unsupported groupby key type does not support equality comparison"); auto [grouped_keys, results] = detail::hash::groupby(flattened_keys, requests, _include_null_keys, stream, mr); return std::make_pair(unflatten_nested_columns(std::move(grouped_keys), _keys), diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 726b51b7702..9f3d67ac38b 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -525,6 +525,97 @@ void aggregate_result_functor::operator()(aggregation con get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr)); }; +/** + * @brief Generate a tdigest column from a grouped set of numeric input values. + * + * The tdigest column produced is of the following structure: + * + * struct { + * // centroids for the digest + * list { + * struct { + * double // mean + * double // weight + * }, + * ... + * } + * // these are from the input stream, not the centroids. they are used + * // during the percentile_approx computation near the beginning or + * // end of the quantiles + * double // min + * double // max + * } + * + * Each output row is a single tdigest. The length of the row is the "size" of the + * tdigest, each element of which represents a weighted centroid (mean, weight). + */ +template <> +void aggregate_result_functor::operator()(aggregation const& agg) +{ + if (cache.has_result(col_idx, agg)) { return; } + + auto const max_centroids = + dynamic_cast(agg).max_centroids; + + auto count_agg = make_count_aggregation(); + operator()(*count_agg); + column_view valid_counts = cache.get_result(col_idx, *count_agg); + + cache.add_result(col_idx, + agg, + detail::group_tdigest( + get_sorted_values(), + helper.group_offsets(stream), + helper.group_labels(stream), + {valid_counts.begin(), static_cast(valid_counts.size())}, + helper.num_groups(stream), + max_centroids, + stream, + mr)); +}; + +/** + * @brief Generate a merged tdigest column from a grouped set of input tdigest columns. + * + * The tdigest column produced is of the following structure: + * + * struct { + * // centroids for the digest + * list { + * struct { + * double // mean + * double // weight + * }, + * ... + * } + * // these are from the input stream, not the centroids. they are used + * // during the percentile_approx computation near the beginning or + * // end of the quantiles + * double // min + * double // max + * } + * + * Each output row is a single tdigest. The length of the row is the "size" of the + * tdigest, each element of which represents a weighted centroid (mean, weight). + */ +template <> +void aggregate_result_functor::operator()(aggregation const& agg) +{ + if (cache.has_result(col_idx, agg)) { return; } + + auto const max_centroids = + dynamic_cast(agg).max_centroids; + cache.add_result(col_idx, + agg, + detail::group_merge_tdigest(get_grouped_values(), + helper.group_offsets(stream), + helper.group_labels(stream), + helper.num_groups(stream), + max_centroids, + stream, + mr)); +}; + } // namespace detail // Sort-based groupby diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp index 2770162da2d..cb01ee8e053 100644 --- a/cpp/src/groupby/sort/group_reductions.hpp +++ b/cpp/src/groupby/sort/group_reductions.hpp @@ -442,6 +442,94 @@ std::unique_ptr group_merge_m2(column_view const& values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @brief Generate a tdigest column from a grouped set of numeric input values. + * + * The tdigest column produced is of the following structure: + * + * struct { + * // centroids for the digest + * list { + * struct { + * double // mean + * double // weight + * }, + * ... + * } + * // these are from the input stream, not the centroids. they are used + * // during the percentile_approx computation near the beginning or + * // end of the quantiles + * double // min + * double // max + * } + * + * Each output row is a single tdigest. The length of the row is the "size" of the + * tdigest, each element of which represents a weighted centroid (mean, weight). + * + * @param values Grouped (and sorted) values to merge. + * @param group_offsets Offsets of groups' starting points within @p values. + * @param group_labels 0-based ID of group that the corresponding value belongs to + * @param group_valid_counts Per-group counts of valid elements. + * @param num_groups Number of groups. + * @param max_centroids Parameter controlling the level of compression of the tdigest. Higher + * values result in a larger, more precise tdigest. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns tdigest column, with 1 tdigest per row + */ +std::unique_ptr group_tdigest(column_view const& values, + cudf::device_span group_offsets, + cudf::device_span group_labels, + cudf::device_span group_valid_counts, + size_type num_groups, + int max_centroids, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +/** + * @brief Merges tdigests within the same group to generate a new tdigest. + * + * The tdigest column produced is of the following structure: + * + * struct { + * // centroids for the digest + * list { + * struct { + * double // mean + * double // weight + * }, + * ... + * } + * // these are from the input stream, not the centroids. they are used + * // during the percentile_approx computation near the beginning or + * // end of the quantiles + * double // min + * double // max + * } + * + * Each output row is a single tdigest. The length of the row is the "size" of the + * tdigest, each element of which represents a weighted centroid (mean, weight). + * + * @param values Grouped tdigests to merge. + * @param group_offsets Offsets of groups' starting points within @p values. + * @param group_labels 0-based ID of group that the corresponding value belongs to + * @param num_groups Number of groups. + * @param max_centroids Parameter controlling the level of compression of the tdigest. Higher + * values result in a larger, more precise tdigest. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns tdigest column, with 1 tdigest per row + */ +std::unique_ptr group_merge_tdigest(column_view const& values, + cudf::device_span group_offsets, + cudf::device_span group_labels, + size_type num_groups, + int max_centroids, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + /** @endinternal * */ diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh index 8eccadd653e..db2ae5b5d8e 100644 --- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh +++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh @@ -31,77 +31,50 @@ #include #include #include -#include -#include #include namespace cudf { namespace groupby { namespace detail { -// ArgMin binary operator with tuple of (value, index) +/** + * @brief ArgMin binary operator with index values into input column. + * + * @tparam T Type of the underlying column. Must support '<' operator. + */ template struct ArgMin { - CUDA_HOST_DEVICE_CALLABLE auto operator()(thrust::tuple const& lhs, - thrust::tuple const& rhs) const - { - if (thrust::get<1>(lhs) == cudf::detail::ARGMIN_SENTINEL) - return rhs; - else if (thrust::get<1>(rhs) == cudf::detail::ARGMIN_SENTINEL) - return lhs; - else - return thrust::get<0>(lhs) < thrust::get<0>(rhs) ? lhs : rhs; - } -}; - -// ArgMax binary operator with tuple of (value, index) -template -struct ArgMax { - CUDA_HOST_DEVICE_CALLABLE auto operator()(thrust::tuple const& lhs, - thrust::tuple const& rhs) const - { - if (thrust::get<1>(lhs) == cudf::detail::ARGMIN_SENTINEL) - return rhs; - else if (thrust::get<1>(rhs) == cudf::detail::ARGMIN_SENTINEL) - return lhs; - else - return thrust::get<0>(lhs) > thrust::get<0>(rhs) ? lhs : rhs; - } -}; - -struct get_tuple_second_element { - template - __device__ size_type operator()(thrust::tuple const& rhs) const + column_device_view const d_col; + CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs, size_type const& rhs) const { - return thrust::get<1>(rhs); + // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and + // github.com/NVIDIA/thrust/issues/1525 + // where invalid random values may be passed here by thrust::reduce_by_key + if (lhs < 0 || lhs >= d_col.size() || d_col.is_null(lhs)) { return rhs; } + if (rhs < 0 || rhs >= d_col.size() || d_col.is_null(rhs)) { return lhs; } + return d_col.element(lhs) < d_col.element(rhs) ? lhs : rhs; } }; /** - * @brief Functor to store the boolean value to null mask. + * @brief ArgMax binary operator with index values into input column. + * + * @tparam T Type of the underlying column. Must support '<' operator. */ -struct bool_to_nullmask { - mutable_column_device_view d_result; - __device__ void operator()(size_type i, bool rhs) +template +struct ArgMax { + column_device_view const d_col; + CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs, size_type const& rhs) const { - if (rhs) { - d_result.set_valid(i); - } else { - d_result.set_null(i); - } + // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and + // github.com/NVIDIA/thrust/issues/1525 + // where invalid random values may be passed here by thrust::reduce_by_key + if (lhs < 0 || lhs >= d_col.size() || d_col.is_null(lhs)) { return rhs; } + if (rhs < 0 || rhs >= d_col.size() || d_col.is_null(rhs)) { return lhs; } + return d_col.element(rhs) < d_col.element(lhs) ? lhs : rhs; } }; -/** - * @brief Returns index for non-null element, and SENTINEL for null element in a column. - * - */ -struct null_as_sentinel { - column_device_view const col; - size_type const SENTINEL; - __device__ size_type operator()(size_type i) const { return col.is_null(i) ? SENTINEL : i; } -}; - /** * @brief Value accessor for column which supports dictionary column too. * @@ -191,25 +164,16 @@ struct reduce_functor { auto resultview = mutable_column_device_view::create(result->mutable_view(), stream); auto valuesview = column_device_view::create(values, stream); if constexpr (K == aggregation::ARGMAX || K == aggregation::ARGMIN) { - constexpr auto SENTINEL = - (K == aggregation::ARGMAX ? cudf::detail::ARGMAX_SENTINEL : cudf::detail::ARGMIN_SENTINEL); - auto idx_begin = - cudf::detail::make_counting_transform_iterator(0, null_as_sentinel{*valuesview, SENTINEL}); - // dictionary keys are sorted, so dictionary32 index comparison is enough. - auto column_begin = valuesview->begin(); - auto begin = thrust::make_zip_iterator(thrust::make_tuple(column_begin, idx_begin)); - auto result_begin = thrust::make_transform_output_iterator(resultview->begin(), - get_tuple_second_element{}); using OpType = std::conditional_t<(K == aggregation::ARGMAX), ArgMax, ArgMin>; thrust::reduce_by_key(rmm::exec_policy(stream), group_labels.data(), group_labels.data() + group_labels.size(), - begin, + thrust::make_counting_iterator(0), thrust::make_discard_iterator(), - result_begin, - thrust::equal_to{}, - OpType{}); + resultview->begin(), + thrust::equal_to{}, + OpType{*valuesview}); } else { auto init = OpType::template identity(); auto begin = cudf::detail::make_counting_transform_iterator( diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/groupby/sort/group_tdigest.cu new file mode 100644 index 00000000000..5b4252a9063 --- /dev/null +++ b/cpp/src/groupby/sort/group_tdigest.cu @@ -0,0 +1,841 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +namespace cudf { +namespace groupby { +namespace detail { + +namespace { + +// the most representative point within a cluster of similar +// values. {mean, weight} +// NOTE: Using a tuple here instead of a struct to take advantage of +// thrust zip iterators for output. +using centroid = thrust::tuple; + +// make a centroid from a scalar with a weight of 1. +template +struct make_centroid { + column_device_view const col; + + centroid operator() __device__(size_type index) + { + return {static_cast(col.element(index)), 1, col.is_valid(index)}; + } +}; + +// make a centroid from an input stream of mean/weight values. +struct make_weighted_centroid { + double const* mean; + double const* weight; + + centroid operator() __device__(size_type index) { return {mean[index], weight[index], true}; } +}; + +// merge two centroids +struct merge_centroids { + centroid operator() __device__(centroid const& lhs, centroid const& rhs) + { + bool const lhs_valid = thrust::get<2>(lhs); + bool const rhs_valid = thrust::get<2>(rhs); + if (!lhs_valid && !rhs_valid) { return {0, 0, false}; } + if (!lhs_valid) { return rhs; } + if (!rhs_valid) { return lhs; } + + double const lhs_mean = thrust::get<0>(lhs); + double const rhs_mean = thrust::get<0>(rhs); + double const lhs_weight = thrust::get<1>(lhs); + double const rhs_weight = thrust::get<1>(rhs); + double const new_weight = lhs_weight + rhs_weight; + return {(lhs_mean * lhs_weight + rhs_mean * rhs_weight) / new_weight, new_weight, true}; + } +}; + +/** + * @brief A functor which returns the nearest cumulative weight in the input stream prior to the + * specified next weight limit. + * + * This functor assumes the weight for all scalars is simply 1. Under this assumption, + * the nearest weight that will be <= the next limit is simply the nearest integer < the limit, + * which we can get by just taking floor(next_limit). For example if our next limit is 3.56, the + * nearest whole number <= it is floor(3.56) == 3. + */ +struct nearest_value_scalar_weights { + thrust::pair operator() __device__(double next_limit, size_type) + { + double const f = floor(next_limit); + return {f, max(0, static_cast(next_limit) - 1)}; + } +}; + +/** + * @brief A functor which returns the nearest cumulative weight in the input stream prior to the + * specified next weight limit. + * + * This functor assumes we are dealing with grouped, sorted, weighted centroids. + */ +struct nearest_value_centroid_weights { + double const* cumulative_weights; + offset_type const* outer_offsets; // groups + offset_type const* inner_offsets; // tdigests within a group + + thrust::pair operator() __device__(double next_limit, size_type group_index) + { + auto const tdigest_begin = outer_offsets[group_index]; + auto const tdigest_end = outer_offsets[group_index + 1]; + auto const num_weights = inner_offsets[tdigest_end] - inner_offsets[tdigest_begin]; + double const* group_cumulative_weights = cumulative_weights + inner_offsets[tdigest_begin]; + + auto const index = ((thrust::lower_bound(thrust::seq, + group_cumulative_weights, + group_cumulative_weights + num_weights, + next_limit)) - + group_cumulative_weights); + + return index == 0 ? thrust::pair{0, 0} + : thrust::pair{group_cumulative_weights[index - 1], index - 1}; + } +}; + +/** + * @brief A functor which returns the cumulative input weight for a given index in a + * set of grouped input values. + * + * This functor assumes the weight for all scalars is simply 1. Under this assumption, + * the cumulative weight for a given value index I is simply I+1. + */ +struct cumulative_scalar_weight { + cudf::device_span group_offsets; + cudf::device_span group_labels; + std::tuple operator() __device__(size_type value_index) const + { + auto const group_index = group_labels[value_index]; + auto const relative_value_index = value_index - group_offsets[group_index]; + return {group_index, relative_value_index, relative_value_index + 1}; + } +}; + +/** + * @brief A functor which returns the cumulative input weight for a given index in a + * set of grouped input centroids. + * + * This functor assumes we are dealing with grouped, weighted centroids. + */ +struct cumulative_centroid_weight { + double const* cumulative_weights; + cudf::device_span group_labels; + offset_type const* outer_offsets; // groups + cudf::device_span inner_offsets; // tdigests with a group + + std::tuple operator() __device__(size_type value_index) const + { + auto const tdigest_index = + static_cast( + thrust::upper_bound(thrust::seq, inner_offsets.begin(), inner_offsets.end(), value_index) - + inner_offsets.begin()) - + 1; + auto const group_index = group_labels[tdigest_index]; + auto const first_tdigest_index = outer_offsets[group_index]; + auto const first_weight_index = inner_offsets[first_tdigest_index]; + auto const relative_value_index = value_index - first_weight_index; + double const* group_cumulative_weights = cumulative_weights + first_weight_index; + + return {group_index, relative_value_index, group_cumulative_weights[relative_value_index]}; + } +}; + +// a monotonically increasing scale function which produces a distribution +// of centroids that is more densely packed in the middle of the input +// than at the ends. +__device__ double scale_func_k1(double quantile, double delta_norm) +{ + double k = delta_norm * asin(2.0 * quantile - 1.0); + k += 1.0; + double q = (sin(k / delta_norm) + 1.0) / 2.0; + return q; +} + +/** + * @brief Compute a set of cluster limits (brackets, essentially) for a + * given tdigest based on the specified delta and the total weight of values + * to be added. + * + * The number of clusters generated will always be <= delta_, where delta_ is + * a reasonably small number likely << 10000. + * + * Each input group gets an independent set of clusters generated. 1 thread + * per group. + * + * This kernel is called in a two-pass style. Once to compute the per-group + * cluster sizes and total # of clusters, and once to compute the actual + * weight limits per cluster. + * + * @param delta_ tdigest compression level + * @param num_groups The number of input groups + * @param nearest_weight_ A functor which returns the nearest weight in the input + * stream that falls before our current cluster limit + * @param total_weight_ A functor which returns the expected total weight for + * the entire stream of input values for the specified group. + * @param group_cluster_wl Output. The set of cluster weight limits for each group. + * @param group_num_clusters Output. The number of output clusters for each input group. + * @param group_cluster_offsets Offsets per-group to the start of it's clusters + * + */ +template +__global__ void generate_cluster_limits_kernel(int delta_, + size_type num_groups, + NearestWeightFunc nearest_weight, + TotalWeightIter total_weight_, + CumulativeWeight cumulative_weight, + double* group_cluster_wl, + size_type* group_num_clusters, + offset_type const* group_cluster_offsets) +{ + int const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto const group_index = tid; + if (group_index >= num_groups) { return; } + + // we will generate at most delta clusters. + double const delta = static_cast(delta_); + double const delta_norm = delta / (2.0 * M_PI); + double const total_weight = total_weight_[group_index]; + group_num_clusters[group_index] = 0; + // a group with nothing in it. + if (total_weight <= 0) { return; } + + // start at the correct place based on our cluster offset. + double* cluster_wl = + group_cluster_wl ? group_cluster_wl + group_cluster_offsets[group_index] : nullptr; + + double cur_limit = 0.0; + double cur_weight = 0.0; + double next_limit = -1.0; + int last_inserted_index = -1; + + // compute the first cluster limit + double nearest_w; + int nearest_w_index; + while (1) { + cur_weight = next_limit < 0 ? 0 : max(cur_weight + 1, nearest_w); + if (cur_weight >= total_weight) { break; } + + // based on where we are closing the cluster off (not including the incoming weight), + // compute the next cluster limit + double const quantile = cur_weight / total_weight; + next_limit = total_weight * scale_func_k1(quantile, delta_norm); + + // if the next limit is < the cur limit, we're past the end of the distribution, so we're done. + if (next_limit <= cur_limit) { + if (cluster_wl) { cluster_wl[group_num_clusters[group_index]] = total_weight; } + group_num_clusters[group_index]++; + break; + } + + // compute the weight we will be at in the input values just before closing off the current + // cluster (because adding the next value will cross the current limit). + // NOTE: can't use structured bindings here. + thrust::tie(nearest_w, nearest_w_index) = nearest_weight(next_limit, group_index); + + if (cluster_wl) { + // because of the way the scale functions work, it is possible to generate clusters + // in such a way that we end up with "gaps" where there are no input values that + // fall into a given cluster. An example would be this: + // + // cluster weight limits = 0.00003, 1.008, 3.008 + // + // input values(weight) = A(1), B(2), C(3) + // + // naively inserting these values into the clusters simply by taking a lower_bound, + // we would get the following distribution of input values into those 3 clusters. + // (), (A), (B,C) + // + // whereas what we really want is: + // + // (A), (B), (C) + // + // to fix this, we will artificially adjust the output cluster limits to guarantee + // at least 1 input value will be put in each cluster during the reduction step. + // this does not affect final centroid results as we still use the "real" weight limits + // to compute subsequent clusters - the purpose is only to allow cluster selection + // during the reduction step to be trivial. + // + double adjusted_next_limit = next_limit; + if (nearest_w_index == last_inserted_index || last_inserted_index < 0) { + nearest_w_index = last_inserted_index + 1; + auto [r, i, adjusted] = cumulative_weight(nearest_w_index); + adjusted_next_limit = max(next_limit, adjusted); + } + cluster_wl[group_num_clusters[group_index]] = adjusted_next_limit; + last_inserted_index = nearest_w_index; + } + group_num_clusters[group_index]++; + cur_limit = next_limit; + } +} + +/** + * @brief Compute a set of cluster limits (brackets, essentially) for a + * given tdigest based on the specified delta and the total weight of values + * to be added. + * + * The number of clusters generated will always be <= delta_, where delta_ is + * a reasonably small number likely << 10000. + * + * Each input group gets an independent set of clusters generated. + * + * @param delta_ tdigest compression level + * @param num_groups The number of input groups + * @param nearest_weight A functor which returns the nearest weight in the input + * stream that falls before our current cluster limit + * @param total_weight A functor which returns the expected total weight for + * the entire stream of input values for the specified group. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns A tuple containing the set of cluster weight limits for each group, a set of + * list-style offsets indicating group sizes, and the total number of clusters + */ +template +std::tuple, std::unique_ptr, size_type> +generate_group_cluster_info(int delta, + size_type num_groups, + NearestWeight nearest_weight, + TotalWeightIter total_weight, + CumulativeWeight cumulative_weight, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + constexpr size_type block_size = 256; + cudf::detail::grid_1d const grid(num_groups, block_size); + + // compute number of clusters per group + // each thread computes 1 set of clusters (# of cluster sets == # of groups) + rmm::device_uvector group_num_clusters(num_groups, stream); + generate_cluster_limits_kernel<<>>( + delta, + num_groups, + nearest_weight, + total_weight, + cumulative_weight, + nullptr, + group_num_clusters.begin(), + nullptr); + + // generate group cluster offsets (where the clusters for a given group start and end) + auto group_cluster_offsets = cudf::make_fixed_width_column( + data_type{type_id::INT32}, num_groups + 1, mask_state::UNALLOCATED, stream, mr); + auto cluster_size = cudf::detail::make_counting_transform_iterator( + 0, [group_num_clusters = group_num_clusters.begin(), num_groups] __device__(size_type index) { + return index == num_groups ? 0 : group_num_clusters[index]; + }); + thrust::exclusive_scan(rmm::exec_policy(stream), + cluster_size, + cluster_size + num_groups + 1, + group_cluster_offsets->mutable_view().begin(), + 0); + + // total # of clusters + offset_type total_clusters = + cudf::detail::get_value(group_cluster_offsets->view(), num_groups, stream); + + // fill in the actual cluster weight limits + rmm::device_uvector group_cluster_wl(total_clusters, stream); + generate_cluster_limits_kernel<<>>( + delta, + num_groups, + nearest_weight, + total_weight, + cumulative_weight, + group_cluster_wl.begin(), + group_num_clusters.begin(), + group_cluster_offsets->view().begin()); + + return {std::move(group_cluster_wl), + std::move(group_cluster_offsets), + static_cast(total_clusters)}; +} + +/** + * @brief Compute a column of tdigests. + * + * Assembles the output tdigest column based on the specified delta, a stream of + * input values (either scalar or centroids), and an assortment of per-group + * clustering information. + * + * This function is effectively just a reduce_by_key that performs a reduction + * from input values -> centroid clusters as defined by the the cluster weight + * boundaries. + * + * @param delta tdigest compression level + * @param values_begin Beginning of the range of input values. + * @param values_end End of the range of input values. + * @param cumulative_weight Functor which returns cumulative weight and group information for + * an absolute input value index. + * @param min_col Column containing the minimum value per group. + * @param max_col Column containing the maximum value per group. + * @param group_cluster_wl Cluster weight limits for each group. + * @param group_cluster_offsets R-value reference of offsets into the cluster weight limits. + * @param total_clusters Total number of clusters in all groups. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns A tdigest column with 1 row per output tdigest. + */ +template +std::unique_ptr compute_tdigests(int delta, + CentroidIter centroids_begin, + CentroidIter centroids_end, + CumulativeWeight group_cumulative_weight, + std::unique_ptr&& min_col, + std::unique_ptr&& max_col, + rmm::device_uvector const& group_cluster_wl, + std::unique_ptr&& group_cluster_offsets, + size_type total_clusters, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // the output for each group is column of data that represents the tdigest. since we want 1 row + // per group, each row will be a list the length of the tdigest for that group. so our output + // column is of the form: + // struct { + // centroids for the digest + // list { + // struct { + // double // mean + // double // weight + // } + // } + // double // min + // double // max + // } + // + // + if (total_clusters == 0) { return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr); } + std::vector> inner_children; + // mean + inner_children.push_back(cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, total_clusters, mask_state::UNALLOCATED, stream, mr)); + // weight + inner_children.push_back(cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, total_clusters, mask_state::UNALLOCATED, stream, mr)); + // tdigest struct + auto tdigests = + cudf::make_structs_column(total_clusters, std::move(inner_children), 0, {}, stream, mr); + + // each input group represents an individual tdigest. within each tdigest, we want the keys + // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall + // into the range 0-99). But since we have multiple tdigests, we need to keep the keys unique + // between the groups, so we add our group start offset. + auto keys = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [delta, + group_cluster_wl = group_cluster_wl.data(), + group_cluster_offsets = group_cluster_offsets->view().begin(), + group_cumulative_weight] __device__(size_type value_index) -> size_type { + auto [group_index, relative_value_index, cumulative_weight] = + group_cumulative_weight(value_index); + + // compute start of cluster weight limits for this group + double const* weight_limits = group_cluster_wl + group_cluster_offsets[group_index]; + auto const num_clusters = + group_cluster_offsets[group_index + 1] - group_cluster_offsets[group_index]; + + // local cluster index + size_type const group_cluster_index = + min(num_clusters - 1, + static_cast( + thrust::lower_bound( + thrust::seq, weight_limits, weight_limits + num_clusters, cumulative_weight) - + weight_limits)); + + // add the cluster offset to generate a globally unique key + return group_cluster_index + group_cluster_offsets[group_index]; + }); + + // reduce the centroids down by key. + cudf::mutable_column_view mean_col = + tdigests->child(cudf::detail::tdigest::mean_column_index).mutable_view(); + cudf::mutable_column_view weight_col = + tdigests->child(cudf::detail::tdigest::weight_column_index).mutable_view(); + auto output = thrust::make_zip_iterator(thrust::make_tuple( + mean_col.begin(), weight_col.begin(), thrust::make_discard_iterator())); + auto const num_values = std::distance(centroids_begin, centroids_end); + thrust::reduce_by_key(rmm::exec_policy(stream), + keys, + keys + num_values, // keys + centroids_begin, // values + thrust::make_discard_iterator(), // key output + output, // output + thrust::equal_to{}, // key equality check + merge_centroids{}); + + // create the list + auto const num_groups = group_cluster_offsets->size() - 1; + auto list = cudf::make_lists_column( + num_groups, std::move(group_cluster_offsets), std::move(tdigests), 0, {}); + + // create final tdigest column + std::vector> children; + children.push_back(std::move(list)); + children.push_back(std::move(min_col)); + children.push_back(std::move(max_col)); + return make_structs_column(num_groups, std::move(children), 0, {}, stream, mr); +} + +// retrieve total weight of scalar inputs by group index +struct scalar_total_weight { + size_type const* group_valid_counts; + __device__ double operator()(size_type group_index) { return group_valid_counts[group_index]; } +}; + +// return the min/max value of scalar inputs by group index +template +struct get_scalar_minmax { + column_device_view const col; + device_span group_offsets; + size_type const* group_valid_counts; + + __device__ thrust::tuple operator()(size_type group_index) + { + // note: .element() is taking care of fixed-point conversions for us. + return {static_cast(col.element(group_offsets[group_index])), + static_cast( + col.element(group_offsets[group_index] + (group_valid_counts[group_index] - 1)))}; + } +}; + +struct typed_group_tdigest { + template < + typename T, + typename std::enable_if_t() || cudf::is_fixed_point()>* = nullptr> + std::unique_ptr operator()(column_view const& col, + cudf::device_span group_offsets, + cudf::device_span group_labels, + cudf::device_span group_valid_counts, + size_type num_groups, + int delta, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + // first, generate cluster weight information for each input group + auto total_weight = cudf::detail::make_counting_transform_iterator( + 0, scalar_total_weight{group_valid_counts.begin()}); + auto [group_cluster_wl, group_cluster_offsets, total_clusters] = + generate_group_cluster_info(delta, + num_groups, + nearest_value_scalar_weights{}, + total_weight, + cumulative_scalar_weight{group_offsets, group_labels}, + stream, + mr); + + // device column view. handy because the .element() function + // automatically handles fixed-point conversions for us + auto d_col = cudf::column_device_view::create(col); + + // compute min and max columns + auto min_col = cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr); + auto max_col = cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr); + thrust::transform( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_groups, + thrust::make_zip_iterator(thrust::make_tuple(min_col->mutable_view().begin(), + max_col->mutable_view().begin())), + get_scalar_minmax{*d_col, group_offsets, group_valid_counts.begin()}); + + // for simple input values, the "centroids" all have a weight of 1. + auto scalar_to_centroid = + cudf::detail::make_counting_transform_iterator(0, make_centroid{*d_col}); + + // generate the final tdigest + return compute_tdigests(delta, + scalar_to_centroid, + scalar_to_centroid + col.size(), + cumulative_scalar_weight{group_offsets, group_labels}, + std::move(min_col), + std::move(max_col), + group_cluster_wl, + std::move(group_cluster_offsets), + total_clusters, + stream, + mr); + } + + template < + typename T, + typename std::enable_if_t() && !cudf::is_fixed_point()>* = nullptr> + std::unique_ptr operator()(column_view const& col, + cudf::device_span group_offsets, + cudf::device_span group_labels, + cudf::device_span group_valid_counts, + size_type num_groups, + int delta, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + CUDF_FAIL("Non-numeric type in group_tdigest"); + } +}; + +} // anonymous namespace + +std::unique_ptr group_tdigest(column_view const& col, + cudf::device_span group_offsets, + cudf::device_span group_labels, + cudf::device_span group_valid_counts, + size_type num_groups, + int max_centroids, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (col.size() == 0) { return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr); } + + auto const delta = max_centroids; + return cudf::type_dispatcher(col.type(), + typed_group_tdigest{}, + col, + group_offsets, + group_labels, + group_valid_counts, + num_groups, + delta, + stream, + mr); +} + +std::unique_ptr group_merge_tdigest(column_view const& input, + cudf::device_span group_offsets, + cudf::device_span group_labels, + size_type num_groups, + int max_centroids, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + cudf::detail::tdigest::check_is_valid_tdigest_column(input); + + if (num_groups == 0 || input.size() == 0) { + return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr); + } + + structs_column_view scv(input); + lists_column_view lcv(scv.child(cudf::detail::tdigest::centroid_column_index)); + // ideally, we would just call .parent().child() here because tdigests cannot be + // sliced. however, lists_column_view() hides that particular interface. However, + // for the same reason, get_sliced_child() should be just as cheap. + auto data = lcv.get_sliced_child(stream); + structs_column_view tdigest(data); + auto mean = tdigest.child(cudf::detail::tdigest::mean_column_index); + auto weight = tdigest.child(cudf::detail::tdigest::weight_column_index); + + // first step is to merge all the tdigests in each group. at the moment the only way to + // make this work is to retrieve the group sizes (via group_offsets) and the individual digest + // sizes (via input.offsets()) to the gpu and do the merges. The scale problem is that while the + // size of each group will likely be small (size of each group will typically map to # of batches + // the input data was chopped into for tdigest generation), the -number- of groups can be + // arbitrarily large. + // + // thrust::merge and thrust::merge_by_key don't provide what we need. What we would need is an + // algorithm like a super-merge that takes two layers of keys: one which identifies the outer + // grouping of tdigests, and one which identifies the inner groupings of the tdigests within the + // outer groups. + + // bring group offsets back to the host + std::vector h_outer_offsets(group_offsets.size()); + cudaMemcpyAsync(h_outer_offsets.data(), + group_offsets.data(), + sizeof(size_type) * group_offsets.size(), + cudaMemcpyDeviceToHost, + stream); + + // bring tdigest offsets back to the host + auto tdigest_offsets = lcv.offsets(); + std::vector h_inner_offsets(tdigest_offsets.size()); + cudaMemcpyAsync(h_inner_offsets.data(), + tdigest_offsets.begin(), + sizeof(size_type) * tdigest_offsets.size(), + cudaMemcpyDeviceToHost, + stream); + + stream.synchronize(); + + // extract all means and weights into a table + cudf::table_view tdigests_unsliced({mean, weight}); + + // generate the merged (but not yet compressed) tdigests for each group. + std::vector> tdigests; + tdigests.reserve(num_groups); + std::transform( + h_outer_offsets.begin(), + h_outer_offsets.end() - 1, + std::next(h_outer_offsets.begin()), + std::back_inserter(tdigests), + [&](auto tdigest_start, auto tdigest_end) { + // the range of tdigests in this group + auto const num_tdigests = tdigest_end - tdigest_start; + + // slice each tdigest from the input + std::vector unmerged_tdigests; + unmerged_tdigests.reserve(num_tdigests); + auto offset_iter = std::next(h_inner_offsets.begin(), tdigest_start); + std::transform(offset_iter, + offset_iter + num_tdigests, + std::next(offset_iter), + std::back_inserter(unmerged_tdigests), + [&](auto start, auto end) { + return cudf::detail::slice(tdigests_unsliced, {start, end}, stream); + }); + + // merge + return cudf::detail::merge(unmerged_tdigests, {0}, {order::ASCENDING}, {}, stream, mr); + }); + + // generate min and max values + auto min_col = scv.child(cudf::detail::tdigest::min_column_index); + auto merged_min_col = cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr); + thrust::reduce_by_key(rmm::exec_policy(stream), + group_labels.begin(), + group_labels.end(), + min_col.begin(), + thrust::make_discard_iterator(), + merged_min_col->mutable_view().begin(), + thrust::equal_to{}, // key equality check + thrust::minimum{}); + + auto max_col = scv.child(cudf::detail::tdigest::max_column_index); + auto merged_max_col = cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr); + thrust::reduce_by_key(rmm::exec_policy(stream), + group_labels.begin(), + group_labels.end(), + max_col.begin(), + thrust::make_discard_iterator(), + merged_max_col->mutable_view().begin(), + thrust::equal_to{}, // key equality check + thrust::maximum{}); + + // concatenate all the merged tdigests back into one table. + std::vector tdigest_views; + tdigest_views.reserve(num_groups); + std::transform(tdigests.begin(), + tdigests.end(), + std::back_inserter(tdigest_views), + [](std::unique_ptr
const& t) { return t->view(); }); + auto merged = cudf::detail::concatenate(tdigest_views, stream, mr); + + // generate cumulative weights + auto merged_weights = merged->get_column(cudf::detail::tdigest::weight_column_index).view(); + auto cumulative_weights = cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, merged_weights.size(), mask_state::UNALLOCATED); + auto keys = cudf::detail::make_counting_transform_iterator( + 0, + [group_labels = group_labels.begin(), + inner_offsets = tdigest_offsets.begin(), + num_inner_offsets = tdigest_offsets.size()] __device__(int index) { + // what -original- tdigest index this absolute index corresponds to + auto const iter = thrust::prev( + thrust::upper_bound(thrust::seq, inner_offsets, inner_offsets + num_inner_offsets, index)); + auto const tdigest_index = thrust::distance(inner_offsets, iter); + + // what group index the original tdigest belongs to + return group_labels[tdigest_index]; + }); + thrust::inclusive_scan_by_key(rmm::exec_policy(stream), + keys, + keys + cumulative_weights->size(), + merged_weights.begin(), + cumulative_weights->mutable_view().begin()); + + auto const delta = max_centroids; + + // generate cluster info + auto total_group_weight = cudf::detail::make_counting_transform_iterator( + 0, + [outer_offsets = group_offsets.data(), + inner_offsets = tdigest_offsets.begin(), + cumulative_weights = + cumulative_weights->view().begin()] __device__(size_type group_index) { + auto const last_weight_index = inner_offsets[outer_offsets[group_index + 1]] - 1; + return cumulative_weights[last_weight_index]; + }); + auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info( + delta, + num_groups, + nearest_value_centroid_weights{cumulative_weights->view().begin(), + group_offsets.data(), + tdigest_offsets.begin()}, + total_group_weight, + cumulative_centroid_weight{ + cumulative_weights->view().begin(), + group_labels, + group_offsets.data(), + {tdigest_offsets.begin(), static_cast(tdigest_offsets.size())}}, + stream, + mr); + + // input centroid values + auto centroids = cudf::detail::make_counting_transform_iterator( + 0, + make_weighted_centroid{ + merged->get_column(cudf::detail::tdigest::mean_column_index).view().begin(), + merged_weights.begin()}); + + // compute the tdigest + return compute_tdigests(delta, + centroids, + centroids + merged->num_rows(), + cumulative_centroid_weight{cumulative_weights->view().begin(), + group_labels, + group_offsets.data(), + {tdigest_offsets.begin(), + static_cast(tdigest_offsets.size())}}, + std::move(merged_min_col), + std::move(merged_max_col), + group_cluster_wl, + std::move(group_cluster_offsets), + total_clusters, + stream, + mr); +} + +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index 69d68f7b6bc..c4905b86ab9 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -23,8 +23,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -102,6 +104,9 @@ sort_groupby_helper::sort_groupby_helper(table_view const& keys, auto [flattened_keys, _, __, struct_null_vectors] = flatten_nested_columns(keys, {}, {}, column_nullability::FORCE); + auto is_supported_key_type = [](auto col) { return cudf::is_equality_comparable(col.type()); }; + CUDF_EXPECTS(std::all_of(flattened_keys.begin(), flattened_keys.end(), is_supported_key_type), + "Unsupported groupby key type does not support equality comparison"); _struct_null_vectors = std::move(struct_null_vectors); _keys = flattened_keys; diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index eeafd959f87..c9b6c6e9f91 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -146,7 +146,7 @@ __global__ void __launch_bounds__(block_size, 2) if (t == 0) { s->chunk = chunks[group_id][str_col_idx]; - s->chunk.leaf_column = &orc_columns[col_idx].cudf_column; + s->chunk.leaf_column = &orc_columns[col_idx]; s->chunk.dict_data = dict_data[str_col_idx].data() + rowgroup_bounds[group_id][col_idx].begin; s->chunk.dict_index = dict_index[str_col_idx].data(); s->chunk.start_row = rowgroup_bounds[group_id][col_idx].begin; diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h index 77de0b0b286..405bf7c2ecc 100644 --- a/cpp/src/io/orc/orc.h +++ b/cpp/src/io/orc/orc.h @@ -615,9 +615,13 @@ class metadata { /** * @brief `column_device_view` and additional, ORC specific, information on the column. */ -struct orc_column_device_view { - column_device_view cudf_column; +struct orc_column_device_view : public column_device_view { + __device__ orc_column_device_view(column_device_view col, thrust::optional parent_idx) + : column_device_view{col}, parent_index{parent_idx} + { + } thrust::optional parent_index; + bitmask_type const* pushdown_mask = nullptr; }; /** diff --git a/cpp/src/io/orc/orc_common.h b/cpp/src/io/orc/orc_common.h index ab6788d01f1..eedaa9d4fc2 100644 --- a/cpp/src/io/orc/orc_common.h +++ b/cpp/src/io/orc/orc_common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,6 +24,7 @@ namespace orc { // ORC rows are divided into groups and assigned indexes for faster seeking static constexpr uint32_t default_row_index_stride = 10000; +static constexpr uint32_t BLOCK_HEADER_SIZE = 3; enum CompressionKind : uint8_t { NONE = 0, diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h index 30687331c15..389895abc83 100644 --- a/cpp/src/io/orc/orc_gpu.h +++ b/cpp/src/io/orc/orc_gpu.h @@ -135,6 +135,8 @@ struct RowGroup { struct EncChunk { uint32_t start_row; // start row of this chunk uint32_t num_rows; // number of rows in this chunk + uint32_t null_mask_start_row; // adjusted to multiple of 8 + uint32_t null_mask_num_rows; // adjusted to multiple of 8 ColumnEncodingKind encoding_kind; // column encoding kind TypeKind type_kind; // column data type uint8_t dtype_len; // data type length @@ -142,7 +144,7 @@ struct EncChunk { uint32_t* dict_index; // dictionary index from row index uint32_t* decimal_offsets; - column_device_view const* leaf_column; + orc_column_device_view const* column; }; /** @@ -182,7 +184,7 @@ struct DictionaryChunk { uint32_t num_dict_strings; // number of strings in dictionary uint32_t dict_char_count; // size of dictionary string data for this chunk - column_device_view const* leaf_column; //!< Pointer to string column + orc_column_device_view const* leaf_column; //!< Pointer to string column }; /** @@ -197,7 +199,7 @@ struct StripeDictionary { uint32_t num_strings; // number of unique strings in the dictionary uint32_t dict_char_count; // total size of dictionary string data - column_device_view const* leaf_column; //!< Pointer to string column + orc_column_device_view const* leaf_column; //!< Pointer to string column }; constexpr uint32_t encode_block_size = 512; @@ -326,17 +328,6 @@ void EncodeStripeDictionaries(StripeDictionary const* stripes, device_2dspan enc_streams, rmm::cuda_stream_view stream); -/** - * @brief Set leaf column element of EncChunk - * - * @param[in] orc_columns Pre-order flattened device array of ORC column views - * @param[in,out] chunks encoder chunk device array [column][rowgroup] - * @param[in] stream CUDA stream used for device memory operations and kernel launches - */ -void set_chunk_columns(device_span orc_columns, - device_2dspan chunks, - rmm::cuda_stream_view stream); - /** * @brief Launches kernel for compacting chunked column data prior to compression * @@ -355,6 +346,7 @@ void CompactOrcDataStreams(device_2dspan strm_desc, * @param[in] num_compressed_blocks Total number of compressed blocks * @param[in] compression Type of compression * @param[in] comp_blk_size Compression block size + * @param[in] max_comp_blk_size Max size of any block after compression * @param[in,out] strm_desc StripeStream device array [stripe][stream] * @param[in,out] enc_streams chunk streams device array [column][rowgroup] * @param[out] comp_in Per-block compression input parameters @@ -365,10 +357,11 @@ void CompressOrcDataStreams(uint8_t* compressed_data, uint32_t num_compressed_blocks, CompressionKind compression, uint32_t comp_blk_size, + uint32_t max_comp_blk_size, device_2dspan strm_desc, device_2dspan enc_streams, - gpu_inflate_input_s* comp_in, - gpu_inflate_status_s* comp_out, + device_span comp_in, + device_span comp_out, rmm::cuda_stream_view stream); /** @@ -438,6 +431,7 @@ void orc_init_statistics_buffersize(statistics_merge_group* groups, * @param[in,out] groups Statistics merge groups * @param[in,out] chunks Statistics data * @param[in] statistics_count Number of statistics buffers + * @param[in] stream CUDA stream used for device memory operations and kernel launches */ void orc_encode_statistics(uint8_t* blob_bfr, statistics_merge_group* groups, @@ -445,6 +439,19 @@ void orc_encode_statistics(uint8_t* blob_bfr, uint32_t statistics_count, rmm::cuda_stream_view stream); +/** + * @brief Number of set bits in pushdown masks, per rowgroup. + * + * @param[in] orc_columns Pre-order flattened device array of ORC column views + * @param[in] rowgroup_bounds Ranges of rows in each rowgroup [rowgroup][column] + * @param[out] set_counts Per rowgroup number of set bits + * @param[in] stream CUDA stream used for device memory operations and kernel launches + */ +void reduce_pushdown_masks(device_span orc_columns, + device_2dspan rowgroup_bounds, + device_2dspan set_counts, + rmm::cuda_stream_view stream); + } // namespace gpu } // namespace orc } // namespace io diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index d50d3898c3b..cc7e22f2042 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -24,6 +24,9 @@ #include #include +#include + +#include namespace cudf { namespace io { @@ -262,7 +265,6 @@ static __device__ uint32_t ByteRLE( } } if (!t) { s->strm_pos[cid] = static_cast(dst - s->stream.data_ptrs[cid]); } - __syncthreads(); return out_cnt; } @@ -618,6 +620,100 @@ inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, static const __device__ __constant__ int32_t kTimeScale[10] = { 1000000000, 100000000, 10000000, 1000000, 100000, 10000, 1000, 100, 10, 1}; +template +static __device__ void encode_null_mask(orcenc_state_s* s, + bitmask_type const* pushdown_mask, + Storage& scan_storage, + int t) +{ + if (s->stream.ids[CI_PRESENT] < 0) return; + + auto const column = *s->chunk.column; + while (s->present_rows < s->chunk.null_mask_num_rows or s->numvals > 0) { + // Number of rows read so far + auto present_rows = s->present_rows; + // valid_buf capacity is byte per thread in block + auto const buf_available_bits = encode_block_size * 8 - s->numvals; + // Number of rows for the block to process in this iteration + auto const nrows = min(s->chunk.null_mask_num_rows - present_rows, buf_available_bits); + // Number of rows for this thread to process in this iteration + auto const t_nrows = min(max(static_cast(nrows) - t * 8, 0), 8); + auto const row = s->chunk.null_mask_start_row + present_rows + t * 8; + + auto get_mask_byte = [&](bitmask_type const* mask, size_type offset) -> uint8_t { + if (t_nrows == 0) return 0; + if (mask == nullptr) return 0xff; + + auto const begin_offset = row + offset; + auto const end_offset = min(begin_offset + 8, offset + column.size()); + auto const mask_word = cudf::detail::get_mask_offset_word(mask, 0, begin_offset, end_offset); + return mask_word & 0xff; + }; + + uint8_t pd_byte = (1 << t_nrows) - 1; + uint32_t pd_set_cnt = t_nrows; + uint32_t offset = t_nrows != 0 ? t * 8 : nrows; + if (pushdown_mask != nullptr) { + pd_byte = get_mask_byte(pushdown_mask, 0) & ((1 << t_nrows) - 1); + pd_set_cnt = __popc(pd_byte); + // Scan the number of valid bits to get dst offset for each thread + cub::BlockScan(scan_storage).ExclusiveSum(pd_set_cnt, offset); + } + + auto const mask_byte = get_mask_byte(column.null_mask(), column.offset()); + auto dst_offset = offset + s->nnz; + auto vbuf_bit_idx = [](int row) { + // valid_buf is a circular buffer with validitiy of 8 rows in each element + return row % (encode_block_size * 8); + }; + if (dst_offset % 8 == 0 and pd_set_cnt == 8) { + s->valid_buf[vbuf_bit_idx(dst_offset) / 8] = mask_byte; + } else { + for (auto bit_idx = 0; bit_idx < t_nrows; ++bit_idx) { + // skip bits where pushdown mask is not set + if (not(pd_byte & (1 << bit_idx))) continue; + if (mask_byte & (1 << bit_idx)) { + set_bit(reinterpret_cast(s->valid_buf), vbuf_bit_idx(dst_offset++)); + } else { + clear_bit(reinterpret_cast(s->valid_buf), vbuf_bit_idx(dst_offset++)); + } + } + } + + __syncthreads(); + if (t == block_size - 1) { + // Number of loaded rows, available for encode + s->numvals += offset + pd_set_cnt; + // Number of loaded rows (different from present_rows because of pushdown masks) + s->nnz += offset + pd_set_cnt; + } + present_rows += nrows; + if (!t) { s->present_rows = present_rows; } + __syncthreads(); + + // RLE encode the present stream + if (s->numvals > ((present_rows < s->chunk.null_mask_num_rows) ? 130 * 8 : 0)) { + auto const flush = (present_rows < s->chunk.null_mask_num_rows) ? 0 : 7; + auto const nbytes_out = (s->numvals + flush) / 8; + auto const nrows_encoded = + ByteRLE(s, s->valid_buf, s->present_out / 8, nbytes_out, flush, t) * 8; + + if (!t) { + // Number of rows enocoded so far + s->present_out += nrows_encoded; + s->numvals -= min(s->numvals, nrows_encoded); + } + __syncthreads(); + } + } + + // reset shared state + if (t == 0) { + s->nnz = 0; + s->numvals = 0; + } +} + /** * @brief Encode column data * @@ -632,6 +728,7 @@ __global__ void __launch_bounds__(block_size) { __shared__ __align__(16) orcenc_state_s state_g; __shared__ union { + typename cub::BlockScan::TempStorage scan_u32; typename cub::BlockReduce::TempStorage i32; typename cub::BlockReduce::TempStorage i64; typename cub::BlockReduce::TempStorage u32; @@ -643,120 +740,74 @@ __global__ void __launch_bounds__(block_size) uint32_t group_id = blockIdx.y; int t = threadIdx.x; if (t == 0) { - s->chunk = chunks[col_id][group_id]; - s->stream = streams[col_id][group_id]; - } - if (t < CI_NUM_STREAMS) { s->strm_pos[t] = 0; } - __syncthreads(); - if (!t) { - s->cur_row = 0; - s->present_rows = 0; - s->present_out = 0; - s->numvals = 0; - s->numlengths = 0; - s->nnz = 0; + s->chunk = chunks[col_id][group_id]; + s->stream = streams[col_id][group_id]; + s->cur_row = 0; + s->present_rows = 0; + s->present_out = 0; + s->numvals = 0; + s->numlengths = 0; + s->nnz = 0; + s->strm_pos[CI_DATA] = 0; + s->strm_pos[CI_PRESENT] = 0; + s->strm_pos[CI_INDEX] = 0; // Dictionary data is encoded in a separate kernel - if (s->chunk.encoding_kind == DICTIONARY_V2) { - s->strm_pos[CI_DATA2] = s->stream.lengths[CI_DATA2]; - s->strm_pos[CI_DICTIONARY] = s->stream.lengths[CI_DICTIONARY]; - } + s->strm_pos[CI_DATA2] = + s->chunk.encoding_kind == DICTIONARY_V2 ? s->stream.lengths[CI_DATA2] : 0; + s->strm_pos[CI_DICTIONARY] = + s->chunk.encoding_kind == DICTIONARY_V2 ? s->stream.lengths[CI_DICTIONARY] : 0; } + __syncthreads(); - auto validity_byte = [&] __device__(int row) -> uint8_t& { - // valid_buf is a circular buffer where validitiy of 8 rows is stored in each element - return s->valid_buf[(row / 8) % encode_block_size]; - }; - - auto validity = [&] __device__(int row) -> uint32_t { - // Check if the specific bit is set in the validity buffer - return (validity_byte(row) >> (row % 8)) & 1; - }; + auto const pushdown_mask = [&]() -> cudf::bitmask_type const* { + auto const parent_index = s->chunk.column->parent_index; + if (!parent_index.has_value()) return nullptr; + return chunks[parent_index.value()][0].column->pushdown_mask; + }(); + encode_null_mask(s, pushdown_mask, temp_storage.scan_u32, t); __syncthreads(); + + auto const column = *s->chunk.column; while (s->cur_row < s->chunk.num_rows || s->numvals + s->numlengths != 0) { - // Encode valid map - if (s->present_rows < s->chunk.num_rows) { - uint32_t present_rows = s->present_rows; - uint32_t nrows = - min(s->chunk.num_rows - present_rows, - encode_block_size * 8 - (present_rows - (min(s->cur_row, s->present_out) & ~7))); - uint32_t nrows_out; - if (t * 8 < nrows) { - auto const row_in_group = present_rows + t * 8; - auto const row = s->chunk.start_row + row_in_group; - uint8_t valid = 0; - if (row < s->chunk.leaf_column->size()) { - if (s->chunk.leaf_column->nullable()) { - auto const current_valid_offset = row + s->chunk.leaf_column->offset(); - auto const last_offset = - min(current_valid_offset + 8, - s->chunk.leaf_column->offset() + s->chunk.leaf_column->size()); - auto const mask = cudf::detail::get_mask_offset_word( - s->chunk.leaf_column->null_mask(), 0, current_valid_offset, last_offset); - valid = 0xff & mask; - } else { - valid = 0xff; - } - if (row + 7 > s->chunk.leaf_column->size()) { - valid = valid & ((1 << (s->chunk.leaf_column->size() - row)) - 1); - } - } - validity_byte(row_in_group) = valid; - } - __syncthreads(); - present_rows += nrows; - if (!t) { s->present_rows = present_rows; } - // RLE encode the present stream - nrows_out = present_rows - s->present_out; // Should always be a multiple of 8 except at - // the end of the last row group - if (nrows_out > ((present_rows < s->chunk.num_rows) ? 130 * 8 : 0)) { - uint32_t present_out = s->present_out; - if (s->stream.ids[CI_PRESENT] >= 0) { - uint32_t flush = (present_rows < s->chunk.num_rows) ? 0 : 7; - nrows_out = (nrows_out + flush) >> 3; - nrows_out = - ByteRLE(s, s->valid_buf, present_out >> 3, nrows_out, flush, t) * 8; - } - __syncthreads(); - if (!t) { s->present_out = min(present_out + nrows_out, present_rows); } - } - __syncthreads(); - } // Fetch non-null values if (s->chunk.type_kind != LIST && !s->stream.data_ptrs[CI_DATA]) { // Pass-through __syncthreads(); if (!t) { - s->cur_row = s->present_rows; - s->strm_pos[CI_DATA] = s->cur_row * s->chunk.dtype_len; + s->cur_row = s->chunk.num_rows; + s->strm_pos[CI_DATA] = s->chunk.num_rows * s->chunk.dtype_len; } - __syncthreads(); - } else if (s->cur_row < s->present_rows) { + } else if (s->cur_row < s->chunk.num_rows) { uint32_t maxnumvals = (s->chunk.type_kind == BOOLEAN) ? 2048 : 1024; uint32_t nrows = - min(min(s->present_rows - s->cur_row, maxnumvals - max(s->numvals, s->numlengths)), + min(min(s->chunk.num_rows - s->cur_row, maxnumvals - max(s->numvals, s->numlengths)), encode_block_size); - auto const row_in_group = s->cur_row + t; - uint32_t const valid = (t < nrows) ? validity(row_in_group) : 0; - s->buf.u32[t] = valid; + auto const row = s->chunk.start_row + s->cur_row + t; + + auto const is_value_valid = [&]() { + if (t >= nrows) return false; + return bit_value_or(pushdown_mask, column.offset() + row, true) and + bit_value_or(column.null_mask(), column.offset() + row, true); + }(); + s->buf.u32[t] = is_value_valid ? 1u : 0u; // TODO: Could use a faster reduction relying on _popc() for the initial phase lengths_to_positions(s->buf.u32, encode_block_size, t); __syncthreads(); - auto const row = s->chunk.start_row + row_in_group; - if (valid) { + if (is_value_valid) { int nz_idx = (s->nnz + s->buf.u32[t] - 1) & (maxnumvals - 1); switch (s->chunk.type_kind) { case INT: case DATE: - case FLOAT: s->vals.u32[nz_idx] = s->chunk.leaf_column->element(row); break; + case FLOAT: s->vals.u32[nz_idx] = column.element(row); break; case DOUBLE: - case LONG: s->vals.u64[nz_idx] = s->chunk.leaf_column->element(row); break; - case SHORT: s->vals.u32[nz_idx] = s->chunk.leaf_column->element(row); break; + case LONG: s->vals.u64[nz_idx] = column.element(row); break; + case SHORT: s->vals.u32[nz_idx] = column.element(row); break; case BOOLEAN: - case BYTE: s->vals.u8[nz_idx] = s->chunk.leaf_column->element(row); break; + case BYTE: s->vals.u8[nz_idx] = column.element(row); break; case TIMESTAMP: { - int64_t ts = s->chunk.leaf_column->element(row); + int64_t ts = column.element(row); int32_t ts_scale = kTimeScale[min(s->chunk.scale, 9)]; int64_t seconds = ts / ts_scale; int64_t nanos = (ts - seconds * ts_scale); @@ -793,7 +844,7 @@ __global__ void __launch_bounds__(block_size) } s->vals.u32[nz_idx] = dict_idx; } else { - string_view value = s->chunk.leaf_column->element(row); + string_view value = column.element(row); s->u.strenc.str_data[s->buf.u32[t] - 1] = value.data(); s->lengths.u32[nz_idx] = value.size_bytes(); } @@ -802,11 +853,10 @@ __global__ void __launch_bounds__(block_size) // Note: can be written in a faster manner, given that all values are equal case DECIMAL: s->lengths.u32[nz_idx] = zigzag(s->chunk.scale); break; case LIST: { - auto const& offsets = - s->chunk.leaf_column->child(lists_column_view::offsets_column_index); + auto const& offsets = column.child(lists_column_view::offsets_column_index); // Compute list length from the offsets - s->lengths.u32[nz_idx] = - offsets.element(row + 1) - offsets.element(row); + s->lengths.u32[nz_idx] = offsets.element(row + 1 + column.offset()) - + offsets.element(row + column.offset()); } break; default: break; } @@ -894,10 +944,10 @@ __global__ void __launch_bounds__(block_size) } break; case DECIMAL: { - if (valid) { - uint64_t const zz_val = (s->chunk.leaf_column->type().id() == type_id::DECIMAL32) - ? zigzag(s->chunk.leaf_column->element(row)) - : zigzag(s->chunk.leaf_column->element(row)); + if (is_value_valid) { + uint64_t const zz_val = (column.type().id() == type_id::DECIMAL32) + ? zigzag(column.element(row)) + : zigzag(column.element(row)); auto const offset = (row == s->chunk.start_row) ? 0 : s->chunk.decimal_offsets[row - 1]; StoreVarint(s->stream.data_ptrs[CI_DATA] + offset, zz_val); @@ -939,8 +989,8 @@ __global__ void __launch_bounds__(block_size) streams[col_id][group_id].lengths[t] = s->strm_pos[t]; if (!s->stream.data_ptrs[t]) { streams[col_id][group_id].data_ptrs[t] = - static_cast(const_cast(s->chunk.leaf_column->head())) + - (s->chunk.leaf_column->offset() + s->chunk.start_row) * s->chunk.dtype_len; + static_cast(const_cast(column.head())) + + (column.offset() + s->chunk.start_row) * s->chunk.dtype_len; } } } @@ -1030,16 +1080,6 @@ __global__ void __launch_bounds__(block_size) if (t == 0) { strm_ptr->lengths[cid] = s->strm_pos[cid]; } } -__global__ void __launch_bounds__(512) - gpu_set_chunk_columns(device_span orc_columns, - device_2dspan chunks) -{ - // Set leaf_column member of EncChunk - for (size_type i = threadIdx.x; i < chunks.size().second; i += blockDim.x) { - chunks[blockIdx.x][i].leaf_column = &orc_columns[blockIdx.x].cudf_column; - } -} - /** * @brief Merge chunked column data into a single contiguous stream * @@ -1102,15 +1142,17 @@ __global__ void __launch_bounds__(1024) * @param[out] comp_out Per-block compression status * @param[in] compressed_bfr Compression output buffer * @param[in] comp_blk_size Compression block size + * @param[in] max_comp_blk_size Max size of any block after compression */ // blockDim {256,1,1} __global__ void __launch_bounds__(256) gpuInitCompressionBlocks(device_2dspan strm_desc, device_2dspan streams, // const? - gpu_inflate_input_s* comp_in, - gpu_inflate_status_s* comp_out, + device_span comp_in, + device_span comp_out, uint8_t* compressed_bfr, - uint32_t comp_blk_size) + uint32_t comp_blk_size, + uint32_t max_comp_blk_size) { __shared__ __align__(16) StripeStream ss; __shared__ uint8_t* volatile uncomp_base_g; @@ -1135,8 +1177,8 @@ __global__ void __launch_bounds__(256) uint32_t blk_size = min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size)); blk_in->srcDevice = src + b * comp_blk_size; blk_in->srcSize = blk_size; - blk_in->dstDevice = dst + b * (3 + comp_blk_size) + 3; // reserve 3 bytes for block header - blk_in->dstSize = blk_size; + blk_in->dstDevice = dst + b * (BLOCK_HEADER_SIZE + max_comp_blk_size) + BLOCK_HEADER_SIZE; + blk_in->dstSize = max_comp_blk_size; blk_out->bytes_written = blk_size; blk_out->status = 1; blk_out->reserved = 0; @@ -1153,14 +1195,16 @@ __global__ void __launch_bounds__(256) * @param[in] comp_out Per-block compression status * @param[in] compressed_bfr Compression output buffer * @param[in] comp_blk_size Compression block size + * @param[in] max_comp_blk_size Max size of any block after compression */ // blockDim {1024,1,1} __global__ void __launch_bounds__(1024) gpuCompactCompressedBlocks(device_2dspan strm_desc, - gpu_inflate_input_s* comp_in, - gpu_inflate_status_s* comp_out, + device_span comp_in, + device_span comp_out, uint8_t* compressed_bfr, - uint32_t comp_blk_size) + uint32_t comp_blk_size, + uint32_t max_comp_blk_size) { __shared__ __align__(16) StripeStream ss; __shared__ const uint8_t* volatile comp_src_g; @@ -1248,16 +1292,6 @@ void EncodeStripeDictionaries(StripeDictionary const* stripes, <<>>(stripes, chunks, enc_streams); } -void set_chunk_columns(device_span orc_columns, - device_2dspan chunks, - rmm::cuda_stream_view stream) -{ - dim3 dim_block(512, 1); - dim3 dim_grid(chunks.size().first, 1); - - gpu_set_chunk_columns<<>>(orc_columns, chunks); -} - void CompactOrcDataStreams(device_2dspan strm_desc, device_2dspan enc_streams, rmm::cuda_stream_view stream) @@ -1271,20 +1305,83 @@ void CompressOrcDataStreams(uint8_t* compressed_data, uint32_t num_compressed_blocks, CompressionKind compression, uint32_t comp_blk_size, + uint32_t max_comp_blk_size, device_2dspan strm_desc, device_2dspan enc_streams, - gpu_inflate_input_s* comp_in, - gpu_inflate_status_s* comp_out, + device_span comp_in, + device_span comp_out, rmm::cuda_stream_view stream) { dim3 dim_block_init(256, 1); dim3 dim_grid(strm_desc.size().first, strm_desc.size().second); gpuInitCompressionBlocks<<>>( - strm_desc, enc_streams, comp_in, comp_out, compressed_data, comp_blk_size); - if (compression == SNAPPY) { gpu_snap(comp_in, comp_out, num_compressed_blocks, stream); } + strm_desc, enc_streams, comp_in, comp_out, compressed_data, comp_blk_size, max_comp_blk_size); + if (compression == SNAPPY) { + auto env_use_nvcomp = std::getenv("LIBCUDF_USE_NVCOMP"); + bool use_nvcomp = env_use_nvcomp != nullptr ? std::atoi(env_use_nvcomp) : 0; + if (use_nvcomp) { + try { + size_t temp_size; + nvcompStatus_t nvcomp_status = nvcompBatchedSnappyCompressGetTempSize( + num_compressed_blocks, comp_blk_size, nvcompBatchedSnappyDefaultOpts, &temp_size); + + CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, + "Error in getting snappy compression scratch size"); + + rmm::device_buffer scratch(temp_size, stream); + rmm::device_uvector uncompressed_data_ptrs(num_compressed_blocks, stream); + rmm::device_uvector uncompressed_data_sizes(num_compressed_blocks, stream); + rmm::device_uvector compressed_data_ptrs(num_compressed_blocks, stream); + rmm::device_uvector compressed_bytes_written(num_compressed_blocks, stream); + + auto comp_it = thrust::make_zip_iterator(uncompressed_data_ptrs.begin(), + uncompressed_data_sizes.begin(), + compressed_data_ptrs.begin()); + thrust::transform(rmm::exec_policy(stream), + comp_in.begin(), + comp_in.end(), + comp_it, + [] __device__(gpu_inflate_input_s in) { + return thrust::make_tuple(in.srcDevice, in.srcSize, in.dstDevice); + }); + nvcomp_status = nvcompBatchedSnappyCompressAsync(uncompressed_data_ptrs.data(), + uncompressed_data_sizes.data(), + max_comp_blk_size, + num_compressed_blocks, + scratch.data(), + scratch.size(), + compressed_data_ptrs.data(), + compressed_bytes_written.data(), + nvcompBatchedSnappyDefaultOpts, + stream.value()); + + CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "Error in snappy compression"); + + thrust::transform(rmm::exec_policy(stream), + compressed_bytes_written.begin(), + compressed_bytes_written.end(), + comp_out.begin(), + [] __device__(size_t size) { + gpu_inflate_status_s status{}; + status.bytes_written = size; + return status; + }); + } catch (...) { + // If we reach this then there was an error in compressing so set an error status for each + // block + thrust::for_each(rmm::exec_policy(stream), + comp_out.begin(), + comp_out.end(), + [] __device__(gpu_inflate_status_s & stat) { stat.status = 1; }); + }; + + } else { + gpu_snap(comp_in.data(), comp_out.data(), num_compressed_blocks, stream); + } + } dim3 dim_block_compact(1024, 1); gpuCompactCompressedBlocks<<>>( - strm_desc, comp_in, comp_out, compressed_data, comp_blk_size); + strm_desc, comp_in, comp_out, compressed_data, comp_blk_size, max_comp_blk_size); } } // namespace gpu diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu index 94d8de6561b..be561530459 100644 --- a/cpp/src/io/orc/stripe_init.cu +++ b/cpp/src/io/orc/stripe_init.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include +#include #include namespace cudf { @@ -52,13 +53,13 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat uint32_t max_uncompressed_block_size = 0; uint32_t num_compressed_blocks = 0; uint32_t num_uncompressed_blocks = 0; - while (cur + 3 < end) { + while (cur + BLOCK_HEADER_SIZE < end) { uint32_t block_len = shuffle((lane_id == 0) ? cur[0] | (cur[1] << 8) | (cur[2] << 16) : 0); uint32_t is_uncompressed = block_len & 1; uint32_t uncompressed_size; gpu_inflate_input_s* init_ctl = nullptr; block_len >>= 1; - cur += 3; + cur += BLOCK_HEADER_SIZE; if (block_len > block_size || cur + block_len > end) { // Fatal num_compressed_blocks = 0; @@ -145,12 +146,12 @@ extern "C" __global__ void __launch_bounds__(128, 8) uint32_t num_compressed_blocks = 0; uint32_t max_compressed_blocks = s->info.num_compressed_blocks; - while (cur + 3 < end) { + while (cur + BLOCK_HEADER_SIZE < end) { uint32_t block_len = shuffle((lane_id == 0) ? cur[0] | (cur[1] << 8) | (cur[2] << 16) : 0); uint32_t is_uncompressed = block_len & 1; uint32_t uncompressed_size_est, uncompressed_size_actual; block_len >>= 1; - cur += 3; + cur += BLOCK_HEADER_SIZE; if (cur + block_len > end) { break; } if (is_uncompressed) { uncompressed_size_est = block_len; @@ -367,9 +368,11 @@ static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s* s, for (;;) { uint32_t block_len, is_uncompressed; - if (cur + 3 > end || cur + 3 >= start + compressed_offset) { break; } + if (cur + BLOCK_HEADER_SIZE > end || cur + BLOCK_HEADER_SIZE >= start + compressed_offset) { + break; + } block_len = cur[0] | (cur[1] << 8) | (cur[2] << 16); - cur += 3; + cur += BLOCK_HEADER_SIZE; is_uncompressed = block_len & 1; block_len >>= 1; cur += block_len; @@ -471,6 +474,45 @@ extern "C" __global__ void __launch_bounds__(128, 8) } } +template +__global__ void __launch_bounds__(block_size) + gpu_reduce_pushdown_masks(device_span orc_columns, + device_2dspan rowgroup_bounds, + device_2dspan set_counts) +{ + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + auto const column_id = blockIdx.x; + auto const rowgroup_id = blockIdx.y; + auto const column = orc_columns[column_id]; + auto const t = threadIdx.x; + + auto const use_child_rg = column.type().id() == type_id::LIST; + auto const rg = rowgroup_bounds[rowgroup_id][column_id + (use_child_rg ? 1 : 0)]; + + if (column.pushdown_mask == nullptr) { + // All elements are valid if the null mask is not present + if (t == 0) { set_counts[rowgroup_id][column_id] = rg.size(); } + return; + }; + + size_type count = 0; + static constexpr size_type bits_per_word = sizeof(bitmask_type) * 8; + for (auto row = t * bits_per_word + rg.begin; row < rg.end; row += block_size * bits_per_word) { + auto const begin_bit = row; + auto const end_bit = min(static_cast(row + bits_per_word), rg.end); + auto const mask_len = end_bit - begin_bit; + auto const mask_word = + cudf::detail::get_mask_offset_word(column.pushdown_mask, 0, row, end_bit) & + ((1 << mask_len) - 1); + count += __popc(mask_word); + } + + count = BlockReduce(temp_storage).Sum(count); + if (t == 0) { set_counts[rowgroup_id][column_id] = count; } +} + void __host__ ParseCompressedStripeData(CompressedStreamInfo* strm_info, int32_t num_streams, uint32_t compression_block_size, @@ -493,19 +535,6 @@ void __host__ PostDecompressionReassemble(CompressedStreamInfo* strm_info, num_streams); } -/** - * @brief Launches kernel for constructing rowgroup from index streams - * - * @param[out] row_groups RowGroup device array [rowgroup][column] - * @param[in] strm_info List of compressed streams (or NULL if uncompressed) - * @param[in] chunks ColumnDesc device array [stripe][column] - * @param[in] num_columns Number of columns - * @param[in] num_stripes Number of stripes - * @param[in] num_rowgroups Number of row groups - * @param[in] rowidx_stride Row index stride - * @param[in] use_base_stride Whether to use base stride obtained from meta or the computed value - * @param[in] stream CUDA stream used for device memory operations and kernel launches - */ void __host__ ParseRowGroupIndex(RowGroup* row_groups, CompressedStreamInfo* strm_info, ColumnDesc* chunks, @@ -528,6 +557,17 @@ void __host__ ParseRowGroupIndex(RowGroup* row_groups, use_base_stride); } +void __host__ reduce_pushdown_masks(device_span columns, + device_2dspan rowgroups, + device_2dspan valid_counts, + rmm::cuda_stream_view stream) +{ + dim3 dim_block(128, 1); + dim3 dim_grid(columns.size(), rowgroups.size().first); // 1 rowgroup per block + gpu_reduce_pushdown_masks<128> + <<>>(columns, rowgroups, valid_counts); +} + } // namespace gpu } // namespace orc } // namespace io diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index e0018ed7166..299c8fbb730 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -36,6 +36,8 @@ #include #include +#include + #include #include #include @@ -97,6 +99,7 @@ constexpr orc::TypeKind to_orc_type(cudf::type_id id) case cudf::type_id::DECIMAL32: case cudf::type_id::DECIMAL64: return TypeKind::DECIMAL; case cudf::type_id::LIST: return TypeKind::LIST; + case cudf::type_id::STRUCT: return TypeKind::STRUCT; default: return TypeKind::INVALID_TYPE_KIND; } } @@ -140,30 +143,30 @@ class orc_column_view { */ explicit orc_column_view(uint32_t index, int str_idx, - int index_in_table, + orc_column_view* parent, column_view const& col, - const table_metadata* metadata) + column_in_metadata const& metadata) : cudf_column{col}, _index{index}, _str_idx{str_idx}, - _is_child{index_in_table < 0}, + _is_child{parent != nullptr}, _type_width{cudf::is_fixed_width(col.type()) ? cudf::size_of(col.type()) : 0}, _scale{(to_orc_type(col.type().id()) == TypeKind::DECIMAL) ? -col.type().scale() : to_clockscale(col.type().id())}, - _precision{orc_precision(col.type().id())}, - _type_kind{to_orc_type(col.type().id())} + _precision{metadata.is_decimal_precision_set() ? metadata.get_decimal_precision() + : orc_precision(col.type().id())}, + _type_kind{to_orc_type(col.type().id())}, + name{metadata.get_name()} { - // Don't assign names to child columns - if (index_in_table >= 0) { - if (metadata != nullptr && index_in_table < static_cast(metadata->column_names.size())) { - _name = metadata->column_names[index_in_table]; - } else { - // Generating default name if name isn't present in metadata - _name = "_col" + std::to_string(index_in_table); - } + if (metadata.is_nullability_defined()) { nullable_from_metadata = metadata.nullable(); } + if (parent != nullptr) { + parent->add_child(_index); + _parent_index = parent->index(); } } + void add_child(uint32_t child_idx) { children.emplace_back(child_idx); } + auto is_string() const noexcept { return cudf_column.type().id() == type_id::STRING; } void set_dict_stride(size_t stride) noexcept { _dict_stride = stride; } auto dict_stride() const noexcept { return _dict_stride; } @@ -204,15 +207,22 @@ class orc_column_view { auto device_stripe_dict() const noexcept { return d_stripe_dict; } // Index in the table - auto index() const noexcept { return _index; } + uint32_t index() const noexcept { return _index; } // Id in the ORC file auto id() const noexcept { return _index + 1; } + auto is_child() const noexcept { return _is_child; } + auto parent_index() const noexcept { return _parent_index.value(); } + auto child_begin() const noexcept { return children.cbegin(); } + auto child_end() const noexcept { return children.cend(); } + auto type_width() const noexcept { return _type_width; } auto size() const noexcept { return cudf_column.size(); } + auto null_count() const noexcept { return cudf_column.null_count(); } auto null_mask() const noexcept { return cudf_column.null_mask(); } bool nullable() const noexcept { return null_mask() != nullptr; } + auto user_defined_nullable() const noexcept { return nullable_from_metadata; } auto scale() const noexcept { return _scale; } auto precision() const noexcept { return _precision; } @@ -220,7 +230,7 @@ class orc_column_view { void set_orc_encoding(ColumnEncodingKind e) noexcept { _encoding_kind = e; } auto orc_kind() const noexcept { return _type_kind; } auto orc_encoding() const noexcept { return _encoding_kind; } - auto orc_name() const noexcept { return _name; } + std::string_view orc_name() const noexcept { return name; } private: column_view cudf_column; @@ -236,9 +246,9 @@ class orc_column_view { int32_t _precision = 0; // ORC-related members - std::string _name{}; - TypeKind _type_kind; - ColumnEncodingKind _encoding_kind; + TypeKind _type_kind = INVALID_TYPE_KIND; + ColumnEncodingKind _encoding_kind = INVALID_ENCODING_KIND; + std::string name; // String dictionary-related members size_t _dict_stride = 0; @@ -250,6 +260,10 @@ class orc_column_view { // Offsets for encoded decimal elements. Used to enable direct writing of encoded decimal elements // into the output stream. uint32_t* d_decimal_offsets = nullptr; + + std::optional nullable_from_metadata; + std::vector children; + std::optional _parent_index; }; size_type orc_table_view::num_rows() const noexcept @@ -474,11 +488,13 @@ orc_streams writer::impl::create_streams(host_span columns, if (single_write_mode) { return column.nullable(); } else { - if (user_metadata_with_nullability.column_nullable.empty()) return true; - CUDF_EXPECTS(user_metadata_with_nullability.column_nullable.size() > column.index(), - "When passing values in user_metadata_with_nullability, data for all columns " - "must be specified"); - return user_metadata_with_nullability.column_nullable[column.index()]; + // For chunked write, when not provided nullability, we assume the worst case scenario + // that all columns are nullable. + auto const chunked_nullable = column.user_defined_nullable().value_or(true); + CUDF_EXPECTS(chunked_nullable or !column.nullable(), + "Mismatch in metadata prescribed nullability and input column nullability. " + "Metadata for nullable input column cannot prescribe nullability = false"); + return chunked_nullable; } }(); @@ -592,6 +608,9 @@ orc_streams writer::impl::create_streams(host_span columns, add_RLE_stream(gpu::CI_DATA2, LENGTH, TypeKind::INT); column.set_orc_encoding(DIRECT_V2); break; + case TypeKind::STRUCT: + // Only has the present stream + break; default: CUDF_FAIL("Unsupported ORC type kind"); } } @@ -639,16 +658,161 @@ orc_streams::orc_stream_offsets orc_streams::compute_offsets( return {std::move(strm_offsets), non_rle_data_size, rle_data_size}; } +std::vector> calculate_aligned_rowgroup_bounds( + orc_table_view const& orc_table, + file_segmentation const& segmentation, + rmm::cuda_stream_view stream) +{ + if (segmentation.num_rowgroups() == 0) return {}; + + auto d_pd_set_counts_data = rmm::device_uvector( + orc_table.num_columns() * segmentation.num_rowgroups(), stream); + auto const d_pd_set_counts = device_2dspan{ + d_pd_set_counts_data.data(), segmentation.num_rowgroups(), orc_table.num_columns()}; + gpu::reduce_pushdown_masks(orc_table.d_columns, segmentation.rowgroups, d_pd_set_counts, stream); + + auto aligned_rgs = hostdevice_2dvector( + segmentation.num_rowgroups(), orc_table.num_columns(), stream); + CUDA_TRY(cudaMemcpyAsync(aligned_rgs.base_device_ptr(), + segmentation.rowgroups.base_device_ptr(), + aligned_rgs.count() * sizeof(rowgroup_rows), + cudaMemcpyDefault, + stream.value())); + auto const d_stripes = cudf::detail::make_device_uvector_async(segmentation.stripes, stream); + + // One thread per column, per stripe + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + orc_table.num_columns() * segmentation.num_stripes(), + [columns = device_span{orc_table.d_columns}, + stripes = device_span{d_stripes}, + d_pd_set_counts, + out_rowgroups = device_2dspan{aligned_rgs}] __device__(auto& idx) { + uint32_t const col_idx = idx / stripes.size(); + // No alignment needed for root columns + if (not columns[col_idx].parent_index.has_value()) return; + + auto const stripe_idx = idx % stripes.size(); + auto const stripe = stripes[stripe_idx]; + auto const parent_col_idx = columns[col_idx].parent_index.value(); + auto const parent_column = columns[parent_col_idx]; + auto const stripe_end = stripe.first + stripe.size; + + auto seek_last_borrow_rg = [&](auto rg_idx, size_type& bits_to_borrow) { + auto curr = rg_idx + 1; + auto curr_rg_size = [&]() { + return parent_column.pushdown_mask != nullptr ? d_pd_set_counts[curr][parent_col_idx] + : out_rowgroups[curr][col_idx].size(); + }; + while (curr < stripe_end and curr_rg_size() <= bits_to_borrow) { + // All bits from rowgroup borrowed, make the rowgroup empty + out_rowgroups[curr][col_idx].begin = out_rowgroups[curr][col_idx].end; + bits_to_borrow -= curr_rg_size(); + ++curr; + } + return curr; + }; + + int previously_borrowed = 0; + for (auto rg_idx = stripe.first; rg_idx + 1 < stripe_end; ++rg_idx) { + auto& rg = out_rowgroups[rg_idx][col_idx]; + + if (parent_column.pushdown_mask == nullptr) { + // No pushdown mask, all null mask bits will be encoded + // Align on rowgroup size (can be misaligned for list children) + if (rg.size() % 8) { + auto bits_to_borrow = 8 - rg.size() % 8; + auto const last_borrow_rg_idx = seek_last_borrow_rg(rg_idx, bits_to_borrow); + if (last_borrow_rg_idx == stripe_end) { + // Didn't find enough bits to borrow, move the rowgroup end to the stripe end + rg.end = out_rowgroups[stripe_end - 1][col_idx].end; + // Done with this stripe + break; + } + auto& last_borrow_rg = out_rowgroups[last_borrow_rg_idx][col_idx]; + last_borrow_rg.begin += bits_to_borrow; + rg.end = last_borrow_rg.begin; + // Skip the rowgroups we emptied in the loop + rg_idx = last_borrow_rg_idx - 1; + } + } else { + // pushdown mask present; null mask bits w/ set pushdown mask bits will be encoded + // Use the number of set bits in pushdown mask as size + auto bits_to_borrow = + 8 - (d_pd_set_counts[rg_idx][parent_col_idx] - previously_borrowed) % 8; + if (bits_to_borrow == 0) { + // Didn't borrow any bits for this rowgroup + previously_borrowed = 0; + continue; + } + + // Find rowgroup in which we finish the search for missing bits + auto const last_borrow_rg_idx = seek_last_borrow_rg(rg_idx, bits_to_borrow); + if (last_borrow_rg_idx == stripe_end) { + // Didn't find enough bits to borrow, move the rowgroup end to the stripe end + rg.end = out_rowgroups[stripe_end - 1][col_idx].end; + // Done with this stripe + break; + } + + auto& last_borrow_rg = out_rowgroups[last_borrow_rg_idx][col_idx]; + // First row that does not need to be borrowed + auto borrow_end = last_borrow_rg.begin; + + // Adjust the number of bits to borrow in the next iteration + previously_borrowed = bits_to_borrow; + + // Find word in which we finish the search for missing bits (guaranteed to be available) + while (bits_to_borrow != 0) { + auto const mask = cudf::detail::get_mask_offset_word( + parent_column.pushdown_mask, 0, borrow_end, borrow_end + 32); + auto const valid_in_word = __popc(mask); + + if (valid_in_word > bits_to_borrow) break; + bits_to_borrow -= valid_in_word; + borrow_end += 32; + } + + // Find the last of the missing bits (guaranteed to be available) + while (bits_to_borrow != 0) { + if (bit_is_set(parent_column.pushdown_mask, borrow_end)) { --bits_to_borrow; }; + ++borrow_end; + } + + last_borrow_rg.begin = borrow_end; + rg.end = borrow_end; + // Skip the rowgroups we emptied in the loop + rg_idx = last_borrow_rg_idx - 1; + } + } + }); + + aligned_rgs.device_to_host(stream, true); + + std::vector> h_aligned_rgs; + h_aligned_rgs.reserve(segmentation.num_rowgroups()); + std::transform(thrust::make_counting_iterator(0ul), + thrust::make_counting_iterator(segmentation.num_rowgroups()), + std::back_inserter(h_aligned_rgs), + [&](auto idx) -> std::vector { + return {aligned_rgs[idx].begin(), aligned_rgs[idx].end()}; + }); + + return h_aligned_rgs; +} + struct segmented_valid_cnt_input { bitmask_type const* mask; std::vector indices; }; -encoded_data writer::impl::encode_columns(orc_table_view const& orc_table, - string_dictionaries&& dictionaries, - encoder_decimal_info&& dec_chunk_sizes, - file_segmentation const& segmentation, - orc_streams const& streams) +encoded_data encode_columns(orc_table_view const& orc_table, + string_dictionaries&& dictionaries, + encoder_decimal_info&& dec_chunk_sizes, + file_segmentation const& segmentation, + orc_streams const& streams, + rmm::cuda_stream_view stream) { auto const num_columns = orc_table.num_columns(); hostdevice_2dvector chunks(num_columns, segmentation.num_rowgroups(), stream); @@ -656,19 +820,22 @@ encoded_data writer::impl::encode_columns(orc_table_view const& orc_table, streams.compute_offsets(orc_table.columns, segmentation.num_rowgroups()); rmm::device_uvector encoded_data(stream_offsets.data_size(), stream); + auto const aligned_rowgroups = calculate_aligned_rowgroup_bounds(orc_table, segmentation, stream); + // Initialize column chunks' descriptions std::map validity_check_inputs; for (auto const& column : orc_table.columns) { for (auto const& stripe : segmentation.stripes) { for (auto rg_idx_it = stripe.cbegin(); rg_idx_it < stripe.cend(); ++rg_idx_it) { - auto const rg_idx = *rg_idx_it; - auto& ck = chunks[column.index()][rg_idx]; - - ck.start_row = segmentation.rowgroups[rg_idx][column.index()].begin; - ck.num_rows = segmentation.rowgroups[rg_idx][column.index()].size(); - ck.encoding_kind = column.orc_encoding(); - ck.type_kind = column.orc_kind(); + auto const rg_idx = *rg_idx_it; + auto& ck = chunks[column.index()][rg_idx]; + ck.start_row = segmentation.rowgroups[rg_idx][column.index()].begin; + ck.num_rows = segmentation.rowgroups[rg_idx][column.index()].size(); + ck.null_mask_start_row = aligned_rowgroups[rg_idx][column.index()].begin; + ck.null_mask_num_rows = aligned_rowgroups[rg_idx][column.index()].size(); + ck.encoding_kind = column.orc_encoding(); + ck.type_kind = column.orc_kind(); if (ck.type_kind == TypeKind::STRING) { ck.dict_index = (ck.encoding_kind == DICTIONARY_V2) ? column.host_stripe_dict(stripe.id)->dict_index @@ -682,6 +849,19 @@ encoded_data writer::impl::encode_columns(orc_table_view const& orc_table, } } } + chunks.host_to_device(stream); + // TODO (future): pass columns separately from chunks (to skip this step) + // and remove info from chunks that is common for the entire column + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0ul), + chunks.count(), + [chunks = device_2dspan{chunks}, + cols = device_span{orc_table.d_columns}] __device__(auto& idx) { + auto const col_idx = idx / chunks.size().second; + auto const rg_idx = idx % chunks.size().second; + chunks[col_idx][rg_idx].column = &cols[col_idx]; + }); auto validity_check_indices = [&](size_t col_idx) { std::vector indices; @@ -787,12 +967,8 @@ encoded_data writer::impl::encode_columns(orc_table_view const& orc_table, } } } - - chunks.host_to_device(stream); chunk_streams.host_to_device(stream); - gpu::set_chunk_columns(orc_table.d_columns, chunks, stream); - if (orc_table.num_string_columns() != 0) { auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict(); gpu::EncodeStripeDictionaries(d_stripe_dict, @@ -854,11 +1030,10 @@ void set_stat_desc_leaf_cols(device_span columns, device_span stat_desc, rmm::cuda_stream_view stream) { - thrust::for_each( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0ul), - thrust::make_counting_iterator(stat_desc.size()), - [=] __device__(auto idx) { stat_desc[idx].leaf_column = &columns[idx].cudf_column; }); + thrust::for_each(rmm::exec_policy(stream), + thrust::make_counting_iterator(0ul), + thrust::make_counting_iterator(stat_desc.size()), + [=] __device__(auto idx) { stat_desc[idx].leaf_column = &columns[idx]; }); } std::vector> writer::impl::gather_statistic_blobs( @@ -999,10 +1174,10 @@ void writer::impl::write_index_stream(int32_t stripe_id, record.pos += stream.lengths[type]; while ((record.pos >= 0) && (record.blk_pos >= 0) && (static_cast(record.pos) >= compression_blocksize_) && - (record.comp_pos + 3 + comp_out[record.blk_pos].bytes_written < + (record.comp_pos + BLOCK_HEADER_SIZE + comp_out[record.blk_pos].bytes_written < static_cast(record.comp_size))) { record.pos -= compression_blocksize_; - record.comp_pos += 3 + comp_out[record.blk_pos].bytes_written; + record.comp_pos += BLOCK_HEADER_SIZE + comp_out[record.blk_pos].bytes_written; record.blk_pos += 1; } } @@ -1099,14 +1274,16 @@ writer::impl::impl(std::unique_ptr sink, SingleWriteMode mode, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : compression_kind_(to_orc_compression(options.get_compression())), + : _mr(mr), + stream(stream), + compression_kind_(to_orc_compression(options.get_compression())), enable_statistics_(options.enable_statistics()), - out_sink_(std::move(sink)), single_write_mode(mode == SingleWriteMode::YES), - user_metadata(options.get_metadata()), - stream(stream), - _mr(mr) + out_sink_(std::move(sink)) { + if (options.get_metadata()) { + table_meta = std::make_unique(*options.get_metadata()); + } init_state(); } @@ -1115,18 +1292,16 @@ writer::impl::impl(std::unique_ptr sink, SingleWriteMode mode, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : compression_kind_(to_orc_compression(options.get_compression())), + : _mr(mr), + stream(stream), + compression_kind_(to_orc_compression(options.get_compression())), enable_statistics_(options.enable_statistics()), - out_sink_(std::move(sink)), single_write_mode(mode == SingleWriteMode::YES), - stream(stream), - _mr(mr) + out_sink_(std::move(sink)) { - if (options.get_metadata() != nullptr) { - user_metadata_with_nullability = *options.get_metadata(); - user_metadata = &user_metadata_with_nullability; + if (options.get_metadata()) { + table_meta = std::make_unique(*options.get_metadata()); } - init_state(); } @@ -1138,6 +1313,113 @@ void writer::impl::init_state() out_sink_->host_write(MAGIC, std::strlen(MAGIC)); } +void pushdown_lists_null_mask(orc_column_view const& col, + device_span d_columns, + bitmask_type const* parent_pd_mask, + device_span out_mask, + rmm::cuda_stream_view stream) +{ + // Set all bits - correct unless there's a mismatch between offsets and null mask + CUDA_TRY(cudaMemsetAsync(static_cast(out_mask.data()), + 255, + out_mask.size() * sizeof(bitmask_type), + stream.value())); + + // Reset bits where a null list element has rows in the child column + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0u), + col.size(), + [d_columns, col_idx = col.index(), parent_pd_mask, out_mask] __device__(auto& idx) { + auto const d_col = d_columns[col_idx]; + auto const is_row_valid = d_col.is_valid(idx) and bit_value_or(parent_pd_mask, idx, true); + if (not is_row_valid) { + auto offsets = d_col.child(lists_column_view::offsets_column_index); + auto const child_rows_begin = offsets.element(idx + d_col.offset()); + auto const child_rows_end = offsets.element(idx + 1 + d_col.offset()); + for (auto child_row = child_rows_begin; child_row < child_rows_end; ++child_row) + clear_bit(out_mask.data(), child_row); + } + }); +} + +/** + * @brief All pushdown masks in a table. + * + * Pushdown masks are applied to child column(s). Only bits of the child column null mask that + * correspond to set pushdown mask bits are encoded into the output file. Similarly, rows where + * pushdown mask is 0 are treated as invalid and not included in the output. + */ +struct pushdown_null_masks { + // Owning vector for masks in device memory + std::vector> data; + // Pointers to pushdown masks in device memory. Can be same for multiple columns. + std::vector masks; +}; + +pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table, + rmm::cuda_stream_view stream) +{ + std::vector mask_ptrs; + mask_ptrs.reserve(orc_table.num_columns()); + std::vector> pd_masks; + for (auto const& col : orc_table.columns) { + // Leaf columns don't need pushdown masks + if (col.orc_kind() != LIST && col.orc_kind() != STRUCT) { + mask_ptrs.emplace_back(nullptr); + continue; + } + auto const parent_pd_mask = col.is_child() ? mask_ptrs[col.parent_index()] : nullptr; + auto const null_mask = col.null_mask(); + + if (null_mask == nullptr and parent_pd_mask == nullptr) { + mask_ptrs.emplace_back(nullptr); + continue; + } + if (col.orc_kind() == STRUCT) { + if (null_mask != nullptr and parent_pd_mask == nullptr) { + // Reuse own null mask + mask_ptrs.emplace_back(null_mask); + } else if (null_mask == nullptr and parent_pd_mask != nullptr) { + // Reuse parent's pushdown mask + mask_ptrs.emplace_back(parent_pd_mask); + } else { + // Both are nullable, allocate new pushdown mask + pd_masks.emplace_back(num_bitmask_words(col.size()), stream); + mask_ptrs.emplace_back(pd_masks.back().data()); + + thrust::transform(rmm::exec_policy(stream), + null_mask, + null_mask + pd_masks.back().size(), + parent_pd_mask, + pd_masks.back().data(), + thrust::bit_and()); + } + } + if (col.orc_kind() == LIST) { + // Need a new pushdown mask unless both the parent and current colmn are not nullable + auto const child_col = orc_table.column(col.child_begin()[0]); + // pushdown mask applies to child column; use the child column size + pd_masks.emplace_back(num_bitmask_words(child_col.size()), stream); + mask_ptrs.emplace_back(pd_masks.back().data()); + pushdown_lists_null_mask(col, orc_table.d_columns, parent_pd_mask, pd_masks.back(), stream); + } + } + + // Attach null masks to device column views (async) + auto const d_mask_ptrs = cudf::detail::make_device_uvector_async(mask_ptrs, stream); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0ul), + orc_table.num_columns(), + [cols = device_span{orc_table.d_columns}, + ptrs = device_span{d_mask_ptrs}] __device__(auto& idx) { + cols[idx].pushdown_mask = ptrs[idx]; + }); + + return {std::move(pd_masks), std::move(mask_ptrs)}; +} + template struct device_stack { __device__ device_stack(T* stack_storage, int capacity) @@ -1164,28 +1446,35 @@ struct device_stack { orc_table_view make_orc_table_view(table_view const& table, table_device_view const& d_table, - table_metadata const* user_metadata, + table_input_metadata const& table_meta, rmm::cuda_stream_view stream) { std::vector orc_columns; std::vector str_col_indexes; - std::function append_orc_column = [&](column_view const& col, - int index_in_table) { - int const str_idx = - (col.type().id() == type_id::STRING) ? static_cast(str_col_indexes.size()) : -1; - auto const& new_col = - orc_columns.emplace_back(orc_columns.size(), str_idx, index_in_table, col, user_metadata); - if (new_col.is_string()) { str_col_indexes.push_back(new_col.index()); } - if (col.type().id() == type_id::LIST) - append_orc_column(col.child(lists_column_view::child_column_index), -1); - if (col.type().id() == type_id::STRUCT) - for (auto child = col.child_begin(); child != col.child_end(); ++child) - append_orc_column(*child, -1); - }; + std::function + append_orc_column = + [&](column_view const& col, orc_column_view* parent_col, column_in_metadata const& col_meta) { + int const str_idx = + (col.type().id() == type_id::STRING) ? static_cast(str_col_indexes.size()) : -1; + + auto const new_col_idx = orc_columns.size(); + orc_columns.emplace_back(new_col_idx, str_idx, parent_col, col, col_meta); + if (orc_columns[new_col_idx].is_string()) { str_col_indexes.push_back(new_col_idx); } + + if (col.type().id() == type_id::LIST) { + append_orc_column(col.child(lists_column_view::child_column_index), + &orc_columns[new_col_idx], + col_meta.child(lists_column_view::child_column_index)); + } else if (col.type().id() == type_id::STRUCT) { + for (auto child_idx = 0; child_idx != col.num_children(); ++child_idx) + append_orc_column( + col.child(child_idx), &orc_columns[new_col_idx], col_meta.child(child_idx)); + } + }; for (auto col_idx = 0; col_idx < table.num_columns(); ++col_idx) { - append_orc_column(table.column(col_idx), col_idx); + append_orc_column(table.column(col_idx), nullptr, table_meta.column_metadata[col_idx]); } rmm::device_uvector d_orc_columns(orc_columns.size(), stream); @@ -1254,19 +1543,24 @@ hostdevice_2dvector calculate_rowgroup_bounds(orc_table_view cons // Root column if (!col.parent_index.has_value()) { size_type const rows_begin = rg_idx * rowgroup_size; - auto const rows_end = - thrust::min((rg_idx + 1) * rowgroup_size, col.cudf_column.size()); + auto const rows_end = thrust::min((rg_idx + 1) * rowgroup_size, col.size()); return rowgroup_rows{rows_begin, rows_end}; } else { // Child column - auto const parent_index = *col.parent_index; - column_device_view parent_col = cols[parent_index].cudf_column; - if (parent_col.type().id() != type_id::LIST) return rg_bounds[rg_idx][parent_index]; - - auto parent_offsets = parent_col.child(lists_column_view::offsets_column_index); - auto const& parent_rowgroup_rows = rg_bounds[rg_idx][parent_index]; - auto const rows_begin = parent_offsets.element(parent_rowgroup_rows.begin); - auto const rows_end = parent_offsets.element(parent_rowgroup_rows.end); + auto const parent_index = *col.parent_index; + orc_column_device_view parent_col = cols[parent_index]; + auto const parent_rg = rg_bounds[rg_idx][parent_index]; + if (parent_col.type().id() != type_id::LIST) { + auto const offset_diff = parent_col.offset() - col.offset(); + return rowgroup_rows{parent_rg.begin + offset_diff, parent_rg.end + offset_diff}; + } + + auto offsets = parent_col.child(lists_column_view::offsets_column_index); + auto const rows_begin = + offsets.element(parent_rg.begin + parent_col.offset()) - col.offset(); + auto const rows_end = + offsets.element(parent_rg.end + parent_col.offset()) - col.offset(); + return rowgroup_rows{rows_begin, rows_end}; } }); @@ -1293,8 +1587,14 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table, current_sizes.end(), [d_cols = device_span{orc_table.d_columns}, col_idx = orc_col.index()] __device__(auto idx) { - auto const& col = d_cols[col_idx].cudf_column; - if (col.is_null(idx)) return 0u; + auto const& col = d_cols[col_idx]; + auto const pushdown_mask = [&]() -> cudf::bitmask_type const* { + auto const parent_index = d_cols[col_idx].parent_index; + if (!parent_index.has_value()) return nullptr; + return d_cols[parent_index.value()].pushdown_mask; + }(); + if (col.is_null(idx) or not bit_value_or(pushdown_mask, idx, true)) + return 0u; int64_t const element = (col.type().id() == type_id::DECIMAL32) ? col.element(idx) : col.element(idx); @@ -1416,9 +1716,25 @@ void writer::impl::write(table_view const& table) CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed"); auto const num_rows = table.num_rows(); + if (not table_meta) { table_meta = std::make_unique(table); } + + // Fill unnamed columns' names in table_meta + std::function add_default_name = + [&](column_in_metadata& col_meta, std::string default_name) { + if (col_meta.get_name().empty()) col_meta.set_name(default_name); + for (size_type i = 0; i < col_meta.num_children(); ++i) { + add_default_name(col_meta.child(i), col_meta.get_name() + "." + std::to_string(i)); + } + }; + for (size_t i = 0; i < table_meta->column_metadata.size(); ++i) { + add_default_name(table_meta->column_metadata[i], "_col" + std::to_string(i)); + } + auto const d_table = table_device_view::create(table, stream); - auto orc_table = make_orc_table_view(table, *d_table, user_metadata, stream); + auto orc_table = make_orc_table_view(table, *d_table, *table_meta, stream); + + auto const pd_masks = init_pushdown_null_masks(orc_table, stream); auto rowgroup_bounds = calculate_rowgroup_bounds(orc_table, row_index_stride_, stream); @@ -1456,7 +1772,7 @@ void writer::impl::write(table_view const& table) auto streams = create_streams(orc_table.columns, segmentation, decimal_column_sizes(dec_chunk_sizes.rg_sizes)); auto enc_data = encode_columns( - orc_table, std::move(dictionaries), std::move(dec_chunk_sizes), segmentation, streams); + orc_table, std::move(dictionaries), std::move(dec_chunk_sizes), segmentation, streams, stream); // Assemble individual disparate column chunks into contiguous data streams size_type const num_index_streams = (orc_table.num_columns() + 1); @@ -1472,29 +1788,31 @@ void writer::impl::write(table_view const& table) } // Allocate intermediate output stream buffer - size_t compressed_bfr_size = 0; - size_t num_compressed_blocks = 0; - auto stream_output = [&]() { + size_t compressed_bfr_size = 0; + size_t num_compressed_blocks = 0; + size_t max_compressed_block_size = 0; + if (compression_kind_ != NONE) { + nvcompBatchedSnappyCompressGetMaxOutputChunkSize( + compression_blocksize_, nvcompBatchedSnappyDefaultOpts, &max_compressed_block_size); + } + auto stream_output = [&]() { size_t max_stream_size = 0; bool all_device_write = true; - for (size_t stripe_id = 0; stripe_id < segmentation.num_stripes(); stripe_id++) { - for (size_t i = 0; i < num_data_streams; i++) { // TODO range for (at least) - gpu::StripeStream* ss = &strm_descs[stripe_id][i]; - if (!out_sink_->is_device_write_preferred(ss->stream_size)) { all_device_write = false; } - size_t stream_size = ss->stream_size; - if (compression_kind_ != NONE) { - ss->first_block = num_compressed_blocks; - ss->bfr_offset = compressed_bfr_size; - - auto num_blocks = std::max( - (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1); - stream_size += num_blocks * 3; - num_compressed_blocks += num_blocks; - compressed_bfr_size += stream_size; - } - max_stream_size = std::max(max_stream_size, stream_size); + for (auto& ss : strm_descs.host_view().flat_view()) { + if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; } + size_t stream_size = ss.stream_size; + if (compression_kind_ != NONE) { + ss.first_block = num_compressed_blocks; + ss.bfr_offset = compressed_bfr_size; + + auto num_blocks = std::max( + (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1); + stream_size += num_blocks * BLOCK_HEADER_SIZE; + num_compressed_blocks += num_blocks; + compressed_bfr_size += (max_compressed_block_size + BLOCK_HEADER_SIZE) * num_blocks; } + max_stream_size = std::max(max_stream_size, stream_size); } if (all_device_write) { @@ -1519,10 +1837,11 @@ void writer::impl::write(table_view const& table) num_compressed_blocks, compression_kind_, compression_blocksize_, + max_compressed_block_size, strm_descs, enc_data.streams, - comp_in.device_ptr(), - comp_out.device_ptr(), + comp_in, + comp_out, stream); strm_descs.device_to_host(stream); comp_out.device_to_host(stream, true); @@ -1641,6 +1960,18 @@ void writer::impl::write(table_view const& table) } // In preorder traversal the column after a list column is always the child column if (column.orc_kind() == LIST) { schema_type.subtypes.emplace_back(column.id() + 1); } + if (column.orc_kind() == STRUCT) { + std::transform(column.child_begin(), + column.child_end(), + std::back_inserter(schema_type.subtypes), + [&](auto const& child_idx) { return orc_table.column(child_idx).id(); }); + std::transform(column.child_begin(), + column.child_end(), + std::back_inserter(schema_type.fieldNames), + [&](auto const& child_idx) { + return std::string{orc_table.column(child_idx).orc_name()}; + }); + } } } else { // verify the user isn't passing mismatched tables @@ -1666,11 +1997,13 @@ void writer::impl::close() PostScript ps; ff.contentLength = out_sink_->bytes_written(); - if (user_metadata) { - for (auto it = user_metadata->user_data.begin(); it != user_metadata->user_data.end(); it++) { - ff.metadata.push_back({it->first, it->second}); - } - } + std::transform(table_meta->user_data.begin(), + table_meta->user_data.end(), + std::back_inserter(ff.metadata), + [&](auto const& udata) { + return UserMetadataItem{udata.first, udata.second}; + }); + // Write statistics metadata if (md.stripeStats.size() != 0) { buffer_.resize((compression_kind_ != NONE) ? 3 : 0); diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp index 787bdeb3a4e..a8fe22a360f 100644 --- a/cpp/src/io/orc/writer_impl.hpp +++ b/cpp/src/io/orc/writer_impl.hpp @@ -262,23 +262,6 @@ class writer::impl { file_segmentation const& segmentation, std::map const& decimal_column_sizes); - /** - * @brief Encodes the input columns into streams. - * - * @param orc_table Non-owning view of a cuDF table w/ ORC-related info - * @param dict_data Dictionary data memory - * @param dict_index Dictionary index memory - * @param dec_chunk_sizes Information about size of encoded decimal columns - * @param segmentation stripe and rowgroup ranges - * @param stream CUDA stream used for device memory operations and kernel launches - * @return Encoded data and per-chunk stream descriptors - */ - encoded_data encode_columns(orc_table_view const& orc_table, - string_dictionaries&& dictionaries, - encoder_decimal_info&& dec_chunk_sizes, - file_segmentation const& segmentation, - orc_streams const& streams); - /** * @brief Returns stripe information after compacting columns' individual data * chunks into contiguous data streams. @@ -375,14 +358,11 @@ class writer::impl { cudf::io::orc::Metadata md; // current write position for rowgroups/chunks size_t current_chunk_offset; - // optional user metadata - table_metadata const* user_metadata = nullptr; - // only used in the write_chunked() case. copied from the (optionally) user supplied - // argument to write_chunked_begin() - table_metadata_with_nullability user_metadata_with_nullability; // special parameter only used by detail::write() to indicate that we are guaranteeing // a single table write. this enables some internal optimizations. bool const single_write_mode; + // optional user metadata + std::unique_ptr table_meta; // to track if the output has been written to sink bool closed = false; diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 564d919b65d..e53ae4ff0c1 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include + #include #include #include @@ -22,6 +24,8 @@ #include #include #include +#include +#include #include #include @@ -36,10 +40,15 @@ namespace lists { namespace detail { namespace { template -struct has_negative_nans { +struct has_negative_nans_fn { column_device_view const d_entries; bool const has_nulls; + has_negative_nans_fn(column_device_view const d_entries, bool const has_nulls) + : d_entries(d_entries), has_nulls(has_nulls) + { + } + __device__ Type operator()(size_type idx) const noexcept { if (has_nulls && d_entries.is_null_nocheck(idx)) { return false; } @@ -50,30 +59,53 @@ struct has_negative_nans { }; /** - * @brief A structure to be used along with type_dispatcher to check if a - * `column_view` has any negative NaN entry + * @brief A structure to be used along with type_dispatcher to check if a column has any + * negative NaN value. + * + * This functor is used to check for replacing negative NaN if there exists one. It is neccessary + * because when calling to `lists::detail::sort_lists`, the negative NaN and positive NaN values (if + * both exist) are separated to the two ends of the output column. This is due to the API + * `lists::detail::sort_lists` internally calls `cub::DeviceSegmentedRadixSort`, which performs + * sorting by comparing bits of the input numbers. Since negative and positive NaN have + * different bits representation, they may not be moved to be close to each other after sorted. */ -struct has_negative_nans_fn { +struct has_negative_nans_dispatch { template >* = nullptr> bool operator()(column_view const& lists_entries, rmm::cuda_stream_view stream) const noexcept { auto const d_entries = column_device_view::create(lists_entries, stream); - return thrust::count_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(lists_entries.size()), - detail::has_negative_nans{*d_entries, lists_entries.has_nulls()}); + return thrust::count_if( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(lists_entries.size()), + detail::has_negative_nans_fn{*d_entries, lists_entries.has_nulls()}); } - template >* = nullptr> - bool operator()(column_view const&, rmm::cuda_stream_view) const noexcept + template >* = nullptr> + bool operator()(column_view const& lists_entries, rmm::cuda_stream_view stream) const { - // Columns of non floating-point data will never contain NaN + // Recursively check negative NaN on the children columns. + return std::any_of( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(lists_entries.num_children()), + [structs_view = structs_column_view{lists_entries}, stream](auto const child_idx) { + auto const col = structs_view.get_sliced_child(child_idx); + return type_dispatcher(col.type(), detail::has_negative_nans_dispatch{}, col, stream); + }); + } + + template && + !std::is_same_v>* = nullptr> + bool operator()(column_view const&, rmm::cuda_stream_view) const + { + // Columns of non floating-point data will never contain NaN. return false; } }; template -struct replace_negative_nans { +struct replace_negative_nans_fn { __device__ Type operator()(Type val) const noexcept { return std::isnan(val) ? std::numeric_limits::quiet_NaN() : val; @@ -81,58 +113,63 @@ struct replace_negative_nans { }; /** - * @brief A structure to be used along with type_dispatcher to replace -NaN by NaN for all entries - * of a floating-point data column + * @brief A structure to be used along with type_dispatcher to replace -NaN by NaN for all rows + * in a floating-point data column. */ -struct replace_negative_nans_fn { - template >* = nullptr> - void operator()(column_view const&, mutable_column_view const&, rmm::cuda_stream_view) const +struct replace_negative_nans_dispatch { + template && + !std::is_same_v>* = nullptr> + std::unique_ptr operator()(column_view const& lists_entries, + rmm::cuda_stream_view) const noexcept { - CUDF_FAIL("Cannot operate on a type that is not floating-point."); + // For non floating point type and non struct, just return a copy of the input. + return std::make_unique(lists_entries); } template >* = nullptr> - void operator()(column_view const& lists_entries, - mutable_column_view const& new_entries, - rmm::cuda_stream_view stream) const noexcept + std::unique_ptr operator()(column_view const& lists_entries, + rmm::cuda_stream_view stream) const noexcept { - // Do not care whether an entry is null or not, just consider it as a floating-point value - thrust::transform(rmm::exec_policy(stream), - lists_entries.begin(), - lists_entries.end(), - new_entries.begin(), - detail::replace_negative_nans{}); - } -}; + auto new_entries = cudf::detail::allocate_like( + lists_entries, lists_entries.size(), cudf::mask_allocation_policy::NEVER, stream); + new_entries->set_null_mask(cudf::detail::copy_bitmask(lists_entries, stream), + lists_entries.null_count()); -/** - * @brief Transform a given lists column to a new lists column in which all the list entries holding - * -NaN value are replaced by (positive) NaN - */ -std::unique_ptr replace_negative_nans_entries(column_view const& lists_entries, - lists_column_view const& lists_column, - rmm::cuda_stream_view stream) -{ - auto new_offsets = std::make_unique(lists_column.offsets()); - auto new_entries = std::make_unique(lists_entries); + // Replace all negative NaN values. + thrust::transform(rmm::exec_policy(stream), + lists_entries.template begin(), + lists_entries.template end(), + new_entries->mutable_view().template begin(), + detail::replace_negative_nans_fn{}); - type_dispatcher(lists_entries.type(), - detail::replace_negative_nans_fn{}, - lists_entries, - new_entries->mutable_view(), - stream); + return new_entries; + } - return make_lists_column( - lists_column.size(), - std::move(new_offsets), - std::move(new_entries), - lists_column.null_count(), - cudf::detail::copy_bitmask( - lists_column.parent(), stream, rmm::mr::get_current_device_resource())); -} + template >* = nullptr> + std::unique_ptr operator()(column_view const& lists_entries, + rmm::cuda_stream_view stream) const noexcept + { + std::vector> output_struct_members; + std::transform( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(lists_entries.num_children()), + std::back_inserter(output_struct_members), + [structs_view = structs_column_view{lists_entries}, stream](auto const child_idx) { + auto const col = structs_view.get_sliced_child(child_idx); + return type_dispatcher(col.type(), detail::replace_negative_nans_dispatch{}, col, stream); + }); + + return cudf::make_structs_column(lists_entries.size(), + std::move(output_struct_members), + lists_entries.null_count(), + cudf::detail::copy_bitmask(lists_entries, stream), + stream); + } +}; /** - * @brief Generate a 0-based offset column for a lists column + * @brief Generate a 0-based offset column for a lists column. * * Given a lists_column_view, which may have a non-zero offset, generate a new column containing * 0-based list offsets. This is done by subtracting each of the input list offset by the first @@ -143,11 +180,10 @@ std::unique_ptr replace_negative_nans_entries(column_view const& lists_e * then output_offsets = { 0, 4, 6, 10 } * @endcode * - * @param lists_column The input lists column - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device resource used to allocate memory - * - * @return A column containing 0-based list offsets + * @param lists_column The input lists column. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device resource used to allocate memory. + * @return A column containing 0-based list offsets. */ std::unique_ptr generate_clean_offsets(lists_column_view const& lists_column, rmm::cuda_stream_view stream, @@ -168,7 +204,35 @@ std::unique_ptr generate_clean_offsets(lists_column_view const& lists_co } /** - * @brief Populate list offsets for all list entries + * @brief Transform a given lists column to a new lists column in which all the list entries holding + * -NaN value are replaced by (positive) NaN. + * + * Replacing -NaN by NaN is necessary before sorting (individual) lists because the sorting API is + * using radix sort, which compares bits of the number thus it may separate -NaN by NaN to the two + * ends of the result column. + */ +std::unique_ptr replace_negative_nans_entries(column_view const& lists_entries, + lists_column_view const& lists_column, + rmm::cuda_stream_view stream) +{ + // We need to copy the offsets column of the input lists_column. Since the input lists_column may + // be sliced, we need to generate clean offsets (i.e., offsets starting from zero). + auto new_offsets = + generate_clean_offsets(lists_column, stream, rmm::mr::get_current_device_resource()); + auto new_entries = type_dispatcher( + lists_entries.type(), detail::replace_negative_nans_dispatch{}, lists_entries, stream); + + return make_lists_column( + lists_column.size(), + std::move(new_offsets), + std::move(new_entries), + lists_column.null_count(), + cudf::detail::copy_bitmask( + lists_column.parent(), stream, rmm::mr::get_current_device_resource())); +} + +/** + * @brief Populate list offsets for all list entries. * * Given an `offsets` column_view containing offsets of a lists column and a number of all list * entries in the column, generate an array that maps from each list entry to the offset of the list @@ -179,12 +243,11 @@ std::unique_ptr generate_clean_offsets(lists_column_view const& lists_co * output = { 1, 1, 1, 1, 2, 2, 3, 3, 3, 3 } * @endcode * - * @param num_entries The number of list entries - * @param offsets Column view to the list offsets - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device resource used to allocate memory - * - * @return A column containing entry list offsets + * @param num_entries The number of list entries. + * @param offsets Column view to the list offsets. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device resource used to allocate memory. + * @return A column containing entry list offsets. */ std::unique_ptr generate_entry_list_offsets(size_type num_entries, column_view const& offsets, @@ -205,95 +268,172 @@ std::unique_ptr generate_entry_list_offsets(size_type num_entries, } /** - * @brief Performs an equality comparison between two entries in a lists column + * @brief Performs an equality comparison between two entries in a lists column. * - * For the two elements that are in the same list in the lists column, they will always be - * considered as different. If they are from the same list and their type is one of floating - * point types, this functor will return the same comparison result as - * `cudf::element_equality_comparator`. + * For the two elements that are NOT in the same list in the lists column, they will always be + * considered as different. If they are from the same list and their type is not floating point, + * this functor will return the same comparison result as `cudf::element_equality_comparator`. * * For floating-point types, entries holding NaN value can be considered as different values or the - * same value depending on the nans_equal parameter. + * same value depending on the `nans_equal` parameter. * - * @tparam Type The data type of entries + * @tparam Type The data type of entries * @tparam nans_equal Flag to specify whether NaN entries should be considered as equal value (only * applicable for floating-point data column) */ -template -class list_entry_comparator { - public: - list_entry_comparator(offset_type const* list_offsets, - column_device_view d_view, - null_equality nulls_equal, - bool has_nulls) - : list_offsets(list_offsets), d_view{d_view}, nulls_equal{nulls_equal}, has_nulls(has_nulls) +template +struct column_row_comparator_fn { + offset_type const* const list_offsets; + column_device_view const lhs; + column_device_view const rhs; + null_equality const nulls_equal; + bool const has_nulls; + bool const nans_equal; + + __host__ __device__ column_row_comparator_fn(offset_type const* const list_offsets, + column_device_view const& lhs, + column_device_view const& rhs, + null_equality const nulls_equal, + bool const has_nulls, + bool const nans_equal) + : list_offsets(list_offsets), + lhs(lhs), + rhs(rhs), + nulls_equal(nulls_equal), + has_nulls(has_nulls), + nans_equal(nans_equal) { } - template - std::enable_if_t and nans_equal_, bool> __device__ - operator()(size_type i, size_type j) const noexcept + template >* = nullptr> + bool __device__ compare(T const& lhs_val, T const& rhs_val) const noexcept { - // Two entries are not considered for equality if they belong to different lists - if (list_offsets[i] != list_offsets[j]) { return false; } + return lhs_val == rhs_val; + } - if (has_nulls) { - bool const nullable = d_view.nullable(); - bool const lhs_is_null{nullable and d_view.is_null_nocheck(i)}; - bool const rhs_is_null{nullable and d_view.is_null_nocheck(j)}; - if (lhs_is_null and rhs_is_null) { - return nulls_equal == null_equality::EQUAL; - } else if (lhs_is_null != rhs_is_null) { - return false; - } - } + template >* = nullptr> + bool __device__ compare(T const& lhs_val, T const& rhs_val) const noexcept + { + // If both element(i) and element(j) are NaNs and nans are considered as equal value then this + // comparison will return `true`. This is the desired behavior in Pandas. + if (nans_equal && std::isnan(lhs_val) && std::isnan(rhs_val)) { return true; } - // For floating-point types, if both element(i) and element(j) are NaNs then this comparison - // will return `true`. This is the desired behavior in Pandas. - auto const lhs = d_view.element(i); - auto const rhs = d_view.element(j); - if (std::isnan(lhs) and std::isnan(rhs)) { return true; } - return lhs == rhs; + // If nans are considered as NOT equal, even both element(i) and element(j) are NaNs this + // comparison will still return `false`. This is the desired behavior in Apache Spark. + return lhs_val == rhs_val; } - template - std::enable_if_t or not nans_equal_, bool> __device__ - operator()(size_type i, size_type j) const noexcept + bool __device__ operator()(size_type i, size_type j) const noexcept { - // Two entries are not considered for equality if they belong to different lists + // Two entries are not considered for equality if they belong to different lists. if (list_offsets[i] != list_offsets[j]) { return false; } if (has_nulls) { - bool const nullable = d_view.nullable(); - bool const lhs_is_null{nullable and d_view.is_null_nocheck(i)}; - bool const rhs_is_null{nullable and d_view.is_null_nocheck(j)}; - if (lhs_is_null and rhs_is_null) { + bool const lhs_is_null{lhs.nullable() && lhs.is_null_nocheck(i)}; + bool const rhs_is_null{rhs.nullable() && rhs.is_null_nocheck(j)}; + if (lhs_is_null && rhs_is_null) { return nulls_equal == null_equality::EQUAL; } else if (lhs_is_null != rhs_is_null) { return false; } } - // For floating-point types, if both element(i) and element(j) are NaNs then this comparison - // will return `false`. This is the desired behavior in Apache Spark. - return d_view.element(i) == d_view.element(j); + return compare(lhs.element(i), lhs.element(j)); + } +}; + +/** + * @brief Struct used in type_dispatcher for comparing two entries in a lists column. + */ +struct column_row_comparator_dispatch { + offset_type const* const list_offsets; + column_device_view const lhs; + column_device_view const rhs; + null_equality const nulls_equal; + bool const has_nulls; + bool const nans_equal; + + __device__ column_row_comparator_dispatch(offset_type const* const list_offsets, + column_device_view const& lhs, + column_device_view const& rhs, + null_equality const nulls_equal, + bool const has_nulls, + bool const nans_equal) + : list_offsets(list_offsets), + lhs(lhs), + rhs(rhs), + nulls_equal(nulls_equal), + has_nulls(has_nulls), + nans_equal(nans_equal) + { + } + + template ()>* = nullptr> + bool __device__ operator()(size_type i, size_type j) const noexcept + { + return column_row_comparator_fn{ + list_offsets, lhs, rhs, nulls_equal, has_nulls, nans_equal}(i, j); + } + + template ()>* = nullptr> + bool operator()(size_type i, size_type j) const + { + CUDF_FAIL( + "`column_row_comparator_dispatch` cannot operate on types that are not equally comparable."); } +}; - private: - offset_type const* list_offsets; - column_device_view d_view; - null_equality nulls_equal; - bool has_nulls; +/** + * @brief Performs an equality comparison between rows of two tables using `column_row_comparator` + * to compare rows of their corresponding columns. + */ +struct table_row_comparator_fn { + offset_type const* const list_offsets; + table_device_view const lhs; + table_device_view const rhs; + null_equality const nulls_equal; + bool const has_nulls; + bool const nans_equal; + + table_row_comparator_fn(offset_type const* const list_offsets, + table_device_view const& lhs, + table_device_view const& rhs, + null_equality const nulls_equal, + bool const has_nulls, + bool const nans_equal) + : list_offsets(list_offsets), + lhs(lhs), + rhs(rhs), + nulls_equal(nulls_equal), + has_nulls(has_nulls), + nans_equal(nans_equal) + { + } + + bool __device__ operator()(size_type i, size_type j) const noexcept + { + auto column_comp = [=](column_device_view const& lhs, column_device_view const& rhs) { + return type_dispatcher( + lhs.type(), + column_row_comparator_dispatch{list_offsets, lhs, rhs, nulls_equal, has_nulls, nans_equal}, + i, + j); + }; + + return thrust::equal(thrust::seq, lhs.begin(), lhs.end(), rhs.begin(), column_comp); + } }; /** - * @brief Construct type-dispatched function object for copying indices of the list entries - * ignoring duplicates + * @brief Struct used in type_dispatcher for copying indices of the list entries ignoring + * duplicates. */ -struct get_unique_entries_fn { - template ()>* = nullptr> +struct get_unique_entries_dispatch { + template () && + !std::is_same_v>* = nullptr> offset_type* operator()(offset_type const*, - column_device_view&, + column_view const&, size_type, offset_type*, null_equality, @@ -301,12 +441,13 @@ struct get_unique_entries_fn { bool, rmm::cuda_stream_view) const { - CUDF_FAIL("Cannot operate on types that are not equally comparable."); + CUDF_FAIL( + "`get_unique_entries_dispatch` cannot operate on types that are not equally comparable."); } template ()>* = nullptr> offset_type* operator()(offset_type const* list_offsets, - column_device_view& d_view, + column_view const& all_lists_entries, size_type num_entries, offset_type* output_begin, null_equality nulls_equal, @@ -314,41 +455,69 @@ struct get_unique_entries_fn { bool has_nulls, rmm::cuda_stream_view stream) const noexcept { - if (nans_equal == nan_equality::ALL_EQUAL) { - list_entry_comparator const comp{list_offsets, d_view, nulls_equal, has_nulls}; - return thrust::unique_copy(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - output_begin, - comp); - } else { - list_entry_comparator const comp{list_offsets, d_view, nulls_equal, has_nulls}; - return thrust::unique_copy(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - output_begin, - comp); - } + auto const d_view = column_device_view::create(all_lists_entries, stream); + auto const comp = column_row_comparator_fn{list_offsets, + *d_view, + *d_view, + nulls_equal, + has_nulls, + nans_equal == nan_equality::ALL_EQUAL}; + return thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + comp); + } + + template >* = nullptr> + offset_type* operator()(offset_type const* list_offsets, + column_view const& all_lists_entries, + size_type num_entries, + offset_type* output_begin, + null_equality nulls_equal, + nan_equality nans_equal, + bool has_nulls, + rmm::cuda_stream_view stream) const noexcept + { + auto const entries_tview = table_view{{all_lists_entries}}; + auto const flatten_nullability = has_nested_nulls(entries_tview) + ? structs::detail::column_nullability::FORCE + : structs::detail::column_nullability::MATCH_INCOMING; + auto const entries_flattened = cudf::structs::detail::flatten_nested_columns( + entries_tview, {order::ASCENDING}, {null_order::AFTER}, flatten_nullability); + auto const d_view = table_device_view::create(std::get<0>(entries_flattened), stream); + + auto const comp = table_row_comparator_fn{list_offsets, + *d_view, + *d_view, + nulls_equal, + has_nulls, + nans_equal == nan_equality::ALL_EQUAL}; + + return thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + comp); } }; /** - * @brief Copy list entries and entry list offsets ignoring duplicates + * @brief Copy list entries and entry list offsets ignoring duplicates. * * Given an array of all entries flattened from a list column and an array that maps each entry to * the offset of the list containing that entry, those entries and list offsets are copied into * new arrays such that the duplicated entries within each list will be ignored. * - * @param all_lists_entries The input array containing all list entries - * @param entries_list_offsets A map from list entries to their corresponding list offsets - * @param nulls_equal Flag to specify whether null entries should be considered equal - * @param nans_equal Flag to specify whether NaN entries should be considered as equal - * value (only applicable for floating-point data column) - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device resource used to allocate memory - * + * @param all_lists_entries The input array containing all list entries. + * @param entries_list_offsets A map from list entries to their corresponding list offsets. + * @param nulls_equal Flag to specify whether null entries should be considered equal. + * @param nans_equal Flag to specify whether NaN entries should be considered equal + * (only applicable for floating-point data column). + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device resource used to allocate memory. * @return A pair of columns, the first one contains unique list entries and the second one - * contains their corresponding list offsets + * contains their corresponding list offsets. */ std::vector> get_unique_entries_and_list_offsets( column_view const& all_lists_entries, @@ -358,16 +527,15 @@ std::vector> get_unique_entries_and_list_offsets( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const num_entries = all_lists_entries.size(); - auto const d_view_entries = column_device_view::create(all_lists_entries, stream); + auto const num_entries = all_lists_entries.size(); - // Allocate memory to store the indices of the unique entries + // Allocate memory to store the indices of the unique entries. auto unique_indices = rmm::device_uvector(num_entries, stream); auto const output_begin = unique_indices.begin(); auto const output_end = type_dispatcher(all_lists_entries.type(), - get_unique_entries_fn{}, + get_unique_entries_dispatch{}, entries_list_offsets.begin(), - *d_view_entries, + all_lists_entries, num_entries, output_begin, nulls_equal, @@ -375,9 +543,9 @@ std::vector> get_unique_entries_and_list_offsets( all_lists_entries.has_nulls(), stream); - // Collect unique entries and entry list offsets + // Collect unique entries and entry list offsets. // The new null_count and bitmask of the unique entries will also be generated - // by the gather function + // by the gather function. return cudf::detail::gather(table_view{{all_lists_entries, entries_list_offsets}}, output_begin, output_end, @@ -388,27 +556,27 @@ std::vector> get_unique_entries_and_list_offsets( } /** - * @brief Generate list offsets from entry offsets + * @brief Generate list offsets from entry offsets. * - * Generate an array of list offsets for the final result lists column. The list - * offsets of the original lists column are also taken into account to make sure the result lists - * column will have the same empty list rows (if any) as in the original lists column. + * Generate an array of list offsets for the final result lists column. The list offsets of the + * original lists column are also taken into account to make sure the result lists column will have + * the same empty list rows (if any) as in the original lists column. * - * @param[in] num_entries The number of unique entries after removing duplicates - * @param[in] entries_list_offsets The mapping from list entries to their list offsets - * @param[out] original_offsets The list offsets of the original lists column, which - * will also be used to store the new list offsets - * @param[in] stream CUDA stream used for device memory operations and kernel launches - * @param[in] mr Device resource used to allocate memory + * @param num_entries The number of unique entries after removing duplicates. + * @param entries_list_offsets The mapping from list entries to their list offsets. + * @param original_offsets The list offsets of the original lists column, which will also be used to + * store the new list offsets. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device resource used to allocate memory. */ void generate_offsets(size_type num_entries, column_view const& entries_list_offsets, mutable_column_view const& original_offsets, rmm::cuda_stream_view stream) { - // Firstly, generate temporary list offsets for the unique entries, ignoring empty lists (if any) + // Firstly, generate temporary list offsets for the unique entries, ignoring empty lists (if any). // If entries_list_offsets = {1, 1, 1, 1, 2, 3, 3, 3, 4, 4 }, num_entries = 10, - // then new_offsets = { 0, 4, 5, 8, 10 } + // then new_offsets = { 0, 4, 5, 8, 10 }. auto const new_offsets = allocate_like( original_offsets, mask_allocation_policy::NEVER, rmm::mr::get_current_device_resource()); thrust::copy_if(rmm::exec_policy(stream), @@ -421,10 +589,9 @@ void generate_offsets(size_type num_entries, }); // Generate a prefix sum of number of empty lists, storing inplace to the original lists - // offsets + // offsets. // If the original list offsets is { 0, 0, 5, 5, 6, 6 } (there are 2 empty lists), - // and new_offsets = { 0, 4, 6 }, - // then output = { 0, 1, 1, 2, 2, 3} + // and new_offsets = { 0, 4, 6 }, then output = { 0, 1, 1, 2, 2, 3}. auto const iter_trans_begin = cudf::detail::make_counting_transform_iterator( 0, [offsets = original_offsets.begin()] __device__(auto i) { return (i > 0 && offsets[i] == offsets[i - 1]) ? 1 : 0; @@ -434,10 +601,10 @@ void generate_offsets(size_type num_entries, iter_trans_begin + original_offsets.size(), original_offsets.begin()); - // Generate the final list offsets + // Generate the final list offsets. // If the original list offsets are { 0, 0, 5, 5, 6, 6 }, the new offsets are { 0, 4, 6 }, - // and the prefix sums of empty lists are { 0, 1, 1, 2, 2, 3 }, - // then output = { 0, 0, 4, 4, 5, 5 } + // and the prefix sums of empty lists are { 0, 1, 1, 2, 2, 3 }, + // then output = { 0, 0, 4, 4, 5, 5 }. thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(original_offsets.size()), @@ -453,7 +620,7 @@ void generate_offsets(size_type num_entries, /** * @copydoc cudf::lists::drop_list_duplicates * - * @param stream CUDA stream used for device memory operations and kernel launches + * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_column, null_equality nulls_equal, @@ -462,22 +629,23 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu rmm::mr::device_memory_resource* mr) { if (lists_column.is_empty()) return cudf::empty_like(lists_column.parent()); - if (cudf::is_nested(lists_column.child().type())) { - CUDF_FAIL("Nested types are not supported in drop_list_duplicates."); + if (auto const child_type = lists_column.child().type(); + cudf::is_nested(child_type) && child_type.id() != type_id::STRUCT) { + CUDF_FAIL("Nested types other than STRUCT are not supported in `drop_list_duplicates`."); } - // Flatten all entries (depth = 1) of the lists column + // Flatten all entries (depth = 1) of the lists column. auto const lists_entries = lists_column.get_sliced_child(stream); - // sorted_lists will store the results of the original lists after calling segmented_sort + // sorted_lists will store the results of the original lists after calling segmented_sort. auto const sorted_lists = [&]() { // If nans_equal == ALL_EQUAL and the column contains lists of floating-point data type, - // we need to replace -NaN by NaN before sorting + // we need to replace -NaN by NaN before sorting. auto const replace_negative_nan = - nans_equal == nan_equality::ALL_EQUAL and - type_dispatcher(lists_entries.type(), detail::has_negative_nans_fn{}, lists_entries, stream); + nans_equal == nan_equality::ALL_EQUAL && + type_dispatcher( + lists_entries.type(), detail::has_negative_nans_dispatch{}, lists_entries, stream); if (replace_negative_nan) { - // The column new_lists_column is temporary, thus we will not pass in `mr` auto const new_lists_column = detail::replace_negative_nans_entries(lists_entries, lists_column, stream); return detail::sort_lists( @@ -490,28 +658,28 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu auto const sorted_lists_entries = lists_column_view(sorted_lists->view()).get_sliced_child(stream); - // Generate a 0-based offset column + // Generate a 0-based offset column. auto lists_offsets = detail::generate_clean_offsets(lists_column, stream, mr); - // Generate a mapping from list entries to offsets of the lists containing those entries + // Generate a mapping from list entries to offsets of the lists containing those entries. auto const entries_list_offsets = detail::generate_entry_list_offsets(sorted_lists_entries.size(), lists_offsets->view(), stream); - // Copy non-duplicated entries (along with their list offsets) to new arrays + // Copy non-duplicated entries (along with their list offsets) to new arrays. auto unique_entries_and_list_offsets = detail::get_unique_entries_and_list_offsets( sorted_lists_entries, entries_list_offsets->view(), nulls_equal, nans_equal, stream, mr); - // Generate offsets for the new lists column + // Generate offsets for the new lists column. detail::generate_offsets(unique_entries_and_list_offsets.front()->size(), unique_entries_and_list_offsets.back()->view(), lists_offsets->mutable_view(), stream); - // Construct a new lists column without duplicated entries + // Construct a new lists column without duplicated entries. // Reuse the null_count and bitmask of the lists_column: those are the null information for - // the list elements (rows) + // the list elements (rows). // For the entries of those lists (rows), their null_count and bitmask were generated separately - // during the step `get_unique_entries_and_list_offsets` above + // during the step `get_unique_entries_and_list_offsets` above. return make_lists_column(lists_column.size(), std::move(lists_offsets), std::move(unique_entries_and_list_offsets.front()), diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu index 4d1d6448dd0..b9b73d98ed2 100644 --- a/cpp/src/lists/interleave_columns.cu +++ b/cpp/src/lists/interleave_columns.cu @@ -169,6 +169,10 @@ struct compute_string_sizes_and_interleave_lists_fn { auto const start_str_idx = list_offsets[list_id]; auto const end_str_idx = list_offsets[list_id + 1]; + // In case of empty list (i.e. it doesn't contain any string element), we just ignore it because + // there will not be anything to store for that list in the child column. + if (start_str_idx == end_str_idx) { return; } + // read_idx and write_idx are indices of string elements. size_type write_idx = dst_list_offsets[idx]; diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu new file mode 100644 index 00000000000..9aea59a195b --- /dev/null +++ b/cpp/src/quantiles/tdigest/tdigest.cu @@ -0,0 +1,383 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace cudf { +namespace detail { +namespace tdigest { + +// https://developer.nvidia.com/blog/lerp-faster-cuda/ +template +__device__ inline T lerp(T v0, T v1, T t) +{ + return fma(t, v1, fma(-t, v0, v0)); +} + +struct centroid { + double mean; + double weight; +}; + +struct make_centroid { + double const* means; + double const* weights; + __device__ centroid operator()(size_type i) { return {means[i], weights[i]}; } +}; + +// kernel for computing percentiles on input tdigest (mean, weight) centroid data. +template +__global__ void compute_percentiles_kernel(device_span tdigest_offsets, + column_device_view percentiles, + CentroidIter centroids_, + double const* min_, + double const* max_, + double const* cumulative_weight_, + double* output) +{ + int const tid = threadIdx.x + blockIdx.x * blockDim.x; + + auto const num_tdigests = tdigest_offsets.size() - 1; + auto const tdigest_index = tid / percentiles.size(); + if (tdigest_index >= num_tdigests) { return; } + auto const pindex = tid % percentiles.size(); + + // size of the digest we're querying + auto const tdigest_size = tdigest_offsets[tdigest_index + 1] - tdigest_offsets[tdigest_index]; + // no work to do. values will be set to null + if (tdigest_size == 0 || !percentiles.is_valid(pindex)) { return; } + + output[tid] = [&]() { + double const percentage = percentiles.element(pindex); + double const* cumulative_weight = cumulative_weight_ + tdigest_offsets[tdigest_index]; + + // centroids for this particular tdigest + CentroidIter centroids = centroids_ + tdigest_offsets[tdigest_index]; + + // min and max for the digest + double const* min_val = min_ + tdigest_index; + double const* max_val = max_ + tdigest_index; + + double const total_weight = cumulative_weight[tdigest_size - 1]; + + // The following Arrow code serves as a basis for this computation + // https://github.com/apache/arrow/blob/master/cpp/src/arrow/util/tdigest.cc#L280 + double const weighted_q = percentage * total_weight; + if (weighted_q <= 1) { + return *min_val; + } else if (weighted_q >= total_weight - 1) { + return *max_val; + } + + // determine what centroid this weighted quantile falls within. + size_type const centroid_index = static_cast(thrust::distance( + cumulative_weight, + thrust::lower_bound( + thrust::seq, cumulative_weight, cumulative_weight + tdigest_size, weighted_q))); + centroid c = centroids[centroid_index]; + + // diff == how far from the "center" of the centroid we are, + // in unit weights. + // visually: + // + // centroid of weight 7 + // C <-- center of the centroid + // |-------| + // | | | + // X Y Z + // X has a diff of -2 (2 units to the left of the center of the centroid) + // Y has a diff of 0 (directly in the middle of the centroid) + // Z has a diff of 3 (3 units to the right of the center of the centroid) + double const diff = weighted_q + c.weight / 2 - cumulative_weight[centroid_index]; + + // if we're completely within a centroid of weight 1, just return that. + if (c.weight == 1 && std::abs(diff) < 0.5) { return c.mean; } + + // otherwise, interpolate between two centroids. + + // get the two centroids we want to interpolate between + auto const look_left = diff < 0; + auto const [lhs, rhs] = [&]() { + if (look_left) { + // if we're at the first centroid, "left" of us is the min value + auto const first_centroid = centroid_index == 0; + auto const lhs = first_centroid ? centroid{*min_val, 0} : centroids[centroid_index - 1]; + auto const rhs = c; + return std::pair{lhs, rhs}; + } else { + // if we're at the last centroid, "right" of us is the max value + auto const last_centroid = (centroid_index == tdigest_size - 1); + auto const lhs = c; + auto const rhs = last_centroid ? centroid{*max_val, 0} : centroids[centroid_index + 1]; + return std::pair{lhs, rhs}; + } + }(); + + // compute interpolation value t + + // total interpolation range. the total range of "space" between the lhs and rhs centroids. + auto const tip = lhs.weight / 2 + rhs.weight / 2; + // if we're looking left, diff is negative, so shift it so that we are interpolating + // from lhs -> rhs. + auto const t = (look_left) ? (diff + tip) / tip : diff / tip; + + // interpolate + return lerp(lhs.mean, rhs.mean, t); + }(); +} + +/** + * @brief Calculate approximate percentiles on a provided tdigest column. + * + * Produces a LIST column where each row `i` represents output from querying the + * corresponding tdigest of from row `i` in `input`. The length of each output list + * is the number of percentiles specified in `percentiles` + * + * @param input tdigest input data. One tdigest per row. + * @param percentiles Desired percentiles in range [0, 1]. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device + * memory + * + * @returns Column of doubles containing requested percentile values. + */ +std::unique_ptr compute_approx_percentiles(structs_column_view const& input, + column_view const& percentiles, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + lists_column_view lcv(input.child(centroid_column_index)); + column_view min_col = input.child(min_column_index); + column_view max_col = input.child(max_column_index); + + // offsets, representing the size of each tdigest + auto offsets = lcv.offsets(); + + // extract means and weights + auto data = lcv.parent().child(lists_column_view::child_column_index); + structs_column_view tdigest(data); + auto mean = tdigest.child(mean_column_index); + auto weight = tdigest.child(weight_column_index); + + // compute summed weights + auto cumulative_weights = cudf::make_fixed_width_column(data_type{type_id::FLOAT64}, + mean.size(), + mask_state::UNALLOCATED, + stream, + rmm::mr::get_current_device_resource()); + auto keys = cudf::detail::make_counting_transform_iterator( + 0, + [offsets_begin = offsets.begin(), + offsets_end = offsets.end()] __device__(size_type i) { + return thrust::distance( + offsets_begin, + thrust::prev(thrust::upper_bound(thrust::seq, offsets_begin, offsets_end, i))); + }); + thrust::inclusive_scan_by_key(rmm::exec_policy(stream), + keys, + keys + weight.size(), + weight.begin(), + cumulative_weights->mutable_view().begin()); + + auto percentiles_cdv = column_device_view::create(percentiles); + + // leaf is a column of size input.size() * percentiles.size() + auto const num_output_values = input.size() * percentiles.size(); + + // null percentiles become null results. + auto [null_mask, null_count] = [&]() { + return percentiles.null_count() != 0 + ? cudf::detail::valid_if( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_output_values, + [percentiles = *percentiles_cdv] __device__(size_type i) { + return percentiles.is_valid(i % percentiles.size()); + }) + : std::pair{rmm::device_buffer{}, 0}; + }(); + + auto result = cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, num_output_values, std::move(null_mask), null_count, stream, mr); + + auto centroids = cudf::detail::make_counting_transform_iterator( + 0, make_centroid{mean.begin(), weight.begin()}); + + constexpr size_type block_size = 256; + cudf::detail::grid_1d const grid(percentiles.size() * input.size(), block_size); + compute_percentiles_kernel<<>>( + {offsets.begin(), static_cast(offsets.size())}, + *percentiles_cdv, + centroids, + min_col.begin(), + max_col.begin(), + cumulative_weights->view().begin(), + result->mutable_view().begin()); + + return result; +} + +void check_is_valid_tdigest_column(column_view const& col) +{ + // sanity check that this is actually tdigest data + CUDF_EXPECTS(col.type().id() == type_id::STRUCT, "Encountered invalid tdigest column"); + CUDF_EXPECTS(col.size() > 0, "tdigest columns must have > 0 rows"); + CUDF_EXPECTS(col.offset() == 0, "Encountered a sliced tdigest column"); + CUDF_EXPECTS(col.nullable() == false, "Encountered nullable tdigest column"); + + structs_column_view scv(col); + CUDF_EXPECTS(scv.num_children() == 3, "Encountered invalid tdigest column"); + CUDF_EXPECTS(scv.child(min_column_index).type().id() == type_id::FLOAT64, + "Encountered invalid tdigest column"); + CUDF_EXPECTS(scv.child(max_column_index).type().id() == type_id::FLOAT64, + "Encountered invalid tdigest column"); + + lists_column_view lcv(scv.child(centroid_column_index)); + auto data = lcv.child(); + CUDF_EXPECTS(data.type().id() == type_id::STRUCT, "Encountered invalid tdigest column"); + CUDF_EXPECTS(data.num_children() == 2, + "Encountered tdigest column with an invalid number of children"); + auto mean = data.child(mean_column_index); + CUDF_EXPECTS(mean.type().id() == type_id::FLOAT64, "Encountered invalid tdigest mean column"); + auto weight = data.child(weight_column_index); + CUDF_EXPECTS(weight.type().id() == type_id::FLOAT64, "Encountered invalid tdigest weight column"); +} + +std::unique_ptr make_empty_tdigest_column(rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // mean/weight columns + std::vector> inner_children; + inner_children.push_back(make_empty_column(data_type(type_id::FLOAT64))); + inner_children.push_back(make_empty_column(data_type(type_id::FLOAT64))); + + auto offsets = cudf::make_fixed_width_column( + data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr); + thrust::fill(rmm::exec_policy(stream), + offsets->mutable_view().begin(), + offsets->mutable_view().end(), + 0); + auto list = + make_lists_column(1, + std::move(offsets), + cudf::make_structs_column(0, std::move(inner_children), 0, {}, stream, mr), + 0, + {}); + + auto min_col = + cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr); + thrust::fill(rmm::exec_policy(stream), + min_col->mutable_view().begin(), + min_col->mutable_view().end(), + 0); + auto max_col = + cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr); + thrust::fill(rmm::exec_policy(stream), + max_col->mutable_view().begin(), + max_col->mutable_view().end(), + 0); + + std::vector> children; + children.push_back(std::move(list)); + children.push_back(std::move(min_col)); + children.push_back(std::move(max_col)); + + return make_structs_column(1, std::move(children), 0, {}, stream, mr); +} + +} // namespace tdigest. + +std::unique_ptr percentile_approx(structs_column_view const& input, + column_view const& percentiles, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + tdigest::check_is_valid_tdigest_column(input); + CUDF_EXPECTS(percentiles.type().id() == type_id::FLOAT64, + "percentile_approx expects float64 percentile inputs"); + + // output is a list column with each row containing percentiles.size() percentile values + auto offsets = cudf::make_fixed_width_column( + data_type{type_id::INT32}, input.size() + 1, mask_state::UNALLOCATED, stream, mr); + auto row_size_iter = thrust::make_constant_iterator(percentiles.size()); + thrust::exclusive_scan(rmm::exec_policy(stream), + row_size_iter, + row_size_iter + input.size() + 1, + offsets->mutable_view().begin()); + + if (percentiles.size() == 0) { + return cudf::make_lists_column( + input.size(), + std::move(offsets), + cudf::make_empty_column(data_type{type_id::FLOAT64}), + input.size(), + cudf::detail::create_null_mask( + input.size(), mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr)); + } + + // if any of the input digests are empty, nullify the corresponding output rows (values will be + // uninitialized) + auto [bitmask, null_count] = [stream, mr, input]() { + lists_column_view lcv(input.child(tdigest::centroid_column_index)); + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [offsets = lcv.offsets().begin()] __device__(size_type index) { + return offsets[index + 1] - offsets[index] == 0 ? 1 : 0; + }); + auto const null_count = thrust::reduce(rmm::exec_policy(stream), iter, iter + input.size(), 0); + if (null_count == 0) { + return std::pair{rmm::device_buffer{}, null_count}; + } + return cudf::detail::valid_if( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + input.size(), + [offsets = lcv.offsets().begin()] __device__(size_type index) { + return offsets[index + 1] - offsets[index] == 0 ? 0 : 1; + }, + stream, + mr); + }(); + + return cudf::make_lists_column( + input.size(), + std::move(offsets), + tdigest::compute_approx_percentiles(input, percentiles, stream, mr), + null_count, + std::move(bitmask), + stream, + mr); +} + +} // namespace detail + +std::unique_ptr percentile_approx(structs_column_view const& input, + column_view const& percentiles, + rmm::mr::device_memory_resource* mr) +{ + return percentile_approx(input, percentiles, rmm::cuda_stream_default, mr); +} + +} // namespace cudf diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu index dc74a5f4ff1..42b57bdb47a 100644 --- a/cpp/src/sort/sort.cu +++ b/cpp/src/sort/sort.cu @@ -26,7 +26,7 @@ namespace cudf { namespace detail { -std::unique_ptr sorted_order(table_view input, +std::unique_ptr sorted_order(table_view const& input, std::vector const& column_order, std::vector const& null_precedence, rmm::cuda_stream_view stream, @@ -75,7 +75,7 @@ struct inplace_column_sort_fn { } }; -std::unique_ptr
sort(table_view input, +std::unique_ptr
sort(table_view const& input, std::vector const& column_order, std::vector const& null_precedence, rmm::cuda_stream_view stream, @@ -101,7 +101,7 @@ std::unique_ptr
sort(table_view input, } // namespace detail -std::unique_ptr sorted_order(table_view input, +std::unique_ptr sorted_order(table_view const& input, std::vector const& column_order, std::vector const& null_precedence, rmm::mr::device_memory_resource* mr) @@ -110,7 +110,7 @@ std::unique_ptr sorted_order(table_view input, return detail::sorted_order(input, column_order, null_precedence, rmm::cuda_stream_default, mr); } -std::unique_ptr
sort(table_view input, +std::unique_ptr
sort(table_view const& input, std::vector const& column_order, std::vector const& null_precedence, rmm::mr::device_memory_resource* mr) diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu index 860e88ae76e..75335579de2 100644 --- a/cpp/src/sort/stable_sort.cu +++ b/cpp/src/sort/stable_sort.cu @@ -25,7 +25,7 @@ namespace cudf { namespace detail { -std::unique_ptr stable_sorted_order(table_view input, +std::unique_ptr stable_sorted_order(table_view const& input, std::vector const& column_order, std::vector const& null_precedence, rmm::cuda_stream_view stream, @@ -36,7 +36,7 @@ std::unique_ptr stable_sorted_order(table_view input, } // namespace detail -std::unique_ptr stable_sorted_order(table_view input, +std::unique_ptr stable_sorted_order(table_view const& input, std::vector const& column_order, std::vector const& null_precedence, rmm::mr::device_memory_resource* mr) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index d9553d463ab..6d385ff969d 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -62,6 +62,7 @@ ConfigureTest(GROUPBY_TEST groupby/count_tests.cpp groupby/groups_tests.cpp groupby/keys_tests.cpp + groupby/lists_tests.cpp groupby/m2_tests.cpp groupby/min_tests.cpp groupby/max_scan_tests.cpp @@ -84,6 +85,7 @@ ConfigureTest(GROUPBY_TEST groupby/sum_of_squares_tests.cpp groupby/sum_scan_tests.cpp groupby/sum_tests.cpp + groupby/tdigest_tests.cu groupby/var_tests.cpp) ################################################################################################### @@ -122,6 +124,7 @@ ConfigureTest(HASH_MAP_TEST ################################################################################################### # - quantiles tests ------------------------------------------------------------------------------- ConfigureTest(QUANTILES_TEST + quantiles/percentile_approx_test.cu quantiles/quantile_test.cpp quantiles/quantiles_test.cpp) diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp index 542205b5b51..b333d9dacba 100644 --- a/cpp/tests/groupby/groupby_test_util.hpp +++ b/cpp/tests/groupby/groupby_test_util.hpp @@ -27,6 +27,9 @@ #include #include #include +#include + +#include namespace cudf { namespace test { @@ -128,5 +131,57 @@ inline void test_single_scan(column_view const& keys, expect_vals, *result.second[0].results[0], debug_output_level::ALL_ERRORS); } +template +inline T frand() +{ + return static_cast(rand()) / static_cast(RAND_MAX); +} + +template +inline T rand_range(T min, T max) +{ + return min + static_cast(frand() * (max - min)); +} + +inline std::unique_ptr generate_typed_percentile_distribution( + std::vector const& buckets, + std::vector const& sizes, + data_type t, + bool sorted = false) +{ + srand(0); + + std::vector values; + size_t total_size = std::reduce(sizes.begin(), sizes.end(), 0); + values.reserve(total_size); + for (size_t idx = 0; idx < sizes.size(); idx++) { + double min = idx == 0 ? 0.0f : buckets[idx - 1]; + double max = buckets[idx]; + + for (int v_idx = 0; v_idx < sizes[idx]; v_idx++) { + values.push_back(rand_range(min, max)); + } + } + + if (sorted) { std::sort(values.begin(), values.end()); } + + cudf::test::fixed_width_column_wrapper src(values.begin(), values.end()); + return cudf::cast(src, t); +} + +// "standardized" means the parameters sent into generate_typed_percentile_distribution. the intent +// is to provide a standardized set of inputs for use with tdigest generation tests and +// percentile_approx tests. std::vector +// buckets{10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0}; std::vector +// sizes{50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000}; +inline std::unique_ptr generate_standardized_percentile_distribution( + data_type t = data_type{type_id::FLOAT64}, bool sorted = false) +{ + std::vector buckets{10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0, 90.0f, 100.0f}; + std::vector b_sizes{ + 50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000}; + return generate_typed_percentile_distribution(buckets, b_sizes, t, sorted); +} + } // namespace test } // namespace cudf diff --git a/cpp/tests/groupby/lists_tests.cpp b/cpp/tests/groupby/lists_tests.cpp new file mode 100644 index 00000000000..11b8ffa92b9 --- /dev/null +++ b/cpp/tests/groupby/lists_tests.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include + +namespace cudf { +namespace test { + +template +struct groupby_lists_test : public cudf::test::BaseFixture { +}; + +TYPED_TEST_SUITE(groupby_lists_test, cudf::test::FixedWidthTypes); + +namespace { +// Checking with a single aggregation, and aggregation column. +// This test is orthogonal to the aggregation type; it focuses on testing the grouping +// with LISTS keys. +auto sum_agg() { return cudf::make_sum_aggregation(); } + +void test_sort_based_sum_agg(column_view const& keys, column_view const& values) +{ + test_single_agg( + keys, values, keys, values, sum_agg(), force_use_sort_impl::YES, null_policy::INCLUDE); +} + +void test_hash_based_sum_agg(column_view const& keys, column_view const& values) +{ + test_single_agg( + keys, values, keys, values, sum_agg(), force_use_sort_impl::NO, null_policy::INCLUDE); +} + +} // namespace + +TYPED_TEST(groupby_lists_test, top_level_lists_are_unsupported) +{ + // Test that grouping on LISTS columns fails visibly. + + // clang-format off + auto keys = lists_column_wrapper { {1,1}, {2,2}, {3,3}, {1,1}, {2,2} }; + auto values = fixed_width_column_wrapper { 0, 1, 2, 3, 4 }; + // clang-format on + + EXPECT_THROW(test_sort_based_sum_agg(keys, values), cudf::logic_error); + EXPECT_THROW(test_hash_based_sum_agg(keys, values), cudf::logic_error); +} + +} // namespace test +} // namespace cudf diff --git a/cpp/tests/groupby/mean_tests.cpp b/cpp/tests/groupby/mean_tests.cpp index 613e1555b79..d390c8a1880 100644 --- a/cpp/tests/groupby/mean_tests.cpp +++ b/cpp/tests/groupby/mean_tests.cpp @@ -160,5 +160,57 @@ TEST_F(groupby_dictionary_mean_test, basic) keys, vals, expect_keys, expect_vals, cudf::make_mean_aggregation()); } +template +struct FixedPointTestBothReps : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes); + +TYPED_TEST(FixedPointTestBothReps, GroupBySortMeanDecimalAsValue) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + for (auto const i : {2, 1, 0, -1, -2}) { + auto const scale = scale_type{i}; + // clang-format off + auto const keys = fixed_width_column_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + auto const vals = fp_wrapper{ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale}; + // clang-format on + + auto const expect_keys = fixed_width_column_wrapper{1, 2, 3}; + auto const expect_vals_min = fp_wrapper{{3, 4, 5}, scale}; + + auto agg = cudf::make_mean_aggregation(); + test_single_agg( + keys, vals, expect_keys, expect_vals_min, std::move(agg), force_use_sort_impl::YES); + } +} + +TYPED_TEST(FixedPointTestBothReps, GroupByHashMeanDecimalAsValue) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + using K = int32_t; + + for (auto const i : {2, 1, 0, -1, -2}) { + auto const scale = scale_type{i}; + // clang-format off + auto const keys = fixed_width_column_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + auto const vals = fp_wrapper{ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale}; + // clang-format on + + auto const expect_keys = fixed_width_column_wrapper{1, 2, 3}; + auto const expect_vals_min = fp_wrapper{{3, 4, 5}, scale}; + + auto agg = cudf::make_mean_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals_min, std::move(agg)); + } +} + } // namespace test } // namespace cudf diff --git a/cpp/tests/groupby/structs_tests.cpp b/cpp/tests/groupby/structs_tests.cpp index 00126a4a5a0..3715ba8d17b 100644 --- a/cpp/tests/groupby/structs_tests.cpp +++ b/cpp/tests/groupby/structs_tests.cpp @@ -22,8 +22,6 @@ #include #include -#include "cudf/aggregation.hpp" -#include "cudf/types.hpp" using namespace cudf::test::iterators; @@ -34,7 +32,7 @@ template struct groupby_structs_test : public cudf::test::BaseFixture { }; -TYPED_TEST_CASE(groupby_structs_test, cudf::test::FixedWidthTypes); +TYPED_TEST_SUITE(groupby_structs_test, cudf::test::FixedWidthTypes); using V = int32_t; // Type of Aggregation Column. using M0 = int32_t; // Type of STRUCT's first (i.e. 0th) member. @@ -79,27 +77,43 @@ void print_agg_results(column_view const& keys, column_view const& vals) } } -void test_sum_agg(column_view const& keys, - column_view const& values, - column_view const& expected_keys, - column_view const& expected_values) +void test_sort_based_sum_agg(column_view const& keys, + column_view const& values, + column_view const& expected_keys, + column_view const& expected_values) { test_single_agg(keys, values, expected_keys, expected_values, sum_agg(), - force_use_sort_impl::NO, + force_use_sort_impl::YES, null_policy::INCLUDE); +} + +void test_hash_based_sum_agg(column_view const& keys, + column_view const& values, + column_view const& expected_keys, + column_view const& expected_values) +{ test_single_agg(keys, values, expected_keys, expected_values, sum_agg(), - force_use_sort_impl::YES, + force_use_sort_impl::NO, null_policy::INCLUDE); } +void test_sum_agg(column_view const& keys, + column_view const& values, + column_view const& expected_keys, + column_view const& expected_values) +{ + test_sort_based_sum_agg(keys, values, expected_keys, expected_values); + test_hash_based_sum_agg(keys, values, expected_keys, expected_values); +} + } // namespace TYPED_TEST(groupby_structs_test, basic) @@ -312,7 +326,8 @@ TYPED_TEST(groupby_structs_test, lists_are_unsupported) // clang-format on auto keys = structs{{member_0, member_1}}; - EXPECT_THROW(test_sum_agg(keys, values, keys, values), cudf::logic_error); + EXPECT_THROW(test_sort_based_sum_agg(keys, values, keys, values), cudf::logic_error); + EXPECT_THROW(test_hash_based_sum_agg(keys, values, keys, values), cudf::logic_error); } } // namespace test diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu new file mode 100644 index 00000000000..818999867c1 --- /dev/null +++ b/cpp/tests/groupby/tdigest_tests.cu @@ -0,0 +1,584 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arrow/util/tdigest.h" + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +namespace cudf { +namespace test { + +using namespace cudf; + +typedef thrust::tuple expected_value; + +template +struct TDigestAllTypes : public cudf::test::BaseFixture { +}; +TYPED_TEST_CASE(TDigestAllTypes, cudf::test::NumericTypes); + +struct tdigest_gen { + template < + typename T, + typename std::enable_if_t() || cudf::is_fixed_point()>* = nullptr> + std::unique_ptr operator()(column_view const& keys, column_view const& values, int delta) + { + cudf::table_view t({keys}); + cudf::groupby::groupby gb(t); + std::vector requests; + std::vector> aggregations; + aggregations.push_back(cudf::make_tdigest_aggregation(delta)); + requests.push_back({values, std::move(aggregations)}); + auto result = gb.aggregate(requests); + return std::move(result.second[0].results[0]); + } + + template < + typename T, + typename std::enable_if_t() && !cudf::is_fixed_point()>* = nullptr> + std::unique_ptr operator()(column_view const& keys, column_view const& values, int delta) + { + CUDF_FAIL("Invalid tdigest test type"); + } +}; + +void tdigest_sample_compare(column_view const& result, + std::vector const& h_expected) +{ + cudf::detail::tdigest::check_is_valid_tdigest_column(result); + cudf::structs_column_view scv(result); + cudf::lists_column_view lcv(scv.child(cudf::detail::tdigest::centroid_column_index)); + cudf::structs_column_view tdigests(lcv.child()); + column_view result_mean = tdigests.child(cudf::detail::tdigest::mean_column_index); + column_view result_weight = tdigests.child(cudf::detail::tdigest::weight_column_index); + + auto expected_mean = cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED); + auto expected_weight = cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED); + auto sampled_result_mean = cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED); + auto sampled_result_weight = cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED); + + rmm::device_vector expected(h_expected.begin(), h_expected.end()); + auto iter = thrust::make_counting_iterator(0); + thrust::for_each( + rmm::exec_policy(rmm::cuda_stream_default), + iter, + iter + expected.size(), + [expected = expected.data().get(), + expected_mean = expected_mean->mutable_view().begin(), + expected_weight = expected_weight->mutable_view().begin(), + result_mean = result_mean.begin(), + result_weight = result_weight.begin(), + sampled_result_mean = sampled_result_mean->mutable_view().begin(), + sampled_result_weight = + sampled_result_weight->mutable_view().begin()] __device__(size_type index) { + expected_mean[index] = thrust::get<1>(expected[index]); + expected_weight[index] = thrust::get<2>(expected[index]); + auto const src_index = thrust::get<0>(expected[index]); + sampled_result_mean[index] = result_mean[src_index]; + sampled_result_weight[index] = result_weight[src_index]; + }); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected_mean, *sampled_result_mean); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_weight, *sampled_result_weight); +} + +template +std::unique_ptr make_expected_tdigest(column_view const& mean, + column_view const& weight, + T min, + T max) +{ + std::vector> inner_children; + inner_children.push_back(std::make_unique(mean)); + inner_children.push_back(std::make_unique(weight)); + // tdigest struct + auto tdigests = cudf::make_structs_column(mean.size(), std::move(inner_children), 0, {}); + + std::vector h_offsets{0, mean.size()}; + auto offsets = + cudf::make_fixed_width_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED); + cudaMemcpy(offsets->mutable_view().begin(), + h_offsets.data(), + sizeof(offset_type) * 2, + cudaMemcpyHostToDevice); + + auto list = cudf::make_lists_column(1, std::move(offsets), std::move(tdigests), 0, {}); + + auto min_col = + cudf::make_fixed_width_column(data_type{type_id::FLOAT64}, 1, mask_state::UNALLOCATED); + thrust::fill(rmm::exec_policy(rmm::cuda_stream_default), + min_col->mutable_view().begin(), + min_col->mutable_view().end(), + static_cast(min)); + auto max_col = + cudf::make_fixed_width_column(data_type{type_id::FLOAT64}, 1, mask_state::UNALLOCATED); + thrust::fill(rmm::exec_policy(rmm::cuda_stream_default), + max_col->mutable_view().begin(), + max_col->mutable_view().end(), + static_cast(max)); + + std::vector> children; + children.push_back(std::move(list)); + children.push_back(std::move(min_col)); + children.push_back(std::move(max_col)); + return make_structs_column(1, std::move(children), 0, {}); +} + +TYPED_TEST(TDigestAllTypes, Simple) +{ + using T = TypeParam; + + // create a tdigest that has far fewer values in it than the delta value. this should result + // in every value remaining uncompressed + cudf::test::fixed_width_column_wrapper values{126, 15, 1, 99, 67}; + cudf::test::fixed_width_column_wrapper keys{0, 0, 0, 0, 0}; + int const delta = 1000; + auto result = cudf::type_dispatcher( + static_cast(values).type(), tdigest_gen{}, keys, values, delta); + + cudf::test::fixed_width_column_wrapper raw_mean({1, 15, 67, 99, 126}); + cudf::test::fixed_width_column_wrapper weight{1, 1, 1, 1, 1}; + auto mean = cudf::cast(raw_mean, data_type{type_id::FLOAT64}); + double const min = 1; + double const max = 126; + auto expected = make_expected_tdigest(*mean, weight, static_cast(min), static_cast(max)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); +} + +TYPED_TEST(TDigestAllTypes, SimpleWithNulls) +{ + using T = TypeParam; + + // create a tdigest that has far fewer values in it than the delta value. this should result + // in every value remaining uncompressed + cudf::test::fixed_width_column_wrapper values{{122, 15, 1, 99, 67, 101, 100, 84, 44, 2}, + {1, 0, 1, 0, 1, 0, 1, 0, 1, 0}}; + cudf::test::fixed_width_column_wrapper keys{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int const delta = 1000; + auto result = cudf::type_dispatcher( + static_cast(values).type(), tdigest_gen{}, keys, values, delta); + + cudf::test::fixed_width_column_wrapper raw_mean({1, 44, 67, 100, 122}); + cudf::test::fixed_width_column_wrapper weight{1, 1, 1, 1, 1}; + auto mean = cudf::cast(raw_mean, data_type{type_id::FLOAT64}); + double const min = 1; + double const max = 122; + auto expected = make_expected_tdigest(*mean, weight, static_cast(min), static_cast(max)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); +} + +TYPED_TEST(TDigestAllTypes, AllNull) +{ + using T = TypeParam; + + // create a tdigest that has far fewer values in it than the delta value. this should result + // in every value remaining uncompressed + cudf::test::fixed_width_column_wrapper values{{122, 15, 1, 99, 67, 101, 100, 84, 44, 2}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}; + cudf::test::fixed_width_column_wrapper keys{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int const delta = 1000; + auto result = cudf::type_dispatcher( + static_cast(values).type(), tdigest_gen{}, keys, values, delta); + + // NOTE: an empty tdigest column still has 1 row. + auto expected = cudf::detail::tdigest::make_empty_tdigest_column(); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); +} + +TYPED_TEST(TDigestAllTypes, LargeGroups) +{ + auto _values = generate_standardized_percentile_distribution(data_type{type_id::FLOAT64}); + int const delta = 1000; + + // generate a random set of keys + std::vector h_keys; + h_keys.reserve(_values->size()); + auto iter = thrust::make_counting_iterator(0); + std::transform(iter, iter + _values->size(), std::back_inserter(h_keys), [](int i) { + return static_cast(round(rand_range(0, 8))); + }); + cudf::test::fixed_width_column_wrapper _keys(h_keys.begin(), h_keys.end()); + + // group the input values together + cudf::table_view k({_keys}); + cudf::groupby::groupby setup_gb(k); + cudf::table_view v({*_values}); + auto groups = setup_gb.get_groups(v); + + // slice it all up so we have keys/columns for everything. + std::vector keys; + std::vector values; + for (size_t idx = 0; idx < groups.offsets.size() - 1; idx++) { + auto k = + cudf::slice(groups.keys->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]}); + keys.push_back(k[0]); + + auto v = + cudf::slice(groups.values->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]}); + values.push_back(v[0]); + } + + // generate a seperate tdigest for each group + std::vector> parts; + std::transform( + iter, iter + values.size(), std::back_inserter(parts), [&keys, &values, delta](int i) { + cudf::table_view t({keys[i]}); + cudf::groupby::groupby gb(t); + std::vector requests; + std::vector> aggregations; + aggregations.push_back(cudf::make_tdigest_aggregation(delta)); + requests.push_back({values[i], std::move(aggregations)}); + auto result = gb.aggregate(requests); + return std::move(result.second[0].results[0]); + }); + std::vector part_views; + std::transform(parts.begin(), + parts.end(), + std::back_inserter(part_views), + [](std::unique_ptr const& col) { return col->view(); }); + auto merged_parts = cudf::concatenate(part_views); + + // generate a tdigest on the whole input set + cudf::table_view t({_keys}); + cudf::groupby::groupby gb(t); + std::vector requests; + std::vector> aggregations; + aggregations.push_back(cudf::make_tdigest_aggregation(delta)); + requests.push_back({*_values, std::move(aggregations)}); + auto result = gb.aggregate(requests); + + // verify that they end up the same. + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[0], *merged_parts); +} + +struct TDigestTest : public cudf::test::BaseFixture { +}; + +TEST_F(TDigestTest, LargeInputDouble) +{ + // these tests are being done explicitly because of the way we have to precompute the correct + // answers. since the input values generated by the generate_distribution() function below are + // cast to specific types -before- being sent into the aggregation, I can't (safely) just use the + // expected values that you get when using doubles all the way through. so I have to pregenerate + // the correct answers for each type by hand. so, we'll choose a reasonable subset (double, + // decimal, int, bool) + + auto values = generate_standardized_percentile_distribution(data_type{type_id::FLOAT64}); + // all in the same group + auto keys = cudf::make_fixed_width_column( + data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED); + thrust::fill(rmm::exec_policy(rmm::cuda_stream_default), + keys->mutable_view().template begin(), + keys->mutable_view().template end(), + 0); + + // compare against a sample of known/expected values (which themselves were verified against the + // Arrow implementation) + + // delta 1000 + { + int const delta = 1000; + auto result = + cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta); + std::vector expected{{0, 0.00040692343794663995, 7}, + {10, 0.16234555627091204477, 153}, + {59, 5.12764811246045937310, 858}, + {250, 62.54581814492237157310, 2356}, + {368, 87.85834376680742252574, 1735}, + {409, 94.07685720279611985006, 1272}, + {491, 99.94197663121231300920, 130}, + {500, 99.99969880795092080916, 2}}; + + tdigest_sample_compare(*result, expected); + } + + // delta 100 + { + int const delta = 100; + auto result = + cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta); + std::vector expected{{0, 0.07265722021410986331, 739}, + {7, 8.19766194442652640362, 10693}, + {16, 36.82277869518204482802, 20276}, + {29, 72.95424834129075009059, 22623}, + {38, 90.61229683516096145013, 15581}, + {46, 99.07283498858802772702, 5142}, + {50, 99.99970905482754801596, 1}}; + + tdigest_sample_compare(*result, expected); + } + + // delta 10 + { + int const delta = 10; + auto result = + cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta); + std::vector expected{{0, 7.15508346777729631327, 71618}, + {1, 33.04971680740474226923, 187499}, + {2, 62.50566666553867634093, 231762}, + {3, 83.46216572053654658703, 187500}, + {4, 96.42204425201593664951, 71620}, + {5, 99.99970905482754801596, 1}}; + + tdigest_sample_compare(*result, expected); + } +} + +TEST_F(TDigestTest, LargeInputInt) +{ + // these tests are being done explicitly because of the way we have to precompute the correct + // answers. since the input values generated by the generate_distribution() function below are + // cast to specific types -before- being sent into the aggregation, I can't (safely) just use the + // expected values that you get when using doubles all the way through. so I have to pregenerate + // the correct answers for each type by hand. so, we'll choose a reasonable subset (double, + // decimal, int, bool) + + auto values = generate_standardized_percentile_distribution(data_type{type_id::INT32}); + // all in the same group + auto keys = cudf::make_fixed_width_column( + data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED); + thrust::fill(rmm::exec_policy(rmm::cuda_stream_default), + keys->mutable_view().template begin(), + keys->mutable_view().template end(), + 0); + + // compare against a sample of known/expected values (which themselves were verified against the + // Arrow implementation) + + // delta 1000 + { + int const delta = 1000; + auto result = + cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta); + std::vector expected{{0, 0, 7}, + {14, 0, 212}, + {26, 0.83247422680412408447, 388}, + {44, 2, 648}, + {45, 2.42598187311178170589, 662}, + {342, 82.75190258751908345403, 1971}, + {383, 90, 1577}, + {417, 94.88376068376066996279, 1170}, + {418, 95, 1157}, + {479, 99, 307}, + {500, 99, 2}}; + + tdigest_sample_compare(*result, expected); + } + + // delta 100 + { + int const delta = 100; + auto result = + cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta); + std::vector expected{{0, 0, 739}, + {7, 7.71486018890863167741, 10693}, + {16, 36.32491615703294485229, 20276}, + {29, 72.44392874508245938614, 22623}, + {38, 90.14209614273795523332, 15581}, + {46, 98.64041229093737683797, 5142}, + {50, 99, 1}}; + + tdigest_sample_compare(*result, expected); + } + + // delta 10 + { + int const delta = 10; + auto result = + cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta); + std::vector expected{{0, 6.66025300902007799664, 71618}, + {1, 32.54912826201739051157, 187499}, + {2, 62.00734805533262772315, 231762}, + {3, 82.96355733333332693746, 187500}, + {4, 95.91280368612116546956, 71620}, + {5, 99, 1}}; + + tdigest_sample_compare(*result, expected); + } +} + +TEST_F(TDigestTest, LargeInputDecimal) +{ + // these tests are being done explicitly because of the way we have to precompute the correct + // answers. since the input values generated by the generate_distribution() function below are + // cast to specific types -before- being sent into the aggregation, I can't (safely) just use the + // expected values that you get when using doubles all the way through. so I have to pregenerate + // the correct answers for each type by hand. so, we'll choose a reasonable subset (double, + // decimal, int, bool) + + auto values = generate_standardized_percentile_distribution(data_type{type_id::DECIMAL32, -4}); + // all in the same group + auto keys = cudf::make_fixed_width_column( + data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED); + thrust::fill(rmm::exec_policy(rmm::cuda_stream_default), + keys->mutable_view().template begin(), + keys->mutable_view().template end(), + 0); + + // compare against a sample of known/expected values (which themselves were verified against the + // Arrow implementation) + + // delta 1000 + { + int const delta = 1000; + auto result = + cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta); + std::vector expected{{0, 0.00035714285714285709, 7}, + {10, 0.16229738562091505782, 153}, + {59, 5.12759696969697031932, 858}, + {250, 62.54576854838715860296, 2356}, + {368, 87.85829446685879418055, 1735}, + {409, 94.07680636792450457051, 1272}, + {491, 99.94192461538463589932, 130}, + {500, 99.99965000000000259206, 2}}; + + tdigest_sample_compare(*result, expected); + } + + // delta 100 + { + int const delta = 100; + auto result = + cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta); + std::vector expected{{0, 0.07260811907983763525, 739}, + {7, 8.19761183016926864298, 10693}, + {16, 36.82272891595975750079, 20276}, + {29, 72.95419827167043536065, 22623}, + {38, 90.61224673640975879607, 15581}, + {46, 99.07278498638662256326, 5142}, + {50, 99.99970000000000425189, 1}}; + + tdigest_sample_compare(*result, expected); + } + + // delta 10 + { + int const delta = 10; + auto result = + cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta); + std::vector expected{{0, 7.15503361864335740705, 71618}, + {1, 33.04966679715625588187, 187499}, + {2, 62.50561666407782013266, 231762}, + {3, 83.46211575573336460820, 187500}, + {4, 96.42199425300195514410, 71620}, + {5, 99.99970000000000425189, 1}}; + + tdigest_sample_compare(*result, expected); + } +} + +struct TDigestMergeTest : public cudf::test::BaseFixture { +}; + +// Note: there is no need to test different types here as the internals of a tdigest are always +// the same regardless of input. +TEST_F(TDigestMergeTest, Simple) +{ + auto values = generate_standardized_percentile_distribution(data_type{type_id::FLOAT64}); + CUDF_EXPECTS(values->size() == 750000, "Unexpected distribution size"); + // all in the same group + auto keys = cudf::make_fixed_width_column( + data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED); + thrust::fill(rmm::exec_policy(rmm::cuda_stream_default), + keys->mutable_view().template begin(), + keys->mutable_view().template end(), + 0); + + auto split_values = cudf::split(*values, {250000, 500000}); + auto split_keys = cudf::split(*keys, {250000, 500000}); + + int const delta = 1000; + + // generate seperate digests + std::vector> parts; + auto iter = thrust::make_counting_iterator(0); + std::transform( + iter, + iter + split_values.size(), + std::back_inserter(parts), + [&split_keys, &split_values, delta](int i) { + cudf::table_view t({split_keys[i]}); + cudf::groupby::groupby gb(t); + std::vector requests; + std::vector> aggregations; + aggregations.push_back(cudf::make_tdigest_aggregation(delta)); + requests.push_back({split_values[i], std::move(aggregations)}); + auto result = gb.aggregate(requests); + return std::move(result.second[0].results[0]); + }); + std::vector part_views; + std::transform(parts.begin(), + parts.end(), + std::back_inserter(part_views), + [](std::unique_ptr const& col) { return col->view(); }); + + // merge delta = 1000 + { + int const merge_delta = 1000; + + // merge them + auto merge_input = cudf::concatenate(part_views); + cudf::test::fixed_width_column_wrapper merge_keys{0, 0, 0}; + cudf::table_view key_table({merge_keys}); + cudf::groupby::groupby gb(key_table); + std::vector requests; + std::vector> aggregations; + aggregations.push_back( + cudf::make_merge_tdigest_aggregation(merge_delta)); + requests.push_back({*merge_input, std::move(aggregations)}); + auto result = gb.aggregate(requests); + + std::vector expected{{0, 0.00013945158577498588, 2}, + {10, 0.04804393446447510763, 50}, + {59, 1.68846964439246893797, 284}, + {250, 33.36323141295877547918, 1479}, + {368, 65.36307727957283475462, 2292}, + {409, 73.95399208218296394080, 1784}, + {490, 87.67566167909056673579, 1570}, + {491, 87.83119717763385381204, 1570}, + {500, 89.24891838334393412424, 1555}, + {578, 95.87182997389099625707, 583}, + {625, 98.20470345147104751504, 405}, + {700, 99.96818381983835877236, 56}, + {711, 99.99970905482754801596, 1}}; + + tdigest_sample_compare(*result.second[0].results[0], expected); + } +} + +} // namespace test +} // namespace cudf diff --git a/cpp/tests/io/metadata_utilities.cpp b/cpp/tests/io/metadata_utilities.cpp new file mode 100644 index 00000000000..39617c99690 --- /dev/null +++ b/cpp/tests/io/metadata_utilities.cpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +namespace cudf::test { + +void expect_metadata_equal(cudf::io::table_input_metadata in_meta, + cudf::io::table_metadata out_meta) +{ + std::function compare_names = + [&](cudf::io::column_name_info out_col, cudf::io::column_in_metadata in_col) { + if (not in_col.get_name().empty()) { EXPECT_EQ(out_col.name, in_col.get_name()); } + ASSERT_EQ(out_col.children.size(), in_col.num_children()); + for (size_t i = 0; i < out_col.children.size(); ++i) { + compare_names(out_col.children[i], in_col.child(i)); + } + }; + + ASSERT_EQ(out_meta.schema_info.size(), in_meta.column_metadata.size()); + + for (size_t i = 0; i < out_meta.schema_info.size(); ++i) { + compare_names(out_meta.schema_info[i], in_meta.column_metadata[i]); + } +} + +} // namespace cudf::test diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index fbeba925f1b..cdf0a3b275b 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -161,14 +162,10 @@ struct SkipRowTest { auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); column_wrapper input_col( sequence, sequence + file_num_rows, validity); - - std::vector> input_cols; - input_cols.push_back(input_col.release()); - auto input_table = std::make_unique
(std::move(input_cols)); - EXPECT_EQ(1, input_table->num_columns()); + table_view input_table({input_col}); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, input_table->view()); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, input_table); cudf_io::write_orc(out_opts); auto begin_sequence = sequence, end_sequence = sequence; @@ -180,9 +177,7 @@ struct SkipRowTest { begin_sequence, end_sequence, validity); std::vector> output_cols; output_cols.push_back(output_col.release()); - auto expected = std::make_unique
(std::move(output_cols)); - EXPECT_EQ(1, expected->num_columns()); - return expected; + return std::make_unique
(std::move(output_cols)); } void test(int skip_rows, int file_num_rows, int read_num_rows) @@ -224,22 +219,18 @@ TYPED_TEST(OrcWriterNumericTypeTest, SingleColumn) constexpr auto num_rows = 100; column_wrapper col( sequence, sequence + num_rows, validity); - - std::vector> cols; - cols.push_back(col.release()); - auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(1, expected->num_columns()); + table_view expected({col}); auto filepath = temp_env->get_temp_filepath("OrcSingleColumn.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected); cudf_io::write_orc(out_opts); cudf_io::orc_reader_options in_opts = cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } TYPED_TEST(OrcWriterNumericTypeTest, SingleColumnWithNulls) @@ -250,22 +241,18 @@ TYPED_TEST(OrcWriterNumericTypeTest, SingleColumnWithNulls) constexpr auto num_rows = 100; column_wrapper col( sequence, sequence + num_rows, validity); - - std::vector> cols; - cols.push_back(col.release()); - auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(1, expected->num_columns()); + table_view expected({col}); auto filepath = temp_env->get_temp_filepath("OrcSingleColumnWithNulls.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected); cudf_io::write_orc(out_opts); cudf_io::orc_reader_options in_opts = cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } TYPED_TEST(OrcWriterTimestampTypeTest, Timestamps) @@ -277,15 +264,11 @@ TYPED_TEST(OrcWriterTimestampTypeTest, Timestamps) constexpr auto num_rows = 100; column_wrapper col( sequence, sequence + num_rows, validity); - - std::vector> cols; - cols.push_back(col.release()); - auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(1, expected->num_columns()); + table_view expected({col}); auto filepath = temp_env->get_temp_filepath("OrcTimestamps.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected); cudf_io::write_orc(out_opts); cudf_io::orc_reader_options in_opts = @@ -294,7 +277,7 @@ TYPED_TEST(OrcWriterTimestampTypeTest, Timestamps) .timestamp_type(this->type()); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } TYPED_TEST(OrcWriterTimestampTypeTest, TimestampsWithNulls) @@ -307,15 +290,11 @@ TYPED_TEST(OrcWriterTimestampTypeTest, TimestampsWithNulls) constexpr auto num_rows = 100; column_wrapper col( sequence, sequence + num_rows, validity); - - std::vector> cols; - cols.push_back(col.release()); - auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(1, expected->num_columns()); + table_view expected({col}); auto filepath = temp_env->get_temp_filepath("OrcTimestampsWithNulls.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected); cudf_io::write_orc(out_opts); cudf_io::orc_reader_options in_opts = @@ -324,12 +303,12 @@ TYPED_TEST(OrcWriterTimestampTypeTest, TimestampsWithNulls) .timestamp_type(this->type()); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } TEST_F(OrcWriterTest, MultiColumn) { - constexpr auto num_rows = 100; + constexpr auto num_rows = 10; auto col0_data = random_values(num_rows); auto col1_data = random_values(num_rows); @@ -351,29 +330,29 @@ TEST_F(OrcWriterTest, MultiColumn) column_wrapper col5{col5_data.begin(), col5_data.end(), validity}; column_wrapper col6{col6_data, col6_data + num_rows, validity}; - cudf_io::table_metadata expected_metadata; - expected_metadata.column_names.emplace_back("bools"); - expected_metadata.column_names.emplace_back("int8s"); - expected_metadata.column_names.emplace_back("int16s"); - expected_metadata.column_names.emplace_back("int32s"); - expected_metadata.column_names.emplace_back("floats"); - expected_metadata.column_names.emplace_back("doubles"); - expected_metadata.column_names.emplace_back("decimal"); - - std::vector> cols; - cols.push_back(col0.release()); - cols.push_back(col1.release()); - cols.push_back(col2.release()); - cols.push_back(col3.release()); - cols.push_back(col4.release()); - cols.push_back(col5.release()); - cols.push_back(col6.release()); - auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(7, expected->num_columns()); + cudf::test::lists_column_wrapper col7{ + {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}}; + + auto child_col = + cudf::test::fixed_width_column_wrapper{48, 27, 25, 31, 351, 351, 29, 15, -1, -99}; + auto col8 = cudf::test::structs_column_wrapper{child_col}; + + table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8}); + + cudf_io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("bools"); + expected_metadata.column_metadata[1].set_name("int8s"); + expected_metadata.column_metadata[2].set_name("int16s"); + expected_metadata.column_metadata[3].set_name("int32s"); + expected_metadata.column_metadata[4].set_name("floats"); + expected_metadata.column_metadata[5].set_name("doubles"); + expected_metadata.column_metadata[6].set_name("decimal"); + expected_metadata.column_metadata[7].set_name("lists"); + expected_metadata.column_metadata[8].set_name("structs"); auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()) + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected) .metadata(&expected_metadata); cudf_io::write_orc(out_opts); @@ -381,13 +360,13 @@ TEST_F(OrcWriterTest, MultiColumn) cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); - EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(OrcWriterTest, MultiColumnWithNulls) { - constexpr auto num_rows = 100; + constexpr auto num_rows = 10; auto col0_data = random_values(num_rows); auto col1_data = random_values(num_rows); @@ -402,14 +381,14 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls) auto col0_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); }); auto col1_mask = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i < 10); }); + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i < 2); }); auto col2_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); auto col3_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i == (num_rows - 1)); }); auto col4_mask = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 40 && i <= 60); }); + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 4 && i <= 6); }); auto col5_mask = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 80); }); + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 8); }); auto col6_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 3); }); @@ -420,30 +399,28 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls) column_wrapper col4{col4_data.begin(), col4_data.end(), col4_mask}; column_wrapper col5{col5_data.begin(), col5_data.end(), col5_mask}; column_wrapper col6{col6_data, col6_data + num_rows, col6_mask}; - - cudf_io::table_metadata expected_metadata; - expected_metadata.column_names.emplace_back("bools"); - expected_metadata.column_names.emplace_back("int8s"); - expected_metadata.column_names.emplace_back("int16s"); - expected_metadata.column_names.emplace_back("int32s"); - expected_metadata.column_names.emplace_back("floats"); - expected_metadata.column_names.emplace_back("doubles"); - expected_metadata.column_names.emplace_back("decimal"); - - std::vector> cols; - cols.push_back(col0.release()); - cols.push_back(col1.release()); - cols.push_back(col2.release()); - cols.push_back(col3.release()); - cols.push_back(col4.release()); - cols.push_back(col5.release()); - cols.push_back(col6.release()); - auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(7, expected->num_columns()); + cudf::test::lists_column_wrapper col7{ + {{9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}}, + col0_mask}; + auto ages_col = cudf::test::fixed_width_column_wrapper{ + {48, 27, 25, 31, 351, 351, 29, 15, -1, -99}, {1, 0, 1, 1, 0, 1, 1, 1, 0, 1}}; + auto col8 = cudf::test::structs_column_wrapper{{ages_col}, {0, 1, 1, 0, 1, 1, 0, 1, 1, 0}}; + table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8}); + + cudf_io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("bools"); + expected_metadata.column_metadata[1].set_name("int8s"); + expected_metadata.column_metadata[2].set_name("int16s"); + expected_metadata.column_metadata[3].set_name("int32s"); + expected_metadata.column_metadata[4].set_name("floats"); + expected_metadata.column_metadata[5].set_name("doubles"); + expected_metadata.column_metadata[6].set_name("decimal"); + expected_metadata.column_metadata[7].set_name("lists"); + expected_metadata.column_metadata[8].set_name("structs"); auto filepath = temp_env->get_temp_filepath("OrcMultiColumnWithNulls.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()) + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected) .metadata(&expected_metadata); cudf_io::write_orc(out_opts); @@ -451,8 +428,8 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls) cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); - EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(OrcWriterTest, ReadZeroRows) @@ -463,15 +440,11 @@ TEST_F(OrcWriterTest, ReadZeroRows) constexpr auto num_rows = 10; column_wrapper col( sequence, sequence + num_rows, validity); - - std::vector> cols; - cols.push_back(col.release()); - auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(1, expected->num_columns()); + table_view expected({col}); auto filepath = temp_env->get_temp_filepath("OrcSingleColumn.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected); cudf_io::write_orc(out_opts); cudf_io::orc_reader_options in_opts = @@ -498,21 +471,16 @@ TEST_F(OrcWriterTest, Strings) column_wrapper col1{strings.begin(), strings.end()}; column_wrapper col2{seq_col2.begin(), seq_col2.end(), validity}; - cudf_io::table_metadata expected_metadata; - expected_metadata.column_names.emplace_back("col_other"); - expected_metadata.column_names.emplace_back("col_string"); - expected_metadata.column_names.emplace_back("col_another"); + table_view expected({col0, col1, col2}); - std::vector> cols; - cols.push_back(col0.release()); - cols.push_back(col1.release()); - cols.push_back(col2.release()); - auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(3, expected->num_columns()); + cudf_io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("col_other"); + expected_metadata.column_metadata[1].set_name("col_string"); + expected_metadata.column_metadata[2].set_name("col_another"); auto filepath = temp_env->get_temp_filepath("OrcStrings.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()) + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected) .metadata(&expected_metadata); cudf_io::write_orc(out_opts); @@ -520,8 +488,8 @@ TEST_F(OrcWriterTest, Strings) cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); - EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(OrcWriterTest, SlicedTable) @@ -545,21 +513,24 @@ TEST_F(OrcWriterTest, SlicedTable) column_wrapper col2{seq_col2.begin(), seq_col2.end(), validity}; column_wrapper col3{seq_col3, seq_col3 + num_rows, validity}; - cudf_io::table_metadata expected_metadata; - expected_metadata.column_names.emplace_back("col_other"); - expected_metadata.column_names.emplace_back("col_string"); - expected_metadata.column_names.emplace_back("col_another"); - expected_metadata.column_names.emplace_back("col_decimal"); + using lcw = cudf::test::lists_column_wrapper; + lcw col4{{9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}}; - std::vector> cols; - cols.push_back(col0.release()); - cols.push_back(col1.release()); - cols.push_back(col2.release()); - cols.push_back(col3.release()); - auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(4, expected->num_columns()); + auto ages_col = cudf::test::fixed_width_column_wrapper{ + {48, 27, 25, 31, 351, 351, 29, 15}, {1, 1, 1, 1, 1, 0, 1, 1}}; + auto col5 = cudf::test::structs_column_wrapper{{ages_col}, {1, 1, 1, 1, 0, 1, 1, 1}}; - auto expected_slice = cudf::slice(expected->view(), {2, static_cast(num_rows)}); + table_view expected({col0, col1, col2, col3, col4, col5}); + + cudf_io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("col_other"); + expected_metadata.column_metadata[1].set_name("col_string"); + expected_metadata.column_metadata[2].set_name("col_another"); + expected_metadata.column_metadata[3].set_name("col_decimal"); + expected_metadata.column_metadata[4].set_name("lists"); + expected_metadata.column_metadata[5].set_name("structs"); + + auto expected_slice = cudf::slice(expected, {2, static_cast(num_rows)}); auto filepath = temp_env->get_temp_filepath("SlicedTable.orc"); cudf_io::orc_writer_options out_opts = @@ -572,7 +543,7 @@ TEST_F(OrcWriterTest, SlicedTable) auto result = cudf_io::read_orc(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view()); - EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(OrcWriterTest, HostBuffer) @@ -583,17 +554,14 @@ TEST_F(OrcWriterTest, HostBuffer) cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); column_wrapper col{seq_col.begin(), seq_col.end(), validity}; - cudf_io::table_metadata expected_metadata; - expected_metadata.column_names.emplace_back("col_other"); + table_view expected{{col}}; - std::vector> cols; - cols.push_back(col.release()); - const auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(1, expected->num_columns()); + cudf_io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("col_other"); std::vector out_buffer; cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info(&out_buffer), expected->view()) + cudf_io::orc_writer_options::builder(cudf_io::sink_info(&out_buffer), expected) .metadata(&expected_metadata); cudf_io::write_orc(out_opts); @@ -602,8 +570,8 @@ TEST_F(OrcWriterTest, HostBuffer) .use_index(false); const auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); - EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(OrcWriterTest, negTimestampsNano) @@ -618,15 +586,11 @@ TEST_F(OrcWriterTest, negTimestampsNano) -1530705634500000000, -1674638741932929000, }; - - std::vector> cols; - cols.push_back(timestamps_ns.release()); - auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(1, expected->num_columns()); + table_view expected({timestamps_ns}); auto filepath = temp_env->get_temp_filepath("OrcNegTimestamp.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected); cudf_io::write_orc(out_opts); @@ -634,10 +598,9 @@ TEST_F(OrcWriterTest, negTimestampsNano) cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view().column(0), - result.tbl->view().column(0), - cudf::test::debug_output_level::ALL_ERRORS); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + expected.column(0), result.tbl->view().column(0), cudf::test::debug_output_level::ALL_ERRORS); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } TEST_F(OrcWriterTest, Slice) @@ -747,21 +710,51 @@ TEST_F(OrcChunkedWriterTest, ManyTables) CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); } -TEST_F(OrcChunkedWriterTest, Strings) +TEST_F(OrcChunkedWriterTest, Metadata) { - std::vector> cols; + std::vector strings{ + "Monday", "Tuesday", "THURSDAY", "Wednesday", "Friday", "Sunday", "Saturday"}; + const auto num_rows = strings.size(); + + auto seq_col0 = random_values(num_rows); + auto seq_col2 = random_values(num_rows); + auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); + + column_wrapper col0{seq_col0.begin(), seq_col0.end(), validity}; + column_wrapper col1{strings.begin(), strings.end()}; + column_wrapper col2{seq_col2.begin(), seq_col2.end(), validity}; + table_view expected({col0, col1, col2}); + + cudf_io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("col_other"); + expected_metadata.column_metadata[1].set_name("col_string"); + expected_metadata.column_metadata[2].set_name("col_another"); + + auto filepath = temp_env->get_temp_filepath("ChunkedMetadata.orc"); + cudf_io::chunked_orc_writer_options opts = + cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath}) + .metadata(&expected_metadata); + cudf_io::orc_chunked_writer(opts).write(expected).write(expected); + + cudf_io::orc_reader_options read_opts = + cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}); + auto result = cudf_io::read_orc(read_opts); + + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); +} + +TEST_F(OrcChunkedWriterTest, Strings) +{ bool mask1[] = {1, 1, 0, 1, 1, 1, 1}; std::vector h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"}; cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1); - cols.push_back(strings1.release()); - cudf::table tbl1(std::move(cols)); + table_view tbl1({strings1}); bool mask2[] = {0, 1, 1, 1, 1, 1, 1}; std::vector h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"}; cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2); - cols.push_back(strings2.release()); - cudf::table tbl2(std::move(cols)); + table_view tbl2({strings2}); auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); @@ -864,7 +857,6 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize) using T = TypeParam; int num_els = 31; - std::vector> cols; bool mask[] = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; @@ -875,9 +867,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize) std::fill(c1b, c1b + num_els, static_cast(6)); column_wrapper c1a_w(c1a, c1a + num_els, mask); column_wrapper c1b_w(c1b, c1b + num_els, mask); - cols.push_back(c1a_w.release()); - cols.push_back(c1b_w.release()); - cudf::table tbl1(std::move(cols)); + table_view tbl1({c1a_w, c1b_w}); T c2a[num_els]; std::fill(c2a, c2a + num_els, static_cast(8)); @@ -885,9 +875,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize) std::fill(c2b, c2b + num_els, static_cast(9)); column_wrapper c2a_w(c2a, c2a + num_els, mask); column_wrapper c2b_w(c2b, c2b + num_els, mask); - cols.push_back(c2a_w.release()); - cols.push_back(c2b_w.release()); - cudf::table tbl2(std::move(cols)); + table_view tbl2({c2a_w, c2b_w}); auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); @@ -911,7 +899,6 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2) using T = TypeParam; int num_els = 33; - std::vector> cols; bool mask[] = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; @@ -922,9 +909,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2) std::fill(c1b, c1b + num_els, static_cast(6)); column_wrapper c1a_w(c1a, c1a + num_els, mask); column_wrapper c1b_w(c1b, c1b + num_els, mask); - cols.push_back(c1a_w.release()); - cols.push_back(c1b_w.release()); - cudf::table tbl1(std::move(cols)); + table_view tbl1({c1a_w, c1b_w}); T c2a[num_els]; std::fill(c2a, c2a + num_els, static_cast(8)); @@ -932,9 +917,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2) std::fill(c2b, c2b + num_els, static_cast(9)); column_wrapper c2a_w(c2a, c2a + num_els, mask); column_wrapper c2b_w(c2b, c2b + num_els, mask); - cols.push_back(c2a_w.release()); - cols.push_back(c2b_w.release()); - cudf::table tbl2(std::move(cols)); + table_view tbl2({c2a_w, c2b_w}); auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); @@ -981,18 +964,12 @@ TEST_F(OrcStatisticsTest, Basic) sequence, sequence + num_rows, valid_all); column_wrapper col5( sequence, sequence + num_rows, validity); - std::vector> cols; - cols.push_back(col1.release()); - cols.push_back(col2.release()); - cols.push_back(col3.release()); - cols.push_back(col4.release()); - cols.push_back(col5.release()); - auto expected = std::make_unique
(std::move(cols)); + table_view expected({col1, col2, col3, col4, col5}); auto filepath = temp_env->get_temp_filepath("OrcStatsMerge.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected); cudf_io::write_orc(out_opts); auto const stats = cudf_io::read_parsed_orc_statistics(cudf_io::source_info{filepath}); @@ -1056,17 +1033,14 @@ TEST_F(OrcWriterTest, SlicedValidMask) column_wrapper col{strings.begin(), strings.end(), validity}; - std::vector> cols; - cols.push_back(col.release()); - - cudf_io::table_metadata expected_metadata; - expected_metadata.column_names.emplace_back("col_string"); - // Bug tested here is easiest to reproduce when column_offset % 32 is 31 std::vector indices{31, 34}; - std::vector sliced_col = cudf::slice(cols[0]->view(), indices); + auto sliced_col = cudf::slice(static_cast(col), indices); cudf::table_view tbl{sliced_col}; + cudf_io::table_input_metadata expected_metadata(tbl); + expected_metadata.column_metadata[0].set_name("col_string"); + auto filepath = temp_env->get_temp_filepath("OrcStrings.orc"); cudf_io::orc_writer_options out_opts = cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl) @@ -1078,7 +1052,7 @@ TEST_F(OrcWriterTest, SlicedValidMask) auto result = cudf_io::read_orc(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(tbl, result.tbl->view()); - EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(OrcReaderTest, SingleInputs) @@ -1087,9 +1061,9 @@ TEST_F(OrcReaderTest, SingleInputs) auto table1 = create_random_fixed_table(5, 5, true); auto filepath1 = temp_env->get_temp_filepath("SimpleTable1.orc"); - cudf_io::chunked_orc_writer_options opts1 = - cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath1}); - cudf_io::orc_chunked_writer(opts1).write(*table1); + cudf_io::orc_writer_options write_opts = + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath1}, table1->view()); + cudf_io::write_orc(write_opts); cudf_io::orc_reader_options read_opts = cudf_io::orc_reader_options::builder(cudf_io::source_info{{filepath1}}); @@ -1106,15 +1080,19 @@ TEST_F(OrcReaderTest, MultipleInputs) auto full_table = cudf::concatenate(std::vector({*table1, *table2})); - auto filepath1 = temp_env->get_temp_filepath("SimpleTable1.orc"); - cudf_io::chunked_orc_writer_options opts1 = - cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath1}); - cudf_io::orc_chunked_writer(opts1).write(*table1); + auto const filepath1 = temp_env->get_temp_filepath("SimpleTable1.orc"); + { + cudf_io::orc_writer_options out_opts = + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath1}, table1->view()); + cudf_io::write_orc(out_opts); + } - auto filepath2 = temp_env->get_temp_filepath("SimpleTable2.orc"); - cudf_io::chunked_orc_writer_options opts2 = - cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath2}); - cudf_io::orc_chunked_writer(opts2).write(*table2); + auto const filepath2 = temp_env->get_temp_filepath("SimpleTable2.orc"); + { + cudf_io::orc_writer_options out_opts = + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath2}, table2->view()); + cudf_io::write_orc(out_opts); + } cudf_io::orc_reader_options read_opts = cudf_io::orc_reader_options::builder(cudf_io::source_info{{filepath1, filepath2}}); @@ -1139,14 +1117,11 @@ TEST_P(OrcWriterTestDecimal, Decimal64) }); auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 7 == 0; }); column_wrapper col{data, data + num_rows, mask}; - - std::vector> cols; - cols.push_back(col.release()); - auto tbl = std::make_unique
(std::move(cols)); + cudf::table_view tbl({static_cast(col)}); auto filepath = temp_env->get_temp_filepath("Decimal64.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl->view()); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl); cudf_io::write_orc(out_opts); @@ -1154,7 +1129,7 @@ TEST_P(OrcWriterTestDecimal, Decimal64) cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(tbl->view().column(0), result.tbl->view().column(0)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(tbl.column(0), result.tbl->view().column(0)); } INSTANTIATE_TEST_CASE_P(OrcWriterTest, @@ -1173,14 +1148,11 @@ TEST_F(OrcWriterTest, Decimal32) }); auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13 == 0; }); column_wrapper col{data, data + num_rows, mask}; - - std::vector> cols; - cols.push_back(col.release()); - auto expected = std::make_unique
(std::move(cols)); + cudf::table_view expected({static_cast(col)}); auto filepath = temp_env->get_temp_filepath("Decimal32.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected); cudf_io::write_orc(out_opts); diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 7260aa9e686..0f59b0d5e15 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -184,25 +185,6 @@ std::unique_ptr make_parquet_list_col( offsets_size, offsets.release(), std::move(child), 0, rmm::device_buffer{}); } -void compare_metadata_equality(cudf::io::table_input_metadata in_meta, - cudf::io::table_metadata out_meta) -{ - std::function compare_names = - [&](cudf::io::column_name_info out_col, cudf::io::column_in_metadata in_col) { - if (not in_col.get_name().empty()) { EXPECT_EQ(out_col.name, in_col.get_name()); } - EXPECT_EQ(out_col.children.size(), in_col.num_children()); - for (size_t i = 0; i < out_col.children.size(); ++i) { - compare_names(out_col.children[i], in_col.child(i)); - } - }; - - EXPECT_EQ(out_meta.schema_info.size(), in_meta.column_metadata.size()); - - for (size_t i = 0; i < out_meta.schema_info.size(); ++i) { - compare_names(out_meta.schema_info[i], in_meta.column_metadata[i]); - } -} - // Base test fixture for tests struct ParquetWriterTest : public cudf::test::BaseFixture { }; @@ -444,7 +426,7 @@ TEST_F(ParquetWriterTest, MultiColumn) auto result = cudf_io::read_parquet(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetWriterTest, MultiColumnWithNulls) @@ -528,7 +510,7 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls) // TODO: Need to be able to return metadata in tree form from reader so they can be compared. // Unfortunately the closest thing to a hierarchical schema is column_name_info which does not // have any tests for it c++ or python. - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetWriterTest, Strings) @@ -568,7 +550,7 @@ TEST_F(ParquetWriterTest, Strings) auto result = cudf_io::read_parquet(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetWriterTest, SlicedTable) @@ -682,7 +664,7 @@ TEST_F(ParquetWriterTest, SlicedTable) auto result = cudf_io::read_parquet(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetWriterTest, ListColumn) @@ -780,7 +762,7 @@ TEST_F(ParquetWriterTest, ListColumn) auto result = cudf_io::read_parquet(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetWriterTest, MultiIndex) @@ -831,7 +813,7 @@ TEST_F(ParquetWriterTest, MultiIndex) auto result = cudf_io::read_parquet(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetWriterTest, HostBuffer) @@ -860,7 +842,7 @@ TEST_F(ParquetWriterTest, HostBuffer) const auto result = cudf_io::read_parquet(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetWriterTest, NonNullable) @@ -989,7 +971,7 @@ TEST_F(ParquetWriterTest, StructOfList) const auto result = cudf_io::read_parquet(read_args); CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetWriterTest, ListOfStruct) @@ -1044,7 +1026,7 @@ TEST_F(ParquetWriterTest, ListOfStruct) const auto result = cudf_io::read_parquet(read_args); CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } // custom data sink that supports device writes. uses plain file io. @@ -1433,7 +1415,7 @@ TEST_F(ParquetChunkedWriterTest, ListOfStruct) auto result = cudf_io::read_parquet(read_opts); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetChunkedWriterTest, ListOfStructOfStructOfListOfList) @@ -1526,7 +1508,7 @@ TEST_F(ParquetChunkedWriterTest, ListOfStructOfStructOfListOfList) auto result = cudf_io::read_parquet(read_opts); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); // We specifically mentioned in input schema that struct_2 is non-nullable across chunked calls. auto result_parent_list = result.tbl->get_column(0); @@ -1697,7 +1679,7 @@ TEST_F(ParquetChunkedWriterTest, DifferentNullabilityStruct) auto result = cudf_io::read_parquet(read_opts); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetChunkedWriterTest, ForcedNullability) @@ -1830,7 +1812,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityStruct) auto result = cudf_io::read_parquet(read_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetChunkedWriterTest, ReadRowGroups) @@ -2552,7 +2534,7 @@ TEST_F(ParquetReaderTest, SelectNestedColumn) expected_metadata.column_metadata[0].child(0).child(0).set_name("age"); CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } { // Test selecting a non-leaf and expecting all hierarchy from that node onwards @@ -2581,7 +2563,7 @@ TEST_F(ParquetReaderTest, SelectNestedColumn) expected_metadata.column_metadata[0].child(0).child(1).set_name("age"); CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } { // Test selecting struct children out of order @@ -2616,7 +2598,7 @@ TEST_F(ParquetReaderTest, SelectNestedColumn) expected_metadata.column_metadata[0].child(1).set_name("human?"); CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } } diff --git a/cpp/tests/lists/combine/concatenate_rows_tests.cpp b/cpp/tests/lists/combine/concatenate_rows_tests.cpp index 8aae523d12b..17d31c3e387 100644 --- a/cpp/tests/lists/combine/concatenate_rows_tests.cpp +++ b/cpp/tests/lists/combine/concatenate_rows_tests.cpp @@ -72,7 +72,7 @@ struct ListConcatenateRowsTypedTest : public cudf::test::BaseFixture { using TypesForTest = cudf::test::Concat; -TYPED_TEST_CASE(ListConcatenateRowsTypedTest, TypesForTest); +TYPED_TEST_SUITE(ListConcatenateRowsTypedTest, TypesForTest); TYPED_TEST(ListConcatenateRowsTypedTest, ConcatenateEmptyColumns) { @@ -110,10 +110,12 @@ TYPED_TEST(ListConcatenateRowsTypedTest, SimpleInputNoNull) { using ListsCol = cudf::test::lists_column_wrapper; - auto const col1 = ListsCol{{1, 2}, {3, 4}, {5, 6}}.release(); - auto const col2 = ListsCol{{7, 8}, {9, 10}, {11, 12}}.release(); - auto const expected = ListsCol{{1, 2, 7, 8}, {3, 4, 9, 10}, {5, 6, 11, 12}}.release(); - auto const results = cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}}); + auto const col1 = ListsCol{{1, 2}, {3, 4}, {5, 6}}.release(); + auto const empty_lists = ListsCol{ListsCol{}, ListsCol{}, ListsCol{}}.release(); + auto const col2 = ListsCol{{7, 8}, {9, 10}, {11, 12}}.release(); + auto const expected = ListsCol{{1, 2, 7, 8}, {3, 4, 9, 10}, {5, 6, 11, 12}}.release(); + auto const results = + cudf::lists::concatenate_rows(TView{{col1->view(), empty_lists->view(), col2->view()}}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, verbosity); } @@ -121,11 +123,13 @@ TYPED_TEST(ListConcatenateRowsTypedTest, SimpleInputWithNullableChild) { using ListsCol = cudf::test::lists_column_wrapper; - auto const col1 = ListsCol{{1, 2}, ListsCol{{null}, null_at(0)}, {5, 6}}.release(); - auto const col2 = ListsCol{{7, 8}, {9, 10}, {11, 12}}.release(); + auto const col1 = ListsCol{{1, 2}, ListsCol{{null}, null_at(0)}, {5, 6}}.release(); + auto const empty_lists = ListsCol{{ListsCol{}, ListsCol{}, ListsCol{}}, null_at(2)}.release(); + auto const col2 = ListsCol{{7, 8}, {9, 10}, {11, 12}}.release(); auto const expected = ListsCol{{1, 2, 7, 8}, ListsCol{{null, 9, 10}, null_at(0)}, {5, 6, 11, 12}}.release(); - auto const results = cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}}); + auto const results = + cudf::lists::concatenate_rows(TView{{col1->view(), empty_lists->view(), col2->view()}}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, verbosity); } @@ -466,3 +470,19 @@ TEST_F(ListConcatenateRowsTest, SlicedStringsColumnsInputWithNulls) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, verbosity); } } + +TEST_F(ListConcatenateRowsTest, StringsColumnsWithEmptyListTest) +{ + auto const col1 = StrListsCol{{"1", "2", "3", "4"}}.release(); + auto const col2 = StrListsCol{{"a", "b", "c"}}.release(); + auto const col3 = StrListsCol{StrListsCol{}}.release(); + auto const col4 = StrListsCol{{"x", "y", "" /*NULL*/, "z"}, null_at(2)}.release(); + auto const col5 = StrListsCol{{StrListsCol{}}, null_at(0)}.release(); + auto const expected = + StrListsCol{{"1", "2", "3", "4", "a", "b", "c", "x", "y", "" /*NULL*/, "z"}, null_at(9)} + .release(); + auto const results = cudf::lists::concatenate_rows( + TView{{col1->view(), col2->view(), col3->view(), col4->view(), col5->view()}}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, verbosity); +} diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index bc413fd220a..270e01075b9 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -14,61 +14,65 @@ * limitations under the License. */ +#include + #include #include +#include #include #include -#include #include #include -using int_type = int32_t; -using float_type = float; - -using LIST_COL_FLT = cudf::test::lists_column_wrapper; -using LIST_COL_STR = cudf::test::lists_column_wrapper; +using namespace cudf::test::iterators; -auto constexpr neg_NaN = -std::numeric_limits::quiet_NaN(); -auto constexpr neg_Inf = -std::numeric_limits::infinity(); -auto constexpr NaN = std::numeric_limits::quiet_NaN(); -auto constexpr Inf = std::numeric_limits::infinity(); +using float_type = float; +using FloatListsCol = cudf::test::lists_column_wrapper; +using StrListsCol = cudf::test::lists_column_wrapper; +using StringsCol = cudf::test::strings_column_wrapper; +using StructsCol = cudf::test::structs_column_wrapper; +using IntsCol = cudf::test::fixed_width_column_wrapper; +using FloatsCol = cudf::test::fixed_width_column_wrapper; -template -void test_once(cudf::column_view const& input, - LCW const& expected, - cudf::null_equality nulls_equal = cudf::null_equality::EQUAL) -{ - auto const results = - cudf::lists::drop_list_duplicates(cudf::lists_column_view{input}, nulls_equal); - if (cudf::is_floating_point(input.type())) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); - } else { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); - } -} +auto constexpr neg_NaN = -std::numeric_limits::quiet_NaN(); +auto constexpr neg_Inf = -std::numeric_limits::infinity(); +auto constexpr NaN = std::numeric_limits::quiet_NaN(); +auto constexpr Inf = std::numeric_limits::infinity(); +auto constexpr verbosity = cudf::test::debug_output_level::FIRST_ERROR; struct DropListDuplicatesTest : public cudf::test::BaseFixture { }; TEST_F(DropListDuplicatesTest, FloatingPointTestsWithSignedZero) { - // -0.0 and 0.0 should be considered equal - test_once(LIST_COL_FLT{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0}, - LIST_COL_FLT{0, 1, 2}); + // -0.0 and 0.0 should be considered equal. + auto const lists = FloatListsCol{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0}; + auto const expected = FloatListsCol{0, 1, 2}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); } TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInf) { - // Lists contain inf - test_once(LIST_COL_FLT{0, 1, 2, 0, 1, 2, 0, 1, 2, Inf, Inf, Inf}, LIST_COL_FLT{0, 1, 2, Inf}); - test_once(LIST_COL_FLT{Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf}, - LIST_COL_FLT{neg_Inf, 0, Inf}); + // Lists contain inf. + { + auto const lists = FloatListsCol{0, 1, 2, 0, 1, 2, 0, 1, 2, Inf, Inf, Inf}; + auto const expected = FloatListsCol{0, 1, 2, Inf}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + { + auto const lists = FloatListsCol{Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf}; + auto const expected = FloatListsCol{neg_Inf, 0, Inf}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } } // The position of NaN is undefined after sorting, thus we need to offload the data to CPU to -// check for validity +// check for validity. // We will not store NaN in the results_expected variable (an unordered_set) because we can't check // for NaN existence in a set. Instead, we will count the number of NaNs in the input and compare // with the number of NaNs in the output. @@ -77,14 +81,14 @@ static void test_floating_point(std::vector const& h_input, cudf::nan_equality nans_equal) { // If NaNs are considered as equal value, the final result should always contain at max ONE NaN - // entry per list + // entry per list. std::size_t const num_NaNs = nans_equal == cudf::nan_equality::ALL_EQUAL ? std::size_t{1} : std::count_if(h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x); }); auto const results_col = cudf::lists::drop_list_duplicates( - cudf::lists_column_view{LIST_COL_FLT(h_input.begin(), h_input.end())}, + cudf::lists_column_view{FloatListsCol(h_input.begin(), h_input.end())}, cudf::null_equality::EQUAL, nans_equal); auto const results_arr = @@ -125,130 +129,479 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInfsAndNaNs) TEST_F(DropListDuplicatesTest, StringTestsNonNull) { - // Trivial cases - test_once(LIST_COL_STR{{}}, LIST_COL_STR{{}}); - test_once(LIST_COL_STR{"this", "is", "a", "string"}, LIST_COL_STR{"a", "is", "string", "this"}); - - // One list column - test_once(LIST_COL_STR{"this", "is", "is", "is", "a", "string", "string"}, - LIST_COL_STR{"a", "is", "string", "this"}); - - // Multiple lists column - test_once( - LIST_COL_STR{LIST_COL_STR{"this", "is", "a", "no duplicate", "string"}, - LIST_COL_STR{"this", "is", "is", "a", "one duplicate", "string"}, - LIST_COL_STR{"this", "is", "is", "is", "a", "two duplicates", "string"}, - LIST_COL_STR{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}}, - LIST_COL_STR{LIST_COL_STR{"a", "is", "no duplicate", "string", "this"}, - LIST_COL_STR{"a", "is", "one duplicate", "string", "this"}, - LIST_COL_STR{"a", "is", "string", "this", "two duplicates"}, - LIST_COL_STR{"a", "is", "string", "this", "three duplicates"}}); + // Trivial cases - empty input. + { + auto const lists = StrListsCol{{}}; + auto const expected = StrListsCol{{}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + // No duplicate entry. + { + auto const lists = StrListsCol{"this", "is", "a", "string"}; + auto const expected = StrListsCol{"a", "is", "string", "this"}; + auto const results = cudf::lists::drop_list_duplicates( + cudf::lists_column_view{lists}, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + // One list column. + { + auto const lists = StrListsCol{"this", "is", "is", "is", "a", "string", "string"}; + auto const expected = StrListsCol{"a", "is", "string", "this"}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + // One list column, input is a strings column with given non-default null_equality and + // nans_equality parameters. + { + auto const lists = StrListsCol{"this", "is", "is", "is", "a", "string", "string"}; + auto const expected = StrListsCol{"a", "is", "string", "this"}; + auto const results = cudf::lists::drop_list_duplicates( + cudf::lists_column_view{lists}, cudf::null_equality::UNEQUAL, cudf::nan_equality::ALL_EQUAL); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + // Multiple lists column. + { + auto const lists = + StrListsCol{StrListsCol{"this", "is", "a", "no duplicate", "string"}, + StrListsCol{"this", "is", "is", "a", "one duplicate", "string"}, + StrListsCol{"this", "is", "is", "is", "a", "two duplicates", "string"}, + StrListsCol{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}}; + auto const expected = StrListsCol{StrListsCol{"a", "is", "no duplicate", "string", "this"}, + StrListsCol{"a", "is", "one duplicate", "string", "this"}, + StrListsCol{"a", "is", "string", "this", "two duplicates"}, + StrListsCol{"a", "is", "string", "this", "three duplicates"}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } } TEST_F(DropListDuplicatesTest, StringTestsWithNulls) { auto const null = std::string(""); - // One list column with null entries - test_once( - LIST_COL_STR{{"this", null, "is", "is", "is", "a", null, "string", null, "string"}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i != 1 && i != 6 && i != 8; })}, - LIST_COL_STR{{"a", "is", "string", "this", null}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })}); + // One list column with null entries. + { + auto const lists = StrListsCol{ + {"this", null, "is", "is", "is", "a", null, "string", null, "string"}, nulls_at({1, 6, 8})}; + auto const expected = StrListsCol{{"a", "is", "string", "this", null}, null_at(4)}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } // Multiple lists column with null lists and null entries - test_once( - LIST_COL_STR{ - {LIST_COL_STR{ - {"this", null, "is", null, "a", null, "no duplicate", null, "string"}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; })}, - LIST_COL_STR{}, - LIST_COL_STR{"this", "is", "is", "a", "one duplicate", "string"}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}, - LIST_COL_STR{{LIST_COL_STR{{"a", "is", "no duplicate", "string", "this", null}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i <= 4; })}, - LIST_COL_STR{}, - LIST_COL_STR{"a", "is", "one duplicate", "string", "this"}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}); + { + auto const lists = StrListsCol{ + {StrListsCol{{"this", null, "is", null, "a", null, "no duplicate", null, "string"}, + nulls_at({1, 3, 5, 7})}, + StrListsCol{}, /* NULL */ + StrListsCol{"this", "is", "is", "a", "one duplicate", "string"}}, + null_at(1)}; + auto const expected = + StrListsCol{{StrListsCol{{"a", "is", "no duplicate", "string", "this", null}, null_at(5)}, + StrListsCol{}, /* NULL */ + StrListsCol{"a", "is", "one duplicate", "string", "this"}}, + null_at(1)}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } } template struct DropListDuplicatesTypedTest : public cudf::test::BaseFixture { }; -#define LIST_COL cudf::test::lists_column_wrapper using TypesForTest = cudf::test::Concat; -TYPED_TEST_CASE(DropListDuplicatesTypedTest, TypesForTest); +TYPED_TEST_SUITE(DropListDuplicatesTypedTest, TypesForTest); TYPED_TEST(DropListDuplicatesTypedTest, InvalidInputTests) { - // Lists of nested types are not supported + using ListsCol = cudf::test::lists_column_wrapper; + + // Nested types (except struct) are not supported. EXPECT_THROW( - cudf::lists::drop_list_duplicates(cudf::lists_column_view{LIST_COL{LIST_COL{{1, 2}, {3}}}}), + cudf::lists::drop_list_duplicates(cudf::lists_column_view{ListsCol{ListsCol{{1, 2}, {3}}}}), cudf::logic_error); } TYPED_TEST(DropListDuplicatesTypedTest, TrivialInputTests) { - // Empty input - test_once(LIST_COL{{}}, LIST_COL{{}}); + using ListsCol = cudf::test::lists_column_wrapper; + + // Empty input. + { + auto const lists = ListsCol{{}}; + auto const expected = ListsCol{{}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } - // Trivial cases - test_once(LIST_COL{0, 1, 2, 3, 4, 5}, LIST_COL{0, 1, 2, 3, 4, 5}); + // Trivial cases. + { + auto const lists = ListsCol{0, 1, 2, 3, 4, 5}; + auto const expected = ListsCol{0, 1, 2, 3, 4, 5}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } - // Multiple empty lists - test_once(LIST_COL{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, - LIST_COL{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); + // Multiple empty lists. + { + auto const lists = ListsCol{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}; + auto const expected = ListsCol{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } } TYPED_TEST(DropListDuplicatesTypedTest, NonNullInputTests) { - // Adjacent lists containing the same entries - test_once(LIST_COL{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}}, - LIST_COL{{1}, {1, 2}, {2, 3}}); - - // Sliced list column - auto const list0 = - LIST_COL{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; - auto const list1 = cudf::slice(list0, {0, 5})[0]; - auto const list2 = cudf::slice(list0, {1, 5})[0]; - auto const list3 = cudf::slice(list0, {1, 3})[0]; - auto const list4 = cudf::slice(list0, {0, 3})[0]; - - test_once(list0, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); - test_once(list1, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); - test_once(list2, LIST_COL{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); - test_once(list3, LIST_COL{{1, 2, 3, 4}, {5}}); - test_once(list4, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}}); + using ListsCol = cudf::test::lists_column_wrapper; + + // Adjacent lists containing the same entries. + { + auto const lists = + ListsCol{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}}; + auto const expected = ListsCol{{1}, {1, 2}, {2, 3}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + // Sliced list column. + auto const lists_original = + ListsCol{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; + auto const lists1 = cudf::slice(lists_original, {0, 5})[0]; + auto const lists2 = cudf::slice(lists_original, {1, 5})[0]; + auto const lists3 = cudf::slice(lists_original, {1, 3})[0]; + auto const lists4 = cudf::slice(lists_original, {0, 3})[0]; + + { + auto const expected = ListsCol{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists_original}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + { + auto const expected = ListsCol{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + { + auto const expected = ListsCol{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists2}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + { + auto const expected = ListsCol{{1, 2, 3, 4}, {5}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists3}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + { + auto const expected = ListsCol{{1, 2, 3}, {1, 2, 3, 4}, {5}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists4}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } } TYPED_TEST(DropListDuplicatesTypedTest, WithNullInputTests) { + using ListsCol = cudf::test::lists_column_wrapper; auto constexpr null = TypeParam{0}; - // null lists - test_once(LIST_COL{{{3, 2, 1, 4, 1}, {5}, {}, {}, {10, 8, 9}, {6, 7}}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i != 2 && i != 3; })}, - LIST_COL{{{1, 2, 3, 4}, {5}, {}, {}, {8, 9, 10}, {6, 7}}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i != 2 && i != 3; })}); - - // null entries are equal - test_once( - LIST_COL{std::initializer_list{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, - LIST_COL{std::initializer_list{1, 3, 5, 7, 9, null}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 5; })}); - - // nulls entries are not equal - test_once( - LIST_COL{std::initializer_list{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, - LIST_COL{std::initializer_list{1, 3, 5, 7, 9, null, null, null, null, null}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i < 5; })}, - cudf::null_equality::UNEQUAL); + // null lists. + { + auto const lists = ListsCol{ + {{3, 2, 1, 4, 1}, {5}, {} /*NULL*/, {} /*NULL*/, {10, 8, 9}, {6, 7}}, nulls_at({2, 3})}; + auto const expected = + ListsCol{{{1, 2, 3, 4}, {5}, {} /*NULL*/, {} /*NULL*/, {8, 9, 10}, {6, 7}}, nulls_at({2, 3})}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + // null entries are equal. + { + auto const lists = ListsCol{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, nulls_at({0, 2, 4, 6, 8})}; + auto const expected = + ListsCol{std::initializer_list{1, 3, 5, 7, 9, null}, null_at(5)}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + // nulls entries are not equal. + { + auto const lists = ListsCol{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, nulls_at({0, 2, 4, 6, 8})}; + auto const expected = + ListsCol{std::initializer_list{1, 3, 5, 7, 9, null, null, null, null, null}, + nulls_at({5, 6, 7, 8, 9})}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}, + cudf::null_equality::UNEQUAL); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } +} + +TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfStructsNoNull) +{ + using ColWrapper = cudf::test::fixed_width_column_wrapper; + + auto const get_structs = [] { + auto child1 = ColWrapper{ + 1, 1, 1, 1, 1, 1, 1, 1, // list1 + 1, 1, 1, 1, 2, 1, 2, 2, // list2 + 2, 2, 2, 2, 3, 2, 3, 3 // list3 + }; + auto child2 = StringsCol{ + // begin list1 + "Banana", + "Mango", + "Apple", + "Cherry", + "Kiwi", + "Banana", + "Cherry", + "Kiwi", // end list1 + // begin list2 + "Bear", + "Duck", + "Cat", + "Dog", + "Panda", + "Bear", + "Cat", + "Panda", // end list2 + // begin list3 + "ÁÁÁ", + "ÉÉÉÉÉ", + "ÍÍÍÍÍ", + "ÁBC", + "XYZ", + "ÁÁÁ", + "ÁBC", + "XYZ" // end list3 + }; + return StructsCol{{child1, child2}}; + }; + + auto const get_structs_expected = [] { + auto child1 = ColWrapper{1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3}; + auto child2 = StringsCol{ + // begin list1 + "Apple", + "Banana", + "Cherry", + "Kiwi", + "Mango", // end list1 + // begin list2 + "Bear", + "Cat", + "Dog", + "Duck", + "Cat", + "Panda", // end list2 + // begin list3 + "ÁBC", + "ÁÁÁ", + "ÉÉÉÉÉ", + "ÍÍÍÍÍ", + "XYZ", + "ÁBC" // end list3 + }; + return StructsCol{{child1, child2}}; + }; + + // Test full columns. + { + auto const lists = + cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); + auto const expected = cudf::make_lists_column( + 3, IntsCol{0, 5, 11, 17}.release(), get_structs_expected().release(), 0, {}); + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists->view()}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected->view(), verbosity); + } + + // Test sliced columns. + { + auto const lists_original = + cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); + auto const expected_original = cudf::make_lists_column( + 3, IntsCol{0, 5, 11, 17}.release(), get_structs_expected().release(), 0, {}); + auto const lists = cudf::slice(lists_original->view(), {1, 3})[0]; + auto const expected = cudf::slice(expected_original->view(), {1, 3})[0]; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } +} + +TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfStructsHaveNull) +{ + using ColWrapper = cudf::test::fixed_width_column_wrapper; + auto constexpr XXX = int32_t{0}; // nulls at the parent structs column level + auto constexpr null = int32_t{0}; // nulls at the children columns level + + auto const get_structs = [] { + auto child1 = ColWrapper{{ + 1, 1, null, XXX, XXX, 1, 1, 1, // list1 + 1, 1, 1, 1, 2, 1, null, 2, // list2 + null, null, 2, 2, 3, 2, 3, 3 // list3 + }, + nulls_at({2, 14, 16, 17})}; + auto child2 = StringsCol{{ + // begin list1 + "Banana", + "Mango", + "Apple", + "XXX", /*NULL*/ + "XXX", /*NULL*/ + "Banana", + "Cherry", + "Kiwi", // end list1 + // begin list2 + "Bear", + "Duck", + "Cat", + "Dog", + "Panda", + "Bear", + "" /*NULL*/, + "Panda", // end list2 + // begin list3 + "ÁÁÁ", + "ÉÉÉÉÉ", + "ÍÍÍÍÍ", + "ÁBC", + "" /*NULL*/, + "ÁÁÁ", + "ÁBC", + "XYZ" // end list3 + }, + nulls_at({14, 20})}; + return StructsCol{{child1, child2}, nulls_at({3, 4})}; + }; + + auto const get_structs_expected = [] { + auto child1 = + ColWrapper{{1, 1, 1, 1, null, XXX, 1, 1, 1, 1, 2, null, 2, 2, 2, 3, 3, 3, null, null}, + nulls_at({4, 5, 11, 18, 19})}; + auto child2 = StringsCol{{ + // begin list1 + "Banana", + "Cherry", + "Kiwi", + "Mango", + "Apple", + "XXX" /*NULL*/, // end list1 + // begin list2 + "Bear", + "Cat", + "Dog", + "Duck", + "Panda", + "" /*NULL*/, // end list2 + // begin list3 + "ÁBC", + "ÁÁÁ", + "ÍÍÍÍÍ", + "XYZ", + "ÁBC", + "" /*NULL*/, + "ÁÁÁ", + "ÉÉÉÉÉ" // end list3 + }, + nulls_at({5, 11, 17})}; + return StructsCol{{child1, child2}, null_at(5)}; + }; + + // Test full columns. + { + auto const lists = + cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); + auto const expected = cudf::make_lists_column( + 3, IntsCol{0, 6, 12, 20}.release(), get_structs_expected().release(), 0, {}); + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists->view()}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected->view(), verbosity); + } + + // Test sliced columns. + { + auto const lists_original = + cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); + auto const expected_original = cudf::make_lists_column( + 3, IntsCol{0, 6, 12, 20}.release(), get_structs_expected().release(), 0, {}); + auto const lists = cudf::slice(lists_original->view(), {1, 3})[0]; + auto const expected = cudf::slice(expected_original->view(), {1, 3})[0]; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } +} + +TEST_F(DropListDuplicatesTest, SlicedInputListsOfStructsWithNaNs) +{ + auto const h_child = std::vector{ + 0, -1, 1, 0, 2, 0, 1, 1, -2, 2, 0, 1, 2, neg_NaN, NaN, NaN, NaN, neg_NaN}; + + auto const get_structs = [&] { + // Two children are just identical. + auto child1 = FloatsCol(h_child.begin(), h_child.end()); + auto child2 = FloatsCol(h_child.begin(), h_child.end()); + return StructsCol{{child1, child2}}; + }; + + // The first list does not have any NaN or -NaN, while the second list has both. + // `drop_list_duplicates` is expected to operate properly on this second list. + auto const lists_original = + cudf::make_lists_column(2, IntsCol{0, 10, 18}.release(), get_structs().release(), 0, {}); + auto const lists2 = cudf::slice(lists_original->view(), {1, 2})[0]; // test on the second list + + // Contain expected values excluding NaN. + auto const results_children_expected = std::unordered_set{0, 1, 2}; + + // Test for cudf::nan_equality::UNEQUAL. + { + auto const results_col = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists2}); + auto const child = cudf::lists_column_view(results_col->view()).child(); + auto const results_arr = cudf::test::to_host(child.child(0)).first; + + std::size_t const num_NaNs = + std::count_if(h_child.begin(), h_child.end(), [](auto x) { return std::isnan(x); }); + EXPECT_EQ(results_arr.size(), results_children_expected.size() + num_NaNs); + + std::size_t NaN_count{0}; + std::unordered_set results; + for (auto const x : results_arr) { + if (std::isnan(x)) { + ++NaN_count; + } else { + results.insert(x); + } + } + EXPECT_TRUE(results_children_expected.size() == results.size() && NaN_count == num_NaNs); + } + + // Test for cudf::nan_equality::ALL_EQUAL. + { + auto const results_col = cudf::lists::drop_list_duplicates( + cudf::lists_column_view{lists2}, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL); + auto const child = cudf::lists_column_view(results_col->view()).child(); + auto const results_arr = cudf::test::to_host(child.child(0)).first; + + std::size_t const num_NaNs = 1; + EXPECT_EQ(results_arr.size(), results_children_expected.size() + num_NaNs); + + std::size_t NaN_count{0}; + std::unordered_set results; + for (auto const x : results_arr) { + if (std::isnan(x)) { + ++NaN_count; + } else { + results.insert(x); + } + } + EXPECT_TRUE(results_children_expected.size() == results.size() && NaN_count == num_NaNs); + } } diff --git a/cpp/tests/quantiles/percentile_approx_test.cu b/cpp/tests/quantiles/percentile_approx_test.cu new file mode 100644 index 00000000000..39f7cc593d6 --- /dev/null +++ b/cpp/tests/quantiles/percentile_approx_test.cu @@ -0,0 +1,435 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + +using namespace cudf; + +struct tdigest_gen { + template < + typename T, + typename std::enable_if_t() || cudf::is_fixed_point()>* = nullptr> + std::unique_ptr operator()(column_view const& keys, column_view const& values, int delta) + { + cudf::table_view t({keys}); + cudf::groupby::groupby gb(t); + std::vector requests; + std::vector> aggregations; + aggregations.push_back(cudf::make_tdigest_aggregation(delta)); + requests.push_back({values, std::move(aggregations)}); + auto result = gb.aggregate(requests); + return std::move(result.second[0].results[0]); + } + + template < + typename T, + typename std::enable_if_t() && !cudf::is_fixed_point()>* = nullptr> + std::unique_ptr operator()(column_view const& keys, column_view const& values, int delta) + { + CUDF_FAIL("Invalid tdigest test type"); + } +}; + +std::unique_ptr arrow_percentile_approx(column_view const& _values, + int delta, + std::vector const& percentages) +{ + // sort the incoming values using the same settings that groupby does. + // this is a little weak because null_order::AFTER is hardcoded internally to groupby. + table_view t({_values}); + auto sorted_t = cudf::sort(t, {}, {null_order::AFTER}); + auto sorted_values = sorted_t->get_column(0).view(); + + std::vector h_values(sorted_values.size()); + cudaMemcpy(h_values.data(), + sorted_values.data(), + sizeof(double) * sorted_values.size(), + cudaMemcpyDeviceToHost); + std::vector h_validity(sorted_values.size()); + if (sorted_values.null_mask() != nullptr) { + auto validity = cudf::mask_to_bools(sorted_values.null_mask(), 0, sorted_values.size()); + cudaMemcpy(h_validity.data(), + (validity->view().data()), + sizeof(char) * sorted_values.size(), + cudaMemcpyDeviceToHost); + } + + // generate the tdigest + arrow::internal::TDigest atd(delta, sorted_values.size() * 2); + for (size_t idx = 0; idx < h_values.size(); idx++) { + if (sorted_values.null_mask() == nullptr || h_validity[idx]) { atd.Add(h_values[idx]); } + } + + // generate the percentiles and stuff them into a list column + std::vector h_result; + h_result.reserve(percentages.size()); + std::transform( + percentages.begin(), percentages.end(), std::back_inserter(h_result), [&atd](double p) { + return atd.Quantile(p); + }); + cudf::test::fixed_width_column_wrapper result(h_result.begin(), h_result.end()); + cudf::test::fixed_width_column_wrapper offsets{ + 0, static_cast(percentages.size())}; + return cudf::make_lists_column(1, offsets.release(), result.release(), 0, {}); +} + +struct percentile_approx_dispatch { + template < + typename T, + typename std::enable_if_t() || cudf::is_fixed_point()>* = nullptr> + std::unique_ptr operator()(column_view const& keys, + column_view const& values, + int delta, + std::vector const& percentages, + size_type ulps) + { + // arrow implementation. + auto expected = [&]() { + // we're explicitly casting back to doubles here but this is ok because that is + // exactly what happens inside of the cudf implementation as values are processed as well. so + // this should not affect results. + auto as_doubles = cudf::cast(values, data_type{type_id::FLOAT64}); + return arrow_percentile_approx(*as_doubles, delta, percentages); + }(); + + // gpu + cudf::table_view t({keys}); + cudf::groupby::groupby gb(t); + std::vector requests; + std::vector> aggregations; + aggregations.push_back(cudf::make_tdigest_aggregation(delta)); + requests.push_back({values, std::move(aggregations)}); + auto gb_result = gb.aggregate(requests); + + cudf::test::fixed_width_column_wrapper g_percentages(percentages.begin(), + percentages.end()); + structs_column_view scv(*(gb_result.second[0].results[0])); + auto result = cudf::percentile_approx(scv, g_percentages); + + cudf::test::expect_columns_equivalent( + *expected, *result, cudf::test::debug_output_level::FIRST_ERROR, ulps); + + return result; + } + + template < + typename T, + typename std::enable_if_t() && !cudf::is_fixed_point()>* = nullptr> + std::unique_ptr operator()(column_view const& keys, + column_view const& values, + int delta, + std::vector const& percentages, + size_type ulps) + { + CUDF_FAIL("Invalid input type for percentile_approx test"); + } +}; + +void percentile_approx_test(column_view const& _keys, + column_view const& _values, + int delta, + std::vector const& percentages, + size_type ulps) +{ + // first pass: validate the actual percentages we get per group. + + // produce the groups + cudf::table_view k({_keys}); + cudf::groupby::groupby pass1_gb(k); + cudf::table_view v({_values}); + auto groups = pass1_gb.get_groups(v); + // slice it all up so we have keys/columns for everything. + std::vector keys; + std::vector values; + for (size_t idx = 0; idx < groups.offsets.size() - 1; idx++) { + auto k = + cudf::slice(groups.keys->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]}); + keys.push_back(k[0]); + + auto v = + cudf::slice(groups.values->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]}); + values.push_back(v[0]); + } + + std::vector> parts; + for (size_t idx = 0; idx < values.size(); idx++) { + // do any casting of the input + parts.push_back(cudf::type_dispatcher(values[idx].type(), + percentile_approx_dispatch{}, + keys[idx], + values[idx], + delta, + percentages, + ulps)); + } + std::vector part_views; + std::transform(parts.begin(), + parts.end(), + std::back_inserter(part_views), + [](std::unique_ptr const& c) { return c->view(); }); + auto expected = cudf::concatenate(part_views); + + // second pass. run the percentile_approx with all the keys in one pass and make sure we get the + // same results as the concatenated by-key results above + + cudf::groupby::groupby gb(k); + std::vector requests; + std::vector> aggregations; + aggregations.push_back(cudf::make_tdigest_aggregation(delta)); + requests.push_back({_values, std::move(aggregations)}); + auto gb_result = gb.aggregate(requests); + + cudf::test::fixed_width_column_wrapper g_percentages(percentages.begin(), + percentages.end()); + structs_column_view scv(*(gb_result.second[0].results[0])); + auto result = cudf::percentile_approx(scv, g_percentages); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *result); +} + +void simple_test(data_type input_type, std::vector> params) +{ + auto values = cudf::test::generate_standardized_percentile_distribution(input_type); + // all in the same group + auto keys = cudf::make_fixed_width_column( + data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED); + thrust::fill(rmm::exec_policy(rmm::cuda_stream_default), + keys->mutable_view().template begin(), + keys->mutable_view().template end(), + 0); + + std::for_each(params.begin(), params.end(), [&](std::pair const& params) { + percentile_approx_test( + *keys, *values, params.first, {0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0}, params.second); + }); +} + +struct group_index { + __device__ int operator()(int i) { return i / 150000; } +}; + +void grouped_test(data_type input_type, std::vector> params) +{ + auto values = cudf::test::generate_standardized_percentile_distribution(input_type); + // all in the same group + auto keys = cudf::make_fixed_width_column( + data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED); + auto i = thrust::make_counting_iterator(0); + thrust::transform(rmm::exec_policy(rmm::cuda_stream_default), + i, + i + values->size(), + keys->mutable_view().template begin(), + group_index{}); + + std::for_each(params.begin(), params.end(), [&](std::pair const& params) { + percentile_approx_test( + *keys, *values, params.first, {0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0}, params.second); + }); +} + +std::pair make_null_mask(column_view const& col) +{ + return cudf::detail::valid_if(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(col.size()), + [] __device__(size_type i) { return i % 2 == 0; }); +} + +void simple_with_nulls_test(data_type input_type, std::vector> params) +{ + auto values = cudf::test::generate_standardized_percentile_distribution(input_type); + // all in the same group + auto keys = cudf::make_fixed_width_column( + data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED); + thrust::fill(rmm::exec_policy(rmm::cuda_stream_default), + keys->mutable_view().template begin(), + keys->mutable_view().template end(), + 0); + + // add a null mask + auto mask = make_null_mask(*values); + values->set_null_mask(mask.first, mask.second); + + std::for_each(params.begin(), params.end(), [&](std::pair const& params) { + percentile_approx_test( + *keys, *values, params.first, {0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0}, params.second); + }); +} + +void grouped_with_nulls_test(data_type input_type, std::vector> params) +{ + auto values = cudf::test::generate_standardized_percentile_distribution(input_type); + // all in the same group + auto keys = cudf::make_fixed_width_column( + data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED); + auto i = thrust::make_counting_iterator(0); + thrust::transform(rmm::exec_policy(rmm::cuda_stream_default), + i, + i + values->size(), + keys->mutable_view().template begin(), + group_index{}); + + // add a null mask + auto mask = make_null_mask(*values); + values->set_null_mask(mask.first, mask.second); + + std::for_each(params.begin(), params.end(), [&](std::pair const& params) { + percentile_approx_test( + *keys, *values, params.first, {0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0}, params.second); + }); +} + +template +data_type get_appropriate_type() +{ + if constexpr (cudf::is_fixed_point()) { return data_type{cudf::type_to_id(), -7}; } + return data_type{cudf::type_to_id()}; +} + +using PercentileApproxTypes = + cudf::test::Concat; + +template +struct PercentileApproxInputTypesTest : public cudf::test::BaseFixture { +}; +TYPED_TEST_CASE(PercentileApproxInputTypesTest, PercentileApproxTypes); + +TYPED_TEST(PercentileApproxInputTypesTest, Simple) +{ + using T = TypeParam; + auto const input_type = get_appropriate_type(); + + simple_test(input_type, + {{1000, cudf::test::default_ulp}, + {100, cudf::test::default_ulp * 4}, + {10, cudf::test::default_ulp * 11}}); +} + +TYPED_TEST(PercentileApproxInputTypesTest, Grouped) +{ + using T = TypeParam; + auto const input_type = get_appropriate_type(); + + grouped_test(input_type, + {{1000, cudf::test::default_ulp}, + {100, cudf::test::default_ulp * 2}, + {10, cudf::test::default_ulp * 10}}); +} + +TYPED_TEST(PercentileApproxInputTypesTest, SimpleWithNulls) +{ + using T = TypeParam; + auto const input_type = get_appropriate_type(); + + simple_with_nulls_test(input_type, + {{1000, cudf::test::default_ulp}, + {100, cudf::test::default_ulp * 2}, + {10, cudf::test::default_ulp * 11}}); +} + +TYPED_TEST(PercentileApproxInputTypesTest, GroupedWithNulls) +{ + using T = TypeParam; + auto const input_type = get_appropriate_type(); + + grouped_with_nulls_test(input_type, + {{1000, cudf::test::default_ulp}, + {100, cudf::test::default_ulp * 2}, + {10, cudf::test::default_ulp * 6}}); +} + +struct PercentileApproxTest : public cudf::test::BaseFixture { +}; + +TEST_F(PercentileApproxTest, EmptyInput) +{ + auto empty_ = cudf::detail::tdigest::make_empty_tdigest_column(); + cudf::test::fixed_width_column_wrapper percentiles{0.0, 0.25, 0.3}; + + std::vector input; + input.push_back(*empty_); + input.push_back(*empty_); + input.push_back(*empty_); + auto empty = cudf::concatenate(input); + + structs_column_view scv(*empty); + auto result = cudf::percentile_approx(scv, percentiles); + + cudf::test::fixed_width_column_wrapper offsets{0, 0, 0, 0}; + std::vector nulls{0, 0, 0}; + auto expected = + cudf::make_lists_column(3, + offsets.release(), + cudf::make_empty_column(data_type{type_id::FLOAT64}), + 3, + cudf::test::detail::make_null_mask(nulls.begin(), nulls.end())); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); +} + +TEST_F(PercentileApproxTest, EmptyPercentiles) +{ + auto const delta = 1000; + + cudf::test::fixed_width_column_wrapper values{0, 1, 2, 3, 4, 5}; + cudf::test::fixed_width_column_wrapper keys{0, 0, 0, 1, 1, 1}; + cudf::table_view t({keys}); + cudf::groupby::groupby gb(t); + std::vector requests; + std::vector> aggregations; + aggregations.push_back(cudf::make_tdigest_aggregation(delta)); + requests.push_back({values, std::move(aggregations)}); + auto tdigest_column = gb.aggregate(requests); + + cudf::test::fixed_width_column_wrapper percentiles{}; + + structs_column_view scv(*tdigest_column.second[0].results[0]); + auto result = cudf::percentile_approx(scv, percentiles); + + cudf::test::fixed_width_column_wrapper offsets{0, 0, 0}; + auto expected = cudf::make_lists_column(2, + offsets.release(), + cudf::make_empty_column(data_type{type_id::FLOAT64}), + 2, + cudf::detail::create_null_mask(2, mask_state::ALL_NULL)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); +} + +TEST_F(PercentileApproxTest, NullPercentiles) +{ + auto const delta = 1000; + + cudf::test::fixed_width_column_wrapper values{1, 1, 2, 3, 4, 5, 6, 7, 8}; + cudf::test::fixed_width_column_wrapper keys{0, 0, 0, 0, 0, 1, 1, 1, 1}; + cudf::table_view t({keys}); + cudf::groupby::groupby gb(t); + std::vector requests; + std::vector> aggregations; + aggregations.push_back(cudf::make_tdigest_aggregation(delta)); + requests.push_back({values, std::move(aggregations)}); + auto tdigest_column = gb.aggregate(requests); + + structs_column_view scv(*tdigest_column.second[0].results[0]); + + cudf::test::fixed_width_column_wrapper npercentiles{{0.5, 0.5, 1.0, 1.0}, {0, 0, 1, 1}}; + auto result = cudf::percentile_approx(scv, npercentiles); + + std::vector valids{0, 0, 1, 1}; + cudf::test::lists_column_wrapper expected{{{99, 99, 4, 4}, valids.begin()}, + {{99, 99, 8, 8}, valids.begin()}}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected); +} \ No newline at end of file diff --git a/cpp/tests/rolling/collect_ops_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp index c26059ee09b..5631c910753 100644 --- a/cpp/tests/rolling/collect_ops_test.cpp +++ b/cpp/tests/rolling/collect_ops_test.cpp @@ -2168,34 +2168,45 @@ TEST_F(CollectSetTest, BasicRollingWindowWithNaNs) result_with_nan_equal->view()); } -TEST_F(CollectSetTest, ListTypeRollingWindow) +TEST_F(CollectSetTest, StructTypeRollingWindow) { using namespace cudf; using namespace cudf::test; - auto const input_column = lists_column_wrapper{{1, 2, 3}, {4, 5}, {6}, {7, 8, 9}, {10}}; - - auto const prev_column = fixed_width_column_wrapper{1, 2, 2, 2, 2}; - auto const foll_column = fixed_width_column_wrapper{1, 1, 1, 1, 0}; + auto col1 = fixed_width_column_wrapper{1, 2, 3, 4, 5}; + auto col2 = strings_column_wrapper{"a", "b", "c", "d", "e"}; + auto const input_column = cudf::test::structs_column_wrapper{{col1, col2}}; + auto const prev_column = fixed_width_column_wrapper{1, 2, 2, 2, 2}; + auto const foll_column = fixed_width_column_wrapper{1, 1, 1, 1, 0}; - EXPECT_THROW(rolling_window(input_column, - prev_column, - foll_column, - 1, - *make_collect_set_aggregation()), - cudf::logic_error); + auto const expected = [] { + auto child1 = fixed_width_column_wrapper{1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5}; + auto child2 = + strings_column_wrapper{"a", "b", "a", "b", "c", "b", "c", "d", "c", "d", "e", "d", "e"}; + return cudf::make_lists_column( + 5, + fixed_width_column_wrapper{0, 2, 5, 8, 11, 13}.release(), + structs_column_wrapper{{child1, child2}}.release(), + 0, + {}); + }(); + auto const result = rolling_window(input_column, + prev_column, + foll_column, + 1, + *make_collect_set_aggregation()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected->view(), result->view()); } -TEST_F(CollectSetTest, StructTypeRollingWindow) +TEST_F(CollectSetTest, ListTypeRollingWindow) { using namespace cudf; using namespace cudf::test; - auto col1 = fixed_width_column_wrapper{1, 2, 3, 4, 5}; - auto col2 = strings_column_wrapper{"a", "b", "c", "d", "e"}; - auto const input_column = cudf::test::structs_column_wrapper{{col1, col2}}; - auto const prev_column = fixed_width_column_wrapper{1, 2, 2, 2, 2}; - auto const foll_column = fixed_width_column_wrapper{1, 1, 1, 1, 0}; + auto const input_column = lists_column_wrapper{{1, 2, 3}, {4, 5}, {6}, {7, 8, 9}, {10}}; + + auto const prev_column = fixed_width_column_wrapper{1, 2, 2, 2, 2}; + auto const foll_column = fixed_width_column_wrapper{1, 1, 1, 1, 0}; EXPECT_THROW(rolling_window(input_column, prev_column, diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index f3002bc4b1a..0f10d6efe4a 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -323,7 +323,8 @@ class corresponding_rows_unequal { corresponding_rows_unequal(table_device_view d_lhs, table_device_view d_rhs, column_device_view lhs_row_indices_, - column_device_view rhs_row_indices_) + column_device_view rhs_row_indices_, + size_type /*fp_ulps*/) : comp(d_lhs, d_rhs), lhs_row_indices(lhs_row_indices_), rhs_row_indices(rhs_row_indices_) { } @@ -347,16 +348,20 @@ class corresponding_rows_not_equivalent { column_device_view lhs_row_indices; column_device_view rhs_row_indices; + size_type const fp_ulps; + public: corresponding_rows_not_equivalent(table_device_view d_lhs, table_device_view d_rhs, column_device_view lhs_row_indices_, - column_device_view rhs_row_indices_) + column_device_view rhs_row_indices_, + size_type fp_ulps_) : d_lhs(d_lhs), d_rhs(d_rhs), comp(d_lhs, d_rhs), lhs_row_indices(lhs_row_indices_), - rhs_row_indices(rhs_row_indices_) + rhs_row_indices(rhs_row_indices_), + fp_ulps(fp_ulps_) { CUDF_EXPECTS(d_lhs.num_columns() == 1 and d_rhs.num_columns() == 1, "Unsupported number of columns"); @@ -368,7 +373,8 @@ class corresponding_rows_not_equivalent { column_device_view const& lhs, column_device_view const& rhs, size_type lhs_index, - size_type rhs_index) + size_type rhs_index, + size_type fp_ulps) { if (lhs.is_valid(lhs_index) and rhs.is_valid(rhs_index)) { T const x = lhs.element(lhs_index); @@ -380,10 +386,9 @@ class corresponding_rows_not_equivalent { } else if (std::isnan(x) || std::isnan(y)) { return std::isnan(x) != std::isnan(y); // comparison of (nan==nan) returns false } else { - constexpr int ulp = 4; // ulp = unit of least precision, value taken from google test T const abs_x_minus_y = std::abs(x - y); return abs_x_minus_y >= std::numeric_limits::min() && - abs_x_minus_y > std::numeric_limits::epsilon() * std::abs(x + y) * ulp; + abs_x_minus_y > std::numeric_limits::epsilon() * std::abs(x + y) * fp_ulps; } } else { // if either is null, then the inequality was checked already @@ -409,8 +414,13 @@ class corresponding_rows_not_equivalent { if (not comp(lhs_index, rhs_index)) { auto lhs_col = this->d_lhs.column(0); auto rhs_col = this->d_rhs.column(0); - return type_dispatcher( - lhs_col.type(), typed_element_not_equivalent{}, lhs_col, rhs_col, lhs_index, rhs_index); + return type_dispatcher(lhs_col.type(), + typed_element_not_equivalent{}, + lhs_col, + rhs_col, + lhs_index, + rhs_index, + fp_ulps); } return false; } @@ -468,6 +478,7 @@ struct column_comparator_impl { column_view const& lhs_row_indices, column_view const& rhs_row_indices, debug_output_level verbosity, + size_type fp_ulps, int depth) { auto d_lhs = cudf::table_device_view::create(table_view{{lhs}}); @@ -483,12 +494,12 @@ struct column_comparator_impl { auto differences = rmm::device_uvector( lhs.size(), rmm::cuda_stream_default); // worst case: everything different auto input_iter = thrust::make_counting_iterator(0); - auto diff_iter = - thrust::copy_if(rmm::exec_policy(), - input_iter, - input_iter + lhs_row_indices.size(), - differences.begin(), - ComparatorType(*d_lhs, *d_rhs, *d_lhs_row_indices, *d_rhs_row_indices)); + auto diff_iter = thrust::copy_if( + rmm::exec_policy(), + input_iter, + input_iter + lhs_row_indices.size(), + differences.begin(), + ComparatorType(*d_lhs, *d_rhs, *d_lhs_row_indices, *d_rhs_row_indices, fp_ulps)); differences.resize(thrust::distance(differences.begin(), diff_iter), rmm::cuda_stream_default); // shrink back down @@ -519,6 +530,7 @@ struct column_comparator_impl { column_view const& lhs_row_indices, column_view const& rhs_row_indices, debug_output_level verbosity, + size_type fp_ulps, int depth) { lists_column_view lhs_l(lhs); @@ -638,6 +650,7 @@ struct column_comparator_impl { *lhs_child_indices, *rhs_child_indices, verbosity, + fp_ulps, depth + 1); } @@ -652,6 +665,7 @@ struct column_comparator_impl { column_view const& lhs_row_indices, column_view const& rhs_row_indices, debug_output_level verbosity, + size_type fp_ulps, int depth) { structs_column_view l_scv(lhs); @@ -667,6 +681,7 @@ struct column_comparator_impl { lhs_row_indices, rhs_row_indices, verbosity, + fp_ulps, depth + 1)) { return false; } @@ -683,6 +698,7 @@ struct column_comparator { column_view const& lhs_row_indices, column_view const& rhs_row_indices, debug_output_level verbosity, + size_type fp_ulps, int depth = 0) { CUDF_EXPECTS(lhs_row_indices.size() == rhs_row_indices.size(), @@ -701,7 +717,7 @@ struct column_comparator { // compare values column_comparator_impl comparator{}; - return comparator(lhs, rhs, lhs_row_indices, rhs_row_indices, verbosity, depth); + return comparator(lhs, rhs, lhs_row_indices, rhs_row_indices, verbosity, fp_ulps, depth); } }; @@ -750,8 +766,14 @@ bool expect_columns_equal(cudf::column_view const& lhs, debug_output_level verbosity) { auto indices = generate_all_row_indices(lhs.size()); - return cudf::type_dispatcher( - lhs.type(), column_comparator{}, lhs, rhs, *indices, *indices, verbosity); + return cudf::type_dispatcher(lhs.type(), + column_comparator{}, + lhs, + rhs, + *indices, + *indices, + verbosity, + cudf::test::default_ulp); } /** @@ -759,11 +781,12 @@ bool expect_columns_equal(cudf::column_view const& lhs, */ bool expect_columns_equivalent(cudf::column_view const& lhs, cudf::column_view const& rhs, - debug_output_level verbosity) + debug_output_level verbosity, + size_type fp_ulps) { auto indices = generate_all_row_indices(lhs.size()); return cudf::type_dispatcher( - lhs.type(), column_comparator{}, lhs, rhs, *indices, *indices, verbosity); + lhs.type(), column_comparator{}, lhs, rhs, *indices, *indices, verbosity, fp_ulps); } /** diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index c5f1233d022..4a7d115ae3b 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -80,9 +80,9 @@ # built documents. # # The short X.Y version. -version = '21.10' +version = '21.12' # The full version, including alpha/beta/rc tags. -release = '21.10.00' +release = '21.12.00' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/java/ci/README.md b/java/ci/README.md index ef3a329f7f6..5432dc8d0f1 100644 --- a/java/ci/README.md +++ b/java/ci/README.md @@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.2.2-devel-centos7 bash You can download the cuDF repo in the docker container or you can mount it into the container. Here I choose to download again in the container. ```bash -git clone --recursive https://github.com/rapidsai/cudf.git -b branch-21.10 +git clone --recursive https://github.com/rapidsai/cudf.git -b branch-21.12 ``` ### Build cuDF jar with devtoolset @@ -47,5 +47,5 @@ scl enable devtoolset-9 "java/ci/build-in-docker.sh" ### The output -You can find the cuDF jar in java/target/ like cudf-21.10.0-SNAPSHOT-cuda11.jar. +You can find the cuDF jar in java/target/ like cudf-21.12.0-SNAPSHOT-cuda11.jar. diff --git a/java/pom.xml b/java/pom.xml index 1b4a31116d4..db79f94009b 100755 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ ai.rapids cudf - 21.10.0-SNAPSHOT + 21.12.0-SNAPSHOT cudfjni diff --git a/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java index 238e0b61fd9..85443c3ae0f 100644 --- a/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,8 +20,6 @@ public class ORCWriterOptions extends CompressedMetadataWriterOptions { - public static ORCWriterOptions DEFAULT = new ORCWriterOptions(new Builder()); - private ORCWriterOptions(Builder builder) { super(builder); } diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 2744728fb44..0af02d1c926 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -1147,7 +1147,11 @@ public static TableWriter writeORCChunked(ORCWriterOptions options, HostBufferCo */ @Deprecated public void writeORC(File outputFile) { - writeORC(ORCWriterOptions.DEFAULT, outputFile); + // Need to specify the number of columns but leave all column names undefined + String[] names = new String[getNumberOfColumns()]; + Arrays.fill(names, ""); + ORCWriterOptions opts = ORCWriterOptions.builder().withColumnNames(names).build(); + writeORC(opts, outputFile); } /** @@ -1157,6 +1161,7 @@ public void writeORC(File outputFile) { */ @Deprecated public void writeORC(ORCWriterOptions options, File outputFile) { + assert options.getColumnNames().length == getNumberOfColumns() : "must specify names for all columns"; try (TableWriter writer = Table.writeORCChunked(options, outputFile)) { writer.write(this); } diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index fc74ee2a3a9..2c95c6eebac 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -15,7 +15,7 @@ #============================================================================= cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) -file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.10/RAPIDS.cmake +file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.12/RAPIDS.cmake ${CMAKE_BINARY_DIR}/RAPIDS.cmake) include(${CMAKE_BINARY_DIR}/RAPIDS.cmake) @@ -29,7 +29,7 @@ if(DEFINED GPU_ARCHS) endif() rapids_cuda_init_architectures(CUDF_JNI) -project(CUDF_JNI VERSION 21.10.00 LANGUAGES C CXX CUDA) +project(CUDF_JNI VERSION 21.12.00 LANGUAGES C CXX CUDA) ################################################################################################### # - build options --------------------------------------------------------------------------------- diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 96dd02e5f2a..ee75112a2ed 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -736,6 +736,29 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam } } +cudf::io::table_input_metadata createORCTableInputMetadata(JNIEnv *env, + jobjectArray const &j_col_names, + jbooleanArray const &j_col_nullability, + jobjectArray const &j_metadata_keys, + jobjectArray const &j_metadata_values) { + cudf::jni::native_jstringArray const col_names(env, j_col_names); + cudf::jni::native_jbooleanArray const col_nullability(env, j_col_nullability); + cudf::jni::native_jstringArray const meta_keys(env, j_metadata_keys); + cudf::jni::native_jstringArray const meta_values(env, j_metadata_values); + + std::vector const cpp_names = col_names.as_cpp_vector(); + std::size_t const num_columns = cpp_names.size(); + cudf::io::table_input_metadata metadata; + metadata.column_metadata.resize(cpp_names.size()); + for (std::size_t i = 0; i < num_columns; i++) { + metadata.column_metadata[i].set_name(cpp_names[i]).set_nullability(col_nullability[i]); + } + for (int i = 0; i < meta_keys.size(); ++i) { + metadata.user_data[meta_keys[i].get()] = meta_values[i].get(); + } + return metadata; +} + // Check that window parameters are valid. bool valid_window_parameters(native_jintArray const &values, native_jpointerArray const &ops, @@ -1500,19 +1523,8 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin( try { cudf::jni::auto_set_device(env); using namespace cudf::io; - cudf::jni::native_jstringArray col_names(env, j_col_names); - cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability); - cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys); - cudf::jni::native_jstringArray meta_values(env, j_metadata_values); - - auto d = col_nullability.data(); - std::vector nullability(d, d + col_nullability.size()); - table_metadata_with_nullability metadata; - metadata.column_nullable = nullability; - metadata.column_names = col_names.as_cpp_vector(); - for (int i = 0; i < meta_keys.size(); ++i) { - metadata.user_data[meta_keys[i].get()] = meta_values[i].get(); - } + table_input_metadata metadata = cudf::jni::createORCTableInputMetadata( + env, j_col_names, j_col_nullability, j_metadata_keys, j_metadata_values); std::unique_ptr data_sink( new cudf::jni::jni_writer_data_sink(env, consumer)); @@ -1542,20 +1554,10 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin( try { cudf::jni::auto_set_device(env); using namespace cudf::io; - cudf::jni::native_jstringArray col_names(env, j_col_names); - cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability); - cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys); - cudf::jni::native_jstringArray meta_values(env, j_metadata_values); cudf::jni::native_jstring output_path(env, j_output_path); - auto d = col_nullability.data(); - std::vector nullability(d, d + col_nullability.size()); - table_metadata_with_nullability metadata; - metadata.column_nullable = nullability; - metadata.column_names = col_names.as_cpp_vector(); - for (int i = 0; i < meta_keys.size(); ++i) { - metadata.user_data[meta_keys[i].get()] = meta_values[i].get(); - } + table_input_metadata metadata = cudf::jni::createORCTableInputMetadata( + env, j_col_names, j_col_nullability, j_metadata_keys, j_metadata_values); sink_info sink{output_path.get()}; chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink) @@ -1577,7 +1579,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCChunk(JNIEnv *env, jcla JNI_NULL_CHECK(env, j_state, "null state", ); using namespace cudf::io; - cudf::table_view *tview = reinterpret_cast(j_table); + cudf::table_view *tview_orig = reinterpret_cast(j_table); + cudf::table_view tview = cudf::jni::remove_validity_if_needed(tview_orig); cudf::jni::native_orc_writer_handle *state = reinterpret_cast(j_state); @@ -1587,7 +1590,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCChunk(JNIEnv *env, jcla } try { cudf::jni::auto_set_device(env); - state->writer->write(*tview); + state->writer->write(tview); } CATCH_STD(env, ) } diff --git a/java/src/main/native/src/map_lookup.cu b/java/src/main/native/src/map_lookup.cu index ad791747713..683651799e7 100644 --- a/java/src/main/native/src/map_lookup.cu +++ b/java/src/main/native/src/map_lookup.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -183,6 +183,10 @@ std::unique_ptr map_lookup(column_view const &map_column, string_scalar // Defensive checks. map_input_check(map_column, stream); + if (map_column.size() == 0) { + return make_empty_column(cudf::data_type{cudf::type_id::STRING}); + } + lists_column_view lcv{map_column}; column_view structs_column = lcv.get_sliced_child(stream); // Two-pass plan: construct gather map, and then gather() on structs_column.child(1). Plan A. diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 0643776a546..d1af0d9a2f6 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -5412,6 +5412,17 @@ void testGetMapValue() { } } + @Test + void testGetMapValueEmptyInput() { + HostColumnVector.StructType structType = new HostColumnVector.StructType(true, Arrays.asList(new HostColumnVector.BasicType(true, DType.STRING), + new HostColumnVector.BasicType(true, DType.STRING))); + try (ColumnVector cv = ColumnVector.fromLists(new HostColumnVector.ListType(true, structType)); + ColumnVector res = cv.getMapValue(Scalar.fromString("a")); + ColumnVector expected = ColumnVector.fromStrings()) { + assertColumnsAreEqual(expected, res); + } + } + @Test void testGetMapKeyExistence() { List list1 = Arrays.asList(new HostColumnVector.StructData("a", "b")); diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index cd1e433d07b..0e7ac15a79e 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -6669,8 +6669,9 @@ void testParquetWriteMap() throws IOException { HostColumnVector.StructType structType = new HostColumnVector.StructType(true, Arrays.asList(new HostColumnVector.BasicType(true, DType.STRING), new HostColumnVector.BasicType(true, DType.STRING))); - try (Table t0 = new Table(ColumnVector.fromLists(new HostColumnVector.ListType(true, - structType), list1, list2, list3))) { + try (ColumnVector listColumn = ColumnVector.fromLists(new HostColumnVector.ListType(true, + structType), list1, list2, list3); + Table t0 = new Table(listColumn)) { try (TableWriter writer = Table.writeParquetChunked(options, f)) { writer.write(t0); } @@ -6875,7 +6876,10 @@ void testArrowIPCWriteToBufferChunked() { void testORCWriteToBufferChunked() { try (Table table0 = getExpectedFileTable(); MyBufferConsumer consumer = new MyBufferConsumer()) { - try (TableWriter writer = Table.writeORCChunked(ORCWriterOptions.DEFAULT, consumer)) { + String[] colNames = new String[table0.getNumberOfColumns()]; + Arrays.fill(colNames, ""); + ORCWriterOptions opts = ORCWriterOptions.builder().withColumnNames(colNames).build(); + try (TableWriter writer = Table.writeORCChunked(opts, consumer)) { writer.write(table0); writer.write(table0); writer.write(table0); @@ -6923,7 +6927,13 @@ void testORCWriteToFileWithColNames() throws IOException { void testORCWriteToFileUncompressed() throws IOException { File tempFileUncompressed = File.createTempFile("test-uncompressed", ".orc"); try (Table table0 = getExpectedFileTable()) { - table0.writeORC(ORCWriterOptions.builder().withCompressionType(CompressionType.NONE).build(), tempFileUncompressed.getAbsoluteFile()); + String[] colNames = new String[table0.getNumberOfColumns()]; + Arrays.fill(colNames, ""); + ORCWriterOptions opts = ORCWriterOptions.builder() + .withColumnNames(colNames) + .withCompressionType(CompressionType.NONE) + .build(); + table0.writeORC(opts, tempFileUncompressed.getAbsoluteFile()); try (Table table2 = Table.readORC(tempFileUncompressed.getAbsoluteFile())) { assertTablesAreEqual(table0, table2); } diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd index d89af43028d..3036b000c5b 100644 --- a/python/cudf/cudf/_lib/cpp/io/orc.pxd +++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd @@ -70,13 +70,13 @@ cdef extern from "cudf/io/orc.hpp" \ cudf_io_types.compression_type get_compression() except+ bool enable_statistics() except+ cudf_table_view.table_view get_table() except+ - const cudf_io_types.table_metadata *get_metadata() except+ + const cudf_io_types.table_input_metadata *get_metadata() except+ # setter void set_compression(cudf_io_types.compression_type comp) except+ void enable_statistics(bool val) except+ void set_table(cudf_table_view.table_view tbl) except+ - void set_metadata(cudf_io_types.table_metadata* meta) except+ + void set_metadata(cudf_io_types.table_input_metadata* meta) except+ @staticmethod orc_writer_options_builder builder( @@ -94,7 +94,7 @@ cdef extern from "cudf/io/orc.hpp" \ cudf_table_view.table_view tbl ) except+ orc_writer_options_builder& metadata( - cudf_io_types.table_metadata *meta + cudf_io_types.table_input_metadata *meta ) except+ orc_writer_options build() except+ @@ -107,7 +107,7 @@ cdef extern from "cudf/io/orc.hpp" \ cudf_io_types.compression_type get_compression() except+ bool enable_statistics() except+ cudf_table_view.table_view get_table() except+ - const cudf_io_types.table_metadata_with_nullability *get_metadata( + const cudf_io_types.table_input_metadata *get_metadata( ) except+ # setter @@ -115,7 +115,7 @@ cdef extern from "cudf/io/orc.hpp" \ void enable_statistics(bool val) except+ void set_table(cudf_table_view.table_view tbl) except+ void set_metadata( - cudf_io_types.table_metadata_with_nullability* meta + cudf_io_types.table_input_metadata* meta ) except+ @staticmethod @@ -133,7 +133,7 @@ cdef extern from "cudf/io/orc.hpp" \ cudf_table_view.table_view tbl ) except+ chunked_orc_writer_options_builder& metadata( - cudf_io_types.table_metadata *meta + cudf_io_types.table_input_metadata *meta ) except+ chunked_orc_writer_options build() except+ diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd index e2053f8ce4f..81ca7e5836b 100644 --- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd +++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd @@ -66,36 +66,17 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: cdef cudf_io_types.table_with_metadata read_parquet( parquet_reader_options args) except + - cdef cppclass column_in_metadata: - column_in_metadata& set_name(const string& name) - column_in_metadata& set_nullability(bool nullable) - column_in_metadata& set_list_column_as_map() - column_in_metadata& set_int96_timestamps(bool req) - column_in_metadata& set_decimal_precision(uint8_t precision) - column_in_metadata& child(size_type i) - - cdef cppclass table_input_metadata: - table_input_metadata() except + - table_input_metadata(const cudf_table_view.table_view& table) except + - table_input_metadata( - const cudf_table_view.table_view& table, - map[string, string] user_data - ) except + - - vector[column_in_metadata] column_metadata - map[string, string] user_data - cdef cppclass parquet_writer_options: parquet_writer_options() except + cudf_io_types.sink_info get_sink_info() except + cudf_io_types.compression_type get_compression() except + cudf_io_types.statistics_freq get_stats_level() except + cudf_table_view.table_view get_table() except + - const table_input_metadata get_metadata() except + + const cudf_io_types.table_input_metadata get_metadata() except + string get_column_chunks_file_path() except+ void set_metadata( - table_input_metadata *m + cudf_io_types.table_input_metadata *m ) except + void set_stats_level( cudf_io_types.statistics_freq sf @@ -121,7 +102,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: cudf_table_view.table_view table_ ) except + parquet_writer_options_builder& metadata( - table_input_metadata *m + cudf_io_types.table_input_metadata *m ) except + parquet_writer_options_builder& stats_level( cudf_io_types.statistics_freq sf @@ -147,11 +128,11 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: cudf_io_types.sink_info get_sink() except + cudf_io_types.compression_type get_compression() except + cudf_io_types.statistics_freq get_stats_level() except + - table_input_metadata* get_metadata( + cudf_io_types.table_input_metadata* get_metadata( ) except+ void set_metadata( - table_input_metadata *m + cudf_io_types.table_input_metadata *m ) except + void set_stats_level( cudf_io_types.statistics_freq sf @@ -171,7 +152,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: cudf_io_types.sink_info sink_, ) except + chunked_parquet_writer_options_builder& metadata( - table_input_metadata *m + cudf_io_types.table_input_metadata *m ) except + chunked_parquet_writer_options_builder& stats_level( cudf_io_types.statistics_freq sf diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/cpp/io/types.pxd index 9fb0e470950..4817cba9d74 100644 --- a/python/cudf/cudf/_lib/cpp/io/types.pxd +++ b/python/cudf/cudf/_lib/cpp/io/types.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2020, NVIDIA CORPORATION. +from libc.stdint cimport uint8_t from libcpp cimport bool from libcpp.map cimport map from libcpp.memory cimport shared_ptr, unique_ptr @@ -8,7 +9,9 @@ from libcpp.string cimport string from libcpp.vector cimport vector from pyarrow.includes.libarrow cimport CRandomAccessFile +cimport cudf._lib.cpp.table.table_view as cudf_table_view from cudf._lib.cpp.table.table cimport table +from cudf._lib.cpp.types cimport size_type cdef extern from "cudf/io/types.hpp" \ @@ -52,15 +55,29 @@ cdef extern from "cudf/io/types.hpp" \ map[string, string] user_data vector[column_name_info] schema_info - cdef cppclass table_metadata_with_nullability(table_metadata): - table_metadata_with_nullability() except + - - vector[bool] nullability - cdef cppclass table_with_metadata: unique_ptr[table] tbl table_metadata metadata + cdef cppclass column_in_metadata: + column_in_metadata& set_name(const string& name) + column_in_metadata& set_nullability(bool nullable) + column_in_metadata& set_list_column_as_map() + column_in_metadata& set_int96_timestamps(bool req) + column_in_metadata& set_decimal_precision(uint8_t precision) + column_in_metadata& child(size_type i) + + cdef cppclass table_input_metadata: + table_input_metadata() except + + table_input_metadata(const cudf_table_view.table_view& table) except + + table_input_metadata( + const cudf_table_view.table_view& table, + map[string, string] user_data + ) except + + + vector[column_in_metadata] column_metadata + map[string, string] user_data + cdef cppclass host_buffer: const char* data size_t size diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index bc4f4aee9cd..03d163b7638 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -23,13 +23,13 @@ from cudf._lib.cpp.io.orc_metadata cimport ( read_raw_orc_statistics as libcudf_read_raw_orc_statistics, ) from cudf._lib.cpp.io.types cimport ( + column_in_metadata, column_name_info, compression_type, data_sink, sink_info, source_info, - table_metadata, - table_metadata_with_nullability, + table_input_metadata, table_with_metadata, ) from cudf._lib.cpp.table.table_view cimport table_view @@ -50,7 +50,8 @@ import numpy as np from cudf._lib.utils cimport data_from_unique_ptr, get_column_names -from cudf._lib.utils import generate_pandas_metadata +from cudf._lib.utils import _index_level_name, generate_pandas_metadata +from cudf.api.types import is_list_dtype, is_struct_dtype cpdef read_raw_orc_statistics(filepath_or_buffer): @@ -144,19 +145,35 @@ cpdef write_orc(Table table, cudf.read_orc """ cdef compression_type compression_ = _get_comp_type(compression) - cdef table_metadata metadata_ = table_metadata() cdef unique_ptr[data_sink] data_sink_c cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c) - - metadata_.column_names.reserve(len(table._column_names)) - - for col_name in table._column_names: - metadata_.column_names.push_back(str.encode(col_name)) + cdef unique_ptr[table_input_metadata] tbl_meta + + if not isinstance(table._index, cudf.RangeIndex): + tv = table_view_from_table(table) + tbl_meta = make_unique[table_input_metadata](tv) + for level, idx_name in enumerate(table._index.names): + tbl_meta.get().column_metadata[level].set_name( + str.encode( + _index_level_name(idx_name, level, table._column_names) + ) + ) + num_index_cols_meta = len(table._index.names) + else: + tv = table_view_from_table(table, ignore_index=True) + tbl_meta = make_unique[table_input_metadata](tv) + num_index_cols_meta = 0 + + for i, name in enumerate(table._column_names, num_index_cols_meta): + tbl_meta.get().column_metadata[i].set_name(name.encode()) + _set_col_children_names( + table[name]._column, tbl_meta.get().column_metadata[i] + ) cdef orc_writer_options c_orc_writer_options = move( orc_writer_options.builder( sink_info_c, table_view_from_table(table, ignore_index=True) - ).metadata(&metadata_) + ).metadata(tbl_meta.get()) .compression(compression_) .enable_statistics( (True if enable_statistics else False)) .build() @@ -231,6 +248,7 @@ cdef class ORCWriter: cdef bool enable_stats cdef compression_type comp_type cdef object index + cdef unique_ptr[table_input_metadata] tbl_meta def __cinit__(self, object path, object index=None, object compression=None, bool enable_statistics=True): @@ -268,20 +286,46 @@ cdef class ORCWriter: """ Prepare all the values required to build the chunked_orc_writer_options anb creates a writer""" - cdef unique_ptr[table_metadata_with_nullability] tbl_meta - tbl_meta = make_unique[table_metadata_with_nullability]() + cdef table_view tv # Set the table_metadata - tbl_meta.get().column_names = get_column_names(table, self.index) + num_index_cols_meta = 0 + self.tbl_meta = make_unique[table_input_metadata]( + table_view_from_table(table, ignore_index=True) + ) + if self.index is not False: + if isinstance(table._index, cudf.core.multiindex.MultiIndex): + tv = table_view_from_table(table) + self.tbl_meta = make_unique[table_input_metadata](tv) + for level, idx_name in enumerate(table._index.names): + self.tbl_meta.get().column_metadata[level].set_name( + (str.encode(idx_name)) + ) + num_index_cols_meta = len(table._index.names) + else: + if table._index.name is not None: + tv = table_view_from_table(table) + self.tbl_meta = make_unique[table_input_metadata](tv) + self.tbl_meta.get().column_metadata[0].set_name( + str.encode(table._index.name) + ) + num_index_cols_meta = 1 + + for i, name in enumerate(table._column_names, num_index_cols_meta): + self.tbl_meta.get().column_metadata[i].set_name(name.encode()) + _set_col_children_names( + table[name]._column, self.tbl_meta.get().column_metadata[i] + ) + pandas_metadata = generate_pandas_metadata(table, self.index) - tbl_meta.get().user_data[str.encode("pandas")] = \ + self.tbl_meta.get().user_data[str.encode("pandas")] = \ str.encode(pandas_metadata) cdef chunked_orc_writer_options args with nogil: args = move( chunked_orc_writer_options.builder(self.sink) - .metadata(tbl_meta.get()) + .metadata(self.tbl_meta.get()) .compression(self.comp_type) .enable_statistics(self.enable_stats) .build() @@ -289,3 +333,15 @@ cdef class ORCWriter: self.writer.reset(new orc_chunked_writer(args)) self.initialized = True + +cdef _set_col_children_names(Column col, column_in_metadata& col_meta): + if is_struct_dtype(col): + for i, (child_col, name) in enumerate( + zip(col.children, list(col.dtype.fields)) + ): + col_meta.child(i).set_name(name.encode()) + _set_col_children_names(child_col, col_meta.child(i)) + elif is_list_dtype(col): + _set_col_children_names(col.children[1], col_meta.child(1)) + else: + return diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index d9017c7d6f8..70bdb6e2e60 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -45,15 +45,14 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.io.parquet cimport ( chunked_parquet_writer_options, chunked_parquet_writer_options_builder, - column_in_metadata, merge_rowgroup_metadata as parquet_merge_metadata, parquet_chunked_writer as cpp_parquet_chunked_writer, parquet_reader_options, parquet_writer_options, read_parquet as parquet_reader, - table_input_metadata, write_parquet as parquet_writer, ) +from cudf._lib.cpp.io.types cimport column_in_metadata, table_input_metadata from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport data_type, size_type diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index dd12c92a15a..810cdd51df5 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -1,5 +1,6 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. +import numpy as np import pyarrow as pa import cudf @@ -81,7 +82,14 @@ cpdef generate_pandas_metadata(Table table, index): ): types.append(col.dtype.to_arrow()) else: - types.append(np_to_pa_dtype(col.dtype)) + # A boolean element takes 8 bits in cudf and 1 bit in + # pyarrow. To make sure the cudf format is interperable + # in arrow, we use `int8` type when converting from a + # cudf boolean array. + if col.dtype.type == np.bool_: + types.append(pa.int8()) + else: + types.append(np_to_pa_dtype(col.dtype)) # Indexes if index is not False: @@ -125,7 +133,15 @@ cpdef generate_pandas_metadata(Table table, index): elif is_list_dtype(idx): types.append(col.dtype.to_arrow()) else: - types.append(np_to_pa_dtype(idx.dtype)) + # A boolean element takes 8 bits in cudf and 1 bit in + # pyarrow. To make sure the cudf format is interperable + # in arrow, we use `int8` type when converting from a + # cudf boolean array. + if idx.dtype.type == np.bool_: + types.append(pa.int8()) + else: + types.append(np_to_pa_dtype(idx.dtype)) + index_levels.append(idx) col_names.append(name) index_descriptors.append(descr) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 1fe59d3dfd6..b2f3274faab 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -44,6 +44,11 @@ def _values(self) -> ColumnBase: def copy(self, deep: bool = True) -> BaseIndex: raise NotImplementedError + @property + def size(self): + # The size of an index is always its length irrespective of dimension. + return len(self) + @property def values(self): return self._values.values @@ -162,6 +167,38 @@ def _clean_nulls_from_index(self): else: return self + @property + def is_monotonic(self): + """Return boolean if values in the object are monotonic_increasing. + + This property is an alias for :attr:`is_monotonic_increasing`. + + Returns + ------- + bool + """ + return self.is_monotonic_increasing + + @property + def is_monotonic_increasing(self): + """Return boolean if values in the object are monotonically increasing. + + Returns + ------- + bool + """ + raise NotImplementedError + + @property + def is_monotonic_decreasing(self): + """Return boolean if values in the object are monotonically decreasing. + + Returns + ------- + bool + """ + raise NotImplementedError + @property def nlevels(self): """ diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 8f18d83eb31..de278db919d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2089,10 +2089,7 @@ def as_column( data ) np_type = np.dtype(dtype).type - if np_type == np.bool_: - pa_type = pa.bool_() - else: - pa_type = np_to_pa_dtype(np.dtype(dtype)) + pa_type = np_to_pa_dtype(np.dtype(dtype)) data = as_column( pa.array( arbitrary, diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index c14cbd11714..c59081e4b59 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -352,7 +352,9 @@ def cat(self, others=None, sep=None, na_rep=None): if len(data) == 1 and data.null_count == 1: data = [""] - out = self._return_or_inplace(data) + # We only want to keep the index if we are adding something to each + # row, not if we are joining all the rows into a single string. + out = self._return_or_inplace(data, retain_index=others is not None) if len(out) == 1 and others is None: if isinstance(out, cudf.Series): out = out.iloc[0] diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1143f85a4e6..bdbd94ef754 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -594,12 +594,6 @@ def dtypes(self): data=[x.dtype for x in self._data.columns], index=self._data.names, ) - @property - def shape(self): - """Returns a tuple representing the dimensionality of the DataFrame. - """ - return self._num_rows, self._num_columns - @property def ndim(self): """Dimension of the data. DataFrame ndim is always 2. @@ -938,12 +932,6 @@ def memory_usage(self, index=True, deep=False): sizes.append(self.index.memory_usage(deep=deep)) return Series(sizes, index=ind) - def __len__(self): - """ - Returns the number of rows - """ - return len(self.index) - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): import cudf @@ -4742,7 +4730,9 @@ def query(self, expr, local_dict=None): boolmask = queryutils.query_execute(self, expr, callenv) return self._apply_boolean_mask(boolmask) - def apply(self, func, axis=1): + def apply( + self, func, axis=1, raw=False, result_type=None, args=(), **kwargs + ): """ Apply a function along an axis of the DataFrame. @@ -4756,12 +4746,17 @@ def apply(self, func, axis=1): ---------- func : function Function to apply to each row. - axis : {0 or 'index', 1 or 'columns'}, default 0 Axis along which the function is applied: * 0 or 'index': apply function to each column. Note: axis=0 is not yet supported. * 1 or 'columns': apply function to each row. + raw: bool, default False + Not yet supported + result_type: {'expand', 'reduce', 'broadcast', None}, default None + Not yet supported + args: tuple + Not yet supported Examples -------- @@ -4910,6 +4905,12 @@ def apply(self, func, axis=1): raise ValueError( "DataFrame.apply currently only supports row wise ops" ) + if raw: + raise ValueError("The `raw` kwarg is not yet supported.") + if result_type is not None: + raise ValueError("The `result_type` kwarg is not yet supported.") + if args or kwargs: + raise ValueError("args and kwargs are not yet supported.") return cudf.Series(func(self)) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 09a6df67da5..28080cbc4c1 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -166,6 +166,11 @@ def size(self): """ return self._num_columns * self._num_rows + @property + def shape(self): + """Returns a tuple representing the dimensionality of the DataFrame.""" + return self._num_rows, self._num_columns + @property def _is_homogeneous(self): # make sure that the dataframe has columns @@ -4547,6 +4552,12 @@ def to_string(self): def __str__(self): return self.to_string() + def __deepcopy__(self, memo): + return self.copy(deep=True) + + def __copy__(self): + return self.copy(deep=False) + def head(self, n=5): """ Return the first `n` rows. @@ -4815,9 +4826,6 @@ def __iter__(self): """ cudf.utils.utils.raise_iteration_error(obj=self) - def __len__(self): - return len(self._column) - def __bool__(self): raise TypeError( f"The truth value of a {type(self)} is ambiguous. Use " @@ -5005,7 +5013,7 @@ def is_unique(self): @property def is_monotonic(self): - """Return boolean if values in the object are monotonic_increasing. + """Return boolean if values in the object are monotonically increasing. This property is an alias for :attr:`is_monotonic_increasing`. @@ -5017,7 +5025,7 @@ def is_monotonic(self): @property def is_monotonic_increasing(self): - """Return boolean if values in the object are monotonic_increasing. + """Return boolean if values in the object are monotonically increasing. Returns ------- @@ -5027,7 +5035,7 @@ def is_monotonic_increasing(self): @property def is_monotonic_decreasing(self): - """Return boolean if values in the object are monotonic_decreasing. + """Return boolean if values in the object are monotonically decreasing. Returns ------- diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 3ac30143463..6414d4a7e84 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -349,17 +349,6 @@ def dtype(self): """ return cudf.dtype(np.int64) - @property - def is_contiguous(self): - """ - Returns if the index is contiguous. - """ - return self._step == 1 - - @property - def size(self): - return len(self) - def find_label_range(self, first=None, last=None): """Find subrange in the ``RangeIndex``, marked by their positions, that starts greater or equal to ``first`` and ends less or equal to ``last`` @@ -417,18 +406,10 @@ def is_unique(self): @property def is_monotonic_increasing(self): - """ - Return if the index is monotonic increasing - (only equal or increasing) values. - """ return self._step > 0 or len(self) <= 1 @property def is_monotonic_decreasing(self): - """ - Return if the index is monotonic decreasing - (only equal or decreasing) values. - """ return self._step < 0 or len(self) <= 1 def get_slice_bound(self, label, side, kind=None): diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 84566b4627c..bc97c72db88 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -7,6 +7,7 @@ import pickle import warnings from collections.abc import Sequence +from numbers import Integral from typing import Any, List, MutableMapping, Optional, Tuple, Union import cupy @@ -17,6 +18,7 @@ import cudf from cudf import _lib as libcudf from cudf._typing import DataFrameOrSeries +from cudf.api.types import is_integer, is_list_like from cudf.core import column from cudf.core._compat import PANDAS_GE_120 from cudf.core.frame import Frame @@ -33,8 +35,6 @@ class MultiIndex(Frame, BaseIndex): ---------- levels : sequence of arrays The unique labels for each level. - labels : sequence of arrays - labels is depreciated, please use levels codes: sequence of arrays Integers for each level designating which label at each location. sortorder : optional int @@ -68,7 +68,6 @@ def __init__( levels=None, codes=None, sortorder=None, - labels=None, names=None, dtype=None, copy=False, @@ -78,13 +77,16 @@ def __init__( if sortorder is not None: raise NotImplementedError("sortorder is not yet supported") - if name is not None: raise NotImplementedError( "Use `names`, `name` is not yet supported" ) - - super().__init__() + if len(levels) == 0: + raise ValueError("Must pass non-zero number of levels/codes") + if not isinstance(codes, cudf.DataFrame) and not isinstance( + codes[0], (Sequence, np.ndarray) + ): + raise TypeError("Codes is not a Sequence of sequences") if copy: if isinstance(codes, cudf.DataFrame): @@ -92,58 +94,57 @@ def __init__( if len(levels) > 0 and isinstance(levels[0], cudf.Series): levels = [level.copy(deep=True) for level in levels] - self._name = None - - if labels: - warnings.warn( - "the 'labels' keyword is deprecated, use 'codes' " "instead", - FutureWarning, - ) - if labels and not codes: - codes = labels - - if len(levels) == 0: - raise ValueError("Must pass non-zero number of levels/codes") + if not isinstance(codes, cudf.DataFrame): + if len(levels) == len(codes): + codes = cudf.DataFrame._from_data( + { + i: column.as_column(code).astype(np.int64) + for i, code in enumerate(codes) + } + ) + else: + raise ValueError( + "MultiIndex has unequal number of levels and " + "codes and is inconsistent!" + ) - if not isinstance(codes, cudf.DataFrame) and not isinstance( - codes[0], (Sequence, np.ndarray) - ): - raise TypeError("Codes is not a Sequence of sequences") + levels = [cudf.Series(level) for level in levels] - if isinstance(codes, cudf.DataFrame): - self._codes = codes - elif len(levels) == len(codes): - self._codes = cudf.DataFrame._from_data( - { - i: column.as_column(code).astype(np.int64) - for i, code in enumerate(codes) - } - ) - else: + if len(levels) != len(codes.columns): raise ValueError( "MultiIndex has unequal number of levels and " "codes and is inconsistent!" ) + if len(set(c.size for c in codes._data.columns)) != 1: + raise ValueError( + "MultiIndex length of codes does not match " + "and is inconsistent!" + ) + for level, code in zip(levels, codes._data.columns): + if code.max() > len(level) - 1: + raise ValueError( + "MultiIndex code %d contains value %d larger " + "than maximum level size at this position" + ) - self._levels = [cudf.Series(level) for level in levels] - self._validate_levels_and_codes(self._levels, self._codes) - - source_data = cudf.DataFrame() - for i, n in enumerate(self._codes.columns): - codes = as_index(self._codes[n]._column) - if -1 in self._codes[n].values: + source_data = {} + for i, (column_name, col) in enumerate(codes._data.items()): + if -1 in col.values: level = cudf.DataFrame( - {n: [None] + list(self._levels[i])}, - index=range(-1, len(self._levels[i])), + {column_name: [None] + list(levels[i])}, + index=range(-1, len(levels[i])), ) else: - level = cudf.DataFrame({n: self._levels[i]}) + level = cudf.DataFrame({column_name: levels[i]}) - source_data[n] = libcudf.copying.gather( - level, codes._data.columns[0] - )[0][n] + source_data[column_name] = libcudf.copying.gather(level, col)[0][ + column_name + ] - self._data = source_data._data + super().__init__(source_data) + self._levels = levels + self._codes = codes + self._name = None self.names = names @property @@ -153,7 +154,6 @@ def names(self): @names.setter def names(self, value): value = [None] * self.nlevels if value is None else value - assert len(value) == self.nlevels if len(value) == len(set(value)): # IMPORTANT: if the provided names are unique, @@ -216,25 +216,20 @@ def rename(self, names, inplace=False): return self.set_names(names, level=None, inplace=inplace) def set_names(self, names, level=None, inplace=False): - if ( - level is not None - and not cudf.api.types.is_list_like(level) - and cudf.api.types.is_list_like(names) - ): + names_is_list_like = is_list_like(names) + level_is_list_like = is_list_like(level) + + if level is not None and not level_is_list_like and names_is_list_like: raise TypeError( "Names must be a string when a single level is provided." ) - if ( - not cudf.api.types.is_list_like(names) - and level is None - and self.nlevels > 1 - ): + if not names_is_list_like and level is None and self.nlevels > 1: raise TypeError("Must pass list-like as `names`.") - if not cudf.api.types.is_list_like(names): + if not names_is_list_like: names = [names] - if level is not None and not cudf.api.types.is_list_like(level): + if level is not None and not level_is_list_like: level = [level] if level is not None and len(names) != len(level): @@ -269,10 +264,6 @@ def _from_data( obj.name = name return obj - @property - def shape(self): - return (self._data.nrows, len(self._data.names)) - @property def name(self): return self._name @@ -281,26 +272,6 @@ def name(self): def name(self, value): self._name = value - def _validate_levels_and_codes(self, levels, codes): - if len(levels) != len(codes.columns): - raise ValueError( - "MultiIndex has unequal number of levels and " - "codes and is inconsistent!" - ) - code_length = len(codes[codes.columns[0]]) - for index, code in enumerate(codes): - if code_length != len(codes[code]): - raise ValueError( - "MultiIndex length of codes does not match " - "and is inconsistent!" - ) - for index, code in enumerate(codes): - if codes[code].max() > len(levels[index]) - 1: - raise ValueError( - "MultiIndex code %d contains value %d larger " - "than maximum level size at this position" - ) - def copy( self, names=None, @@ -396,36 +367,9 @@ def copy( return mi - def deepcopy(self): - return self.copy(deep=True) - - def __copy__(self): - return self.copy(deep=True) - def __iter__(self): - """ - Iterating over a GPU object is not effecient and hence not supported. - - Consider using ``.to_arrow()``, ``.to_pandas()`` or ``.values_host`` - if you wish to iterate over the values. - """ cudf.utils.utils.raise_iteration_error(obj=self) - def _popn(self, n): - """ Returns a copy of this index without the left-most n values. - - Removes n names, labels, and codes in order to build a new index - for results. - """ - result = MultiIndex( - levels=self.levels[n:], - codes=self.codes.iloc[:, n:], - names=self.names[n:], - ) - if self.names is not None: - result.names = self.names[n:] - return result - def __repr__(self): max_seq_items = get_option("display.max_seq_items") or len(self) @@ -534,9 +478,7 @@ def codes(self): @property def nlevels(self): - """ - Integer number of levels in this MultiIndex. - """ + """Integer number of levels in this MultiIndex.""" return len(self._data) @property @@ -576,23 +518,13 @@ def levels(self): self._compute_levels_and_codes() return self._levels - @property - def labels(self): - warnings.warn( - "This feature is deprecated in pandas and will be" - "dropped from cudf as well.", - FutureWarning, - ) - return self.codes - @property def ndim(self): - """Dimension of the data. For MultiIndex ndim is always 2. - """ + """Dimension of the data. For MultiIndex ndim is always 2.""" return 2 def _get_level_label(self, level): - """ Get name of the level. + """Get name of the level. Parameters ---------- @@ -658,8 +590,6 @@ def isin(self, values, level=None): >>> midx.isin([(1, 'red'), (3, 'red')]) array([ True, False, False]) """ - from cudf.api.types import is_list_like - if level is None: if isinstance(values, cudf.MultiIndex): values_idx = values @@ -708,11 +638,6 @@ def isin(self, values, level=None): return result - def mask(self, cond, other=None, inplace=False): - raise NotImplementedError( - ".mask is not supported for MultiIndex operations" - ) - def where(self, cond, other=None, inplace=False): raise NotImplementedError( ".where is not supported for MultiIndex operations" @@ -795,9 +720,7 @@ def _index_and_downcast(self, result, index, index_key): ) or isinstance(index_key[0], slice): index_key = index_key[0] - slice_access = False - if isinstance(index_key, slice): - slice_access = True + slice_access = isinstance(index_key, slice) out_index = cudf.DataFrame() # Select the last n-k columns where n is the number of columns and k is # the length of the indexing tuple @@ -805,30 +728,24 @@ def _index_and_downcast(self, result, index, index_key): if not isinstance(index_key, (numbers.Number, slice)): size = len(index_key) for k in range(size, len(index._data)): - if index.names is None: - name = k - else: - name = index.names[k] out_index.insert( - len(out_index.columns), - name, + out_index._num_columns, + k if index.names is None else index.names[k], cudf.Series._from_data({None: index._data.columns[k]}), ) - if len(result) == 1 and size == 0 and slice_access is False: + if len(result) == 1 and size == 0 and not slice_access: # If the final result is one row and it was not mapped into # directly, return a Series with a tuple as name. result = result.T result = result[result._data.names[0]] - elif len(result) == 0 and slice_access is False: + elif len(result) == 0 and not slice_access: # Pandas returns an empty Series with a tuple as name # the one expected result column - series_name = [] - for col in index._data.columns: - series_name.append(col[0]) - result = cudf.Series([]) - result.name = tuple(series_name) - elif len(out_index.columns) == 1: + result = cudf.Series._from_data( + {}, name=tuple((col[0] for col in index._data.columns)) + ) + elif out_index._num_columns == 1: # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according # to that column's name. @@ -836,11 +753,18 @@ def _index_and_downcast(self, result, index, index_key): out_index = as_index(last_column) out_index.name = index.names[-1] index = out_index - elif len(out_index.columns) > 1: + elif out_index._num_columns > 1: # Otherwise pop the leftmost levels, names, and codes from the # source index until it has the correct number of columns (n-k) result.reset_index(drop=True) - index = index._popn(size) + if index.names is not None: + result.names = index.names[size:] + index = MultiIndex( + levels=index.levels[size:], + codes=index.codes.iloc[:, size:], + names=index.names[size:], + ) + if isinstance(index_key, tuple): result = result.set_index(index) return result @@ -896,24 +820,6 @@ def _validate_indexer( for i in indexer: self._validate_indexer(i) - def _split_tuples(self, tuples): - if len(tuples) == 1: - return tuples, slice(None) - elif isinstance(tuples[0], tuple): - row = tuples[0] - if len(tuples) == 1: - column = slice(None) - else: - column = tuples[1] - return row, column - elif isinstance(tuples[0], slice): - return tuples - else: - return tuples, slice(None) - - def __len__(self): - return self._data.nrows - def __eq__(self, other): if isinstance(other, MultiIndex): for self_col, other_col in zip( @@ -924,24 +830,16 @@ def __eq__(self, other): return self.names == other.names return NotImplemented - @property - def is_contiguous(self): - return True - @property def size(self): - return len(self) + # The size of a MultiIndex is only dependent on the number of rows. + return self._num_rows def take(self, indices): - from collections.abc import Sequence - from numbers import Integral - if isinstance(indices, (Integral, Sequence)): indices = np.array(indices) - elif isinstance(indices, cudf.Series): - if indices.has_nulls: - raise ValueError("Column must have no nulls.") - indices = indices + elif isinstance(indices, cudf.Series) and indices.has_nulls: + raise ValueError("Column must have no nulls.") elif isinstance(indices, slice): start, stop, step = indices.indices(len(self)) indices = column.arange(start, stop, step) @@ -977,21 +875,17 @@ def deserialize(cls, header, frames): ) df = cudf.DataFrame.deserialize(header["source_data"], frames) obj = cls.from_frame(df) - obj._set_names(names) - return obj + return obj._set_names(names) columns = column.deserialize_columns(header["columns"], frames) - return cls._from_data(dict(zip(names, columns))) + obj = cls._from_data(dict(zip(range(0, len(names)), columns))) + return obj._set_names(names) def __getitem__(self, index): - match = self.take(index) - if isinstance(index, slice): - return match if isinstance(index, int): # we are indexing into a single row of the MultiIndex, # return that row as a tuple: - return match.to_pandas()[0] - else: - return match + return self.take(index).to_pandas()[0] + return self.take(index) def to_frame(self, index=True, name=None): # TODO: Currently this function makes a shallow copy, which is @@ -1003,7 +897,7 @@ def to_frame(self, index=True, name=None): if name is not None: if len(name) != len(self.levels): raise ValueError( - "'name' should have th same length as " + "'name' should have the same length as " "number of levels on index." ) df.columns = name @@ -1095,8 +989,7 @@ def from_tuples(cls, tuples, names=None): """ # Use Pandas for handling Python host objects pdi = pd.MultiIndex.from_tuples(tuples, names=names) - result = cls.from_pandas(pdi) - return result + return cls.from_pandas(pdi) @property def values_host(self): @@ -1426,18 +1319,6 @@ def from_pandas(cls, multiindex, nan_as_null=None): def is_unique(self): return len(self) == len(self.unique()) - @property - def is_monotonic(self): - """Return boolean if values in the object are monotonic_increasing. - - This property is an alias for :attr:`is_monotonic_increasing`. - - Returns - ------- - bool - """ - return self.is_monotonic_increasing - @property def is_monotonic_increasing(self): """ @@ -1539,13 +1420,9 @@ def memory_usage(self, deep=False): return n def difference(self, other, sort=None): - temp_self = self - temp_other = other - if hasattr(self, "to_pandas"): - temp_self = self.to_pandas() if hasattr(other, "to_pandas"): - temp_other = self.to_pandas() - return temp_self.difference(temp_other, sort) + other = other.to_pandas() + return self.to_pandas().difference(other, sort) def append(self, other): """ @@ -1609,12 +1486,6 @@ def append(self, other): return MultiIndex._concat(to_concat) - def nan_to_num(*args, **kwargs): - return args[0] - - def array_equal(*args, **kwargs): - return args[0] == args[1] - def __array_function__(self, func, types, args, kwargs): cudf_df_module = MultiIndex @@ -1650,8 +1521,8 @@ def _level_index_from_level(self, level): try: return self.names.index(level) except ValueError: - if not pd.api.types.is_integer(level): - raise KeyError(f"Level {level} not found") from None + if not is_integer(level): + raise KeyError(f"Level {level} not found") if level < 0: level += self.nlevels if level >= self.nlevels: @@ -1661,9 +1532,6 @@ def _level_index_from_level(self, level): ) from None return level - def _level_name_from_level(self, level): - return self.names[self._level_index_from_level(level)] - def get_loc(self, key, method=None, tolerance=None): """ Get location for a label or a tuple of labels. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 4ad2c325eeb..594f9fc42d0 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3637,6 +3637,12 @@ def label_encoding(self, cats, dtype=None, na_sentinel=-1): dtype: int8 """ + warnings.warn( + "Series.label_encoding is deprecated and will be removed in the future.\ + Consider using cuML's LabelEncoder instead", + DeprecationWarning, + ) + def _return_sentinel_series(): return Series( cudf.core.column.full( diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 73fbd50c824..3aa672223c9 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -41,37 +41,90 @@ def _parse_column_statistics(cs, column_statistics_blob): column_statistics["number_of_values"] = cs.numberOfValues if cs.HasField("hasNull"): column_statistics["has_null"] = cs.hasNull + if cs.HasField("intStatistics"): - column_statistics["minimum"] = cs.intStatistics.minimum - column_statistics["maximum"] = cs.intStatistics.maximum - column_statistics["sum"] = cs.intStatistics.sum + column_statistics["minimum"] = ( + cs.intStatistics.minimum + if cs.intStatistics.HasField("minimum") + else None + ) + column_statistics["maximum"] = ( + cs.intStatistics.maximum + if cs.intStatistics.HasField("maximum") + else None + ) + column_statistics["sum"] = ( + cs.intStatistics.sum if cs.intStatistics.HasField("sum") else None + ) + elif cs.HasField("doubleStatistics"): - column_statistics["minimum"] = cs.doubleStatistics.minimum - column_statistics["maximum"] = cs.doubleStatistics.maximum - column_statistics["sum"] = cs.doubleStatistics.sum + column_statistics["minimum"] = ( + cs.doubleStatistics.minimum + if cs.doubleStatistics.HasField("minimum") + else None + ) + column_statistics["maximum"] = ( + cs.doubleStatistics.maximum + if cs.doubleStatistics.HasField("maximum") + else None + ) + column_statistics["sum"] = ( + cs.doubleStatistics.sum + if cs.doubleStatistics.HasField("sum") + else None + ) + elif cs.HasField("stringStatistics"): - column_statistics["minimum"] = cs.stringStatistics.minimum - column_statistics["maximum"] = cs.stringStatistics.maximum + column_statistics["minimum"] = ( + cs.stringStatistics.minimum + if cs.stringStatistics.HasField("minimum") + else None + ) + column_statistics["maximum"] = ( + cs.stringStatistics.maximum + if cs.stringStatistics.HasField("maximum") + else None + ) column_statistics["sum"] = cs.stringStatistics.sum + elif cs.HasField("bucketStatistics"): column_statistics["true_count"] = cs.bucketStatistics.count[0] column_statistics["false_count"] = ( column_statistics["number_of_values"] - column_statistics["true_count"] ) + elif cs.HasField("decimalStatistics"): - column_statistics["minimum"] = cs.decimalStatistics.minimum - column_statistics["maximum"] = cs.decimalStatistics.maximum + column_statistics["minimum"] = ( + cs.decimalStatistics.minimum + if cs.decimalStatistics.HasField("minimum") + else None + ) + column_statistics["maximum"] = ( + cs.decimalStatistics.maximum + if cs.decimalStatistics.HasField("maximum") + else None + ) column_statistics["sum"] = cs.decimalStatistics.sum + elif cs.HasField("dateStatistics"): - column_statistics["minimum"] = datetime.datetime.fromtimestamp( - datetime.timedelta(cs.dateStatistics.minimum).total_seconds(), - datetime.timezone.utc, + column_statistics["minimum"] = ( + datetime.datetime.fromtimestamp( + datetime.timedelta(cs.dateStatistics.minimum).total_seconds(), + datetime.timezone.utc, + ) + if cs.dateStatistics.HasField("minimum") + else None ) - column_statistics["maximum"] = datetime.datetime.fromtimestamp( - datetime.timedelta(cs.dateStatistics.maximum).total_seconds(), - datetime.timezone.utc, + column_statistics["maximum"] = ( + datetime.datetime.fromtimestamp( + datetime.timedelta(cs.dateStatistics.maximum).total_seconds(), + datetime.timezone.utc, + ) + if cs.dateStatistics.HasField("maximum") + else None ) + elif cs.HasField("timestampStatistics"): # Before ORC-135, the local timezone offset was included and they were # stored as minimum and maximum. After ORC-135, the timestamp is @@ -87,6 +140,7 @@ def _parse_column_statistics(cs, column_statistics_blob): column_statistics["maximum"] = datetime.datetime.fromtimestamp( cs.timestampStatistics.maximumUtc / 1000, datetime.timezone.utc ) + elif cs.HasField("binaryStatistics"): column_statistics["sum"] = cs.binaryStatistics.sum @@ -338,11 +392,11 @@ def to_orc(df, fname, compression=None, enable_statistics=True, **kwargs): for col in df._data.columns: if isinstance(col, cudf.core.column.StructColumn): - raise NotImplementedError( - "Writing to ORC format is not yet supported with " - "Struct columns." + warnings.warn( + "Support for writing tables with struct columns is " + "currently experimental." ) - elif isinstance(col, cudf.core.column.CategoricalColumn): + if isinstance(col, cudf.core.column.CategoricalColumn): raise NotImplementedError( "Writing to ORC format is not yet supported with " "Categorical columns." diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index d98ab0504cc..877cec24afa 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -324,3 +324,12 @@ def test_dtype(in_dtype, expect): def test_dtype_raise(in_dtype): with pytest.raises(TypeError): cudf.dtype(in_dtype) + + +def test_dtype_np_bool_to_pa_bool(): + """This test case captures that utility np_to_pa_dtype + should map np.bool_ to pa.bool_, nuances on bit width + difference should be handled elsewhere. + """ + + assert np_to_pa_dtype(np.dtype("bool")) == pa.bool_() diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 465cf36e1f3..e2b1d72c63e 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -5,7 +5,9 @@ """ import itertools import operator +import pickle import re +from io import BytesIO import cupy as cp import numpy as np @@ -1553,3 +1555,49 @@ def test_multiIndex_duplicate_names(): ) assert_eq(gi, pi) + + +def test_difference(): + midx = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ) + midx2 = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3, 3], [0, 2, 1, 1, 0, 2]], + names=["x", "y"], + ) + + expected = midx2.to_pandas().difference(midx.to_pandas()) + actual = midx2.difference(midx) + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "names", + [ + ["a", "b", "c"], + [None, None, None], + ["aa", "aa", "aa"], + ["bb", "aa", "aa"], + None, + ], +) +def test_pickle_roundtrip_multiIndex(names): + df = cudf.DataFrame( + { + "one": [1, 2, 3], + "two": [True, False, True], + "three": ["ab", "cd", "ef"], + "four": [0.2, 0.1, -10.2], + } + ) + expected_df = df.set_index(["one", "two", "three"]) + expected_df.index.names = names + local_file = BytesIO() + + pickle.dump(expected_df, local_file) + local_file.seek(0) + actual_df = pickle.load(local_file) + assert_eq(expected_df, actual_df) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 2d4dc55bd28..1230b4b35f3 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -5,6 +5,7 @@ import os import random from io import BytesIO +from string import ascii_lowercase import numpy as np import pandas as pd @@ -58,7 +59,6 @@ def _make_path_or_buf(src): @pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.filterwarnings("ignore:Strings are not yet supported") @pytest.mark.parametrize("engine", ["pyarrow", "cudf"]) @pytest.mark.parametrize("use_index", [False, True]) @pytest.mark.parametrize( @@ -221,6 +221,7 @@ def test_orc_read_statistics(datadir): assert_eq(file_statistics[0]["string1"]["minimum"], "one") +@pytest.mark.filterwarnings("ignore:Using CPU") @pytest.mark.parametrize("engine", ["cudf", "pyarrow"]) @pytest.mark.parametrize( "predicate,expected_len", @@ -244,6 +245,7 @@ def test_orc_read_filtered(datadir, engine, predicate, expected_len): assert len(df_filtered) == expected_len +@pytest.mark.filterwarnings("ignore:Using CPU") @pytest.mark.parametrize("engine", ["cudf", "pyarrow"]) def test_orc_read_stripes(datadir, engine): path = datadir / "TestOrcFile.testDate1900.orc" @@ -558,7 +560,6 @@ def test_orc_reader_boolean_type(datadir, orc_file): assert_eq(pdf, df) -@pytest.mark.filterwarnings("ignore:Using CPU") def test_orc_reader_tzif_timestamps(datadir): # Contains timstamps in the range covered by the TZif file # Other timedate tests only cover "future" times @@ -954,7 +955,9 @@ def generate_list_struct_buff(size=100_000): return buff -list_struct_buff = generate_list_struct_buff() +@pytest.fixture(scope="module") +def list_struct_buff(): + return generate_list_struct_buff() @pytest.mark.parametrize( @@ -967,9 +970,7 @@ def generate_list_struct_buff(size=100_000): ) @pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100_000]) @pytest.mark.parametrize("use_index", [True, False]) -def test_lists_struct_nests( - columns, num_rows, use_index, -): +def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff): gdf = cudf.read_orc( list_struct_buff, @@ -993,7 +994,7 @@ def test_lists_struct_nests( @pytest.mark.parametrize("columns", [None, ["lvl1_struct"], ["lvl1_list"]]) -def test_skip_rows_for_nested_types(columns): +def test_skip_rows_for_nested_types(columns, list_struct_buff): with pytest.raises( RuntimeError, match="skip_rows is not supported by nested column" ): @@ -1379,3 +1380,115 @@ def test_names_in_struct_dtype_nesting(datadir): edf = cudf.DataFrame(expect.to_pandas()) # test schema assert edf.dtypes.equals(got.dtypes) + + +@pytest.mark.filterwarnings("ignore:.*struct.*experimental") +def test_writer_lists_structs(list_struct_buff): + df_in = cudf.read_orc(list_struct_buff) + + buff = BytesIO() + df_in.to_orc(buff) + + pyarrow_tbl = pyarrow.orc.ORCFile(buff).read() + + assert pyarrow_tbl.equals(df_in.to_arrow()) + + +@pytest.mark.filterwarnings("ignore:.*struct.*experimental") +@pytest.mark.parametrize( + "data", + [ + { + "with_pd": [ + [i if i % 3 else None] if i < 9999 or i > 20001 else None + for i in range(21000) + ], + "no_pd": [ + [i if i % 3 else None] if i < 9999 or i > 20001 else [] + for i in range(21000) + ], + }, + ], +) +def test_orc_writer_lists_empty_rg(data): + pdf_in = pd.DataFrame(data) + buffer = BytesIO() + cudf_in = cudf.from_pandas(pdf_in) + + cudf_in.to_orc(buffer) + + df = cudf.read_orc(buffer) + assert_eq(df, cudf_in) + + pdf_out = pa.orc.ORCFile(buffer).read().to_pandas() + assert_eq(pdf_in, pdf_out) + + +def test_statistics_sum_overflow(): + maxint64 = np.iinfo(np.int64).max + minint64 = np.iinfo(np.int64).min + + buff = BytesIO() + with po.Writer( + buff, po.Struct(a=po.BigInt(), b=po.BigInt(), c=po.BigInt()) + ) as writer: + writer.write((maxint64, minint64, minint64)) + writer.write((1, -1, 1)) + + file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff]) + assert file_stats[0]["a"].get("sum") is None + assert file_stats[0]["b"].get("sum") is None + assert file_stats[0]["c"].get("sum") == minint64 + 1 + + assert stripe_stats[0]["a"].get("sum") is None + assert stripe_stats[0]["b"].get("sum") is None + assert stripe_stats[0]["c"].get("sum") == minint64 + 1 + + +def test_empty_statistics(): + buff = BytesIO() + orc_schema = po.Struct( + a=po.BigInt(), + b=po.Double(), + c=po.String(), + d=po.Decimal(11, 2), + e=po.Date(), + f=po.Timestamp(), + g=po.Boolean(), + h=po.Binary(), + i=po.BigInt(), + # One column with non null value, else cudf/pyorc readers crash + ) + data = tuple([None] * (len(orc_schema.fields) - 1) + [1]) + with po.Writer(buff, orc_schema) as writer: + writer.write(data) + + got = cudf.io.orc.read_orc_statistics([buff]) + + # Check for both file and stripe stats + for stats in got: + # Similar expected stats for the first 6 columns in this case + for col_name in ascii_lowercase[:6]: + assert stats[0][col_name].get("number_of_values") == 0 + assert stats[0][col_name].get("has_null") is True + assert stats[0][col_name].get("minimum") is None + assert stats[0][col_name].get("maximum") is None + for col_name in ascii_lowercase[:3]: + assert stats[0][col_name].get("sum") == 0 + # Sum for decimal column is a string + assert stats[0]["d"].get("sum") == "0" + + assert stats[0]["g"].get("number_of_values") == 0 + assert stats[0]["g"].get("has_null") is True + assert stats[0]["g"].get("true_count") == 0 + assert stats[0]["g"].get("false_count") == 0 + + assert stats[0]["h"].get("number_of_values") == 0 + assert stats[0]["h"].get("has_null") is True + assert stats[0]["h"].get("sum") == 0 + + assert stats[0]["i"].get("number_of_values") == 1 + assert stats[0]["i"].get("has_null") is False + assert stats[0]["i"].get("minimum") == 1 + assert stats[0]["i"].get("maximum") == 1 + assert stats[0]["i"].get("sum") == 1 diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index 133597b8f19..11ed68056b6 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -122,19 +122,41 @@ def pdf(scope="module"): return df -def test_read_csv(s3_base, s3so, pdf): +@pytest.mark.parametrize("bytes_per_thread", [32, 1024]) +def test_read_csv(s3_base, s3so, pdf, bytes_per_thread): # Write to buffer fname = "test_csv_reader.csv" bname = "csv" buffer = pdf.to_csv(index=False) with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}): got = cudf.read_csv( - "s3://{}/{}".format(bname, fname), storage_options=s3so + "s3://{}/{}".format(bname, fname), + storage_options=s3so, + bytes_per_thread=bytes_per_thread, ) assert_eq(pdf, got) +@pytest.mark.parametrize("bytes_per_thread", [32, 1024]) +def test_read_csv_byte_range(s3_base, s3so, pdf, bytes_per_thread): + # Write to buffer + fname = "test_csv_reader_byte_range.csv" + bname = "csv" + buffer = pdf.to_csv(index=False) + with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}): + got = cudf.read_csv( + "s3://{}/{}".format(bname, fname), + storage_options=s3so, + byte_range=(74, 73), + bytes_per_thread=bytes_per_thread, + header=False, + names=["Integer", "Float", "Integer2", "String", "Boolean"], + ) + + assert_eq(pdf.iloc[-2:].reset_index(drop=True), got) + + @pytest.mark.parametrize("chunksize", [None, 3]) def test_write_csv(s3_base, s3so, pdf, chunksize): # Write to buffer @@ -156,7 +178,9 @@ def test_write_csv(s3_base, s3so, pdf, chunksize): assert_eq(pdf, got) -def test_read_parquet(s3_base, s3so, pdf): +@pytest.mark.parametrize("bytes_per_thread", [32, 1024]) +@pytest.mark.parametrize("columns", [None, ["Float", "String"]]) +def test_read_parquet(s3_base, s3so, pdf, bytes_per_thread, columns): fname = "test_parquet_reader.parquet" bname = "parquet" buffer = BytesIO() @@ -164,10 +188,32 @@ def test_read_parquet(s3_base, s3so, pdf): buffer.seek(0) with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}): got = cudf.read_parquet( - "s3://{}/{}".format(bname, fname), storage_options=s3so + "s3://{}/{}".format(bname, fname), + storage_options=s3so, + bytes_per_thread=bytes_per_thread, + columns=columns, ) - assert_eq(pdf, got) + expect = pdf[columns] if columns else pdf + assert_eq(expect, got) + + +def test_read_parquet_filters(s3_base, s3so, pdf): + fname = "test_parquet_reader_filters.parquet" + bname = "parquet" + buffer = BytesIO() + pdf.to_parquet(path=buffer) + buffer.seek(0) + filters = [("String", "==", "Omega")] + with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}): + got = cudf.read_parquet( + "s3://{}/{}".format(bname, fname), + storage_options=s3so, + filters=filters, + ) + + # All row-groups should be filtered out + assert_eq(pdf.iloc[:0], got.reset_index(drop=True)) def test_write_parquet(s3_base, s3so, pdf): diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 5100f1a9c49..bdaf5e144a5 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -14,6 +14,11 @@ from cudf.core._compat import PANDAS_GE_120 _NA_REP = "" + +"""Map numpy dtype to pyarrow types. +Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special +handling is required when converting a Boolean column into arrow. +""" _np_pa_dtypes = { np.float64: pa.float64(), np.float32: pa.float32(), @@ -22,7 +27,7 @@ np.int32: pa.int32(), np.int16: pa.int16(), np.int8: pa.int8(), - np.bool_: pa.int8(), + np.bool_: pa.bool_(), np.uint64: pa.uint64(), np.uint32: pa.uint32(), np.uint16: pa.uint16(), diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index c2d55daabdf..a7105f3f35b 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -392,6 +392,12 @@ enable_statistics: boolean, default True Enable writing column statistics. + +Notes +----- +Support for writing tables with struct columns is currently experimental, +the output may not be as reliable as writing for other datatypes. + See Also -------- cudf.read_orc diff --git a/python/cudf/setup.py b/python/cudf/setup.py index 3abfdec74b8..6d210eff071 100644 --- a/python/cudf/setup.py +++ b/python/cudf/setup.py @@ -6,13 +6,24 @@ import subprocess import sys import sysconfig + +# Must import in this order: +# setuptools -> Cython.Distutils.build_ext -> setuptools.command.build_ext +# Otherwise, setuptools.command.build_ext ends up inheriting from +# Cython.Distutils.old_build_ext which we do not want +import setuptools + +try: + from Cython.Distutils.build_ext import new_build_ext as _build_ext +except ImportError: + from setuptools.command.build_ext import build_ext as _build_ext + from distutils.spawn import find_executable from distutils.sysconfig import get_python_lib import numpy as np import pyarrow as pa -from Cython.Build import cythonize -from Cython.Distutils import build_ext +import setuptools.command.build_ext from setuptools import find_packages, setup from setuptools.extension import Extension @@ -105,22 +116,46 @@ def get_cuda_version_from_header(cuda_include_dir, delimeter=""): ), ) -try: - nthreads = int(os.environ.get("PARALLEL_LEVEL", "0") or "0") -except Exception: - nthreads = 0 -cmdclass = versioneer.get_cmdclass() +class build_ext_and_proto_no_debug(_build_ext): + def build_extensions(self): + def remove_flags(compiler, *flags): + for flag in flags: + try: + compiler.compiler_so = list( + filter((flag).__ne__, compiler.compiler_so) + ) + except Exception: + pass + # Full optimization + self.compiler.compiler_so.append("-O3") + # Silence '-Wunknown-pragmas' warning + self.compiler.compiler_so.append("-Wno-unknown-pragmas") + # No debug symbols, full optimization, no '-Wstrict-prototypes' warning + remove_flags( + self.compiler, "-g", "-G", "-O1", "-O2", "-Wstrict-prototypes" + ) + super().build_extensions() -class build_ext_and_proto(build_ext): - def build_extensions(self): - try: - # Silence the '-Wstrict-prototypes' warning - self.compiler.compiler_so.remove("-Wstrict-prototypes") - except Exception: - pass - build_ext.build_extensions(self) + def finalize_options(self): + if self.distribution.ext_modules: + # Delay import this to allow for Cython-less installs + from Cython.Build.Dependencies import cythonize + + nthreads = getattr(self, "parallel", None) # -j option in Py3.5+ + nthreads = int(nthreads) if nthreads else None + self.distribution.ext_modules = cythonize( + self.distribution.ext_modules, + nthreads=nthreads, + force=self.force, + gdb_debug=False, + compiler_directives=dict( + profile=False, language_level=3, embedsignature=True + ), + ) + # Skip calling super() and jump straight to setuptools + setuptools.command.build_ext.build_ext.finalize_options(self) def run(self): # Get protoc @@ -158,11 +193,9 @@ def run(self): src.write(new_src_content) # Run original Cython build_ext command - build_ext.run(self) + _build_ext.run(self) -cmdclass["build_ext"] = build_ext_and_proto - extensions = [ Extension( "*", @@ -196,6 +229,10 @@ def run(self): ) ] +cmdclass = versioneer.get_cmdclass() +cmdclass["build_ext"] = build_ext_and_proto_no_debug + + setup( name="cudf", version=versioneer.get_version(), @@ -214,13 +251,7 @@ def run(self): ], # Include the separately-compiled shared library setup_requires=["cython", "protobuf"], - ext_modules=cythonize( - extensions, - nthreads=nthreads, - compiler_directives=dict( - profile=False, language_level=3, embedsignature=True - ), - ), + ext_modules=extensions, packages=find_packages(include=["cudf", "cudf.*"]), package_data=dict.fromkeys( find_packages(include=["cudf._lib*"]), ["*.pxd"], diff --git a/python/custreamz/dev_requirements.txt b/python/custreamz/dev_requirements.txt index 61e4817b1c2..2f2a45dbe05 100644 --- a/python/custreamz/dev_requirements.txt +++ b/python/custreamz/dev_requirements.txt @@ -3,8 +3,8 @@ flake8==3.8.3 black==19.10b0 isort==5.6.4 -dask>=2021.6.0 -distributed>=2021.6.0 +dask==2021.09.1 +distributed==2021.09.1 streamz python-confluent-kafka pytest diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py index 77973ee34ff..1c21fca51c8 100644 --- a/python/dask_cudf/dask_cudf/accessors.py +++ b/python/dask_cudf/dask_cudf/accessors.py @@ -37,6 +37,32 @@ def field(self, key): meta=self.d_series._meta._constructor([], dtype=typ), ) + def explode(self): + """ + Creates a dataframe view of the struct column, one column per field. + + Returns + ------- + DataFrame + + Examples + -------- + >>> import cudf, dask_cudf + >>> ds = dask_cudf.from_cudf(cudf.Series( + ... [{'a': 42, 'b': 'str1', 'c': [-1]}, + ... {'a': 0, 'b': 'str2', 'c': [400, 500]}, + ... {'a': 7, 'b': '', 'c': []}]), npartitions=2) + >>> ds.struct.explode().compute() + a b c + 0 42 str1 [-1] + 1 0 str2 [400, 500] + 2 7 [] + """ + return self.d_series.map_partitions( + lambda s: s.struct.explode(), + meta=self.d_series._meta.struct.explode(), + ) + class ListMethods: def __init__(self, d_series): diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 805927dd474..1521ce41806 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -499,3 +499,18 @@ def test_dask_struct_field_Int_Error(data): with pytest.raises(IndexError): got.struct.field(1000).compute() + + +@pytest.mark.parametrize( + "data", + [ + [{}, {}, {}], + [{"a": 100, "b": "abc"}, {"a": 42, "b": "def"}, {"a": -87, "b": ""}], + [{"a": [1, 2, 3], "b": {"c": 101}}, {"a": [4, 5], "b": {"c": 102}}], + ], +) +def test_struct_explode(data): + expect = Series(data).struct.explode() + got = dgd.from_cudf(Series(data), 2).struct.explode() + + assert_eq(expect, got.compute()) diff --git a/python/dask_cudf/dev_requirements.txt b/python/dask_cudf/dev_requirements.txt index 0b601180711..7d41184feae 100644 --- a/python/dask_cudf/dev_requirements.txt +++ b/python/dask_cudf/dev_requirements.txt @@ -1,7 +1,7 @@ # Copyright (c) 2021, NVIDIA CORPORATION. -dask>=2021.6.0 -distributed>=2021.6.0 +dask==2021.09.1 +distributed==2021.09.1 fsspec>=0.6.0 numba>=0.53.1 numpy diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py index c4cb57ff89a..515469f8b6c 100644 --- a/python/dask_cudf/setup.py +++ b/python/dask_cudf/setup.py @@ -10,8 +10,8 @@ install_requires = [ "cudf", - "dask>=2021.6.0", - "distributed>=2021.6.0", + "dask==2021.09.1", + "distributed==2021.09.1", "fsspec>=0.6.0", "numpy", "pandas>=1.0,<1.4.0dev0", @@ -23,8 +23,8 @@ "pandas>=1.0,<1.4.0dev0", "pytest", "numba>=0.53.1", - "dask>=2021.6.0", - "distributed>=2021.6.0", + "dask==2021.09.1", + "distributed==2021.09.1", ] }