diff --git a/build.sh b/build.sh index 45074a6645f..c2eba134c35 100755 --- a/build.sh +++ b/build.sh @@ -230,6 +230,7 @@ if buildAll || hasArg libcudf; then fi echo "$MSG" python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${LIB_BUILD_DIR}/ninja_log.html + cp ${LIB_BUILD_DIR}/.ninja_log ${LIB_BUILD_DIR}/ninja.log fi if [[ ${INSTALL_TARGET} != "" ]]; then diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index f23296038f2..6f19f174da0 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -85,6 +85,7 @@ if [ "$BUILD_LIBCUDF" == '1' ]; then gpuci_logger "Copying build metrics results" mkdir -p "$WORKSPACE/build-metrics" cp "$LIBCUDF_BUILD_DIR/ninja_log.html" "$WORKSPACE/build-metrics/BuildMetrics.html" + cp "$LIBCUDF_BUILD_DIR/ninja.log" "$WORKSPACE/build-metrics/ninja.log" fi gpuci_logger "Build conda pkg for libcudf_kafka" diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 57592de59af..13ef02efc99 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -123,7 +123,7 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- -ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates.cpp) +ConfigureNVBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates.cpp) # ################################################################################################## # * join benchmark -------------------------------------------------------------------------------- diff --git a/cpp/benchmarks/io/orc/orc_reader.cpp b/cpp/benchmarks/io/orc/orc_reader.cpp index bb4a0ce72d8..e15513275ee 100644 --- a/cpp/benchmarks/io/orc/orc_reader.cpp +++ b/cpp/benchmarks/io/orc/orc_reader.cpp @@ -88,13 +88,13 @@ void BM_orc_read_varying_options(benchmark::State& state) auto const use_np_dtypes = (flags & 2) != 0; auto const ts_type = cudf::data_type{static_cast(state.range(state_idx++))}; + // skip_rows is not supported on nested types auto const data_types = dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED), int32_t(type_group_id::FLOATING_POINT), int32_t(type_group_id::FIXED_POINT), int32_t(type_group_id::TIMESTAMP), - int32_t(cudf::type_id::STRING), - int32_t(cudf::type_id::LIST)}), + int32_t(cudf::type_id::STRING)}), col_sel); auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size}); auto const view = tbl->view(); @@ -181,11 +181,12 @@ BENCHMARK_REGISTER_F(OrcRead, column_selection) ->Unit(benchmark::kMillisecond) ->UseManualTime(); +// Need an API to get the number of stripes to enable row_selection::STRIPES here BENCHMARK_DEFINE_F(OrcRead, row_selection) (::benchmark::State& state) { BM_orc_read_varying_options(state); } BENCHMARK_REGISTER_F(OrcRead, row_selection) ->ArgsProduct({{int32_t(column_selection::ALL)}, - {int32_t(row_selection::STRIPES), int32_t(row_selection::NROWS)}, + {int32_t(row_selection::NROWS)}, {1, 8}, {0b11}, // defaults {int32_t(cudf::type_id::EMPTY)}}) diff --git a/cpp/benchmarks/io/parquet/parquet_reader.cpp b/cpp/benchmarks/io/parquet/parquet_reader.cpp index d7a3a668bd1..09194931498 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader.cpp @@ -88,13 +88,13 @@ void BM_parq_read_varying_options(benchmark::State& state) auto const use_pandas_metadata = (flags & 2) != 0; auto const ts_type = cudf::data_type{static_cast(state.range(state_idx++))}; + // No nested types here, because of https://github.com/rapidsai/cudf/issues/9970 auto const data_types = dtypes_for_column_selection( get_type_or_group({static_cast(type_group_id::INTEGRAL), static_cast(type_group_id::FLOATING_POINT), static_cast(type_group_id::FIXED_POINT), static_cast(type_group_id::TIMESTAMP), - static_cast(cudf::type_id::STRING), - static_cast(cudf::type_id::LIST)}), + static_cast(cudf::type_id::STRING)}), col_sel); auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size}); auto const view = tbl->view(); @@ -181,20 +181,18 @@ BENCHMARK_REGISTER_F(ParquetRead, column_selection) ->Unit(benchmark::kMillisecond) ->UseManualTime(); -// Disabled until we add an API to read metadata from a parquet file and determine num row groups. -// https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863 -/* +// row_selection::ROW_GROUPS disabled until we add an API to read metadata from a parquet file and +// determine num row groups. https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863 BENCHMARK_DEFINE_F(ParquetRead, row_selection) (::benchmark::State& state) { BM_parq_read_varying_options(state); } BENCHMARK_REGISTER_F(ParquetRead, row_selection) ->ArgsProduct({{int32_t(column_selection::ALL)}, - {int32_t(row_selection::ROW_GROUPS), int32_t(row_selection::NROWS)}, + {int32_t(row_selection::NROWS)}, {1, 4}, {0b01}, // defaults {int32_t(cudf::type_id::EMPTY)}}) ->Unit(benchmark::kMillisecond) ->UseManualTime(); -*/ BENCHMARK_DEFINE_F(ParquetRead, misc_options) (::benchmark::State& state) { BM_parq_read_varying_options(state); } diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates.cpp index 8039d7d065f..317db92ae8b 100644 --- a/cpp/benchmarks/stream_compaction/drop_duplicates.cpp +++ b/cpp/benchmarks/stream_compaction/drop_duplicates.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,64 +15,102 @@ */ #include -#include +#include #include #include #include -#include -#include + +#include + +#include #include #include -class Compaction : public cudf::benchmark { -}; +// necessary for custom enum types +// see: https://github.com/NVIDIA/nvbench/blob/main/examples/enums.cu +NVBENCH_DECLARE_ENUM_TYPE_STRINGS( + // Enum type: + cudf::duplicate_keep_option, + // Callable to generate input strings: + [](cudf::duplicate_keep_option option) { + switch (option) { + case cudf::duplicate_keep_option::KEEP_FIRST: return "KEEP_FIRST"; + case cudf::duplicate_keep_option::KEEP_LAST: return "KEEP_LAST"; + case cudf::duplicate_keep_option::KEEP_NONE: return "KEEP_NONE"; + default: return "ERROR"; + } + }, + // Callable to generate descriptions: + [](auto) { return std::string{}; }) + +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms"); + +template +void nvbench_drop_duplicates(nvbench::state& state, + nvbench::type_list>) +{ + if constexpr (not std::is_same_v and + Keep != cudf::duplicate_keep_option::KEEP_FIRST) { + state.skip("Skip unwanted benchmarks."); + } + + cudf::rmm_pool_raii pool_raii; + + auto const num_rows = state.get_int64("NumRows"); + + cudf::test::UniformRandomGenerator rand_gen(0, 100); + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [&rand_gen](auto row) { return rand_gen.generate(); }); + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 100 == 0 ? false : true; }); + cudf::test::fixed_width_column_wrapper values(elements, elements + num_rows, valids); + + auto input_column = cudf::column_view(values); + auto input_table = cudf::table_view({input_column, input_column, input_column, input_column}); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + auto result = cudf::detail::drop_duplicates( + input_table, {0}, Keep, cudf::null_equality::EQUAL, cudf::null_order::BEFORE, stream_view); + }); +} template -void BM_compaction(benchmark::State& state, cudf::duplicate_keep_option keep) +void nvbench_unordered_drop_duplicates(nvbench::state& state, nvbench::type_list) { - auto const n_rows = static_cast(state.range(0)); + cudf::rmm_pool_raii pool_raii; + + auto const num_rows = state.get_int64("NumRows"); cudf::test::UniformRandomGenerator rand_gen(0, 100); auto elements = cudf::detail::make_counting_transform_iterator( 0, [&rand_gen](auto row) { return rand_gen.generate(); }); auto valids = cudf::detail::make_counting_transform_iterator( 0, [](auto i) { return i % 100 == 0 ? false : true; }); - cudf::test::fixed_width_column_wrapper values(elements, elements + n_rows, valids); + cudf::test::fixed_width_column_wrapper values(elements, elements + num_rows, valids); auto input_column = cudf::column_view(values); auto input_table = cudf::table_view({input_column, input_column, input_column, input_column}); - for (auto _ : state) { - cuda_event_timer timer(state, true); - auto result = cudf::drop_duplicates(input_table, {0}, keep); - } + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + auto result = cudf::detail::unordered_drop_duplicates( + input_table, {0}, cudf::null_equality::EQUAL, stream_view); + }); } -#define concat(a, b, c) a##b##c -#define get_keep(op) cudf::duplicate_keep_option::KEEP_##op - -// TYPE, OP -#define RBM_BENCHMARK_DEFINE(name, type, keep) \ - BENCHMARK_DEFINE_F(Compaction, name)(::benchmark::State & state) \ - { \ - BM_compaction(state, get_keep(keep)); \ - } \ - BENCHMARK_REGISTER_F(Compaction, name) \ - ->UseManualTime() \ - ->Arg(10000) /* 10k */ \ - ->Arg(100000) /* 100k */ \ - ->Arg(1000000) /* 1M */ \ - ->Arg(10000000) /* 10M */ - -#define COMPACTION_BENCHMARK_DEFINE(type, keep) \ - RBM_BENCHMARK_DEFINE(concat(type, _, keep), type, keep) - -COMPACTION_BENCHMARK_DEFINE(bool, NONE); -COMPACTION_BENCHMARK_DEFINE(int8_t, NONE); -COMPACTION_BENCHMARK_DEFINE(int32_t, NONE); -COMPACTION_BENCHMARK_DEFINE(int32_t, FIRST); -COMPACTION_BENCHMARK_DEFINE(int32_t, LAST); -using cudf::timestamp_ms; -COMPACTION_BENCHMARK_DEFINE(timestamp_ms, NONE); -COMPACTION_BENCHMARK_DEFINE(float, NONE); +using data_type = nvbench::type_list; +using keep_option = nvbench::enum_type_list; + +NVBENCH_BENCH_TYPES(nvbench_drop_duplicates, NVBENCH_TYPE_AXES(data_type, keep_option)) + .set_name("drop_duplicates") + .set_type_axes_names({"Type", "KeepOption"}) + .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); + +NVBENCH_BENCH_TYPES(nvbench_unordered_drop_duplicates, NVBENCH_TYPE_AXES(data_type)) + .set_name("unordered_drop_duplicates") + .set_type_axes_names({"Type"}) + .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake index c964c85156c..5a20f78b798 100644 --- a/cpp/cmake/thirdparty/get_cucollections.cmake +++ b/cpp/cmake/thirdparty/get_cucollections.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -21,7 +21,7 @@ function(find_and_configure_cucollections) cuco 0.0 GLOBAL_TARGETS cuco::cuco CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections - GIT_TAG 0ca860b824f5dc22cf8a41f09912e62e11f07d82 + GIT_TAG 6ec8b6dcdeceea07ab4456d32461a05c18864411 OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF" ) diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index 87823d71c6f..3d065556827 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -75,6 +75,18 @@ std::unique_ptr drop_duplicates( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc cudf::unordered_drop_duplicates + * + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr
unordered_drop_duplicates( + table_view const& input, + std::vector const& keys, + null_equality nulls_equal = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy) * @@ -94,5 +106,24 @@ cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal = null_equality::EQUAL, rmm::cuda_stream_view stream = rmm::cuda_stream_default); +/** + * @copydoc cudf::unordered_distinct_count(column_view const&, null_policy, nan_policy) + * + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + */ +cudf::size_type unordered_distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + +/** + * @copydoc cudf::unordered_distinct_count(table_view const&, null_equality) + * + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + */ +cudf::size_type unordered_distinct_count(table_view const& input, + null_equality nulls_equal = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp index 6a85428d8f0..a7112ae415d 100644 --- a/cpp/include/cudf/fixed_point/fixed_point.hpp +++ b/cpp/include/cudf/fixed_point/fixed_point.hpp @@ -387,9 +387,9 @@ class fixed_point { /** * @brief operator + (for adding two `fixed_point` numbers) * - * If `_scale`s are equal, `_value`s are added
- * If `_scale`s are not equal, number with smaller `_scale` is shifted to the - * greater `_scale`, and then `_value`s are added + * If `_scale`s are equal, `_value`s are added. + * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the + * smaller `_scale`, and then the `_value`s are added. * * @tparam Rep1 Representation type of number being added to `this` * @tparam Rad1 Radix (base) type of number being added to `this` @@ -402,9 +402,9 @@ class fixed_point { /** * @brief operator - (for subtracting two `fixed_point` numbers) * - * If `_scale`s are equal, `_value`s are subtracted
- * If `_scale`s are not equal, number with smaller `_scale` is shifted to the - * greater `_scale`, and then `_value`s are subtracted + * If `_scale`s are equal, `_value`s are subtracted. + * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the + * smaller `_scale`, and then the `_value`s are subtracted. * * @tparam Rep1 Representation type of number being added to `this` * @tparam Rad1 Radix (base) type of number being added to `this` @@ -417,7 +417,7 @@ class fixed_point { /** * @brief operator * (for multiplying two `fixed_point` numbers) * - * `_scale`s are added and `_value`s are multiplied + * `_scale`s are added and `_value`s are multiplied. * * @tparam Rep1 Representation type of number being added to `this` * @tparam Rad1 Radix (base) type of number being added to `this` @@ -430,7 +430,7 @@ class fixed_point { /** * @brief operator / (for dividing two `fixed_point` numbers) * - * `_scale`s are subtracted and `_value`s are divided + * `_scale`s are subtracted and `_value`s are divided. * * @tparam Rep1 Representation type of number being added to `this` * @tparam Rad1 Radix (base) type of number being added to `this` @@ -443,9 +443,9 @@ class fixed_point { /** * @brief operator == (for comparing two `fixed_point` numbers) * - * If `_scale`s are equal, `_value`s are compared
- * If `_scale`s are not equal, number with smaller `_scale` is shifted to the - * greater `_scale`, and then `_value`s are compared + * If `_scale`s are equal, `_value`s are compared. + * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the + * smaller `_scale`, and then the `_value`s are compared. * * @tparam Rep1 Representation type of number being added to `this` * @tparam Rad1 Radix (base) type of number being added to `this` @@ -458,9 +458,9 @@ class fixed_point { /** * @brief operator != (for comparing two `fixed_point` numbers) * - * If `_scale`s are equal, `_value`s are compared
- * If `_scale`s are not equal, number with smaller `_scale` is shifted to the - * greater `_scale`, and then `_value`s are compared + * If `_scale`s are equal, `_value`s are compared. + * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the + * smaller `_scale`, and then the `_value`s are compared. * * @tparam Rep1 Representation type of number being added to `this` * @tparam Rad1 Radix (base) type of number being added to `this` @@ -473,9 +473,9 @@ class fixed_point { /** * @brief operator <= (for comparing two `fixed_point` numbers) * - * If `_scale`s are equal, `_value`s are compared
- * If `_scale`s are not equal, number with smaller `_scale` is shifted to the - * greater `_scale`, and then `_value`s are compared + * If `_scale`s are equal, `_value`s are compared. + * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the + * smaller `_scale`, and then the `_value`s are compared. * * @tparam Rep1 Representation type of number being added to `this` * @tparam Rad1 Radix (base) type of number being added to `this` @@ -488,9 +488,9 @@ class fixed_point { /** * @brief operator >= (for comparing two `fixed_point` numbers) * - * If `_scale`s are equal, `_value`s are compared
- * If `_scale`s are not equal, number with smaller `_scale` is shifted to the - * greater `_scale`, and then `_value`s are compared + * If `_scale`s are equal, `_value`s are compared. + * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the + * smaller `_scale`, and then the `_value`s are compared. * * @tparam Rep1 Representation type of number being added to `this` * @tparam Rad1 Radix (base) type of number being added to `this` @@ -503,9 +503,9 @@ class fixed_point { /** * @brief operator < (for comparing two `fixed_point` numbers) * - * If `_scale`s are equal, `_value`s are compared
- * If `_scale`s are not equal, number with smaller `_scale` is shifted to the - * greater `_scale`, and then `_value`s are compared + * If `_scale`s are equal, `_value`s are compared. + * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the + * smaller `_scale`, and then the `_value`s are compared. * * @tparam Rep1 Representation type of number being added to `this` * @tparam Rad1 Radix (base) type of number being added to `this` @@ -518,9 +518,9 @@ class fixed_point { /** * @brief operator > (for comparing two `fixed_point` numbers) * - * If `_scale`s are equal, `_value`s are compared
- * If `_scale`s are not equal, number with smaller `_scale` is shifted to the - * greater `_scale`, and then `_value`s are compared + * If `_scale`s are equal, `_value`s are compared. + * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the + * smaller `_scale`, and then the `_value`s are compared. * * @tparam Rep1 Representation type of number being added to `this` * @tparam Rad1 Radix (base) type of number being added to `this` @@ -534,7 +534,7 @@ class fixed_point { * @brief Method for creating a `fixed_point` number with a new `scale` * * The `fixed_point` number returned will have the same value, underlying representation and - * radix as `this`, the only thing changed is the scale + * radix as `this`, the only thing changed is the scale. * * @param scale The `scale` of the returned `fixed_point` number * @return `fixed_point` number with a new `scale` diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index 7551511d281..94039d81f31 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -189,7 +189,7 @@ std::unique_ptr
drop_nans( * @note if @p input.num_rows() is zero, there is no error, and an empty table * is returned. * - * @throws cudf::logic_error if The `input` size and `boolean_mask` size mismatches. + * @throws cudf::logic_error if `input.num_rows() != boolean_mask.size()`. * @throws cudf::logic_error if `boolean_mask` is not `type_id::BOOL8` type. * * @param[in] input The input table_view to filter @@ -214,7 +214,10 @@ enum class duplicate_keep_option { }; /** - * @brief Create a new table without duplicate rows + * @brief Create a new table without duplicate rows. + * + * The output table is sorted according to the lexicographic ordering of the data in the columns + * indexed by `keys`. * * Given an `input` table_view, each row is copied to output table if the corresponding * row of `keys` columns is unique, where the definition of unique depends on the value of @p keep: @@ -222,18 +225,18 @@ enum class duplicate_keep_option { * - KEEP_LAST: only the last of a sequence of duplicate rows is copied * - KEEP_NONE: no duplicate rows are copied * - * @throws cudf::logic_error if The `input` row size mismatches with `keys`. + * @throws cudf::logic_error if the `keys` column indices are out of bounds in the `input` table. * * @param[in] input input table_view to copy only unique rows * @param[in] keys vector of indices representing key columns from `input` - * @param[in] keep keep first entry, last entry, or no entries if duplicates found + * @param[in] keep keep first row, last row, or no rows of the found duplicates * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, nulls are not * equal if null_equality::UNEQUAL * @param[in] null_precedence flag to denote nulls should appear before or after non-null items * @param[in] mr Device memory resource used to allocate the returned table's device - * memory + * memory * - * @return Table with unique rows as per specified `keep`. + * @return Table with sorted unique rows as specified by `keep`. */ std::unique_ptr
drop_duplicates( table_view const& input, @@ -244,37 +247,95 @@ std::unique_ptr
drop_duplicates( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Count the unique elements in the column_view + * @brief Create a new table without duplicate rows with hash-based algorithms. + * + * Given an `input` table_view, each row is copied to output table if the corresponding + * row of `keys` columns is unique. If duplicate rows are present, it is unspecified which + * row is copied. * - * Given an input column_view, number of unique elements in this column_view is returned + * The order of elements in the output table is not specified. + * + * @param[in] input input table_view to copy only unique rows + * @param[in] keys vector of indices representing key columns from `input` + * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, nulls are not + * equal if null_equality::UNEQUAL + * @param[in] mr Device memory resource used to allocate the returned table's device + * memory + * + * @return Table with unique rows in an unspecified order. + */ +std::unique_ptr
unordered_drop_duplicates( + table_view const& input, + std::vector const& keys, + null_equality nulls_equal = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Count the number of consecutive groups of equivalent elements in a column. * * If `null_handling` is null_policy::EXCLUDE and `nan_handling` is nan_policy::NAN_IS_NULL, both * `NaN` and `null` values are ignored. If `null_handling` is null_policy::EXCLUDE and - * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in unique - * count. + * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in count. + * + * `null`s are handled as equal. * - * @param[in] input The column_view whose unique elements will be counted. + * @param[in] input The column_view whose number of distinct consecutive groups will be counted * @param[in] null_handling flag to include or ignore `null` while counting - * @param[in] nan_handling flag to consider `NaN==null` or not. + * @param[in] nan_handling flag to consider `NaN==null` or not * - * @return number of unique elements + * @return number of distinct consecutive groups in the column */ cudf::size_type distinct_count(column_view const& input, null_policy null_handling, nan_policy nan_handling); /** - * @brief Count the unique rows in a table. - * + * @brief Count the number of consecutive groups of equivalent elements in a table. * - * @param[in] input Table whose unique rows will be counted. - * @param[in] nulls_equal flag to denote if null elements should be considered equal - * nulls are not equal if null_equality::UNEQUAL + * @param[in] input Table whose number of distinct consecutive groups will be counted + * @param[in] nulls_equal flag to denote if null elements should be considered equal. + * nulls are not equal if null_equality::UNEQUAL. * - * @return number of unique rows in the table + * @return number of distinct consecutive groups in the table */ cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal = null_equality::EQUAL); +/** + * @brief Count the unique elements in the column_view. + * + * If `nulls_equal == nulls_equal::UNEQUAL`, all `null`s are unique. + * + * Given an input column_view, number of unique elements in this column_view is returned. + * + * If `null_handling` is null_policy::EXCLUDE and `nan_handling` is nan_policy::NAN_IS_NULL, both + * `NaN` and `null` values are ignored. If `null_handling` is null_policy::EXCLUDE and + * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in unique + * count. + * + * `null`s are handled as equal. + * + * @param[in] input The column_view whose unique elements will be counted + * @param[in] null_handling flag to include or ignore `null` while counting + * @param[in] nan_handling flag to consider `NaN==null` or not + * + * @return number of unique elements + */ +cudf::size_type unordered_distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling); + +/** + * @brief Count the unique rows in a table. + * + * @param[in] input Table whose unique rows will be counted + * @param[in] nulls_equal flag to denote if null elements should be considered equal. + * nulls are not equal if null_equality::UNEQUAL. + * + * @return number of unique rows in the table + */ +cudf::size_type unordered_distinct_count(table_view const& input, + null_equality nulls_equal = null_equality::EQUAL); + /** @} */ } // namespace cudf diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py index bac6697da82..33c369b254f 100755 --- a/cpp/scripts/sort_ninja_log.py +++ b/cpp/scripts/sort_ninja_log.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. # import argparse import os @@ -34,49 +34,63 @@ # build a map of the log entries entries = {} with open(log_file, "r") as log: + last = 0 + files = {} for line in log: entry = line.split() if len(entry) > 4: - elapsed = int(entry[1]) - int(entry[0]) obj_file = entry[3] file_size = ( os.path.getsize(os.path.join(log_path, obj_file)) if os.path.exists(obj_file) else 0 ) - entries[entry[3]] = (elapsed, file_size) + start = int(entry[0]) + end = int(entry[1]) + # logic based on ninjatracing + if end < last: + files = {} + last = end + files.setdefault(entry[4], (entry[3], start, end, file_size)) -# check file could be loaded + # build entries from files dict + for entry in files.values(): + entries[entry[0]] = (entry[1], entry[2], entry[3]) + +# check file could be loaded and we have entries to report if len(entries) == 0: print("Could not parse", log_file) exit() -# sort the keys by build time (descending order) -keys = list(entries.keys()) -sl = sorted(keys, key=lambda k: entries[k][0], reverse=True) +# sort the entries by build-time (descending order) +sorted_list = sorted( + list(entries.keys()), + key=lambda k: entries[k][1] - entries[k][0], + reverse=True, +) -if output_fmt == "xml": - # output results in XML format +# output results in XML format +def output_xml(entries, sorted_list, args): root = ET.Element("testsuites") testsuite = ET.Element( "testsuite", attrib={ "name": "build-time", - "tests": str(len(keys)), + "tests": str(len(sorted_list)), "failures": str(0), "errors": str(0), }, ) root.append(testsuite) - for key in sl: - entry = entries[key] - elapsed = float(entry[0]) / 1000 + for name in sorted_list: + entry = entries[name] + build_time = float(entry[1] - entry[0]) / 1000 item = ET.Element( "testcase", attrib={ "classname": "BuildTime", - "name": key, - "time": str(elapsed), + "name": name, + "time": str(build_time), }, ) testsuite.append(item) @@ -85,62 +99,219 @@ xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ") print(xmlstr) -elif output_fmt == "html": - # output results in HTML format - print("Sorted Ninja Build Times") - # Note: Jenkins does not support style defined in the html + +# utility converts a millisecond value to a colum width in pixels +def time_to_width(value, end): + # map a value from (0,end) to (0,1000) + r = (float(value) / float(end)) * 1000.0 + return int(r) + + +# assign each entry to a thread by analyzing the start/end times and +# slotting them into thread buckets where they fit +def assign_entries_to_threads(entries): + # first sort the entries' keys by end timestamp + sorted_keys = sorted( + list(entries.keys()), key=lambda k: entries[k][1], reverse=True + ) + + # build the chart data by assigning entries to threads + results = {} + threads = [] + for name in sorted_keys: + entry = entries[name] + + # assign this entry by finding the first available thread identified + # by the thread's current start time greater than the entry's end time + tid = -1 + for t in range(len(threads)): + if threads[t] >= entry[1]: + threads[t] = entry[0] + tid = t + break + + # if no current thread found, create a new one with this entry + if tid < 0: + threads.append(entry[0]) + tid = len(threads) - 1 + + # add entry name to the array associated with this tid + if tid not in results.keys(): + results[tid] = [] + results[tid].append(name) + + # first entry has the last end time + end_time = entries[sorted_keys[0]][1] + + # return the threaded entries and the last end time + return (results, end_time) + + +# output chart results in HTML format +def output_html(entries, sorted_list, args): + print("Build Metrics Report") + # Note: Jenkins does not support javascript nor style defined in the html # https://www.jenkins.io/doc/book/security/configuring-content-security-policy/ print("") if args.msg is not None: print("

", args.msg, "

") - print("
") - print( - "", - "", - "", - sep="", - ) - summary = {"red": 0, "yellow": 0, "green": 0} + + # map entries to threads + # the end_time is used to scale all the entries to a fixed output width + threads, end_time = assign_entries_to_threads(entries) + + # color ranges for build times + summary = {"red": 0, "yellow": 0, "green": 0, "white": 0} red = "bgcolor='#FFBBD0'" yellow = "bgcolor='#FFFF80'" green = "bgcolor='#AAFFBD'" - for key in sl: - result = entries[key] - elapsed = result[0] - color = green - if elapsed > 300000: # 5 minutes - color = red - summary["red"] += 1 - elif elapsed > 120000: # 2 minutes - color = yellow - summary["yellow"] += 1 - else: - summary["green"] += 1 + white = "bgcolor='#FFFFFF'" + + # create the build-time chart + print("
FileCompile time
(ms)
Size
(bytes)
") + for tid in range(len(threads)): + names = threads[tid] + # sort the names for this thread by start time + names = sorted(names, key=lambda k: entries[k][0]) + + # use the last entry's end time as the total row size + # (this is an estimate and does not have to be exact) + last_entry = entries[names[len(names) - 1]] + last_time = time_to_width(last_entry[1], end_time) print( - "", + "") + + # done with the chart + print("
", - key, - "", - result[0], - "", - result[1], - "
", sep="", ) - print("

") + + prev_end = 0 # used for spacing between entries + + # write out each entry for this thread as a column for a single row + for name in names: + entry = entries[name] + start = entry[0] + end = entry[1] + + # this handles minor gaps between end of the + # previous entry and the start of the next + if prev_end > 0 and start > prev_end: + size = time_to_width(start - prev_end, end_time) + print("") + # adjust for the cellspacing + prev_end = end + int(end_time / 500) + + # format the build-time + build_time = end - start + build_time_str = str(build_time) + " ms" + if build_time > 120000: # 2 minutes + minutes = int(build_time / 60000) + seconds = int(((build_time / 60000) - minutes) * 60) + build_time_str = "{:d}:{:02d} min".format(minutes, seconds) + elif build_time > 1000: + build_time_str = "{:.3f} s".format(build_time / 1000) + + # assign color and accumulate legend values + color = white + if build_time > 300000: # 5 minutes + color = red + summary["red"] += 1 + elif build_time > 120000: # 2 minutes + color = yellow + summary["yellow"] += 1 + elif build_time > 1000: # 1 second + color = green + summary["green"] += 1 + else: + summary["white"] += 1 + + # compute the pixel width based on build-time + size = max(time_to_width(build_time, end_time), 2) + # output the column for this entry + print("") + # update the entry with just the computed output info + entries[name] = (build_time_str, color, entry[2]) + + # add a filler column at the end of each row + print("
", end="") + # use a slightly smaller, fixed-width font + print("", end="") + + # add the file-name if it fits, otherwise, truncate the name + file_name = os.path.basename(name) + if len(file_name) + 3 > size / 7: + abbr_size = int(size / 7) - 3 + if abbr_size > 1: + print(file_name[:abbr_size], "...", sep="", end="") + else: + print(file_name, end="") + # done with this entry + print("

") + + # output detail table in build-time descending order + print("") + print( + "", + "", + "", + sep="", + ) + for name in sorted_list: + entry = entries[name] + build_time_str = entry[0] + color = entry[1] + file_size = entry[2] + + # format file size + file_size_str = "" + if file_size > 1000000: + file_size_str = "{:.3f} MB".format(file_size / 1000000) + elif file_size > 1000: + file_size_str = "{:.3f} KB".format(file_size / 1000) + elif file_size > 0: + file_size_str = str(file_size) + " bytes" + + # output entry row + print("", sep="", end="") + print("", sep="", end="") + print("", sep="") + + print("
FileCompile timeSize
", name, "", build_time_str, "", file_size_str, "

") + # include summary table with color legend + print("") print("time > 5 minutes") print("") print("2 minutes < time < 5 minutes") print("") - print("time < 2 minutes") + print("1 second < time < 2 minutes") print("") + print("time < 1 second") + print("") print("
", summary["red"], "
", summary["yellow"], "
", summary["green"], "
", summary["white"], "
") -else: - # output results in CSV format + +# output results in CSV format +def output_csv(entries, sorted_list, args): print("time,size,file") - for key in sl: - result = entries[key] - print(result[0], result[1], key, sep=",") + for name in sorted_list: + entry = entries[name] + build_time = entry[1] - entry[0] + file_size = entry[2] + print(build_time, file_size, name, sep=",") + + +if output_fmt == "xml": + output_xml(entries, sorted_list, args) +elif output_fmt == "html": + output_html(entries, sorted_list, args) +else: + output_csv(entries, sorted_list, args) diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index e3d1ea88ece..96b7fd48dc9 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -57,26 +58,29 @@ std::unique_ptr add_keys( // [a,b,c,d,f] + [d,b,e] = [a,b,c,d,f,d,b,e] auto combined_keys = cudf::detail::concatenate(std::vector{old_keys, new_keys}, stream); - // sort and remove any duplicates from the combined keys - // drop_duplicates([a,b,c,d,f,d,b,e]) = [a,b,c,d,e,f] - auto table_keys = cudf::detail::drop_duplicates(table_view{{combined_keys->view()}}, - std::vector{0}, // only one key column - duplicate_keep_option::KEEP_FIRST, - null_equality::EQUAL, - null_order::BEFORE, - stream, - mr) - ->release(); - std::unique_ptr keys_column(std::move(table_keys.front())); + + // Drop duplicates from the combined keys, then sort the result. + // sort(unordered_drop_duplicates([a,b,c,d,f,d,b,e])) = [a,b,c,d,e,f] + auto table_keys = + cudf::detail::unordered_drop_duplicates(table_view{{combined_keys->view()}}, + std::vector{0}, // only one key column + null_equality::EQUAL, + stream, + mr); + std::vector column_order{order::ASCENDING}; + std::vector null_precedence{null_order::AFTER}; // should be no nulls here + auto sorted_keys = + cudf::detail::sort(table_keys->view(), column_order, null_precedence, stream, mr)->release(); + + std::unique_ptr keys_column(std::move(sorted_keys.front())); // create a map for the indices // lower_bound([a,b,c,d,e,f],[a,b,c,d,f]) = [0,1,2,3,5] - auto map_indices = cudf::detail::lower_bound( - table_view{{keys_column->view()}}, - table_view{{old_keys}}, - std::vector{order::ASCENDING}, - std::vector{null_order::AFTER}, // should be no nulls here - stream, - mr); + auto map_indices = cudf::detail::lower_bound(table_view{{keys_column->view()}}, + table_view{{old_keys}}, + column_order, + null_precedence, + stream, + mr); // now create the indices column -- map old values to the new ones // gather([4,0,3,1,2,2,2,4,0],[0,1,2,3,5]) = [5,0,3,1,2,2,2,5,0] column_view indices_view(dictionary_column.indices().type(), diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu index fd86d8ec7d4..301338fa1a8 100644 --- a/cpp/src/dictionary/detail/concatenate.cu +++ b/cpp/src/dictionary/detail/concatenate.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -216,15 +217,15 @@ std::unique_ptr concatenate(host_span columns, // sort keys and remove duplicates; // this becomes the keys child for the output dictionary column - auto table_keys = cudf::detail::drop_duplicates(table_view{{all_keys->view()}}, - std::vector{0}, - duplicate_keep_option::KEEP_FIRST, - null_equality::EQUAL, - null_order::BEFORE, - stream, - mr) - ->release(); - std::unique_ptr keys_column(std::move(table_keys.front())); + auto table_keys = cudf::detail::unordered_drop_duplicates( + table_view{{all_keys->view()}}, std::vector{0}, null_equality::EQUAL, stream, mr); + auto sorted_keys = cudf::detail::sort(table_keys->view(), + std::vector{order::ASCENDING}, + std::vector{null_order::BEFORE}, + stream, + mr) + ->release(); + std::unique_ptr keys_column(std::move(sorted_keys.front())); // next, concatenate the indices std::vector indices_views(columns.size()); diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu index 72f6e034479..c1fb1fa2180 100644 --- a/cpp/src/dictionary/set_keys.cu +++ b/cpp/src/dictionary/set_keys.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -120,16 +121,17 @@ std::unique_ptr set_keys( auto keys = dictionary_column.keys(); CUDF_EXPECTS(keys.type() == new_keys.type(), "keys types must match"); - // copy the keys -- use drop_duplicates to make sure they are sorted and unique - auto table_keys = cudf::detail::drop_duplicates(table_view{{new_keys}}, - std::vector{0}, - duplicate_keep_option::KEEP_FIRST, - null_equality::EQUAL, - null_order::BEFORE, - stream, - mr) - ->release(); - std::unique_ptr keys_column(std::move(table_keys.front())); + // copy the keys -- use unordered_drop_duplicates to make sure they are unique, then + // sort the results. + auto unique_keys = cudf::detail::unordered_drop_duplicates( + table_view{{new_keys}}, std::vector{0}, null_equality::EQUAL, stream, mr); + auto sorted_keys = cudf::detail::sort(unique_keys->view(), + std::vector{order::ASCENDING}, + std::vector{null_order::BEFORE}, + stream, + mr) + ->release(); + std::unique_ptr keys_column(std::move(sorted_keys.front())); // compute the new nulls auto matches = cudf::detail::contains(keys, keys_column->view(), stream, mr); diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index e8b4a8b1cbf..57bb222aaa0 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -37,7 +38,6 @@ #include -#include #include #include @@ -219,20 +219,18 @@ std::pair, std::vector> groupby::scan groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - auto grouped_keys = helper().sorted_keys(rmm::cuda_stream_default, mr); + auto const stream = rmm::cuda_stream_default; + auto grouped_keys = helper().sorted_keys(stream, mr); - auto const& group_offsets = helper().group_offsets(rmm::cuda_stream_default); - std::vector group_offsets_vector(group_offsets.size()); - thrust::copy(thrust::device_pointer_cast(group_offsets.begin()), - thrust::device_pointer_cast(group_offsets.end()), - group_offsets_vector.begin()); + auto const& group_offsets = helper().group_offsets(stream); + auto const group_offsets_vector = cudf::detail::make_std_vector_sync(group_offsets, stream); - if (values.num_columns()) { + if (not values.is_empty()) { auto grouped_values = cudf::detail::gather(values, - helper().key_sort_order(rmm::cuda_stream_default), + helper().key_sort_order(stream), cudf::out_of_bounds_policy::DONT_CHECK, cudf::detail::negative_index_policy::NOT_ALLOWED, - rmm::cuda_stream_default, + stream, mr); return groupby::groups{ std::move(grouped_keys), std::move(group_offsets_vector), std::move(grouped_values)}; diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index c2115c3caa4..21bfd8120f7 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -68,7 +68,7 @@ class make_pair_function { { // Compute the hash value of row `i` auto row_hash_value = remap_sentinel_hash(_hash(i), _empty_key_sentinel); - return cuco::make_pair(std::move(row_hash_value), std::move(i)); + return cuco::make_pair(row_hash_value, i); } private: diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index f38e653c4a6..e492968b8a6 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -45,7 +45,7 @@ struct make_pair_function_semi { { // The value is irrelevant since we only ever use the hash map to check for // membership of a particular row index. - return cuco::make_pair(i, 0); + return cuco::make_pair(static_cast(i), 0); } }; diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index 8563a2a3bd3..39fe0b60c8c 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -49,7 +49,7 @@ struct make_pair_function { { // The value is irrelevant since we only ever use the hash map to check for // membership of a particular row index. - return cuco::make_pair(i, 0); + return cuco::make_pair(static_cast(i), 0); } }; diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp index 6f9149a47e2..234eaf51f96 100644 --- a/cpp/src/reductions/reductions.cpp +++ b/cpp/src/reductions/reductions.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -93,7 +93,7 @@ struct reduce_dispatch_functor { case aggregation::NUNIQUE: { auto nunique_agg = dynamic_cast(agg.get()); return make_fixed_width_scalar( - detail::distinct_count( + detail::unordered_distinct_count( col, nunique_agg->_null_handling, nan_policy::NAN_IS_VALID, stream), stream, mr); diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 5c695f8a16f..2c7488084b5 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,14 +14,18 @@ * limitations under the License. */ +#include "stream_compaction_common.cuh" +#include "stream_compaction_common.hpp" + #include #include #include +#include +#include #include #include #include #include -#include #include #include @@ -30,39 +34,19 @@ #include #include +#include + +#include +#include +#include +#include #include namespace cudf { namespace detail { - -cudf::size_type distinct_count(table_view const& keys, - null_equality nulls_equal, - rmm::cuda_stream_view stream) -{ - // sort only indices - auto sorted_indices = sorted_order(keys, - std::vector{}, - std::vector{}, - stream, - rmm::mr::get_current_device_resource()); - - // count unique elements - auto sorted_row_index = sorted_indices->view().data(); - auto device_input_table = cudf::table_device_view::create(keys, stream); - - row_equality_comparator comp( - nullate::DYNAMIC{cudf::has_nulls(keys)}, *device_input_table, *device_input_table, nulls_equal); - return thrust::count_if( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(keys.num_rows()), - [sorted_row_index, comp] __device__(cudf::size_type i) { - return (i == 0 || not comp(sorted_row_index[i], sorted_row_index[i - 1])); - }); -} - +namespace { /** - * @brief Functor to check for `NAN` at an index in a `column_device_view`. + * @brief Functor to check for `NaN` at an index in a `column_device_view`. * * @tparam T The type of `column_device_view` */ @@ -76,97 +60,199 @@ struct check_for_nan { check_for_nan(cudf::column_device_view input) : _input{input} {} /** - * @brief Operator to be called to check for `NAN` at `index` in `_input` + * @brief Operator to be called to check for `NaN` at `index` in `_input` * - * @param[in] index The index at which the `NAN` needs to be checked in `input` + * @param[in] index The index at which the `NaN` needs to be checked in `input` * - * @returns bool true if value at `index` is `NAN` and not null, else false + * @returns bool true if value at `index` is `NaN` and not null, else false */ - __device__ bool operator()(size_type index) + __device__ bool operator()(size_type index) const noexcept { return std::isnan(_input.data()[index]) and _input.is_valid(index); } - protected: cudf::column_device_view _input; }; /** * @brief A structure to be used along with type_dispatcher to check if a - * `column_view` has `NAN`. + * `column_view` has `NaN`. */ struct has_nans { /** - * @brief Checks if `input` has `NAN` + * @brief Checks if `input` has `NaN` * * @note This will be applicable only for floating point type columns. * - * @param[in] input The `column_view` which will be checked for `NAN` + * @param[in] input The `column_view` which will be checked for `NaN` * @param[in] stream CUDA stream used for device memory operations and kernel launches. * - * @returns bool true if `input` has `NAN` else false + * @returns bool true if `input` has `NaN` else false */ - template ::value>* = nullptr> + template >* = nullptr> bool operator()(column_view const& input, rmm::cuda_stream_view stream) { auto input_device_view = cudf::column_device_view::create(input, stream); auto device_view = *input_device_view; - auto count = thrust::count_if(rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(input.size()), - check_for_nan(device_view)); - return count > 0; + return thrust::any_of(rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(input.size()), + check_for_nan(device_view)); } /** - * @brief Checks if `input` has `NAN` + * @brief Checks if `input` has `NaN` * * @note This will be applicable only for non-floating point type columns. And - * non-floating point columns can never have `NAN`, so it will always return + * non-floating point columns can never have `NaN`, so it will always return * false * - * @param[in] input The `column_view` which will be checked for `NAN` + * @param[in] input The `column_view` which will be checked for `NaN` * @param[in] stream CUDA stream used for device memory operations and kernel launches. * - * @returns bool Always false as non-floating point columns can't have `NAN` + * @returns bool Always false as non-floating point columns can't have `NaN` */ - template ::value>* = nullptr> - bool operator()(column_view const& input, rmm::cuda_stream_view stream) + template >* = nullptr> + bool operator()(column_view const&, rmm::cuda_stream_view) + { + return false; + } +}; + +/** + * @brief A functor to be used along with device type_dispatcher to check if + * the row `index` of `column_device_view` is `NaN`. + */ +struct check_nan { + // Check if it's `NaN` for floating point type columns + template >* = nullptr> + __device__ inline bool operator()(column_device_view const& input, size_type index) + { + return std::isnan(input.data()[index]); + } + // Non-floating point type columns can never have `NaN`, so it will always return false. + template >* = nullptr> + __device__ inline bool operator()(column_device_view const&, size_type) { return false; } }; +} // namespace + +cudf::size_type distinct_count(table_view const& keys, + null_equality nulls_equal, + rmm::cuda_stream_view stream) +{ + auto table_ptr = cudf::table_device_view::create(keys, stream); + row_equality_comparator comp( + nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr, *table_ptr, nulls_equal); + return thrust::count_if( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(keys.num_rows()), + [comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); }); +} + +cudf::size_type unordered_distinct_count(table_view const& keys, + null_equality nulls_equal, + rmm::cuda_stream_view stream) +{ + auto table_ptr = cudf::table_device_view::create(keys, stream); + auto const num_rows = table_ptr->num_rows(); + auto const has_null = nullate::DYNAMIC{cudf::has_nulls(keys)}; + + hash_map_type key_map{compute_hash_table_size(num_rows), + COMPACTION_EMPTY_KEY_SENTINEL, + COMPACTION_EMPTY_VALUE_SENTINEL, + detail::hash_table_allocator_type{default_allocator{}, stream}, + stream.value()}; + + compaction_hash hash_key{has_null, *table_ptr}; + row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal); + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [] __device__(size_type i) { return cuco::make_pair(i, i); }); + + // when nulls are equal, insert non-null rows only to improve efficiency + if (nulls_equal == null_equality::EQUAL and has_null) { + thrust::counting_iterator stencil(0); + auto const [row_bitmask, null_count] = cudf::detail::bitmask_or(keys, stream); + row_validity pred{static_cast(row_bitmask.data())}; + + key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, stream.value()); + return key_map.get_size() + static_cast((null_count > 0) ? 1 : 0); + } + // otherwise, insert all + key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); + return key_map.get_size(); +} cudf::size_type distinct_count(column_view const& input, null_policy null_handling, nan_policy nan_handling, rmm::cuda_stream_view stream) { - if (0 == input.size() || input.null_count() == input.size()) { return 0; } - - cudf::size_type nrows = input.size(); - - bool has_nan = false; - // Check for Nans - // Checking for nulls in input and flag nan_handling, as the count will - // only get affected if these two conditions are true. NAN will only be - // be an extra if nan_handling was NAN_IS_NULL and input also had null, which - // will increase the count by 1. - if (input.has_nulls() and nan_handling == nan_policy::NAN_IS_NULL) { - has_nan = cudf::type_dispatcher(input.type(), has_nans{}, input, stream); - } + auto const num_rows = input.size(); - auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream); + if (num_rows == 0 or num_rows == input.null_count()) { return 0; } - // if nan is considered null and there are already null values - if (nan_handling == nan_policy::NAN_IS_NULL and has_nan and input.has_nulls()) --count; + auto const count_nulls = null_handling == null_policy::INCLUDE; + auto const nan_is_null = nan_handling == nan_policy::NAN_IS_NULL; + auto const should_check_nan = cudf::is_floating_point(input.type()); + auto input_device_view = cudf::column_device_view::create(input, stream); + auto device_view = *input_device_view; + auto input_table_view = table_view{{input}}; + auto table_ptr = cudf::table_device_view::create(input_table_view, stream); + row_equality_comparator comp(nullate::DYNAMIC{cudf::has_nulls(input_table_view)}, + *table_ptr, + *table_ptr, + null_equality::EQUAL); - if (null_handling == null_policy::EXCLUDE and input.has_nulls()) - return --count; - else - return count; + return thrust::count_if( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(num_rows), + [count_nulls, nan_is_null, should_check_nan, device_view, comp] __device__(cudf::size_type i) { + auto const is_null = device_view.is_null(i); + auto const is_nan = nan_is_null and should_check_nan and + cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i); + if (not count_nulls and (is_null or (nan_is_null and is_nan))) { return false; } + if (i == 0) { return true; } + if (count_nulls and nan_is_null and (is_nan or is_null)) { + auto const prev_is_nan = + should_check_nan and + cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i - 1); + return not(prev_is_nan or device_view.is_null(i - 1)); + } + return not comp(i, i - 1); + }); } +cudf::size_type unordered_distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream) +{ + if (0 == input.size() or input.null_count() == input.size()) { return 0; } + + auto count = detail::unordered_distinct_count(table_view{{input}}, null_equality::EQUAL, stream); + + // Check for nulls. If the null policy is EXCLUDE and null values were found, + // we decrement the count. + auto const has_null = input.has_nulls(); + if (null_handling == null_policy::EXCLUDE and has_null) { --count; } + + // Check for NaNs. There are two cases that can lead to decrementing the + // count. The first case is when the input has no nulls, but has NaN values + // handled as a null via NAN_IS_NULL and has a policy to EXCLUDE null values + // from the count. The second case is when the input has null values and NaN + // values handled as nulls via NAN_IS_NULL. Regardless of whether the null + // policy is set to EXCLUDE, we decrement the count to avoid double-counting + // null and NaN as distinct entities. + auto const has_nan_as_null = (nan_handling == nan_policy::NAN_IS_NULL) and + cudf::type_dispatcher(input.type(), has_nans{}, input, stream); + if (has_nan_as_null and (has_null or null_handling == null_policy::EXCLUDE)) { --count; } + return count; +} } // namespace detail cudf::size_type distinct_count(column_view const& input, @@ -183,4 +269,18 @@ cudf::size_type distinct_count(table_view const& input, null_equality nulls_equa return detail::distinct_count(input, nulls_equal); } +cudf::size_type unordered_distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling) +{ + CUDF_FUNC_RANGE(); + return detail::unordered_distinct_count(input, null_handling, nan_handling); +} + +cudf::size_type unordered_distinct_count(table_view const& input, null_equality nulls_equal) +{ + CUDF_FUNC_RANGE(); + return detail::unordered_distinct_count(input, nulls_equal); +} + } // namespace cudf diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index abc34663aee..2fd1f530b6d 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,13 +14,16 @@ * limitations under the License. */ -#include +#include "drop_duplicates.cuh" +#include "stream_compaction_common.cuh" +#include "stream_compaction_common.hpp" #include #include #include #include #include +#include #include #include #include @@ -37,6 +40,7 @@ #include #include +#include #include namespace cudf { @@ -85,12 +89,12 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys, auto comp = row_equality_comparator( nullate::DYNAMIC{cudf::has_nulls(keys)}, *device_input_table, *device_input_table, nulls_equal); - auto result_end = unique_copy(sorted_indices->view().begin(), - sorted_indices->view().end(), - unique_indices.begin(), - comp, - keep, - stream); + auto result_end = cudf::detail::unique_copy(sorted_indices->view().begin(), + sorted_indices->view().end(), + unique_indices.begin(), + comp, + keep, + stream); return cudf::detail::slice(column_view(unique_indices), 0, @@ -106,7 +110,7 @@ std::unique_ptr drop_duplicates(table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (0 == input.num_rows() || 0 == input.num_columns() || 0 == keys.size()) { + if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) { return empty_like(input); } @@ -130,6 +134,62 @@ std::unique_ptr
drop_duplicates(table_view const& input, mr); } +std::unique_ptr
unordered_drop_duplicates(table_view const& input, + std::vector const& keys, + null_equality nulls_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) { + return empty_like(input); + } + + auto keys_view = input.select(keys); + auto table_ptr = cudf::table_device_view::create(keys_view, stream); + auto has_null = nullate::DYNAMIC{cudf::has_nulls(keys_view)}; + auto const num_rows{table_ptr->num_rows()}; + + hash_map_type key_map{compute_hash_table_size(num_rows), + COMPACTION_EMPTY_KEY_SENTINEL, + COMPACTION_EMPTY_VALUE_SENTINEL, + detail::hash_table_allocator_type{default_allocator{}, stream}, + stream.value()}; + + compaction_hash hash_key{has_null, *table_ptr}; + row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal); + + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [] __device__(size_type i) { return cuco::make_pair(i, i); }); + // insert unique indices into the map. + key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); + + auto counting_iter = thrust::make_counting_iterator(0); + rmm::device_uvector index_exists_in_map(num_rows, stream, mr); + // enumerate all indices to check if they are present in the map. + key_map.contains(counting_iter, counting_iter + num_rows, index_exists_in_map.begin(), hash_key); + + auto const output_size{key_map.get_size()}; + + // write unique indices to a numeric column + auto unique_indices = cudf::make_numeric_column( + data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED, stream, mr); + auto mutable_view = mutable_column_device_view::create(*unique_indices, stream); + thrust::copy_if(rmm::exec_policy(stream), + counting_iter, + counting_iter + num_rows, + index_exists_in_map.begin(), + mutable_view->begin(), + thrust::identity{}); + + // run gather operation to establish new order + return detail::gather(input, + unique_indices->view(), + out_of_bounds_policy::DONT_CHECK, + detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); +} + } // namespace detail std::unique_ptr
drop_duplicates(table_view const& input, @@ -144,4 +204,13 @@ std::unique_ptr
drop_duplicates(table_view const& input, input, keys, keep, nulls_equal, null_precedence, rmm::cuda_stream_default, mr); } +std::unique_ptr
unordered_drop_duplicates(table_view const& input, + std::vector const& keys, + null_equality nulls_equal, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::unordered_drop_duplicates(input, keys, nulls_equal, rmm::cuda_stream_default, mr); +} + } // namespace cudf diff --git a/cpp/src/stream_compaction/stream_compaction_common.cuh b/cpp/src/stream_compaction/stream_compaction_common.cuh new file mode 100644 index 00000000000..8ba9223a1bc --- /dev/null +++ b/cpp/src/stream_compaction/stream_compaction_common.cuh @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "stream_compaction_common.hpp" + +namespace cudf { +namespace detail { + +/** + * @brief Device callable to hash a given row. + */ +template +class compaction_hash { + public: + compaction_hash(Nullate has_nulls, table_device_view t) : _hash{has_nulls, t} {} + + __device__ inline auto operator()(size_type i) const noexcept + { + auto hash = _hash(i); + return (hash == COMPACTION_EMPTY_KEY_SENTINEL) ? (hash - 1) : hash; + } + + private: + row_hash _hash; +}; + +/** + * @brief Device functor to determine if a row is valid. + */ +class row_validity { + public: + row_validity(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {} + + __device__ inline bool operator()(const size_type& i) const noexcept + { + return cudf::bit_is_set(_row_bitmask, i); + } + + private: + bitmask_type const* _row_bitmask; +}; + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/stream_compaction/stream_compaction_common.hpp b/cpp/src/stream_compaction/stream_compaction_common.hpp new file mode 100644 index 00000000000..1d743eccdbe --- /dev/null +++ b/cpp/src/stream_compaction/stream_compaction_common.hpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include +#include + +#include + +#include + +#include + +namespace cudf { +namespace detail { + +constexpr auto COMPACTION_EMPTY_KEY_SENTINEL = std::numeric_limits::max(); +constexpr auto COMPACTION_EMPTY_VALUE_SENTINEL = std::numeric_limits::min(); + +using hash_type = cuco::detail::MurmurHash3_32; + +using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor>; + +using hash_map_type = + cuco::static_map; + +using row_hash = cudf::row_hasher; + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu index dadeaf7d1e0..405c83ab872 100644 --- a/cpp/src/transform/encode.cu +++ b/cpp/src/transform/encode.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,7 +30,10 @@ #include #include +#include #include +#include +#include namespace cudf { namespace detail { @@ -38,29 +41,23 @@ namespace detail { std::pair, std::unique_ptr> encode( table_view const& input_table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - std::vector drop_keys(input_table.num_columns()); + auto const num_cols = input_table.num_columns(); + + std::vector drop_keys(num_cols); std::iota(drop_keys.begin(), drop_keys.end(), 0); - // side effects of this function we are now dependent on: - // - resulting column elements are sorted ascending - // - nulls are sorted to the beginning - auto keys_table = cudf::detail::drop_duplicates(input_table, - drop_keys, - duplicate_keep_option::KEEP_FIRST, - null_equality::EQUAL, - null_order::AFTER, - stream, - mr); + auto unique_keys = cudf::detail::unordered_drop_duplicates( + input_table, drop_keys, null_equality::EQUAL, stream, mr); + + std::vector column_order(num_cols, order::ASCENDING); + std::vector null_precedence(num_cols, null_order::AFTER); + auto sorted_unique_keys = + cudf::detail::sort(unique_keys->view(), column_order, null_precedence, stream, mr); - auto indices_column = - cudf::detail::lower_bound(keys_table->view(), - input_table, - std::vector(input_table.num_columns(), order::ASCENDING), - std::vector(input_table.num_columns(), null_order::AFTER), - stream, - mr); + auto indices_column = cudf::detail::lower_bound( + sorted_unique_keys->view(), input_table, column_order, null_precedence, stream, mr); - return std::make_pair(std::move(keys_table), std::move(indices_column)); + return std::make_pair(std::move(sorted_unique_keys), std::move(indices_column)); } } // namespace detail diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index e23403e68e4..6b5670630ec 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -292,6 +292,7 @@ ConfigureTest( ConfigureTest( STREAM_COMPACTION_TEST stream_compaction/apply_boolean_mask_tests.cpp + stream_compaction/distinct_count_tests.cpp stream_compaction/drop_nulls_tests.cpp stream_compaction/drop_nans_tests.cpp stream_compaction/drop_duplicates_tests.cpp diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp new file mode 100644 index 00000000000..78b52db5255 --- /dev/null +++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp @@ -0,0 +1,370 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +using cudf::nan_policy; +using cudf::null_equality; +using cudf::null_policy; + +constexpr int32_t XXX{70}; // Mark for null elements +constexpr int32_t YYY{3}; // Mark for null elements + +template +struct DistinctCountCommon : public cudf::test::BaseFixture { +}; + +TYPED_TEST_SUITE(DistinctCountCommon, cudf::test::NumericTypes); + +TYPED_TEST(DistinctCountCommon, NoNull) +{ + using T = TypeParam; + + auto const input = cudf::test::make_type_param_vector( + {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4}); + + cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end()); + + // explicit instantiation to one particular type (`double`) to reduce build time + auto const expected = + static_cast(std::set(input.begin(), input.end()).size()); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + // explicit instantiation to one particular type (`double`) to reduce build time + std::vector input_data(input.begin(), input.end()); + auto const new_end = std::unique(input_data.begin(), input_data.end()); + auto const gold_ordered = std::distance(input_data.begin(), new_end); + EXPECT_EQ(gold_ordered, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); +} + +TYPED_TEST(DistinctCountCommon, TableNoNull) +{ + using T = TypeParam; + + auto const input1 = cudf::test::make_type_param_vector( + {1, 3, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4}); + auto const input2 = cudf::test::make_type_param_vector( + {3, 3, 3, 4, 31, 1, 8, 5, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1}); + + std::vector> pair_input; + std::transform( + input1.begin(), input1.end(), input2.begin(), std::back_inserter(pair_input), [](T a, T b) { + return std::make_pair(a, b); + }); + + cudf::test::fixed_width_column_wrapper input_col1(input1.begin(), input1.end()); + cudf::test::fixed_width_column_wrapper input_col2(input2.begin(), input2.end()); + cudf::table_view input_table({input_col1, input_col2}); + + auto const expected = static_cast( + std::set>(pair_input.begin(), pair_input.end()).size()); + EXPECT_EQ(expected, cudf::unordered_distinct_count(input_table, null_equality::EQUAL)); + + auto const new_end = std::unique(pair_input.begin(), pair_input.end()); + auto const gold_ordered = std::distance(pair_input.begin(), new_end); + EXPECT_EQ(gold_ordered, cudf::distinct_count(input_table, null_equality::EQUAL)); +} + +struct DistinctCount : public cudf::test::BaseFixture { +}; + +TEST_F(DistinctCount, WithNull) +{ + using T = int32_t; + + std::vector input = {1, 3, 3, XXX, 31, 1, 8, 2, 0, XXX, XXX, + XXX, 10, 40, 31, 42, 0, 42, 8, 5, XXX}; + std::vector valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; + + cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); + + // explicit instantiation to one particular type (`double`) to reduce build time + auto const expected = + static_cast(std::set(input.begin(), input.end()).size()); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + auto const new_end = std::unique(input.begin(), input.end()); + auto const gold_ordered = std::distance(input.begin(), new_end) - 3; + EXPECT_EQ(gold_ordered, + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(DistinctCount, IgnoringNull) +{ + using T = int32_t; + + std::vector input = {1, YYY, YYY, XXX, 31, 1, 8, 2, 0, XXX, 1, + XXX, 10, 40, 31, 42, 0, 42, 8, 5, XXX}; + std::vector valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; + + cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); + + auto const expected = + static_cast(std::set(input.begin(), input.end()).size()); + // Removing 2 from expected to remove count for `XXX` and `YYY` + EXPECT_EQ( + expected - 2, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); + + auto const new_end = std::unique(input.begin(), input.end()); + // -1 since `YYY, YYY, XXX` is in the same group of equivalent rows + auto const gold_ordered = std::distance(input.begin(), new_end) - 1; + EXPECT_EQ(gold_ordered, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(DistinctCount, WithNansAndNull) +{ + using T = float; + + std::vector input = {1, 3, NAN, XXX, 31, 1, 8, 2, 0, XXX, 1, + XXX, 10, 40, 31, NAN, 0, NAN, 8, 5, XXX}; + std::vector valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + auto const expected = + static_cast(std::set(input.begin(), input.end()).size()); + EXPECT_EQ( + expected + 1, // +1 since `NAN` is not in std::set + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + auto const new_end = std::unique(input.begin(), input.end()); + auto const gold_ordered = std::distance(input.begin(), new_end); + EXPECT_EQ(gold_ordered, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + input = {NAN, NAN, XXX}; + valid = {1, 1, 0}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_nan = 2; + EXPECT_EQ( + expected_all_nan, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ(expected_all_nan, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(DistinctCount, WithNansOnly) +{ + using T = float; + + std::vector input = {1, 3, NAN, 70, 31}; + std::vector valid = {1, 1, 1, 1, 1}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected = 5; + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + input = {NAN, NAN, NAN}; + valid = {1, 1, 1}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_nan = 1; + EXPECT_EQ( + expected_all_nan, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ(expected_all_nan, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(DistinctCount, NansAsNullWithNoNull) +{ + using T = float; + + std::vector input = {1, 3, NAN, 70, 31}; + std::vector valid = {1, 1, 1, 1, 1}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected = 5; + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + + input = {NAN, NAN, NAN}; + valid = {1, 1, 1}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_nan = 1; + EXPECT_EQ( + expected_all_nan, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected_all_nan, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); +} + +TEST_F(DistinctCount, NansAsNullWithNull) +{ + using T = float; + + std::vector input = {1, 3, NAN, XXX, 31}; + std::vector valid = {1, 1, 1, 0, 1}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected = 4; + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + + input = {NAN, NAN, XXX}; + valid = {1, 1, 0}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_null = 1; + EXPECT_EQ( + expected_all_null, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected_all_null, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); +} + +TEST_F(DistinctCount, NansAsNullWithIgnoreNull) +{ + using T = float; + + std::vector input = {1, 3, NAN, XXX, 31}; + std::vector valid = {1, 1, 1, 0, 1}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected = 3; + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + + input = {NAN, NAN, NAN}; + valid = {1, 1, 1}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_nan = 0; + EXPECT_EQ( + expected_all_nan, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected_all_nan, + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); +} + +TEST_F(DistinctCount, EmptyColumn) +{ + using T = float; + + cudf::test::fixed_width_column_wrapper input_col{}; + + constexpr auto expected = 0; + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); +} + +TEST_F(DistinctCount, StringColumnWithNull) +{ + cudf::test::strings_column_wrapper input_col{ + {"", "this", "is", "this", "This", "a", "column", "of", "the", "strings"}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 1}}; + + cudf::size_type const expected = + (std::vector{"", "this", "is", "This", "a", "column", "of", "strings"}).size(); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(DistinctCount, TableWithNull) +{ + cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; + cudf::test::fixed_width_column_wrapper col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, + {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}}; + cudf::table_view input{{col1, col2}}; + + EXPECT_EQ(8, cudf::unordered_distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); +} + +TEST_F(DistinctCount, EmptyColumnedTable) +{ + std::vector cols{}; + + cudf::table_view input(cols); + + EXPECT_EQ(0, cudf::unordered_distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(0, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); +} + +TEST_F(DistinctCount, TableMixedTypes) +{ + cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; + cudf::test::fixed_width_column_wrapper col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, + {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}}; + cudf::test::fixed_width_column_wrapper col3{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, + {1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0}}; + cudf::table_view input{{col1, col2, col3}}; + + EXPECT_EQ(9, cudf::unordered_distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); +} + +TEST_F(DistinctCount, TableWithStringColumnWithNull) +{ + cudf::test::fixed_width_column_wrapper col1{{0, 9, 8, 9, 6, 5, 4, 3, 2, 1, 0}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; + cudf::test::strings_column_wrapper col2{ + {"", "this", "is", "this", "this", "a", "column", "of", "the", "strings", ""}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; + + cudf::table_view input{{col1, col2}}; + EXPECT_EQ(9, cudf::unordered_distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); +} diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp index 916d2a33b97..d49b8208094 100644 --- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp +++ b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,254 +14,98 @@ * limitations under the License. */ -#include -#include -#include #include +#include #include #include #include #include + #include #include #include #include #include +#include +#include + using cudf::nan_policy; using cudf::null_equality; using cudf::null_policy; -template -struct DistinctCountCommon : public cudf::test::BaseFixture { -}; - -TYPED_TEST_SUITE(DistinctCountCommon, cudf::test::NumericTypes); - -TYPED_TEST(DistinctCountCommon, NoNull) -{ - using T = TypeParam; - - auto const input = cudf::test::make_type_param_vector( - {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4}); - - cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end()); - - cudf::size_type expected = std::set(input.begin(), input.end()).size(); - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); -} - -TYPED_TEST(DistinctCountCommon, TableNoNull) -{ - using T = TypeParam; - - auto const input1 = cudf::test::make_type_param_vector( - {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4}); - auto const input2 = cudf::test::make_type_param_vector( - {3, 3, 4, 31, 1, 8, 5, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1}); - std::vector> pair_input; - std::transform( - input1.begin(), input1.end(), input2.begin(), std::back_inserter(pair_input), [](T a, T b) { - return std::make_pair(a, b); - }); - - cudf::test::fixed_width_column_wrapper input_col1(input1.begin(), input1.end()); - cudf::test::fixed_width_column_wrapper input_col2(input2.begin(), input2.end()); - - std::vector cols{input_col1, input_col2}; - cudf::table_view input_table(cols); - - cudf::size_type expected = std::set>(pair_input.begin(), pair_input.end()).size(); - EXPECT_EQ(expected, cudf::distinct_count(input_table, null_equality::EQUAL)); -} - -struct DistinctCount : public cudf::test::BaseFixture { +struct DropDuplicatesCommon : public cudf::test::BaseFixture { }; -TEST_F(DistinctCount, WithNull) -{ - using T = int32_t; - - // Considering 70 as null - std::vector input = {1, 3, 3, 70, 31, 1, 8, 2, 0, 70, 1, 70, 10, 40, 31, 42, 0, 42, 8, 5, 70}; - std::vector valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, - 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; - - cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); - - cudf::size_type expected = std::set(input.begin(), input.end()).size(); - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); -} - -TEST_F(DistinctCount, IgnoringNull) -{ - using T = int32_t; - - // Considering 70 and 3 as null - std::vector input = {1, 3, 3, 70, 31, 1, 8, 2, 0, 70, 1, 70, 10, 40, 31, 42, 0, 42, 8, 5, 70}; - std::vector valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, - 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; - - cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); - - cudf::size_type expected = std::set(input.begin(), input.end()).size(); - // Removing 2 from expected to remove count for 70 and 3 - EXPECT_EQ(expected - 2, - cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); -} - -TEST_F(DistinctCount, WithNansAndNull) +TEST_F(DropDuplicatesCommon, StringKeyColumn) { - using T = float; - - std::vector input = {1, 3, NAN, 70, 31, 1, 8, 2, 0, 70, 1, - 70, 10, 40, 31, NAN, 0, NAN, 8, 5, 70}; - std::vector valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, - 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; - - cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; - - cudf::size_type expected = std::set(input.begin(), input.end()).size(); - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); -} - -TEST_F(DistinctCount, WithNansOnly) -{ - using T = float; - - std::vector input = {1, 3, NAN, 70, 31}; - std::vector valid = {1, 1, 1, 1, 1}; - - cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; - - cudf::size_type expected = 5; - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); -} - -TEST_F(DistinctCount, NansAsNullWithNoNull) -{ - using T = float; - - std::vector input = {1, 3, NAN, 70, 31}; - std::vector valid = {1, 1, 1, 1, 1}; - - cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; - - cudf::size_type expected = 5; - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); -} + cudf::test::fixed_width_column_wrapper col{{5, 4, 4, 5, 5, 8, 1}, {1, 0, 0, 1, 1, 1, 1}}; + cudf::test::strings_column_wrapper key_col{{"all", "new", "new", "all", "new", "the", "strings"}, + {1, 1, 1, 1, 0, 1, 1}}; + cudf::table_view input{{col, key_col}}; + std::vector keys{1}; -TEST_F(DistinctCount, NansAsNullWithNull) -{ - using T = float; + cudf::test::fixed_width_column_wrapper exp_sort_col{{5, 5, 4, 1, 8}, {1, 1, 0, 1, 1}}; + cudf::test::strings_column_wrapper exp_sort_key_col{{"new", "all", "new", "strings", "the"}, + {0, 1, 1, 1, 1}}; + cudf::table_view expected_sort{{exp_sort_col, exp_sort_key_col}}; - std::vector input = {1, 3, NAN, 70, 31}; - std::vector valid = {1, 1, 1, 0, 1}; + auto got_sort = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, got_sort->view()); - cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + auto got_unordered = unordered_drop_duplicates(input, keys); + auto key_view = got_unordered->select(keys.begin(), keys.end()); + auto sorted_result = cudf::sort_by_key(got_unordered->view(), key_view); - cudf::size_type expected = 4; - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, sorted_result->view()); } -TEST_F(DistinctCount, NansAsNullWithIgnoreNull) +TEST_F(DropDuplicatesCommon, EmptyInputTable) { - using T = float; - - std::vector input = {1, 3, NAN, 70, 31}; - std::vector valid = {1, 1, 1, 0, 1}; + cudf::test::fixed_width_column_wrapper col(std::initializer_list{}); + cudf::table_view input{{col}}; + std::vector keys{1, 2}; - cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + auto got = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); - cudf::size_type expected = 3; - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view()); } -TEST_F(DistinctCount, EmptyColumn) +TEST_F(DropDuplicatesCommon, NoColumnInputTable) { - using T = float; - - cudf::test::fixed_width_column_wrapper input_col{}; - - cudf::size_type expected = 0; - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); -} + cudf::table_view input{std::vector()}; + std::vector keys{1, 2}; -TEST_F(DistinctCount, StringColumnWithNull) -{ - cudf::test::strings_column_wrapper input_col{ - {"", "this", "is", "this", "This", "a", "column", "of", "the", "strings"}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 1}}; - - cudf::size_type expected = - (std::vector{"", "this", "is", "This", "a", "column", "of", "strings"}).size(); - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); -} + auto got = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); -TEST_F(DistinctCount, TableWithNull) -{ - cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; - cudf::test::fixed_width_column_wrapper col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, - {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}}; - cudf::table_view input{{col1, col2}}; - - EXPECT_EQ(8, cudf::distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL)); + auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view()); } -TEST_F(DistinctCount, EmptyColumnedTable) +TEST_F(DropDuplicatesCommon, EmptyKeys) { - std::vector cols{}; - - cudf::table_view input(cols); - - EXPECT_EQ(0, cudf::distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(0, cudf::distinct_count(input, null_equality::UNEQUAL)); - EXPECT_EQ(0, cudf::distinct_count(cudf::table_view{}, null_equality::EQUAL)); - EXPECT_EQ(0, cudf::distinct_count(cudf::table_view{}, null_equality::UNEQUAL)); -} + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper empty_col{}; + cudf::table_view input{{col}}; + std::vector keys{}; -TEST_F(DistinctCount, TableMixedTypes) -{ - cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; - cudf::test::fixed_width_column_wrapper col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, - {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}}; - cudf::test::fixed_width_column_wrapper col3{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, - {1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0}}; - cudf::table_view input{{col1, col2, col3}}; - - EXPECT_EQ(9, cudf::distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL)); -} + auto got = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); -TEST_F(DistinctCount, TableWithStringColumnWithNull) -{ - cudf::test::fixed_width_column_wrapper col1{{0, 9, 8, 9, 6, 5, 4, 3, 2, 1, 0}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; - cudf::test::strings_column_wrapper col2{ - {"", "this", "is", "this", "this", "a", "column", "of", "the", "strings", ""}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; - - cudf::table_view input{{col1, col2}}; - EXPECT_EQ(9, cudf::distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL)); + auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got_unordered->view()); } -struct DropDuplicate : public cudf::test::BaseFixture { +struct DropDuplicates : public cudf::test::BaseFixture { }; -TEST_F(DropDuplicate, NonNullTable) +TEST_F(DropDuplicates, NonNullTable) { cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 5}}; cudf::test::fixed_width_column_wrapper col2{{4, 5, 3, 4, 9, 4}}; @@ -271,7 +115,7 @@ TEST_F(DropDuplicate, NonNullTable) cudf::table_view input{{col1, col2, col1_key, col2_key}}; std::vector keys{2, 3}; - // Keep first of duplicate + // Keep the first duplicate row // The expected table would be sorted in ascending order with respect to keys cudf::test::fixed_width_column_wrapper exp_col1_first{{5, 5, 5, 3, 8}}; cudf::test::fixed_width_column_wrapper exp_col2_first{{4, 4, 4, 3, 9}}; @@ -284,7 +128,7 @@ TEST_F(DropDuplicate, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); - // keep last of duplicate + // Keep the last duplicate row cudf::test::fixed_width_column_wrapper exp_col1_last{{5, 5, 4, 3, 8}}; cudf::test::fixed_width_column_wrapper exp_col2_last{{4, 4, 5, 3, 9}}; cudf::test::fixed_width_column_wrapper exp_col1_key_last{{9, 19, 20, 20, 21}}; @@ -296,7 +140,7 @@ TEST_F(DropDuplicate, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); - // Keep unique + // Keep no duplicate rows cudf::test::fixed_width_column_wrapper exp_col1_unique{{5, 5, 3, 8}}; cudf::test::fixed_width_column_wrapper exp_col2_unique{{4, 4, 3, 9}}; cudf::test::fixed_width_column_wrapper exp_col1_key_unique{{9, 19, 20, 21}}; @@ -309,88 +153,145 @@ TEST_F(DropDuplicate, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); } -TEST_F(DropDuplicate, WithNull) +TEST_F(DropDuplicates, KeepFirstWithNull) { - cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 20, 19, 21, 19}, + {1, 1, 0, 0, 1, 1, 1}}; cudf::table_view input{{col, key}}; std::vector keys{1}; - // Keep first of duplicate - cudf::test::fixed_width_column_wrapper exp_col_first{{4, 5, 5, 8}, {0, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_first{{20, 19, 20, 21}, {0, 1, 1, 1}}; - cudf::table_view expected_first{{exp_col_first, exp_key_col_first}}; - auto got_first = + // nulls are equal + cudf::test::fixed_width_column_wrapper exp_col_first_equal{{3, 5, 5, 8}, {1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_first_equal{{20, 19, 20, 21}, + {0, 1, 1, 1}}; + cudf::table_view expected_first_equal{{exp_col_first_equal, exp_key_col_first_equal}}; + auto got_first_equal = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first_equal, got_first_equal->view()); - // Keep last of duplicate - cudf::test::fixed_width_column_wrapper exp_col_last{{3, 1, 5, 8}, {1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_last{{20, 19, 20, 21}, {0, 1, 1, 1}}; - cudf::table_view expected_last{{exp_col_last, exp_key_col_last}}; - auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + // nulls are unequal + cudf::test::fixed_width_column_wrapper exp_col_first_unequal{{3, 2, 5, 5, 8}, + {1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_first_unequal{{20, 20, 19, 20, 21}, + {0, 0, 1, 1, 1}}; + cudf::table_view expected_first_unequal{{exp_col_first_unequal, exp_key_col_first_unequal}}; + auto got_first_unequal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::UNEQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); - - // Keep unique of duplicate - cudf::test::fixed_width_column_wrapper exp_col_unique{{5, 8}, {1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_unique{{20, 21}, {1, 1}}; - cudf::table_view expected_unique{{exp_col_unique, exp_key_col_unique}}; - auto got_unique = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first_unequal, got_first_unequal->view()); } -TEST_F(DropDuplicate, StringKeyColumn) +TEST_F(DropDuplicates, KeepLastWithNull) { - cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; - cudf::test::strings_column_wrapper key_col{{"all", "new", "all", "new", "the", "strings"}, - {1, 1, 1, 0, 1, 1}}; - cudf::table_view input{{col, key_col}}; + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 20, 19, 21, 19}, + {1, 1, 0, 0, 1, 1, 1}}; + cudf::table_view input{{col, key}}; std::vector keys{1}; - cudf::test::fixed_width_column_wrapper exp_col_last{{5, 3, 4, 1, 8}, {1, 1, 0, 1, 1}}; - cudf::test::strings_column_wrapper exp_key_col_last{{"new", "all", "new", "strings", "the"}, - {0, 1, 1, 1, 1}}; - cudf::table_view expected_last{{exp_col_last, exp_key_col_last}}; - auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); + // nulls are equal + cudf::test::fixed_width_column_wrapper exp_col_last_equal{{2, 1, 4, 8}, {1, 1, 0, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_last_equal{{20, 19, 20, 21}, + {0, 1, 1, 1}}; + cudf::table_view expected_last_equal{{exp_col_last_equal, exp_key_col_last_equal}}; + auto got_last_equal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::EQUAL); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last_equal, got_last_equal->view()); + + // nulls are unequal + cudf::test::fixed_width_column_wrapper exp_col_last_unequal{{3, 2, 1, 4, 8}, + {1, 1, 1, 0, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_last_unequal{{20, 20, 19, 20, 21}, + {0, 0, 1, 1, 1}}; + cudf::table_view expected_last_unequal{{exp_col_last_unequal, exp_key_col_last_unequal}}; + auto got_last_unequal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::UNEQUAL); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last_unequal, got_last_unequal->view()); } -TEST_F(DropDuplicate, EmptyInputTable) +TEST_F(DropDuplicates, KeepNoneWithNull) { - cudf::test::fixed_width_column_wrapper col(std::initializer_list{}); - cudf::table_view input{{col}}; - std::vector keys{1, 2}; + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 20, 19, 21, 19}, + {1, 1, 0, 0, 1, 1, 1}}; + cudf::table_view input{{col, key}}; + std::vector keys{1}; - auto got = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + // nulls are equal + cudf::test::fixed_width_column_wrapper exp_col_unique_equal{{8}, {1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_unique_equal{{21}, {1}}; + cudf::table_view expected_unique_equal{{exp_col_unique_equal, exp_key_col_unique_equal}}; + auto got_unique_equal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::EQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique_equal, got_unique_equal->view()); + + // nulls are unequal + cudf::test::fixed_width_column_wrapper exp_col_unique_unequal{{3, 2, 8}, {1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_unique_unequal{{20, 20, 21}, + {0, 0, 1}}; + cudf::table_view expected_unique_unequal{{exp_col_unique_unequal, exp_key_col_unique_unequal}}; + auto got_unique_unequal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::UNEQUAL); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique_unequal, got_unique_unequal->view()); } -TEST_F(DropDuplicate, NoColumnInputTable) +struct UnorderedDropDuplicates : public cudf::test::BaseFixture { +}; + +TEST_F(UnorderedDropDuplicates, NonNullTable) { - cudf::table_view input{std::vector()}; - std::vector keys{1, 2}; + cudf::test::fixed_width_column_wrapper col1{{6, 6, 3, 5, 8, 5}}; + cudf::test::fixed_width_column_wrapper col2{{6, 6, 3, 4, 9, 4}}; + cudf::test::fixed_width_column_wrapper col1_key{{20, 20, 20, 19, 21, 9}}; + cudf::test::fixed_width_column_wrapper col2_key{{19, 19, 20, 20, 9, 21}}; - auto got = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + cudf::table_view input{{col1, col2, col1_key, col2_key}}; + std::vector keys{2, 3}; - CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); + // The expected table would be sorted in ascending order with respect to keys + cudf::test::fixed_width_column_wrapper exp_col1{{5, 5, 6, 3, 8}}; + cudf::test::fixed_width_column_wrapper exp_col2{{4, 4, 6, 3, 9}}; + cudf::test::fixed_width_column_wrapper exp_col1_key{{9, 19, 20, 20, 21}}; + cudf::test::fixed_width_column_wrapper exp_col2_key{{21, 20, 19, 20, 9}}; + cudf::table_view expected{{exp_col1, exp_col2, exp_col1_key, exp_col2_key}}; + + auto result = unordered_drop_duplicates(input, keys); + auto key_view = result->select(keys.begin(), keys.end()); + auto sorted_result = cudf::sort_by_key(result->view(), key_view); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view()); } -TEST_F(DropDuplicate, EmptyKeys) +TEST_F(UnorderedDropDuplicates, WithNull) { - cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper empty_col{}; - cudf::table_view input{{col}}; - std::vector keys{}; - - auto got = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + cudf::test::fixed_width_column_wrapper col{{5, 4, 4, 1, 8, 1}, {1, 0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; + cudf::table_view input{{col, key}}; + std::vector keys{1}; - CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); + // nulls are equal + cudf::test::fixed_width_column_wrapper exp_equal_col{{4, 1, 5, 8}, {0, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_equal_key_col{{20, 19, 20, 21}, {0, 1, 1, 1}}; + cudf::table_view expected_equal{{exp_equal_col, exp_equal_key_col}}; + auto res_equal = unordered_drop_duplicates(input, keys, null_equality::EQUAL); + auto equal_keys = res_equal->select(keys.begin(), keys.end()); + auto sorted_equal = cudf::sort_by_key(res_equal->view(), equal_keys); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_equal, sorted_equal->view()); + + // nulls are unequal + cudf::test::fixed_width_column_wrapper exp_unequal_col{{4, 1, 4, 5, 8}, {0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_unequal_key_col{{20, 19, 20, 20, 21}, + {0, 1, 0, 1, 1}}; + cudf::table_view expected_unequal{{exp_unequal_col, exp_unequal_key_col}}; + auto res_unequal = unordered_drop_duplicates(input, keys, null_equality::UNEQUAL); + auto sorted_unequal = cudf::sort(res_unequal->view()); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unequal, sorted_unequal->view()); } diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst index 2a4dd5ff9c8..d705504cc0c 100644 --- a/docs/cudf/source/api_docs/index_objects.rst +++ b/docs/cudf/source/api_docs/index_objects.rst @@ -22,7 +22,6 @@ Properties :toctree: api/ Index.empty - Index.gpu_values Index.has_duplicates Index.is_monotonic Index.is_monotonic_increasing @@ -93,9 +92,9 @@ Conversion :toctree: api/ Index.astype - Index.to_array Index.to_arrow Index.to_list + Index.to_numpy Index.to_series Index.to_frame Index.to_pandas diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst index 891bb3a1e61..cf5dd4a2a1d 100644 --- a/docs/cudf/source/api_docs/series.rst +++ b/docs/cudf/source/api_docs/series.rst @@ -408,13 +408,13 @@ Serialization / IO / conversion .. autosummary:: :toctree: api/ - Series.to_array Series.to_arrow + Series.to_cupy Series.to_dlpack Series.to_frame - Series.to_gpu_array Series.to_hdf Series.to_json + Series.to_numpy Series.to_pandas Series.to_string Series.from_arrow diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb index a7e959a05a7..0034584a6f7 100644 --- a/docs/cudf/source/user_guide/10min.ipynb +++ b/docs/cudf/source/user_guide/10min.ipynb @@ -4550,7 +4550,7 @@ } ], "source": [ - "df['a'].to_array()" + "df['a'].to_numpy()" ] }, { @@ -4571,7 +4571,7 @@ } ], "source": [ - "ddf['a'].compute().to_array()" + "ddf['a'].compute().to_numpy()" ] }, { diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index 61981b34615..cb3234bf706 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -45,7 +45,6 @@ public final class ColumnVector extends ColumnView { NativeDepsLoader.loadNativeDeps(); } - private final OffHeapState offHeap; private Optional nullCount = Optional.empty(); private int refCount; @@ -56,14 +55,23 @@ public final class ColumnVector extends ColumnView { * owned by this instance. */ public ColumnVector(long nativePointer) { - super(getColumnViewFromColumn(nativePointer)); + super(new OffHeapState(nativePointer)); assert nativePointer != 0; - offHeap = new OffHeapState(nativePointer); MemoryCleaner.register(this, offHeap); this.refCount = 0; incRefCountInternal(true); } + private static OffHeapState makeOffHeap(DType type, long rows, Optional nullCount, + DeviceMemoryBuffer dataBuffer, DeviceMemoryBuffer validityBuffer, + DeviceMemoryBuffer offsetBuffer) { + long viewHandle = initViewHandle( + type, (int)rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(), + dataBuffer, validityBuffer, offsetBuffer, null); + return new OffHeapState(type, (int) rows, dataBuffer, validityBuffer, + offsetBuffer, null, viewHandle); + } + /** * Create a new column vector based off of data already on the device. * @param type the type of the vector @@ -81,24 +89,29 @@ public ColumnVector(long nativePointer) { public ColumnVector(DType type, long rows, Optional nullCount, DeviceMemoryBuffer dataBuffer, DeviceMemoryBuffer validityBuffer, DeviceMemoryBuffer offsetBuffer) { - super(ColumnVector.initViewHandle( - type, (int)rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(), - dataBuffer, validityBuffer, offsetBuffer, null)); + super(makeOffHeap(type, rows, nullCount, dataBuffer, validityBuffer, offsetBuffer)); assert !type.equals(DType.LIST) : "This constructor should not be used for list type"; if (!type.equals(DType.STRING)) { assert offsetBuffer == null : "offsets are only supported for STRING"; } assert (nullCount.isPresent() && nullCount.get() <= Integer.MAX_VALUE) || !nullCount.isPresent(); - offHeap = new OffHeapState(type, (int) rows, dataBuffer, validityBuffer, - offsetBuffer, null, viewHandle); MemoryCleaner.register(this, offHeap); this.nullCount = nullCount; - this.refCount = 0; incRefCountInternal(true); } + private static OffHeapState makeOffHeap(DType type, long rows, Optional nullCount, + DeviceMemoryBuffer dataBuffer, DeviceMemoryBuffer validityBuffer, + DeviceMemoryBuffer offsetBuffer, List toClose, long[] childHandles) { + long viewHandle = initViewHandle(type, (int)rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(), + dataBuffer, validityBuffer, + offsetBuffer, childHandles); + return new OffHeapState(type, (int) rows, dataBuffer, validityBuffer, offsetBuffer, + toClose, viewHandle); + } + /** * Create a new column vector based off of data already on the device with child columns. * @param type the type of the vector, typically a nested type @@ -118,16 +131,12 @@ public ColumnVector(DType type, long rows, Optional nullCount, public ColumnVector(DType type, long rows, Optional nullCount, DeviceMemoryBuffer dataBuffer, DeviceMemoryBuffer validityBuffer, DeviceMemoryBuffer offsetBuffer, List toClose, long[] childHandles) { - super(initViewHandle(type, (int)rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(), - dataBuffer, validityBuffer, - offsetBuffer, childHandles)); + super(makeOffHeap(type, rows, nullCount, dataBuffer, validityBuffer, offsetBuffer, toClose, childHandles)); if (!type.equals(DType.STRING) && !type.equals(DType.LIST)) { assert offsetBuffer == null : "offsets are only supported for STRING, LISTS"; } assert (nullCount.isPresent() && nullCount.get() <= Integer.MAX_VALUE) || !nullCount.isPresent(); - offHeap = new OffHeapState(type, (int) rows, dataBuffer, validityBuffer, offsetBuffer, - toClose, viewHandle); MemoryCleaner.register(this, offHeap); this.refCount = 0; @@ -143,8 +152,7 @@ public ColumnVector(DType type, long rows, Optional nullCount, * @param contiguousBuffer the buffer that this is based off of. */ private ColumnVector(long viewAddress, DeviceMemoryBuffer contiguousBuffer) { - super(viewAddress); - offHeap = new OffHeapState(viewAddress, contiguousBuffer); + super(new OffHeapState(viewAddress, contiguousBuffer)); MemoryCleaner.register(this, offHeap); // TODO we may want to ask for the null count anyways... this.nullCount = Optional.empty(); diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 422311fc8e0..8155fe79080 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -40,6 +40,7 @@ public class ColumnView implements AutoCloseable, BinaryOperable { protected final DType type; protected final long rows; protected final long nullCount; + protected final ColumnVector.OffHeapState offHeap; /** * Constructs a Column View given a native view address @@ -50,6 +51,22 @@ public class ColumnView implements AutoCloseable, BinaryOperable { this.type = DType.fromNative(ColumnView.getNativeTypeId(viewHandle), ColumnView.getNativeTypeScale(viewHandle)); this.rows = ColumnView.getNativeRowCount(viewHandle); this.nullCount = ColumnView.getNativeNullCount(viewHandle); + this.offHeap = null; + } + + + /** + * Intended to be called from ColumnVector when it is being constructed. Because state creates a + * cudf::column_view instance and will close it in all cases, we don't want to have to double + * close it. + * @param state the state this view is based off of. + */ + protected ColumnView(ColumnVector.OffHeapState state) { + offHeap = state; + viewHandle = state.getViewHandle(); + type = DType.fromNative(ColumnView.getNativeTypeId(viewHandle), ColumnView.getNativeTypeScale(viewHandle)); + rows = ColumnView.getNativeRowCount(viewHandle); + nullCount = ColumnView.getNativeNullCount(viewHandle); } /** @@ -265,7 +282,10 @@ public long getDeviceMemorySize() { @Override public void close() { - ColumnView.deleteColumnView(viewHandle); + // close the view handle so long as offHeap is not going to do it for us. + if (offHeap == null) { + ColumnView.deleteColumnView(viewHandle); + } viewHandle = 0; } @@ -3248,7 +3268,7 @@ public final ColumnVector listIndexOf(Scalar key, FindOptions findOption) { * The index is set to null if one of the following is true: * 1. The search key row is null. * 2. The list row is null. - * @param key ColumnView of search keys. + * @param keys ColumnView of search keys. * @param findOption Whether to find the first index of the key, or the last. * @return The resultant column of int32 indices */ @@ -3284,6 +3304,17 @@ public final Scalar getScalarElement(int index) { return new Scalar(getType(), getElement(getNativeView(), index)); } + /** + * Get the number of bytes needed to allocate a validity buffer for the given number of rows. + * According to cudf::bitmask_allocation_size_bytes, the padding boundary for null mask is 64 bytes. + */ + static long getValidityBufferSize(int numRows) { + // number of bytes required = Math.ceil(number of bits / 8) + long actualBytes = ((long) numRows + 7) >> 3; + // padding to the multiplies of the padding boundary(64 bytes) + return ((actualBytes + 63) >> 6) << 6; + } + ///////////////////////////////////////////////////////////////////////////// // INTERNAL/NATIVE ACCESS ///////////////////////////////////////////////////////////////////////////// @@ -3701,7 +3732,7 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat * Native method to find the first (or last) index of each search key in the specified column, * in each row of a list column. * @param nativeView the column view handle of the list - * @param scalarColumnHandle handle to the search key column + * @param keyColumnHandle handle to the search key column * @param isFindFirst Whether to find the first index of the key, or the last. * @return column handle of the resultant column of int32 indices */ @@ -3881,11 +3912,6 @@ private static native long bitwiseMergeAndSetValidity(long baseHandle, long[] vi private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, long boolColumnViewHandle) throws CudfException; - /** - * Get the number of bytes needed to allocate a validity buffer for the given number of rows. - */ - static native long getNativeValidPointerSize(int size); - //////// // Native cudf::column_view life cycle and metadata access methods. Life cycle methods // should typically only be called from the OffHeap inner class. @@ -3975,7 +4001,7 @@ static ColumnVector createColumnVector(DType type, int rows, HostMemoryBuffer da DeviceMemoryBuffer mainValidDevBuff = null; DeviceMemoryBuffer mainOffsetsDevBuff = null; if (mainColValid != null) { - long validLen = getNativeValidPointerSize(mainColRows); + long validLen = getValidityBufferSize(mainColRows); mainValidDevBuff = DeviceMemoryBuffer.allocate(validLen); mainValidDevBuff.copyFromHostBuffer(mainColValid, 0, validLen); } @@ -4084,7 +4110,7 @@ private static NestedColumnVector createNestedColumnVector(DType type, long rows data.copyFromHostBuffer(dataBuffer, 0, dataLen); } if (validityBuffer != null) { - long validLen = getNativeValidPointerSize((int)rows); + long validLen = getValidityBufferSize((int)rows); valid = DeviceMemoryBuffer.allocate(validLen); valid.copyFromHostBuffer(validityBuffer, 0, validLen); } diff --git a/java/src/main/java/ai/rapids/cudf/DType.java b/java/src/main/java/ai/rapids/cudf/DType.java index 742501be375..2e5b0202dc5 100644 --- a/java/src/main/java/ai/rapids/cudf/DType.java +++ b/java/src/main/java/ai/rapids/cudf/DType.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -307,7 +307,7 @@ public static DType fromJavaBigDecimal(BigDecimal dec) { return new DType(DTypeEnum.DECIMAL128, -dec.scale()); } throw new IllegalArgumentException("Precision " + dec.precision() + - " exceeds max precision cuDF can support " + DECIMAL64_MAX_PRECISION); + " exceeds max precision cuDF can support " + DECIMAL128_MAX_PRECISION); } /** diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java index 0fe7d7a5df8..3abc6db385d 100644 --- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java @@ -199,7 +199,7 @@ public ColumnVector copyToDevice() { } HostMemoryBuffer hvalid = this.offHeap.valid; if (hvalid != null) { - long validLen = ColumnView.getNativeValidPointerSize((int) rows); + long validLen = ColumnView.getValidityBufferSize((int) rows); valid = DeviceMemoryBuffer.allocate(validLen); valid.copyFromHostBuffer(hvalid, 0, validLen); } @@ -858,7 +858,7 @@ public static HostColumnVector timestampNanoSecondsFromBoxedLongs(Long... values * Build */ - public static final class ColumnBuilder implements AutoCloseable { + public static final class ColumnBuilder implements AutoCloseable { private DType type; private HostMemoryBuffer data; @@ -869,28 +869,78 @@ public static final class ColumnBuilder implements AutoCloseable { private boolean nullable; private long rows; private long estimatedRows; + private long rowCapacity = 0L; + private long validCapacity = 0L; private boolean built = false; private List childBuilders = new ArrayList<>(); + private Runnable nullHandler; - private int currentIndex = 0; - private int currentByteIndex = 0; - + // The value of currentIndex can't exceed Int32.Max. Storing currentIndex as a long is to + // adapt HostMemoryBuffer.setXXX, which requires a long offset. + private long currentIndex = 0; + // Only for Strings: pointer of the byte (data) buffer + private int currentStringByteIndex = 0; + // Use bit shift instead of multiply to transform row offset to byte offset + private int bitShiftBySize = 0; + private static final int bitShiftByOffset = (int)(Math.log(OFFSET_SIZE) / Math.log(2)); public ColumnBuilder(HostColumnVector.DataType type, long estimatedRows) { this.type = type.getType(); this.nullable = type.isNullable(); this.rows = 0; - this.estimatedRows = estimatedRows; + this.estimatedRows = Math.max(estimatedRows, 1L); + this.bitShiftBySize = (int)(Math.log(this.type.getSizeInBytes()) / Math.log(2)); + + // initialize the null handler according to the data type + this.setupNullHandler(); + for (int i = 0; i < type.getNumChildren(); i++) { childBuilders.add(new ColumnBuilder(type.getChild(i), estimatedRows)); } } + private void setupNullHandler() { + if (this.type == DType.LIST) { + this.nullHandler = () -> { + this.growListBuffersAndRows(); + this.growValidBuffer(); + setNullAt(currentIndex++); + offsets.setInt(currentIndex << bitShiftByOffset, childBuilders.get(0).getCurrentIndex()); + }; + } else if (this.type == DType.STRING) { + this.nullHandler = () -> { + this.growStringBuffersAndRows(0); + this.growValidBuffer(); + setNullAt(currentIndex++); + offsets.setInt(currentIndex << bitShiftByOffset, currentStringByteIndex); + }; + } else if (this.type == DType.STRUCT) { + this.nullHandler = () -> { + this.growStructBuffersAndRows(); + this.growValidBuffer(); + setNullAt(currentIndex++); + for (ColumnBuilder childBuilder : childBuilders) { + childBuilder.appendNull(); + } + }; + } else { + this.nullHandler = () -> { + this.growFixedWidthBuffersAndRows(); + this.growValidBuffer(); + setNullAt(currentIndex++); + }; + } + } + public HostColumnVector build() { List hostColumnVectorCoreList = new ArrayList<>(); for (ColumnBuilder childBuilder : childBuilders) { hostColumnVectorCoreList.add(childBuilder.buildNestedInternal()); } + // Aligns the valid buffer size with other buffers in terms of row size, because it grows lazily. + if (valid != null) { + growValidBuffer(); + } HostColumnVector hostColumnVector = new HostColumnVector(type, rows, Optional.of(nullCount), data, valid, offsets, hostColumnVectorCoreList); built = true; @@ -902,6 +952,10 @@ private HostColumnVectorCore buildNestedInternal() { for (ColumnBuilder childBuilder : childBuilders) { hostColumnVectorCoreList.add(childBuilder.buildNestedInternal()); } + // Aligns the valid buffer size with other buffers in terms of row size, because it grows lazily. + if (valid != null) { + growValidBuffer(); + } return new HostColumnVectorCore(type, rows, Optional.of(nullCount), data, valid, offsets, hostColumnVectorCoreList); } @@ -929,71 +983,113 @@ public ColumnBuilder appendStructValues(StructData... inputList) { } /** - * A method that is responsible for growing the buffers as needed - * and incrementing the row counts when we append values or nulls. - * @param hasNull indicates whether the validity buffer needs to be considered, as the - * nullcount may not have been fully calculated yet - * @param length used for strings + * Grows valid buffer lazily. The valid buffer won't be materialized until the first null + * value appended. This method reuses the rowCapacity to track the sizes of column. + * Therefore, please call specific growBuffer method to update rowCapacity before calling + * this method. + */ + private void growValidBuffer() { + if (valid == null) { + long maskBytes = ColumnView.getValidityBufferSize((int) rowCapacity); + valid = HostMemoryBuffer.allocate(maskBytes); + valid.setMemory(0, valid.length, (byte) 0xFF); + validCapacity = rowCapacity; + return; + } + if (validCapacity < rowCapacity) { + long maskBytes = ColumnView.getValidityBufferSize((int) rowCapacity); + HostMemoryBuffer newValid = HostMemoryBuffer.allocate(maskBytes); + newValid.setMemory(0, newValid.length, (byte) 0xFF); + valid = copyBuffer(newValid, valid); + validCapacity = rowCapacity; + } + } + + /** + * A method automatically grows data buffer for fixed-width columns as needed along with + * incrementing the row counts. Please call this method before appending any value or null. */ - private void growBuffersAndRows(boolean hasNull, int length) { + private void growFixedWidthBuffersAndRows() { assert rows + 1 <= Integer.MAX_VALUE : "Row count cannot go over Integer.MAX_VALUE"; rows++; - long targetDataSize = 0; - if (!type.isNestedType()) { - if (type.equals(DType.STRING)) { - targetDataSize = data == null ? length : currentByteIndex + length; - } else { - targetDataSize = data == null ? estimatedRows * type.getSizeInBytes() : rows * type.getSizeInBytes(); - } + if (data == null) { + data = HostMemoryBuffer.allocate(estimatedRows << bitShiftBySize); + rowCapacity = estimatedRows; + } else if (rows > rowCapacity) { + long newCap = Math.min(rowCapacity * 2, Integer.MAX_VALUE - 1); + data = copyBuffer(HostMemoryBuffer.allocate(newCap << bitShiftBySize), data); + rowCapacity = newCap; } + } - if (targetDataSize > 0) { - if (data == null) { - data = HostMemoryBuffer.allocate(targetDataSize); - } else { - long maxLen; - if (type.equals(DType.STRING)) { - maxLen = Integer.MAX_VALUE; - } else { - maxLen = Integer.MAX_VALUE * (long) type.getSizeInBytes(); - } - long oldLen = data.getLength(); - long newDataLen = Math.max(1, oldLen); - while (targetDataSize > newDataLen) { - newDataLen = newDataLen * 2; - } - if (newDataLen != oldLen) { - newDataLen = Math.min(newDataLen, maxLen); - if (newDataLen < targetDataSize) { - throw new IllegalStateException("A data buffer for strings is not supported over 2GB in size"); - } - HostMemoryBuffer newData = HostMemoryBuffer.allocate(newDataLen); - data = copyBuffer(newData, data); - } - } + /** + * A method automatically grows offsets buffer for list columns as needed along with + * incrementing the row counts. Please call this method before appending any value or null. + */ + private void growListBuffersAndRows() { + assert rows + 2 <= Integer.MAX_VALUE : "Row count cannot go over Integer.MAX_VALUE"; + rows++; + + if (offsets == null) { + offsets = HostMemoryBuffer.allocate((estimatedRows + 1) << bitShiftByOffset); + offsets.setInt(0, 0); + rowCapacity = estimatedRows; + } else if (rows > rowCapacity) { + long newCap = Math.min(rowCapacity * 2, Integer.MAX_VALUE - 2); + offsets = copyBuffer(HostMemoryBuffer.allocate((newCap + 1) << bitShiftByOffset), offsets); + rowCapacity = newCap; } - if (type.equals(DType.LIST) || type.equals(DType.STRING)) { - if (offsets == null) { - offsets = HostMemoryBuffer.allocate((estimatedRows + 1) * OFFSET_SIZE); - offsets.setInt(0, 0); - } else if ((rows +1) * OFFSET_SIZE > offsets.length) { - long newOffsetLen = offsets.length * 2; - HostMemoryBuffer newOffsets = HostMemoryBuffer.allocate(newOffsetLen); - offsets = copyBuffer(newOffsets, offsets); - } + } + + /** + * A method automatically grows offsets and data buffer for string columns as needed along with + * incrementing the row counts. Please call this method before appending any value or null. + * + * @param stringLength number of bytes required by the next row + */ + private void growStringBuffersAndRows(int stringLength) { + assert rows + 2 <= Integer.MAX_VALUE : "Row count cannot go over Integer.MAX_VALUE"; + rows++; + + if (offsets == null) { + // Initialize data buffer with at least 1 byte in case the first appended value is null. + data = HostMemoryBuffer.allocate(Math.max(1, stringLength)); + offsets = HostMemoryBuffer.allocate((estimatedRows + 1) << bitShiftByOffset); + offsets.setInt(0, 0); + rowCapacity = estimatedRows; + return; } - if (hasNull || nullCount > 0) { - if (valid == null) { - long targetValidSize = ColumnView.getNativeValidPointerSize((int)estimatedRows); - valid = HostMemoryBuffer.allocate(targetValidSize); - valid.setMemory(0, targetValidSize, (byte) 0xFF); - } else if (valid.length < ColumnView.getNativeValidPointerSize((int)rows)) { - long newValidLen = valid.length * 2; - HostMemoryBuffer newValid = HostMemoryBuffer.allocate(newValidLen); - newValid.setMemory(0, newValidLen, (byte) 0xFF); - valid = copyBuffer(newValid, valid); - } + + if (rows > rowCapacity) { + long newCap = Math.min(rowCapacity * 2, Integer.MAX_VALUE - 2); + offsets = copyBuffer(HostMemoryBuffer.allocate((newCap + 1) << bitShiftByOffset), offsets); + rowCapacity = newCap; + } + + long currentLength = currentStringByteIndex + stringLength; + if (currentLength > data.length) { + long requiredLength = data.length; + do { + requiredLength = requiredLength * 2; + } while (currentLength > requiredLength); + data = copyBuffer(HostMemoryBuffer.allocate(requiredLength), data); + } + } + + /** + * For struct columns, we only need to update rows and rowCapacity (for the growth of + * valid buffer), because struct columns hold no buffer itself. + * Please call this method before appending any value or null. + */ + private void growStructBuffersAndRows() { + assert rows + 1 <= Integer.MAX_VALUE : "Row count cannot go over Integer.MAX_VALUE"; + rows++; + + if (rowCapacity == 0) { + rowCapacity = estimatedRows; + } else if (rows > rowCapacity) { + rowCapacity = Math.min(rowCapacity * 2, Integer.MAX_VALUE - 1); } } @@ -1015,29 +1111,13 @@ private HostMemoryBuffer copyBuffer(HostMemoryBuffer targetBuffer, HostMemoryBuf * Method that sets the null bit in the validity vector * @param index the row index at which the null is marked */ - private void setNullAt(int index) { + private void setNullAt(long index) { assert index < rows : "Index for null value should fit the column with " + rows + " rows"; nullCount += BitVectorHelper.setNullAt(valid, index); } public final ColumnBuilder appendNull() { - growBuffersAndRows(true, 0); - setNullAt(currentIndex); - currentIndex++; - currentByteIndex += type.getSizeInBytes(); - if (type.hasOffsets()) { - if (type.equals(DType.LIST)) { - offsets.setInt(currentIndex * OFFSET_SIZE, childBuilders.get(0).getCurrentIndex()); - } else { - // It is a String - offsets.setInt(currentIndex * OFFSET_SIZE, currentByteIndex); - } - } else if (type.equals(DType.STRUCT)) { - // structs propagate nulls to children and even further down if needed - for (ColumnBuilder childBuilder : childBuilders) { - childBuilder.appendNull(); - } - } + nullHandler.run(); return this; } @@ -1081,7 +1161,7 @@ public ColumnBuilder endStruct() { assert type.equals(DType.STRUCT) : "This only works for structs"; assert allChildrenHaveSameIndex() : "Appending structs data appears to be off " + childBuilders + " should all have the same currentIndex " + type; - growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); + growStructBuffersAndRows(); currentIndex++; return this; } @@ -1095,9 +1175,8 @@ assert allChildrenHaveSameIndex() : "Appending structs data appears to be off " */ public ColumnBuilder endList() { assert type.equals(DType.LIST); - growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); - currentIndex++; - offsets.setInt(currentIndex * OFFSET_SIZE, childBuilders.get(0).getCurrentIndex()); + growListBuffersAndRows(); + offsets.setInt(++currentIndex << bitShiftByOffset, childBuilders.get(0).getCurrentIndex()); return this; } @@ -1155,80 +1234,67 @@ public void incrCurrentIndex() { } public int getCurrentIndex() { - return currentIndex; + return (int) currentIndex; } + @Deprecated public int getCurrentByteIndex() { - return currentByteIndex; + return currentStringByteIndex; } public final ColumnBuilder append(byte value) { - growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); + growFixedWidthBuffersAndRows(); assert type.isBackedByByte(); assert currentIndex < rows; - data.setByte(currentIndex * type.getSizeInBytes(), value); - currentIndex++; - currentByteIndex += type.getSizeInBytes(); + data.setByte(currentIndex++ << bitShiftBySize, value); return this; } public final ColumnBuilder append(short value) { - growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); + growFixedWidthBuffersAndRows(); assert type.isBackedByShort(); assert currentIndex < rows; - data.setShort(currentIndex * type.getSizeInBytes(), value); - currentIndex++; - currentByteIndex += type.getSizeInBytes(); + data.setShort(currentIndex++ << bitShiftBySize, value); return this; } public final ColumnBuilder append(int value) { - growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); + growFixedWidthBuffersAndRows(); assert type.isBackedByInt(); assert currentIndex < rows; - data.setInt(currentIndex * type.getSizeInBytes(), value); - currentIndex++; - currentByteIndex += type.getSizeInBytes(); + data.setInt(currentIndex++ << bitShiftBySize, value); return this; } public final ColumnBuilder append(long value) { - growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); + growFixedWidthBuffersAndRows(); assert type.isBackedByLong(); assert currentIndex < rows; - data.setLong(currentIndex * type.getSizeInBytes(), value); - currentIndex++; - currentByteIndex += type.getSizeInBytes(); + data.setLong(currentIndex++ << bitShiftBySize, value); return this; } public final ColumnBuilder append(float value) { - growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); + growFixedWidthBuffersAndRows(); assert type.equals(DType.FLOAT32); assert currentIndex < rows; - data.setFloat(currentIndex * type.getSizeInBytes(), value); - currentIndex++; - currentByteIndex += type.getSizeInBytes(); + data.setFloat(currentIndex++ << bitShiftBySize, value); return this; } public final ColumnBuilder append(double value) { - growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); + growFixedWidthBuffersAndRows(); assert type.equals(DType.FLOAT64); assert currentIndex < rows; - data.setDouble(currentIndex * type.getSizeInBytes(), value); - currentIndex++; - currentByteIndex += type.getSizeInBytes(); + data.setDouble(currentIndex++ << bitShiftBySize, value); return this; } public final ColumnBuilder append(boolean value) { - growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); + growFixedWidthBuffersAndRows(); assert type.equals(DType.BOOL8); assert currentIndex < rows; - data.setBoolean(currentIndex * type.getSizeInBytes(), value); - currentIndex++; - currentByteIndex += type.getSizeInBytes(); + data.setBoolean(currentIndex++ << bitShiftBySize, value); return this; } @@ -1237,22 +1303,19 @@ public ColumnBuilder append(BigDecimal value) { } public ColumnBuilder append(BigInteger unscaledVal) { - growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); + growFixedWidthBuffersAndRows(); assert currentIndex < rows; if (type.typeId == DType.DTypeEnum.DECIMAL32) { - data.setInt(currentIndex * type.getSizeInBytes(), unscaledVal.intValueExact()); + data.setInt(currentIndex++ << bitShiftBySize, unscaledVal.intValueExact()); } else if (type.typeId == DType.DTypeEnum.DECIMAL64) { - data.setLong(currentIndex * type.getSizeInBytes(), unscaledVal.longValueExact()); + data.setLong(currentIndex++ << bitShiftBySize, unscaledVal.longValueExact()); } else if (type.typeId == DType.DTypeEnum.DECIMAL128) { - assert currentIndex < rows; byte[] unscaledValueBytes = unscaledVal.toByteArray(); byte[] result = convertDecimal128FromJavaToCudf(unscaledValueBytes); - data.setBytes(currentIndex*DType.DTypeEnum.DECIMAL128.sizeInBytes, result, 0, result.length); - } else { + data.setBytes(currentIndex++ << bitShiftBySize, result, 0, result.length); + } else { throw new IllegalStateException(type + " is not a supported decimal type."); } - currentIndex++; - currentByteIndex += type.getSizeInBytes(); return this; } @@ -1271,14 +1334,13 @@ public ColumnBuilder appendUTF8String(byte[] value, int srcOffset, int length) { assert length >= 0; assert value.length + srcOffset <= length; assert type.equals(DType.STRING) : " type " + type + " is not String"; - currentIndex++; - growBuffersAndRows(false, length); - assert currentIndex < rows + 1; + growStringBuffersAndRows(length); + assert currentIndex < rows; if (length > 0) { - data.setBytes(currentByteIndex, value, srcOffset, length); + data.setBytes(currentStringByteIndex, value, srcOffset, length); } - currentByteIndex += length; - offsets.setInt(currentIndex * OFFSET_SIZE, currentByteIndex); + currentStringByteIndex += length; + offsets.setInt(++currentIndex << bitShiftByOffset, currentStringByteIndex); return this; } @@ -1822,7 +1884,7 @@ public final Builder append(HostColumnVector columnVector) { } private void allocateBitmaskAndSetDefaultValues() { - long bitmaskSize = ColumnView.getNativeValidPointerSize((int) rows); + long bitmaskSize = ColumnView.getValidityBufferSize((int) rows); valid = HostMemoryBuffer.allocate(bitmaskSize); valid.setMemory(0, bitmaskSize, (byte) 0xFF); } diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 0fce27bc130..63247eb0066 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1790,16 +1790,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityLength(J CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidPointerSize(JNIEnv *env, - jobject j_object, - jint size) { - try { - cudf::jni::auto_set_device(env); - return static_cast(cudf::bitmask_allocation_size_bytes(size)); - } - CATCH_STD(env, 0); -} - JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getDeviceMemorySize(JNIEnv *env, jclass, jlong handle) { JNI_NULL_CHECK(env, handle, "native handle is null", 0); diff --git a/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java index a26dbec4907..7b476c31b95 100644 --- a/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ import org.junit.jupiter.api.Test; import java.util.Random; +import java.util.function.Consumer; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -39,21 +40,34 @@ public void testCreateColumnVectorBuilder() { @Test public void testArrayAllocation() { - try (HostColumnVector byteColumnVector = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) { - assertFalse(byteColumnVector.hasNulls()); - assertEquals(byteColumnVector.getByte(0), 2); - assertEquals(byteColumnVector.getByte(1), 3); - assertEquals(byteColumnVector.getByte(2), 5); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); + assertEquals(cv.getByte(0), 2); + assertEquals(cv.getByte(1), 3); + assertEquals(cv.getByte(2), 5); + }; + try (HostColumnVector bcv = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) { + verify.accept(bcv); + } + try (HostColumnVector bcv = ColumnBuilderHelper.fromBytes(true, new byte[]{2, 3, 5})) { + verify.accept(bcv); } } @Test public void testUnsignedArrayAllocation() { - try (HostColumnVector v = HostColumnVector.fromUnsignedBytes(new byte[]{(byte)0xff, (byte)128, 5})) { - assertFalse(v.hasNulls()); - assertEquals(0xff, Byte.toUnsignedInt(v.getByte(0)), 0xff); - assertEquals(128, Byte.toUnsignedInt(v.getByte(1)), 128); - assertEquals(5, Byte.toUnsignedInt(v.getByte(2)), 5); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); + assertEquals(0xff, Byte.toUnsignedInt(cv.getByte(0)), 0xff); + assertEquals(128, Byte.toUnsignedInt(cv.getByte(1)), 128); + assertEquals(5, Byte.toUnsignedInt(cv.getByte(2)), 5); + }; + try (HostColumnVector bcv = HostColumnVector.fromUnsignedBytes(new byte[]{(byte)0xff, (byte)128, 5})) { + verify.accept(bcv); + } + try (HostColumnVector bcv = ColumnBuilderHelper.fromBytes(false, + new byte[]{(byte)0xff, (byte)128, 5})) { + verify.accept(bcv); } } @@ -70,47 +84,73 @@ public void testAppendRepeatingValues() { @Test public void testUpperIndexOutOfBoundsException() { - try (HostColumnVector byteColumnVector = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) { - assertThrows(AssertionError.class, () -> byteColumnVector.getByte(3)); - assertFalse(byteColumnVector.hasNulls()); + Consumer verify = (cv) -> { + assertThrows(AssertionError.class, () -> cv.getByte(3)); + assertFalse(cv.hasNulls()); + }; + try (HostColumnVector bcv = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) { + verify.accept(bcv); + } + try (HostColumnVector bcv = ColumnBuilderHelper.fromBytes(true, new byte[]{2, 3, 5})) { + verify.accept(bcv); } } @Test public void testLowerIndexOutOfBoundsException() { - try (HostColumnVector byteColumnVector = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) { - assertFalse(byteColumnVector.hasNulls()); - assertThrows(AssertionError.class, () -> byteColumnVector.getByte(-1)); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); + assertThrows(AssertionError.class, () -> cv.getByte(-1)); + }; + try (HostColumnVector bcv = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) { + verify.accept(bcv); + } + try (HostColumnVector bcv = ColumnBuilderHelper.fromBytes(true, new byte[]{2, 3, 5})) { + verify.accept(bcv); } } @Test public void testAddingNullValues() { - try (HostColumnVector byteColumnVector = HostColumnVector.fromBoxedBytes( - new Byte[]{2, 3, 4, 5, 6, 7, null, null})) { - assertTrue(byteColumnVector.hasNulls()); - assertEquals(2, byteColumnVector.getNullCount()); + Consumer verify = (cv) -> { + assertTrue(cv.hasNulls()); + assertEquals(2, cv.getNullCount()); for (int i = 0; i < 6; i++) { - assertFalse(byteColumnVector.isNull(i)); + assertFalse(cv.isNull(i)); } - assertTrue(byteColumnVector.isNull(6)); - assertTrue(byteColumnVector.isNull(7)); + assertTrue(cv.isNull(6)); + assertTrue(cv.isNull(7)); + }; + try (HostColumnVector bcv = HostColumnVector.fromBoxedBytes( + new Byte[]{2, 3, 4, 5, 6, 7, null, null})) { + verify.accept(bcv); + } + try (HostColumnVector bcv = ColumnBuilderHelper.fromBoxedBytes(true, + new Byte[]{2, 3, 4, 5, 6, 7, null, null})) { + verify.accept(bcv); } } @Test public void testAddingUnsignedNullValues() { - try (HostColumnVector byteColumnVector = HostColumnVector.fromBoxedUnsignedBytes( - new Byte[]{2, 3, 4, 5, (byte)128, (byte)254, null, null})) { - assertTrue(byteColumnVector.hasNulls()); - assertEquals(2, byteColumnVector.getNullCount()); + Consumer verify = (cv) -> { + assertTrue(cv.hasNulls()); + assertEquals(2, cv.getNullCount()); for (int i = 0; i < 6; i++) { - assertFalse(byteColumnVector.isNull(i)); + assertFalse(cv.isNull(i)); } - assertEquals(128, Byte.toUnsignedInt(byteColumnVector.getByte(4))); - assertEquals(254, Byte.toUnsignedInt(byteColumnVector.getByte(5))); - assertTrue(byteColumnVector.isNull(6)); - assertTrue(byteColumnVector.isNull(7)); + assertEquals(128, Byte.toUnsignedInt(cv.getByte(4))); + assertEquals(254, Byte.toUnsignedInt(cv.getByte(5))); + assertTrue(cv.isNull(6)); + assertTrue(cv.isNull(7)); + }; + try (HostColumnVector bcv = HostColumnVector.fromBoxedUnsignedBytes( + new Byte[]{2, 3, 4, 5, (byte)128, (byte)254, null, null})) { + verify.accept(bcv); + } + try (HostColumnVector bcv = ColumnBuilderHelper.fromBoxedBytes(false, + new Byte[]{2, 3, 4, 5, (byte)128, (byte)254, null, null})) { + verify.accept(bcv); } } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnBuilderHelper.java b/java/src/test/java/ai/rapids/cudf/ColumnBuilderHelper.java new file mode 100644 index 00000000000..263244b2413 --- /dev/null +++ b/java/src/test/java/ai/rapids/cudf/ColumnBuilderHelper.java @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +import java.math.BigDecimal; +import java.math.RoundingMode; +import java.util.Arrays; +import java.util.Comparator; +import java.util.Objects; +import java.util.function.Consumer; + +/** + * ColumnBuilderHelper helps to test ColumnBuilder with existed ColumnVector tests. + */ +public class ColumnBuilderHelper { + + public static HostColumnVector build( + HostColumnVector.DataType type, + int rows, + Consumer init) { + try (HostColumnVector.ColumnBuilder b = new HostColumnVector.ColumnBuilder(type, rows)) { + init.accept(b); + return b.build(); + } + } + + public static ColumnVector buildOnDevice( + HostColumnVector.DataType type, + int rows, + Consumer init) { + try (HostColumnVector.ColumnBuilder b = new HostColumnVector.ColumnBuilder(type, rows)) { + init.accept(b); + return b.buildAndPutOnDevice(); + } + } + + public static HostColumnVector fromBoxedBytes(boolean signed, Byte... values) { + DType dt = signed ? DType.INT8 : DType.UINT8; + return ColumnBuilderHelper.build( + new HostColumnVector.BasicType(true, dt), + values.length, + (b) -> { + for (Byte v : values) + if (v == null) b.appendNull(); + else b.append(v); + }); + } + + public static HostColumnVector fromBoxedDoubles(Double... values) { + return ColumnBuilderHelper.build( + new HostColumnVector.BasicType(true, DType.FLOAT64), + values.length, + (b) -> { + for (Double v : values) + if (v == null) b.appendNull(); + else b.append(v); + }); + } + + public static HostColumnVector fromBoxedInts(boolean signed, Integer... values) { + DType dt = signed ? DType.INT32 : DType.UINT32; + return ColumnBuilderHelper.build( + new HostColumnVector.BasicType(true, dt), + values.length, + (b) -> { + for (Integer v : values) + if (v == null) b.appendNull(); + else b.append(v); + }); + } + + public static HostColumnVector fromBoxedLongs(boolean signed, Long... values) { + DType dt = signed ? DType.INT64 : DType.UINT64; + return ColumnBuilderHelper.build( + new HostColumnVector.BasicType(true, dt), + values.length, + (b) -> { + for (Long v : values) + if (v == null) b.appendNull(); + else b.append(v); + }); + } + + public static HostColumnVector fromBytes(boolean signed, byte... values) { + DType dt = signed ? DType.INT8 : DType.UINT8; + return ColumnBuilderHelper.build( + new HostColumnVector.BasicType(false, dt), + values.length, + (b) -> { + for (byte v : values) b.append(v); + }); + } + + public static HostColumnVector fromDecimals(BigDecimal... values) { + // Simply copy from HostColumnVector.fromDecimals + BigDecimal maxDec = Arrays.stream(values).filter(Objects::nonNull) + .max(Comparator.comparingInt(BigDecimal::precision)) + .orElse(BigDecimal.ZERO); + int maxScale = Arrays.stream(values).filter(Objects::nonNull) + .map(decimal -> decimal.scale()) + .max(Comparator.naturalOrder()) + .orElse(0); + maxDec = maxDec.setScale(maxScale, RoundingMode.UNNECESSARY); + + return ColumnBuilderHelper.build( + new HostColumnVector.BasicType(true, DType.fromJavaBigDecimal(maxDec)), + values.length, + (b) -> { + for (BigDecimal v : values) + if (v == null) b.appendNull(); + else b.append(v); + }); + } + + public static HostColumnVector fromDoubles(double... values) { + return ColumnBuilderHelper.build( + new HostColumnVector.BasicType(false, DType.FLOAT64), + values.length, + (b) -> { + for (double v : values) b.append(v); + }); + } + + public static HostColumnVector fromInts(boolean signed, int... values) { + DType dt = signed ? DType.INT32 : DType.UINT32; + return ColumnBuilderHelper.build( + new HostColumnVector.BasicType(false, dt), + values.length, + (b) -> { + for (int v : values) b.append(v); + }); + } + + public static HostColumnVector fromLongs(boolean signed, long... values) { + DType dt = signed ? DType.INT64 : DType.UINT64; + return ColumnBuilderHelper.build( + new HostColumnVector.BasicType(false, dt), + values.length, + (b) -> { + for (long v : values) b.append(v); + }); + } +} diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 2dbec454eb2..8f39c3c51ce 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -3559,7 +3559,6 @@ void testCastDecimal64ToString() { for (int scale : new int[]{-5, -2, -1, 0, 1, 2, 5}) { for (int i = 0; i < strDecimalValues.length; i++) { strDecimalValues[i] = dumpDecimal(unScaledValues[i], scale); - System.out.println(strDecimalValues[i]); } testCastFixedWidthToStringsAndBack(DType.create(DType.DTypeEnum.DECIMAL64, scale), diff --git a/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java index c2772520f57..994066c5df0 100644 --- a/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,10 +22,12 @@ import org.junit.jupiter.api.Test; import java.math.BigDecimal; +import java.math.BigInteger; import java.math.RoundingMode; import java.util.Arrays; import java.util.Objects; import java.util.Random; +import java.util.function.Consumer; import static org.junit.jupiter.api.Assertions.*; @@ -33,9 +35,11 @@ public class DecimalColumnVectorTest extends CudfTestBase { private static final Random rdSeed = new Random(1234); private static final int dec32Scale = 4; private static final int dec64Scale = 10; + private static final int dec128Scale = 30; private static final BigDecimal[] decimal32Zoo = new BigDecimal[20]; private static final BigDecimal[] decimal64Zoo = new BigDecimal[20]; + private static final BigDecimal[] decimal128Zoo = new BigDecimal[20]; private static final int[] unscaledDec32Zoo = new int[decimal32Zoo.length]; private static final long[] unscaledDec64Zoo = new long[decimal64Zoo.length]; @@ -45,6 +49,9 @@ public class DecimalColumnVectorTest extends CudfTestBase { private final BigDecimal[] boundaryDecimal64 = new BigDecimal[]{ new BigDecimal("999999999999999999"), new BigDecimal("-999999999999999999")}; + private final BigDecimal[] boundaryDecimal128 = new BigDecimal[]{ + new BigDecimal("99999999999999999999999999999999999999"), new BigDecimal("-99999999999999999999999999999999999999")}; + private final BigDecimal[] overflowDecimal32 = new BigDecimal[]{ BigDecimal.valueOf(Integer.MAX_VALUE), BigDecimal.valueOf(Integer.MIN_VALUE)}; @@ -72,6 +79,12 @@ public static void setup() { } else { decimal64Zoo[i] = null; } + if (rdSeed.nextBoolean()) { + BigInteger unscaledVal = BigInteger.valueOf(rdSeed.nextLong()).multiply(BigInteger.valueOf(rdSeed.nextLong())); + decimal128Zoo[i] = new BigDecimal(unscaledVal, dec128Scale); + } else { + decimal128Zoo[i] = null; + } } } @@ -190,27 +203,44 @@ public void testDecimalGeneral() { @Test public void testDecimalFromDecimals() { - DecimalColumnVectorTest.testDecimalImpl(false, dec32Scale, decimal32Zoo); - DecimalColumnVectorTest.testDecimalImpl(true, dec64Scale, decimal64Zoo); - DecimalColumnVectorTest.testDecimalImpl(false, 0, boundaryDecimal32); - DecimalColumnVectorTest.testDecimalImpl(true, 0, boundaryDecimal64); + DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL32, dec32Scale, decimal32Zoo); + DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL64, dec64Scale, decimal64Zoo); + DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL128, dec128Scale, decimal128Zoo); + DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL32, 0, boundaryDecimal32); + DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL64, 0, boundaryDecimal64); + DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL128, 0, boundaryDecimal128); } - private static void testDecimalImpl(boolean isInt64, int scale, BigDecimal[] decimalZoo) { - try (ColumnVector cv = ColumnVector.fromDecimals(decimalZoo)) { - try (HostColumnVector hcv = cv.copyToHost()) { - assertEquals(-scale, hcv.getType().getScale()); - assertEquals(isInt64, hcv.getType().typeId == DType.DTypeEnum.DECIMAL64); - assertEquals(decimalZoo.length, hcv.rows); - for (int i = 0; i < decimalZoo.length; i++) { - assertEquals(decimalZoo[i] == null, hcv.isNull(i)); - if (decimalZoo[i] != null) { - assertEquals(decimalZoo[i].floatValue(), hcv.getBigDecimal(i).floatValue()); - long backValue = isInt64 ? hcv.getLong(i) : hcv.getInt(i); - assertEquals(decimalZoo[i].setScale(scale, RoundingMode.UNNECESSARY), BigDecimal.valueOf(backValue, scale)); + private static void testDecimalImpl(DType.DTypeEnum decimalType, int scale, BigDecimal[] decimalZoo) { + Consumer assertions = (hcv) -> { + assertEquals(-scale, hcv.getType().getScale()); + assertEquals(hcv.getType().typeId, decimalType); + assertEquals(decimalZoo.length, hcv.rows); + for (int i = 0; i < decimalZoo.length; i++) { + assertEquals(decimalZoo[i] == null, hcv.isNull(i)); + if (decimalZoo[i] != null) { + BigDecimal actual; + switch (decimalType) { + case DECIMAL32: + actual = BigDecimal.valueOf(hcv.getInt(i), scale); + break; + case DECIMAL64: + actual = BigDecimal.valueOf(hcv.getLong(i), scale); + break; + default: + actual = hcv.getBigDecimal(i); } + assertEquals(decimalZoo[i].subtract(actual).longValueExact(), 0L); } } + }; + try (ColumnVector cv = ColumnVector.fromDecimals(decimalZoo)) { + try (HostColumnVector hcv = cv.copyToHost()) { + assertions.accept(hcv); + } + } + try (HostColumnVector hcv = ColumnBuilderHelper.fromDecimals(decimalZoo)) { + assertions.accept(hcv); } } diff --git a/java/src/test/java/ai/rapids/cudf/DoubleColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/DoubleColumnVectorTest.java index d82565e1d2d..fa34429685e 100644 --- a/java/src/test/java/ai/rapids/cudf/DoubleColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/DoubleColumnVectorTest.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ import org.junit.jupiter.api.Test; import java.util.Random; +import java.util.function.Consumer; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -40,34 +41,51 @@ public void testCreateColumnVectorBuilder() { @Test public void testArrayAllocation() { - try (HostColumnVector doubleColumnVector = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) { - assertFalse(doubleColumnVector.hasNulls()); - assertEqualsWithinPercentage(doubleColumnVector.getDouble(0), 2.1, 0.01); - assertEqualsWithinPercentage(doubleColumnVector.getDouble(1), 3.02, 0.01); - assertEqualsWithinPercentage(doubleColumnVector.getDouble(2), 5.003, 0.001); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); + assertEqualsWithinPercentage(cv.getDouble(0), 2.1, 0.01); + assertEqualsWithinPercentage(cv.getDouble(1), 3.02, 0.01); + assertEqualsWithinPercentage(cv.getDouble(2), 5.003, 0.001); + }; + try (HostColumnVector dcv = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) { + verify.accept(dcv); + } + try (HostColumnVector dcv = ColumnBuilderHelper.fromDoubles(2.1, 3.02, 5.003)) { + verify.accept(dcv); } } @Test public void testUpperIndexOutOfBoundsException() { - try (HostColumnVector doubleColumnVector = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) { - assertThrows(AssertionError.class, () -> doubleColumnVector.getDouble(3)); - assertFalse(doubleColumnVector.hasNulls()); + Consumer verify = (cv) -> { + assertThrows(AssertionError.class, () -> cv.getDouble(3)); + assertFalse(cv.hasNulls()); + }; + try (HostColumnVector dcv = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) { + verify.accept(dcv); + } + try (HostColumnVector dcv = ColumnBuilderHelper.fromDoubles(2.1, 3.02, 5.003)) { + verify.accept(dcv); } } @Test public void testLowerIndexOutOfBoundsException() { - try (HostColumnVector doubleColumnVector = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) { - assertFalse(doubleColumnVector.hasNulls()); - assertThrows(AssertionError.class, () -> doubleColumnVector.getDouble(-1)); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); + assertThrows(AssertionError.class, () -> cv.getDouble(-1)); + }; + try (HostColumnVector dcv = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) { + verify.accept(dcv); + } + try (HostColumnVector dcv = ColumnBuilderHelper.fromDoubles(2.1, 3.02, 5.003)) { + verify.accept(dcv); } } @Test public void testAddingNullValues() { - try (HostColumnVector cv = - HostColumnVector.fromBoxedDoubles(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, null, null)) { + Consumer verify = (cv) -> { assertTrue(cv.hasNulls()); assertEquals(2, cv.getNullCount()); for (int i = 0; i < 6; i++) { @@ -75,6 +93,14 @@ public void testAddingNullValues() { } assertTrue(cv.isNull(6)); assertTrue(cv.isNull(7)); + }; + try (HostColumnVector dcv = + HostColumnVector.fromBoxedDoubles(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, null, null)) { + verify.accept(dcv); + } + try (HostColumnVector dcv = ColumnBuilderHelper.fromBoxedDoubles( + 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, null, null)) { + verify.accept(dcv); } } diff --git a/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java index 2fb8164534b..7d6311fb24c 100644 --- a/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ import org.junit.jupiter.api.Test; import java.util.Random; +import java.util.function.Consumer; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -34,47 +35,75 @@ public void testCreateColumnVectorBuilder() { try (ColumnVector intColumnVector = ColumnVector.build(DType.INT32, 3, (b) -> b.append(1))) { assertFalse(intColumnVector.hasNulls()); } + try (ColumnVector intColumnVector = ColumnBuilderHelper.buildOnDevice( + new HostColumnVector.BasicType(true, DType.INT32), 3, (b) -> b.append(1))) { + assertFalse(intColumnVector.hasNulls()); + } } @Test public void testArrayAllocation() { - try (HostColumnVector intColumnVector = HostColumnVector.fromInts(2, 3, 5)) { - assertFalse(intColumnVector.hasNulls()); - assertEquals(intColumnVector.getInt(0), 2); - assertEquals(intColumnVector.getInt(1), 3); - assertEquals(intColumnVector.getInt(2), 5); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); + assertEquals(cv.getInt(0), 2); + assertEquals(cv.getInt(1), 3); + assertEquals(cv.getInt(2), 5); + }; + try (HostColumnVector cv = HostColumnVector.fromInts(2, 3, 5)) { + verify.accept(cv); + } + try (HostColumnVector cv = ColumnBuilderHelper.fromInts(true, 2, 3, 5)) { + verify.accept(cv); } } @Test public void testUnsignedArrayAllocation() { - try (HostColumnVector v = HostColumnVector.fromUnsignedInts(0xfedcba98, 0x80000000, 5)) { - assertFalse(v.hasNulls()); - assertEquals(0xfedcba98L, Integer.toUnsignedLong(v.getInt(0))); - assertEquals(0x80000000L, Integer.toUnsignedLong(v.getInt(1))); - assertEquals(5, Integer.toUnsignedLong(v.getInt(2))); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); + assertEquals(0xfedcba98L, Integer.toUnsignedLong(cv.getInt(0))); + assertEquals(0x80000000L, Integer.toUnsignedLong(cv.getInt(1))); + assertEquals(5, Integer.toUnsignedLong(cv.getInt(2))); + }; + try (HostColumnVector cv = HostColumnVector.fromUnsignedInts(0xfedcba98, 0x80000000, 5)) { + verify.accept(cv); + } + try (HostColumnVector cv = ColumnBuilderHelper.fromInts(false, 0xfedcba98, 0x80000000, 5)) { + verify.accept(cv); } } @Test public void testUpperIndexOutOfBoundsException() { - try (HostColumnVector intColumnVector = HostColumnVector.fromInts(2, 3, 5)) { - assertThrows(AssertionError.class, () -> intColumnVector.getInt(3)); - assertFalse(intColumnVector.hasNulls()); + Consumer verify = (cv) -> { + assertThrows(AssertionError.class, () -> cv.getInt(3)); + assertFalse(cv.hasNulls()); + }; + try (HostColumnVector icv = HostColumnVector.fromInts(2, 3, 5)) { + verify.accept(icv); + } + try (HostColumnVector icv = ColumnBuilderHelper.fromInts(true, 2, 3, 5)) { + verify.accept(icv); } } @Test public void testLowerIndexOutOfBoundsException() { - try (HostColumnVector intColumnVector = HostColumnVector.fromInts(2, 3, 5)) { - assertFalse(intColumnVector.hasNulls()); - assertThrows(AssertionError.class, () -> intColumnVector.getInt(-1)); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); + assertThrows(AssertionError.class, () -> cv.getInt(-1)); + }; + try (HostColumnVector icv = HostColumnVector.fromInts(2, 3, 5)) { + verify.accept(icv); + } + try (HostColumnVector icv = ColumnBuilderHelper.fromInts(true, 2, 3, 5)) { + verify.accept(icv); } } @Test public void testAddingNullValues() { - try (HostColumnVector cv = HostColumnVector.fromBoxedInts(2, 3, 4, 5, 6, 7, null, null)) { + Consumer verify = (cv) -> { assertTrue(cv.hasNulls()); assertEquals(2, cv.getNullCount()); for (int i = 0; i < 6; i++) { @@ -82,13 +111,18 @@ public void testAddingNullValues() { } assertTrue(cv.isNull(6)); assertTrue(cv.isNull(7)); + }; + try (HostColumnVector cv = HostColumnVector.fromBoxedInts(2, 3, 4, 5, 6, 7, null, null)) { + verify.accept(cv); + } + try (HostColumnVector cv = ColumnBuilderHelper.fromBoxedInts(true, 2, 3, 4, 5, 6, 7, null, null)) { + verify.accept(cv); } } @Test public void testAddingUnsignedNullValues() { - try (HostColumnVector cv = HostColumnVector.fromBoxedUnsignedInts( - 2, 3, 4, 5, 0xfedbca98, 0x80000000, null, null)) { + Consumer verify = (cv) -> { assertTrue(cv.hasNulls()); assertEquals(2, cv.getNullCount()); for (int i = 0; i < 6; i++) { @@ -98,6 +132,14 @@ public void testAddingUnsignedNullValues() { assertEquals(0x80000000L, Integer.toUnsignedLong(cv.getInt(5))); assertTrue(cv.isNull(6)); assertTrue(cv.isNull(7)); + }; + try (HostColumnVector cv = HostColumnVector.fromBoxedUnsignedInts( + 2, 3, 4, 5, 0xfedbca98, 0x80000000, null, null)) { + verify.accept(cv); + } + try (HostColumnVector cv = ColumnBuilderHelper.fromBoxedInts(false, + 2, 3, 4, 5, 0xfedbca98, 0x80000000, null, null)) { + verify.accept(cv); } } diff --git a/java/src/test/java/ai/rapids/cudf/LongColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/LongColumnVectorTest.java index 43c2b5a99c2..193992f5304 100644 --- a/java/src/test/java/ai/rapids/cudf/LongColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/LongColumnVectorTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ import org.junit.jupiter.api.Test; import java.util.Random; +import java.util.function.Consumer; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -38,46 +39,71 @@ public void testCreateColumnVectorBuilder() { @Test public void testArrayAllocation() { - try (HostColumnVector longColumnVector = HostColumnVector.fromLongs(2L, 3L, 5L)) { - assertFalse(longColumnVector.hasNulls()); - assertEquals(longColumnVector.getLong(0), 2); - assertEquals(longColumnVector.getLong(1), 3); - assertEquals(longColumnVector.getLong(2), 5); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); + assertEquals(cv.getLong(0), 2); + assertEquals(cv.getLong(1), 3); + assertEquals(cv.getLong(2), 5); + }; + try (HostColumnVector lcv = HostColumnVector.fromLongs(2L, 3L, 5L)) { + verify.accept(lcv); + } + try (HostColumnVector lcv = ColumnBuilderHelper.fromLongs(true,2L, 3L, 5L)) { + verify.accept(lcv); } } @Test public void testUnsignedArrayAllocation() { - try (HostColumnVector longColumnVector = HostColumnVector.fromUnsignedLongs( - 0xfedcba9876543210L, 0x8000000000000000L, 5L)) { - assertFalse(longColumnVector.hasNulls()); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); assertEquals(Long.toUnsignedString(0xfedcba9876543210L), - Long.toUnsignedString(longColumnVector.getLong(0))); + Long.toUnsignedString(cv.getLong(0))); assertEquals(Long.toUnsignedString(0x8000000000000000L), - Long.toUnsignedString(longColumnVector.getLong(1))); - assertEquals(5L, longColumnVector.getLong(2)); + Long.toUnsignedString(cv.getLong(1))); + assertEquals(5L, cv.getLong(2)); + }; + try (HostColumnVector lcv = HostColumnVector.fromUnsignedLongs( + 0xfedcba9876543210L, 0x8000000000000000L, 5L)) { + verify.accept(lcv); + } + try (HostColumnVector lcv = ColumnBuilderHelper.fromLongs(false, + 0xfedcba9876543210L, 0x8000000000000000L, 5L)) { + verify.accept(lcv); } } @Test public void testUpperIndexOutOfBoundsException() { - try (HostColumnVector longColumnVector = HostColumnVector.fromLongs(2L, 3L, 5L)) { - assertThrows(AssertionError.class, () -> longColumnVector.getLong(3)); - assertFalse(longColumnVector.hasNulls()); + Consumer verify = (cv) -> { + assertThrows(AssertionError.class, () -> cv.getLong(3)); + assertFalse(cv.hasNulls()); + }; + try (HostColumnVector lcv = HostColumnVector.fromLongs(2L, 3L, 5L)) { + verify.accept(lcv); + } + try (HostColumnVector lcv = ColumnBuilderHelper.fromLongs(true, 2L, 3L, 5L)) { + verify.accept(lcv); } } @Test public void testLowerIndexOutOfBoundsException() { - try (HostColumnVector longColumnVector = HostColumnVector.fromLongs(2L, 3L, 5L)) { - assertFalse(longColumnVector.hasNulls()); - assertThrows(AssertionError.class, () -> longColumnVector.getLong(-1)); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); + assertThrows(AssertionError.class, () -> cv.getLong(-1)); + }; + try (HostColumnVector lcv = HostColumnVector.fromLongs(2L, 3L, 5L)) { + verify.accept(lcv); + } + try (HostColumnVector lcv = ColumnBuilderHelper.fromLongs(true, 2L, 3L, 5L)) { + verify.accept(lcv); } } @Test public void testAddingNullValues() { - try (HostColumnVector cv = HostColumnVector.fromBoxedLongs(2L, 3L, 4L, 5L, 6L, 7L, null, null)) { + Consumer verify = (cv) -> { assertTrue(cv.hasNulls()); assertEquals(2, cv.getNullCount()); for (int i = 0; i < 6; i++) { @@ -85,13 +111,19 @@ public void testAddingNullValues() { } assertTrue(cv.isNull(6)); assertTrue(cv.isNull(7)); + }; + try (HostColumnVector lcv = HostColumnVector.fromBoxedLongs(2L, 3L, 4L, 5L, 6L, 7L, null, null)) { + verify.accept(lcv); + } + try (HostColumnVector lcv = ColumnBuilderHelper.fromBoxedLongs(true, + 2L, 3L, 4L, 5L, 6L, 7L, null, null)) { + verify.accept(lcv); } } @Test public void testAddingUnsignedNullValues() { - try (HostColumnVector cv = HostColumnVector.fromBoxedUnsignedLongs( - 2L, 3L, 4L, 5L, 0xfedcba9876543210L, 0x8000000000000000L, null, null)) { + Consumer verify = (cv) -> { assertTrue(cv.hasNulls()); assertEquals(2, cv.getNullCount()); for (int i = 0; i < 6; i++) { @@ -103,6 +135,14 @@ public void testAddingUnsignedNullValues() { Long.toUnsignedString(cv.getLong(5))); assertTrue(cv.isNull(6)); assertTrue(cv.isNull(7)); + }; + try (HostColumnVector lcv = HostColumnVector.fromBoxedUnsignedLongs( + 2L, 3L, 4L, 5L, 0xfedcba9876543210L, 0x8000000000000000L, null, null)) { + verify.accept(lcv); + } + try (HostColumnVector lcv = ColumnBuilderHelper.fromBoxedLongs(false, + 2L, 3L, 4L, 5L, 0xfedcba9876543210L, 0x8000000000000000L, null, null)) { + verify.accept(lcv); } } diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd index 5b81d369ef5..897b61f8001 100644 --- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -33,11 +33,13 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" \ column_view boolean_mask ) except + - cdef unique_ptr[table] drop_duplicates(table_view source_table, - vector[size_type] keys, - duplicate_keep_option keep, - null_equality nulls_equal) except + - - cdef size_type distinct_count(column_view source_table, - null_policy null_handling, - nan_policy nan_handling) except + + cdef unique_ptr[table] drop_duplicates( + table_view source_table, + vector[size_type] keys, + duplicate_keep_option keep, + null_equality nulls_equal) except + + + cdef size_type unordered_distinct_count( + column_view source_table, + null_policy null_handling, + nan_policy nan_handling) except + diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 4330c565982..c4f885382f3 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import pandas as pd @@ -11,10 +11,10 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.stream_compaction cimport ( apply_boolean_mask as cpp_apply_boolean_mask, - distinct_count as cpp_distinct_count, drop_duplicates as cpp_drop_duplicates, drop_nulls as cpp_drop_nulls, duplicate_keep_option, + unordered_distinct_count as cpp_unordered_distinct_count, ) from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -190,7 +190,7 @@ def distinct_count(Column source_column, ignore_nulls=True, nan_as_null=False): cdef column_view source_column_view = source_column.view() with nogil: - count = cpp_distinct_count( + count = cpp_unordered_distinct_count( source_column_view, cpp_null_handling, cpp_nan_handling diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2866c1c003c..19313dd3fe2 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -986,7 +986,7 @@ def __array__(self, dtype=None): raise TypeError( "Implicit conversion to a host NumPy array via __array__ is not " "allowed. To explicitly construct a host array, consider using " - ".to_array()" + ".to_numpy()" ) @property @@ -1271,6 +1271,12 @@ def column_empty( column_empty(row_count, field_dtype) for field_dtype in dtype.fields.values() ) + elif is_list_dtype(dtype): + data = None + children = ( + full(row_count + 1, 0, dtype="int32"), + column_empty(row_count, dtype=dtype.element_type), + ) elif is_categorical_dtype(dtype): data = None children = ( diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 323a5ad088a..3735a949277 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6027,6 +6027,37 @@ def __dataframe__( self, nan_as_null=nan_as_null, allow_copy=allow_copy ) + def nunique(self, axis=0, dropna=True): + """ + Count number of distinct elements in specified axis. + Return Series with number of distinct elements. Can ignore NaN values. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for + column-wise. + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]}) + >>> df.nunique() + A 3 + B 2 + dtype: int64 + """ + if axis != 0: + raise NotImplementedError("axis parameter is not supported yet.") + + return cudf.Series(super().nunique(method="sort", dropna=dropna)) + def from_dataframe(df, allow_copy=False): return df_protocol.from_dataframe(df, allow_copy=allow_copy) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index bc666430189..2e01a29b961 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2,6 +2,7 @@ from __future__ import annotations +import builtins import copy import pickle import warnings @@ -5982,12 +5983,12 @@ def eq(self, other, axis="columns", level=None, fill_value=None): ... 'd': [10, 12, 12]} ... ) >>> left.eq(right) - a b c d + a b c d 0 True True 1 True True 2 True True >>> left.eq(right, fill_value=7) - a b c d + a b c d 0 True True True False 1 True True False False 2 True True False False @@ -6057,12 +6058,12 @@ def ne(self, other, axis="columns", level=None, fill_value=None): ... 'd': [10, 12, 12]} ... ) >>> left.ne(right) - a b c d + a b c d 0 False False 1 False False 2 False False >>> left.ne(right, fill_value=7) - a b c d + a b c d 0 False False False True 1 False False True True 2 False False True True @@ -6132,12 +6133,12 @@ def lt(self, other, axis="columns", level=None, fill_value=None): ... 'd': [10, 12, 12]} ... ) >>> left.lt(right) - a b c d + a b c d 0 False False 1 False False 2 False False >>> left.lt(right, fill_value=7) - a b c d + a b c d 0 False False False True 1 False False False True 2 False False False True @@ -6207,12 +6208,12 @@ def le(self, other, axis="columns", level=None, fill_value=None): ... 'd': [10, 12, 12]} ... ) >>> left.le(right) - a b c d + a b c d 0 True True 1 True True 2 True True >>> left.le(right, fill_value=7) - a b c d + a b c d 0 True True True True 1 True True False True 2 True True False True @@ -6282,12 +6283,12 @@ def gt(self, other, axis="columns", level=None, fill_value=None): ... 'd': [10, 12, 12]} ... ) >>> left.gt(right) - a b c d + a b c d 0 False False 1 False False 2 False False >>> left.gt(right, fill_value=7) - a b c d + a b c d 0 False False False False 1 False False True False 2 False False True False @@ -6357,12 +6358,12 @@ def ge(self, other, axis="columns", level=None, fill_value=None): ... 'd': [10, 12, 12]} ... ) >>> left.ge(right) - a b c d + a b c d 0 True True 1 True True 2 True True >>> left.ge(right, fill_value=7) - a b c d + a b c d 0 True True True False 1 True True True False 2 True True True False @@ -6402,6 +6403,28 @@ def ge(self, other, axis="columns", level=None, fill_value=None): other=other, fn="ge", fill_value=fill_value, can_reindex=True ) + def nunique(self, method: builtins.str = "sort", dropna: bool = True): + """ + Returns a per column mapping with counts of unique values for + each column. + + Parameters + ---------- + method : builtins.str, default "sort" + Method used by cpp_distinct_count + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + dict + Name and unique value counts of each column in frame. + """ + return { + name: col.distinct_count(method=method, dropna=dropna) + for name, col in self._data.items() + } + def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 66194f0f877..12a2538b776 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2756,7 +2756,7 @@ def nunique(self, method="sort", dropna=True): raise NotImplementedError(msg) if self.null_count == len(self): return 0 - return self._column.distinct_count(method, dropna) + return super().nunique(method, dropna) def value_counts( self, diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 2623569afac..ef479f19363 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -3,6 +3,7 @@ from __future__ import annotations +import builtins from typing import Any, Dict, MutableMapping, Optional, Tuple, TypeVar, Union import cupy @@ -325,3 +326,21 @@ def _make_operands_for_binop( return NotImplemented return {result_name: (self._column, other, reflect, fill_value)} + + def nunique(self, method: builtins.str = "sort", dropna: bool = True): + """ + Return count of unique values for the column. + + Parameters + ---------- + method : builtins.str, default "sort" + Method used by cpp_distinct_count + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + int + Number of unique values in the column. + """ + return self._column.distinct_count(method=method, dropna=dropna) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 889662c8a1c..ba2caf7c6c8 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9081,6 +9081,32 @@ def test_dataframe_assign_cp_np_array(): assert_eq(pdf, gdf) +@pytest.mark.parametrize( + "data", [{"a": [1, 2, 3], "b": [1, 1, 0]}], +) +def test_dataframe_nunique(data): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.nunique() + expected = pdf.nunique() + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", [{"key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}], +) +def test_dataframe_nunique_index(data): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.index.nunique() + expected = pdf.index.nunique() + + assert_eq(expected, actual) + + def test_dataframe_rename_duplicate_column(): gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) with pytest.raises( diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 44749103b54..fc9ad9711d1 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. + import functools import operator @@ -586,3 +587,18 @@ def test_listcol_setitem_error_cases(data, item, error): sr = cudf.Series(data) with pytest.raises(BaseException, match=error): sr[1] = item + + +def test_listcol_setitem_retain_dtype(): + df = cudf.DataFrame( + {"a": cudf.Series([["a", "b"], []]), "b": [1, 2], "c": [123, 321]} + ) + df1 = df.head(0) + # Performing a setitem on `b` triggers a `column.column_empty_like` call + # which tries to create an empty ListColumn. + df1["b"] = df1["c"] + # Performing a copy to trigger a copy dtype which is obtained by accessing + # `ListColumn.children` that would have been corrupted in previous call + # prior to this fix: https://github.com/rapidsai/cudf/pull/10151/ + df2 = df1.copy() + assert df2["a"].dtype == df["a"].dtype diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 1c80fe80f2d..358484d79b9 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1521,6 +1521,32 @@ def test_series_transpose(data): assert_eq(cudf_transposed, csr) +@pytest.mark.parametrize( + "data", [1, 3, 5, 7, 7], +) +def test_series_nunique(data): + cd_s = cudf.Series(data) + pd_s = cd_s.to_pandas() + + actual = cd_s.nunique() + expected = pd_s.nunique() + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", [1, 3, 5, 7, 7], +) +def test_series_nunique_index(data): + cd_s = cudf.Series(data) + pd_s = cd_s.to_pandas() + + actual = cd_s.index.nunique() + expected = pd_s.index.nunique() + + assert_eq(expected, actual) + + @pytest.mark.parametrize( "fill_value,data", [ diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index 149d98ebfb9..1bc270a5b9f 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. + import math from operator import getitem from typing import Set @@ -42,19 +43,11 @@ def __init__(self, *args, **kwargs): def __getitem__(self, key): if isinstance(key, list): g = CudfDataFrameGroupBy( - self.obj, - by=self.index, - slice=key, - sort=self.sort, - **self.dropna, + self.obj, by=self.by, slice=key, sort=self.sort, **self.dropna, ) else: g = CudfSeriesGroupBy( - self.obj, - by=self.index, - slice=key, - sort=self.sort, - **self.dropna, + self.obj, by=self.by, slice=key, sort=self.sort, **self.dropna, ) g._meta = g._meta[key] @@ -63,8 +56,8 @@ def __getitem__(self, key): def mean(self, split_every=None, split_out=1): return groupby_agg( self.obj, - self.index, - {c: "mean" for c in self.obj.columns if c not in self.index}, + self.by, + {c: "mean" for c in self.obj.columns if c not in self.by}, split_every=split_every, split_out=split_out, dropna=self.dropna, @@ -76,8 +69,8 @@ def mean(self, split_every=None, split_out=1): def collect(self, split_every=None, split_out=1): return groupby_agg( self.obj, - self.index, - {c: "collect" for c in self.obj.columns if c not in self.index}, + self.by, + {c: "collect" for c in self.obj.columns if c not in self.by}, split_every=split_every, split_out=split_out, dropna=self.dropna, @@ -94,10 +87,10 @@ def aggregate(self, arg, split_every=None, split_out=1): if ( isinstance(self.obj, DaskDataFrame) and ( - isinstance(self.index, str) + isinstance(self.by, str) or ( - isinstance(self.index, list) - and all(isinstance(x, str) for x in self.index) + isinstance(self.by, list) + and all(isinstance(x, str) for x in self.by) ) ) and _is_supported(arg, SUPPORTED_AGGS) @@ -133,7 +126,7 @@ def __init__(self, *args, **kwargs): def mean(self, split_every=None, split_out=1): return groupby_agg( self.obj, - self.index, + self.by, {self._slice: "mean"}, split_every=split_every, split_out=split_out, @@ -146,7 +139,7 @@ def mean(self, split_every=None, split_out=1): def std(self, split_every=None, split_out=1): return groupby_agg( self.obj, - self.index, + self.by, {self._slice: "std"}, split_every=split_every, split_out=split_out, @@ -159,7 +152,7 @@ def std(self, split_every=None, split_out=1): def var(self, split_every=None, split_out=1): return groupby_agg( self.obj, - self.index, + self.by, {self._slice: "var"}, split_every=split_every, split_out=split_out, @@ -172,7 +165,7 @@ def var(self, split_every=None, split_out=1): def collect(self, split_every=None, split_out=1): return groupby_agg( self.obj, - self.index, + self.by, {self._slice: "collect"}, split_every=split_every, split_out=split_out, @@ -192,12 +185,12 @@ def aggregate(self, arg, split_every=None, split_out=1): if ( isinstance(self.obj, DaskDataFrame) - and isinstance(self.index, (str, list)) + and isinstance(self.by, (str, list)) and _is_supported(arg, SUPPORTED_AGGS) ): return groupby_agg( self.obj, - self.index, + self.by, arg, split_every=split_every, split_out=split_out,