diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 5f0be6d797a..59e2ea224f6 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -14,9 +14,9 @@ python/dask_cudf/ @rapidsai/cudf-dask-codeowners java/ @rapidsai/cudf-java-codeowners #build/ops code owners -.github/ @rapidsai/ops-codeowners -/ci/ @rapidsai/ops-codeowners +.github/ @rapidsai/ops-codeowners +/ci/ @rapidsai/ops-codeowners conda/ @rapidsai/ops-codeowners -**/Dockerfile @rapidsai/ops-codeowners -**/.dockerignore @rapidsai/ops-codeowners -docker/ @rapidsai/ops-codeowners +/Dockerfile @rapidsai/ops-codeowners +/.dockerignore @rapidsai/ops-codeowners +/docker/ @rapidsai/ops-codeowners diff --git a/README.md b/README.md index c0fa500ad77..687d25c200b 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,17 @@ **NOTE:** For the latest stable [README.md](https://github.com/rapidsai/cudf/blob/main/README.md) ensure you are on the `main` branch. +## Resources + +- [cuDF Reference Documentation](https://docs.rapids.ai/api/cudf/stable/): Python API reference, tutorials, and topic guides. +- [libcudf Reference Documentation](https://docs.rapids.ai/api/libcudf/stable/): C/C++ CUDA library API reference. +- [Getting Started](https://rapids.ai/start.html): Instructions for installing cuDF. +- [RAPIDS Community](https://rapids.ai/community.html): Get help, contribute, and collaborate. +- [GitHub repository](https://github.com/rapidsai/cudf): Download the cuDF source code. +- [Issue tracker](https://github.com/rapidsai/cudf/issues): Report issues or request features. + +## Overview + Built based on the [Apache Arrow](http://arrow.apache.org/) columnar memory format, cuDF is a GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise manipulating data. cuDF provides a pandas-like API that will be familiar to data engineers & data scientists, so they can use it to easily accelerate their workflows without going into the details of CUDA programming. diff --git a/build.sh b/build.sh index d75053f8849..bc49b76d44e 100755 --- a/build.sh +++ b/build.sh @@ -192,6 +192,7 @@ fi # Build libcudf_kafka library if hasArg libcudf_kafka; then cmake -S $REPODIR/cpp/libcudf_kafka -B ${KAFKA_LIB_BUILD_DIR} \ + ${CUDF_CMAKE_CUDA_ARCHITECTURES} \ -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 5657d21889f..1be8a6b450a 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -58,6 +58,7 @@ test: - test -f $PREFIX/include/cudf/ast/linearizer.hpp - test -f $PREFIX/include/cudf/ast/operators.hpp - test -f $PREFIX/include/cudf/binaryop.hpp + - test -f $PREFIX/include/cudf/labeling/label_bins.hpp - test -f $PREFIX/include/cudf/column/column_factories.hpp - test -f $PREFIX/include/cudf/column/column.hpp - test -f $PREFIX/include/cudf/column/column_view.hpp @@ -66,6 +67,7 @@ test: - test -f $PREFIX/include/cudf/datetime.hpp - test -f $PREFIX/include/cudf/detail/aggregation/aggregation.hpp - test -f $PREFIX/include/cudf/detail/aggregation/result_cache.hpp + - test -f $PREFIX/include/cudf/detail/label_bins.hpp - test -f $PREFIX/include/cudf/detail/binaryop.hpp - test -f $PREFIX/include/cudf/detail/concatenate.hpp - test -f $PREFIX/include/cudf/detail/copy.hpp @@ -132,6 +134,7 @@ test: - test -f $PREFIX/include/cudf/join.hpp - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp - test -f $PREFIX/include/cudf/lists/detail/copying.hpp + - test -f $PREFIX/include/cudf/lists/detail/drop_list_duplicates.hpp - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp - test -f $PREFIX/include/cudf/lists/count_elements.hpp - test -f $PREFIX/include/cudf/lists/explode.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3e875b71ca6..fc439ebfa7f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -156,6 +156,7 @@ add_library(cudf src/binaryop/jit/code/kernel.cpp src/binaryop/jit/code/operation.cpp src/binaryop/jit/code/traits.cpp + src/labeling/label_bins.cu src/bitmask/null_mask.cu src/column/column.cu src/column/column_device_view.cu @@ -194,7 +195,7 @@ add_library(cudf src/groupby/hash/groupby.cu src/groupby/sort/group_argmax.cu src/groupby/sort/group_argmin.cu - src/groupby/sort/groupby.cu + src/groupby/sort/aggregate.cpp src/groupby/sort/group_collect.cu src/groupby/sort/group_count.cu src/groupby/sort/group_max.cu @@ -204,6 +205,11 @@ add_library(cudf src/groupby/sort/group_quantiles.cu src/groupby/sort/group_std.cu src/groupby/sort/group_sum.cu + src/groupby/sort/scan.cpp + src/groupby/sort/group_count_scan.cu + src/groupby/sort/group_max_scan.cu + src/groupby/sort/group_min_scan.cu + src/groupby/sort/group_sum_scan.cu src/groupby/sort/sort_helper.cu src/hash/hashing.cu src/interop/dlpack.cpp @@ -410,7 +416,7 @@ target_compile_options(cudf target_compile_definitions(cudf PUBLIC "$<$:${CUDF_CXX_DEFINITIONS}>" - "$<$:${CUDF_CUDA_DEFINITIONS}>" + "$:${CUDF_CUDA_DEFINITIONS}>>" ) # Disable Jitify log printing. See https://github.com/NVIDIA/jitify/issues/79 @@ -505,10 +511,11 @@ add_library(cudftestutil STATIC target_compile_options(cudftestutil PUBLIC "$<$:${CUDF_CXX_FLAGS}>" - "$<$:${CUDF_CUDA_FLAGS}>" + "$:${CUDF_CUDA_FLAGS}>>" ) -target_compile_features(cudftestutil PUBLIC cxx_std_14 cuda_std_14) +target_compile_features(cudftestutil + PUBLIC cxx_std_14 $) target_link_libraries(cudftestutil PUBLIC GTest::gmock @@ -522,7 +529,7 @@ target_include_directories(cudftestutil install(TARGETS cudftestutil DESTINATION lib - EXPORT cudf-targets) + EXPORT cudf-testing-targets) add_library(cudf::cudftestutil ALIAS cudftestutil) @@ -600,6 +607,11 @@ install(EXPORT cudf-targets NAMESPACE cudf:: DESTINATION "${INSTALL_CONFIGDIR}") +install(EXPORT cudf-testing-targets + FILE cudf-testing-targets.cmake + NAMESPACE cudf:: + DESTINATION "${INSTALL_CONFIGDIR}") + ################################################################################################ # - build export ------------------------------------------------------------------------------- configure_package_config_file(cmake/cudf-build-config.cmake.in ${CUDF_BINARY_DIR}/cudf-config.cmake @@ -628,6 +640,10 @@ export(EXPORT cudf-targets FILE ${CUDF_BINARY_DIR}/cudf-targets.cmake NAMESPACE cudf::) +export(EXPORT cudf-testing-targets + FILE ${CUDF_BINARY_DIR}/cudf-testing-targets.cmake + NAMESPACE cudf::) + ################################################################################################### # - make documentation ---------------------------------------------------------------------------- diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index ded5a4bb596..7fd84b508ac 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -173,8 +173,12 @@ ConfigureBench(AST_BENCH ast/transform_benchmark.cpp) ConfigureBench(BINARYOP_BENCH binaryop/binaryop_benchmark.cu) ################################################################################################### -# - subword tokenizer benchmark ------------------------------------------------------------------- -ConfigureBench(SUBWORD_TOKENIZER_BENCH text/subword_benchmark.cpp) +# - nvtext benchmark ------------------------------------------------------------------- +ConfigureBench(TEXT_BENCH + text/normalize_benchmark.cpp + text/normalize_spaces_benchmark.cpp + text/tokenize_benchmark.cpp + text/subword_benchmark.cpp) ################################################################################################### # - strings benchmark ------------------------------------------------------------------- @@ -191,6 +195,8 @@ ConfigureBench(STRINGS_BENCH string/filter_benchmark.cpp string/find_benchmark.cpp string/replace_benchmark.cpp + string/replace_re_benchmark.cpp string/split_benchmark.cpp string/substring_benchmark.cpp + string/translate_benchmark.cpp string/url_decode_benchmark.cpp) diff --git a/cpp/benchmarks/string/replace_re_benchmark.cpp b/cpp/benchmarks/string/replace_re_benchmark.cpp new file mode 100644 index 00000000000..616e2c0f22c --- /dev/null +++ b/cpp/benchmarks/string/replace_re_benchmark.cpp @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "string_bench_args.hpp" + +#include +#include +#include +#include + +#include +#include +#include + +class StringReplace : public cudf::benchmark { +}; + +enum replace_type { replace_re, replace_re_multi, replace_backref }; + +static void BM_replace(benchmark::State& state, replace_type rt) +{ + cudf::size_type const n_rows{static_cast(state.range(0))}; + cudf::size_type const max_str_length{static_cast(state.range(1))}; + data_profile table_profile; + table_profile.set_distribution_params( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); + auto const table = + create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + cudf::test::strings_column_wrapper repls({"#", ""}); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + switch (rt) { + case replace_type::replace_re: // contains_re and matches_re use the same main logic + cudf::strings::replace_re(input, "\\d+"); + break; + case replace_type::replace_re_multi: // counts occurrences of pattern + cudf::strings::replace_re(input, {"\\d+", "\\s+"}, cudf::strings_column_view(repls)); + break; + case replace_type::replace_backref: // returns occurrences of matches + cudf::strings::replace_with_backrefs(input, "(\\d+)", "#\\1X"); + break; + } + } + + state.SetBytesProcessed(state.iterations() * input.chars_size()); +} + +static void generate_bench_args(benchmark::internal::Benchmark* b) +{ + int const min_rows = 1 << 12; + int const max_rows = 1 << 24; + int const row_mult = 8; + int const min_rowlen = 1 << 5; + int const max_rowlen = 1 << 13; + int const len_mult = 4; + generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); +} + +#define STRINGS_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(StringReplace, name) \ + (::benchmark::State & st) { BM_replace(st, name); } \ + BENCHMARK_REGISTER_F(StringReplace, name) \ + ->Apply(generate_bench_args) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +STRINGS_BENCHMARK_DEFINE(replace_re) +STRINGS_BENCHMARK_DEFINE(replace_re_multi) +STRINGS_BENCHMARK_DEFINE(replace_backref) diff --git a/cpp/benchmarks/string/translate_benchmark.cpp b/cpp/benchmarks/string/translate_benchmark.cpp new file mode 100644 index 00000000000..c49a986d744 --- /dev/null +++ b/cpp/benchmarks/string/translate_benchmark.cpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "string_bench_args.hpp" + +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +class StringTranslate : public cudf::benchmark { +}; + +using entry_type = std::pair; + +static void BM_translate(benchmark::State& state, int entry_count) +{ + cudf::size_type const n_rows{static_cast(state.range(0))}; + cudf::size_type const max_str_length{static_cast(state.range(1))}; + data_profile table_profile; + table_profile.set_distribution_params( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); + auto const table = + create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + + std::vector entries(entry_count); + std::transform(thrust::counting_iterator(0), + thrust::counting_iterator(entry_count), + entries.begin(), + [](auto idx) -> entry_type { + return entry_type{'!' + idx, '~' - idx}; + }); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + cudf::strings::translate(input, entries); + } + + state.SetBytesProcessed(state.iterations() * input.chars_size()); +} + +static void generate_bench_args(benchmark::internal::Benchmark* b) +{ + int const min_rows = 1 << 12; + int const max_rows = 1 << 24; + int const row_mult = 8; + int const min_rowlen = 1 << 5; + int const max_rowlen = 1 << 13; + int const len_mult = 4; + generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); +} + +#define STRINGS_BENCHMARK_DEFINE(name, entries) \ + BENCHMARK_DEFINE_F(StringTranslate, name) \ + (::benchmark::State & st) { BM_translate(st, entries); } \ + BENCHMARK_REGISTER_F(StringTranslate, name) \ + ->Apply(generate_bench_args) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +STRINGS_BENCHMARK_DEFINE(translate_small, 5) +STRINGS_BENCHMARK_DEFINE(translate_medium, 25) +STRINGS_BENCHMARK_DEFINE(translate_large, 50) diff --git a/cpp/benchmarks/text/normalize_benchmark.cpp b/cpp/benchmarks/text/normalize_benchmark.cpp new file mode 100644 index 00000000000..32c4fb7dcde --- /dev/null +++ b/cpp/benchmarks/text/normalize_benchmark.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +class TextNormalize : public cudf::benchmark { +}; + +static void BM_normalize(benchmark::State& state, bool to_lower) +{ + auto const n_rows = static_cast(state.range(0)); + auto const max_str_length = static_cast(state.range(1)); + data_profile table_profile; + table_profile.set_distribution_params( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); + auto const table = + create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + nvtext::normalize_characters(input, to_lower); + } + + state.SetBytesProcessed(state.iterations() * input.chars_size()); +} + +static void generate_bench_args(benchmark::internal::Benchmark* b) +{ + int const min_rows = 1 << 12; + int const max_rows = 1 << 24; + int const row_mult = 8; + int const min_rowlen = 1 << 5; + int const max_rowlen = 1 << 13; + int const len_mult = 4; + for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) { + for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) { + // avoid generating combinations that exceed the cudf column limit + size_t total_chars = static_cast(row_count) * rowlen * 4; + if (total_chars < std::numeric_limits::max()) { + b->Args({row_count, rowlen}); + } + } + } +} + +#define NVTEXT_BENCHMARK_DEFINE(name, lower) \ + BENCHMARK_DEFINE_F(TextNormalize, name) \ + (::benchmark::State & st) { BM_normalize(st, lower); } \ + BENCHMARK_REGISTER_F(TextNormalize, name) \ + ->Apply(generate_bench_args) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +NVTEXT_BENCHMARK_DEFINE(characters, false) +NVTEXT_BENCHMARK_DEFINE(to_lower, true) diff --git a/cpp/benchmarks/text/normalize_spaces_benchmark.cpp b/cpp/benchmarks/text/normalize_spaces_benchmark.cpp new file mode 100644 index 00000000000..dcabb0c225c --- /dev/null +++ b/cpp/benchmarks/text/normalize_spaces_benchmark.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +class TextNormalize : public cudf::benchmark { +}; + +static void BM_normalize(benchmark::State& state) +{ + auto const n_rows = static_cast(state.range(0)); + auto const max_str_length = static_cast(state.range(1)); + data_profile table_profile; + table_profile.set_distribution_params( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); + auto const table = + create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + nvtext::normalize_spaces(input); + } + + state.SetBytesProcessed(state.iterations() * input.chars_size()); +} + +static void generate_bench_args(benchmark::internal::Benchmark* b) +{ + int const min_rows = 1 << 12; + int const max_rows = 1 << 24; + int const row_mult = 8; + int const min_rowlen = 1 << 5; + int const max_rowlen = 1 << 13; + int const len_mult = 4; + generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); +} + +#define NVTEXT_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(TextNormalize, name) \ + (::benchmark::State & st) { BM_normalize(st); } \ + BENCHMARK_REGISTER_F(TextNormalize, name) \ + ->Apply(generate_bench_args) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +NVTEXT_BENCHMARK_DEFINE(spaces) diff --git a/cpp/benchmarks/text/tokenize_benchmark.cpp b/cpp/benchmarks/text/tokenize_benchmark.cpp new file mode 100644 index 00000000000..f9e742f0f31 --- /dev/null +++ b/cpp/benchmarks/text/tokenize_benchmark.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +class TextTokenize : public cudf::benchmark { +}; + +enum class tokenize_type { single, multi, count, count_multi, ngrams }; + +static void BM_tokenize(benchmark::State& state, tokenize_type tt) +{ + auto const n_rows = static_cast(state.range(0)); + auto const max_str_length = static_cast(state.range(1)); + data_profile table_profile; + table_profile.set_distribution_params( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); + auto const table = + create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + cudf::test::strings_column_wrapper delimiters({" ", "+", "-"}); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + switch (tt) { + case tokenize_type::single: nvtext::tokenize(input); break; + case tokenize_type::multi: + nvtext::tokenize(input, cudf::strings_column_view(delimiters)); + break; + case tokenize_type::count: nvtext::count_tokens(input); break; + case tokenize_type::count_multi: + nvtext::count_tokens(input, cudf::strings_column_view(delimiters)); + break; + case tokenize_type::ngrams: + // default is bigrams + nvtext::ngrams_tokenize(input); + break; + } + } + + state.SetBytesProcessed(state.iterations() * input.chars_size()); +} + +static void generate_bench_args(benchmark::internal::Benchmark* b) +{ + int const min_rows = 1 << 12; + int const max_rows = 1 << 24; + int const row_mult = 8; + int const min_rowlen = 1 << 5; + int const max_rowlen = 1 << 13; + int const len_mult = 4; + generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); +} + +#define NVTEXT_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(TextTokenize, name) \ + (::benchmark::State & st) { BM_tokenize(st, tokenize_type::name); } \ + BENCHMARK_REGISTER_F(TextTokenize, name) \ + ->Apply(generate_bench_args) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +NVTEXT_BENCHMARK_DEFINE(single) +NVTEXT_BENCHMARK_DEFINE(multi) +NVTEXT_BENCHMARK_DEFINE(count) +NVTEXT_BENCHMARK_DEFINE(count_multi) +NVTEXT_BENCHMARK_DEFINE(ngrams) diff --git a/cpp/cmake/Modules/ConfigureCUDA.cmake b/cpp/cmake/Modules/ConfigureCUDA.cmake index d4be6e65021..b0d048c6294 100644 --- a/cpp/cmake/Modules/ConfigureCUDA.cmake +++ b/cpp/cmake/Modules/ConfigureCUDA.cmake @@ -18,7 +18,7 @@ find_package(CUDAToolkit REQUIRED) # Auto-detect available GPU compute architectures -include(${CUDF_SOURCE_DIR}/cmake/Modules/SetGPUArchs.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/SetGPUArchs.cmake) message(STATUS "CUDF: Building CUDF for GPU architectures: ${CMAKE_CUDA_ARCHITECTURES}") # Must come after find_package(CUDAToolkit) because we symlink @@ -29,10 +29,6 @@ enable_language(CUDA) if(CMAKE_COMPILER_IS_GNUCXX) list(APPEND CUDF_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations) - if(CUDF_BUILD_TESTS OR CUDF_BUILD_BENCHMARKS) - # Suppress parentheses warning which causes gmock to fail - list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-Wno-parentheses) - endif() endif() list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) @@ -46,6 +42,9 @@ if(DISABLE_DEPRECATION_WARNING) list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-Wno-deprecated-declarations) endif() +# make sure we produce smallest binary size +list(APPEND CUDF_CUDA_FLAGS -Xfatbin=-compress-all) + # Option to enable line info in CUDA device compilation to allow introspection when profiling / memchecking if(CUDA_ENABLE_LINEINFO) list(APPEND CUDF_CUDA_FLAGS -lineinfo) diff --git a/cpp/cmake/Modules/SetGPUArchs.cmake b/cpp/cmake/Modules/SetGPUArchs.cmake index 61e4e6bc198..f09d5ead8e2 100644 --- a/cpp/cmake/Modules/SetGPUArchs.cmake +++ b/cpp/cmake/Modules/SetGPUArchs.cmake @@ -58,7 +58,7 @@ if(${PROJECT_NAME}_BUILD_FOR_ALL_ARCHS) list(APPEND CMAKE_CUDA_ARCHITECTURES ${latest_arch}) elseif(${PROJECT_NAME}_BUILD_FOR_DETECTED_ARCHS) - include(${PROJECT_SOURCE_DIR}/cmake/Modules/EvalGPUArchs.cmake) + include(${CMAKE_CURRENT_LIST_DIR}/EvalGPUArchs.cmake) evaluate_gpu_archs(CMAKE_CUDA_ARCHITECTURES) list(TRANSFORM CMAKE_CUDA_ARCHITECTURES APPEND "-real") diff --git a/cpp/cmake/cudf-build-config.cmake.in b/cpp/cmake/cudf-build-config.cmake.in index 3f4d2e5586e..d0c5a608e45 100644 --- a/cpp/cmake/cudf-build-config.cmake.in +++ b/cpp/cmake/cudf-build-config.cmake.in @@ -50,6 +50,11 @@ if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-arrow-targets.cmake") include("${CMAKE_CURRENT_LIST_DIR}/cudf-arrow-targets.cmake") endif() include("${CMAKE_CURRENT_LIST_DIR}/cudf-targets.cmake") + +if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake") +endif() + include("${CMAKE_CURRENT_LIST_DIR}/cudf-config-version.cmake") check_required_components(cudf) diff --git a/cpp/cmake/cudf-config.cmake.in b/cpp/cmake/cudf-config.cmake.in index 0a478516f18..14f8a661c2f 100644 --- a/cpp/cmake/cudf-config.cmake.in +++ b/cpp/cmake/cudf-config.cmake.in @@ -1,7 +1,70 @@ @PACKAGE_INIT@ + +#[=======================================================================[ + +Provide targets for the cudf library. + +Built based on the Apache Arrow columnar memory format, cuDF is a GPU DataFrame +library for loading, joining, aggregating, filtering, and otherwise +manipulating data. + +cuDF provides a pandas-like API that will be familiar to data engineers & +data scientists, so they can use it to easily accelerate their workflows +without going into the details of CUDA programming. + + +Imported Targets +^^^^^^^^^^^^^^^^ + +If cudf is found, this module defines the following IMPORTED GLOBAL +targets: + + cudf::cudf - The main cudf library. + +This module offers an optional testing component which defines the +following IMPORTED GLOBAL targets: + + cudf::cudftestutil - The main cudf testing library + cudf::gmock + cudf::gmock_main + cudf::gtest + cudf::gtest_main + + +Result Variables +^^^^^^^^^^^^^^^^ + +This module will set the following variables in your project:: + + CUDF_FOUND + CUDF_VERSION + CUDF_VERSION_MAJOR + CUDF_VERSION_MINOR + +#]=======================================================================] + + cmake_minimum_required(VERSION 3.18) +set(_possible_targets_to_promote + cudf::cudf + cudf::benchmark + cudf::benchmark_main + cudf::gmock + cudf::gtest + cudf::gmock_main + cudf::gtest_main + cudf::cudftestutil + rmm::rmm + arrow_shared + arrow_cuda_shared ) +foreach(t IN LISTS _possible_targets_to_promote) + if(NOT TARGET ${t}) + list(APPEND _targets_to_promote ${t}) + endif() +endforeach() + set(CUDF_VERSION @CUDF_VERSION@) set(CUDF_VERSION_MAJOR @CUDF_VERSION_MAJOR@) set(CUDF_VERSION_MINOR @CUDF_VERSION_MINOR@) @@ -26,7 +89,6 @@ set(ArrowCUDA_DIR "${Arrow_DIR}") find_dependency(ArrowCUDA @CUDF_VERSION_Arrow@) find_dependency(rmm @CUDF_MIN_VERSION_rmm@) -find_dependency(GTest @CUDF_MIN_VERSION_GTest@) set(Thrust_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/Thrust") find_dependency(Thrust @CUDF_MIN_VERSION_Thrust@) @@ -35,10 +97,23 @@ thrust_create_target(cudf::Thrust FROM_OPTIONS) list(POP_FRONT CMAKE_MODULE_PATH) include("${CMAKE_CURRENT_LIST_DIR}/cudf-targets.cmake") + +if(testing IN_LIST cudf_FIND_COMPONENTS) + enable_language(CUDA) + + find_dependency(GTest @CUDF_MIN_VERSION_GTest@) + include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake") +endif() + include("${CMAKE_CURRENT_LIST_DIR}/cudf-config-version.cmake") check_required_components(cudf) +foreach(t IN LISTS _targets_to_promote) + if(TARGET ${t}) + set_target_properties(${t} PROPERTIES IMPORTED_GLOBAL TRUE) + endif() +endforeach() set(${CMAKE_FIND_PACKAGE_NAME}_CONFIG "${CMAKE_CURRENT_LIST_FILE}") include(FindPackageHandleStandardArgs) diff --git a/cpp/cmake/thirdparty/CUDF_GetCPM.cmake b/cpp/cmake/thirdparty/CUDF_GetCPM.cmake index 5162aaf6ce7..19c07933d42 100644 --- a/cpp/cmake/thirdparty/CUDF_GetCPM.cmake +++ b/cpp/cmake/thirdparty/CUDF_GetCPM.cmake @@ -23,7 +23,8 @@ include(${CPM_DOWNLOAD_LOCATION}) function(fix_cmake_global_defaults target) if(TARGET ${target}) get_target_property(_is_imported ${target} IMPORTED) - if(_is_imported) + get_target_property(_already_global ${target} IMPORTED_GLOBAL) + if(_is_imported AND NOT _already_global) set_target_properties(${target} PROPERTIES IMPORTED_GLOBAL TRUE) endif() endif() diff --git a/cpp/cmake/thirdparty/CUDF_GetGTest.cmake b/cpp/cmake/thirdparty/CUDF_GetGTest.cmake index e346dce1730..666ba0fbb2c 100644 --- a/cpp/cmake/thirdparty/CUDF_GetGTest.cmake +++ b/cpp/cmake/thirdparty/CUDF_GetGTest.cmake @@ -15,6 +15,11 @@ #============================================================================= function(find_and_configure_gtest VERSION) + + if(TARGET GTest::gtest) + return() + endif() + # Find or install GoogleTest CPMFindPackage(NAME GTest VERSION ${VERSION} @@ -44,7 +49,7 @@ function(find_and_configure_gtest VERSION) gmock_main gtest_main DESTINATION lib - EXPORT cudf-targets) + EXPORT cudf-testing-targets) endif() endfunction() diff --git a/cpp/cmake/thirdparty/CUDF_GetRMM.cmake b/cpp/cmake/thirdparty/CUDF_GetRMM.cmake index 54e0a8620c5..e5d1f2f07a9 100644 --- a/cpp/cmake/thirdparty/CUDF_GetRMM.cmake +++ b/cpp/cmake/thirdparty/CUDF_GetRMM.cmake @@ -28,6 +28,11 @@ function(cudf_restore_if_enabled var) endfunction() function(find_and_configure_rmm VERSION) + + if(TARGET rmm::rmm) + return() + endif() + # Consumers have two options for local source builds: # 1. Pass `-D CPM_rmm_SOURCE=/path/to/rmm` to build a local RMM source tree # 2. Pass `-D CMAKE_PREFIX_PATH=/path/to/rmm/build` to use an existing local diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index a81b6ebc8a1..3c454c85720 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,7 +74,8 @@ class aggregation { NUNIQUE, ///< count number of unique elements NTH_ELEMENT, ///< get the nth element ROW_NUMBER, ///< get row-number of current index (relative to rolling window) - COLLECT, ///< collect values into a list + COLLECT_LIST, ///< collect values into a list + COLLECT_SET, ///< collect values into a list without duplicate entries LEAD, ///< window function, accesses row at specified offset following current row LAG, ///< window function, accesses row at specified offset preceding current row PTX, ///< PTX UDF based reduction @@ -205,18 +206,35 @@ std::unique_ptr make_nth_element_aggregation( std::unique_ptr make_row_number_aggregation(); /** - * @brief Factory to create a COLLECT aggregation + * @brief Factory to create a COLLECT_LIST aggregation * - * `COLLECT` returns a list column of all included elements in the group/series. + * `COLLECT_LIST` returns a list column of all included elements in the group/series. * * If `null_handling` is set to `EXCLUDE`, null elements are dropped from each * of the list rows. * * @param null_handling Indicates whether to include/exclude nulls in list elements. */ -std::unique_ptr make_collect_aggregation( +std::unique_ptr make_collect_list_aggregation( null_policy null_handling = null_policy::INCLUDE); +/** + * @brief Factory to create a COLLECT_SET aggregation + * + * `COLLECT_SET` returns a lists column of all included elements in the group/series. Within each + * list, the duplicated entries are dropped out such that each entry appears only once. + * + * If `null_handling` is set to `EXCLUDE`, null elements are dropped from each + * of the list rows. + * + * @param null_handling Indicates whether to include/exclude nulls during collection + * @param nulls_equal Flag to specify whether null entries within each list should be considered + * equal + */ +std::unique_ptr make_collect_set_aggregation( + null_policy null_handling = null_policy::INCLUDE, + null_equality null_equal = null_equality::EQUAL); + /// Factory to create a LAG aggregation std::unique_ptr make_lag_aggregation(size_type offset); diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp index 8ec26cf5eb7..27bcb0d320b 100644 --- a/cpp/include/cudf/ast/detail/operators.hpp +++ b/cpp/include/cudf/ast/detail/operators.hpp @@ -187,7 +187,7 @@ CUDA_HOST_DEVICE_CALLABLE constexpr void ast_operator_dispatcher(ast_operator op #ifndef __CUDA_ARCH__ CUDF_FAIL("Invalid operator."); #else - release_assert(false && "Invalid operator."); + cudf_assert(false && "Invalid operator."); #endif break; } @@ -784,7 +784,7 @@ struct double_dispatch_binary_operator_types { #ifndef __CUDA_ARCH__ CUDF_FAIL("Invalid binary operation."); #else - release_assert(false && "Invalid binary operation."); + cudf_assert(false && "Invalid binary operation."); #endif } }; @@ -819,7 +819,7 @@ struct single_dispatch_binary_operator_types { #ifndef __CUDA_ARCH__ CUDF_FAIL("Invalid binary operation."); #else - release_assert(false && "Invalid binary operation."); + cudf_assert(false && "Invalid binary operation."); #endif } }; @@ -924,7 +924,7 @@ struct dispatch_unary_operator_types { #ifndef __CUDA_ARCH__ CUDF_FAIL("Invalid unary operation."); #else - release_assert(false && "Invalid unary operation."); + cudf_assert(false && "Invalid unary operation."); #endif } }; @@ -996,7 +996,7 @@ struct return_type_functor { #ifndef __CUDA_ARCH__ CUDF_FAIL("Invalid binary operation. Return type cannot be determined."); #else - release_assert(false && "Invalid binary operation. Return type cannot be determined."); + cudf_assert(false && "Invalid binary operation. Return type cannot be determined."); #endif } @@ -1024,7 +1024,7 @@ struct return_type_functor { #ifndef __CUDA_ARCH__ CUDF_FAIL("Invalid unary operation. Return type cannot be determined."); #else - release_assert(false && "Invalid unary operation. Return type cannot be determined."); + cudf_assert(false && "Invalid unary operation. Return type cannot be determined."); #endif } }; diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh index ee08742d871..2719a8b5077 100644 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ b/cpp/include/cudf/ast/detail/transform.cuh @@ -87,7 +87,7 @@ struct unary_row_output : public row_output { Input input, detail::device_data_reference output) const { - release_assert(false && "Invalid unary dispatch operator for the provided input."); + cudf_assert(false && "Invalid unary dispatch operator for the provided input."); } }; @@ -116,7 +116,7 @@ struct binary_row_output : public row_output { RHS rhs, detail::device_data_reference output) const { - release_assert(false && "Invalid binary dispatch operator for the provided input."); + cudf_assert(false && "Invalid binary dispatch operator for the provided input."); } }; @@ -239,7 +239,7 @@ struct row_evaluator { detail::device_data_reference rhs, detail::device_data_reference output) const { - release_assert(false && "Invalid binary dispatch operator for the provided input."); + cudf_assert(false && "Invalid binary dispatch operator for the provided input."); } private: @@ -311,7 +311,7 @@ __device__ void evaluate_row_expression(detail::row_evaluator const& evaluator, output, op); } else { - release_assert(false && "Invalid operator arity."); + cudf_assert(false && "Invalid operator arity."); } } } diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index b2f152180b0..5a02f5bbe55 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -774,7 +774,7 @@ struct index_element_fn { std::is_unsigned::value)>* = nullptr> __device__ size_type operator()(Args&&... args) { - release_assert(false and "dictionary indices must be an unsigned integral type"); + cudf_assert(false and "dictionary indices must be an unsigned integral type"); return 0; } }; diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index 3d006449044..3f5f5a91632 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -19,8 +19,8 @@ #include #include #include +#include #include -#include #include #include @@ -103,7 +103,7 @@ struct update_target_element { column_device_view source, size_type source_index) const noexcept { - release_assert(false and "Invalid source type and aggregation combination."); + cudf_assert(false and "Invalid source type and aggregation combination."); } }; diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 1cafad25c9c..18bef301e03 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ #pragma once #include -#include +#include #include #include #include @@ -320,11 +320,11 @@ struct udf_aggregation final : derived_aggregation { }; /** - * @brief Derived aggregation class for specifying COLLECT aggregation + * @brief Derived aggregation class for specifying COLLECT_LIST aggregation */ struct collect_list_aggregation final : derived_aggregation { explicit collect_list_aggregation(null_policy null_handling = null_policy::INCLUDE) - : derived_aggregation{COLLECT}, _null_handling{null_handling} + : derived_aggregation{COLLECT_LIST}, _null_handling{null_handling} { } null_policy _null_handling; ///< include or exclude nulls @@ -340,6 +340,32 @@ struct collect_list_aggregation final : derived_aggregation size_t hash_impl() const { return std::hash{}(static_cast(_null_handling)); } }; +/** + * @brief Derived aggregation class for specifying COLLECT_SET aggregation + */ +struct collect_set_aggregation final : derived_aggregation { + explicit collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE, + null_equality null_equal = null_equality::EQUAL) + : derived_aggregation{COLLECT_SET}, _null_handling{null_handling}, _null_equal(null_equal) + { + } + null_policy _null_handling; ///< include or exclude nulls + null_equality _null_equal; ///< whether to consider nulls as equal values + + protected: + friend class derived_aggregation; + + bool operator==(collect_set_aggregation const& other) const + { + return _null_handling == other._null_handling && _null_equal == other._null_equal; + } + + size_t hash_impl() const + { + return std::hash{}(static_cast(_null_handling) ^ static_cast(_null_equal)); + } +}; + /** * @brief Sentinel value used for `ARGMAX` aggregation. * @@ -514,9 +540,15 @@ struct target_type_impl { using type = cudf::size_type; }; -// Always use list for COLLECT +// Always use list for COLLECT_LIST +template +struct target_type_impl { + using type = cudf::list_view; +}; + +// Always use list for COLLECT_SET template -struct target_type_impl { +struct target_type_impl { using type = cudf::list_view; }; @@ -617,8 +649,10 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin return f.template operator()(std::forward(args)...); case aggregation::ROW_NUMBER: return f.template operator()(std::forward(args)...); - case aggregation::COLLECT: - return f.template operator()(std::forward(args)...); + case aggregation::COLLECT_LIST: + return f.template operator()(std::forward(args)...); + case aggregation::COLLECT_SET: + return f.template operator()(std::forward(args)...); case aggregation::LEAD: return f.template operator()(std::forward(args)...); case aggregation::LAG: @@ -627,7 +661,7 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin #ifndef __CUDA_ARCH__ CUDF_FAIL("Unsupported aggregation."); #else - release_assert(false && "Unsupported aggregation."); + cudf_assert(false && "Unsupported aggregation."); // The following code will never be reached, but the compiler generates a // warning if there isn't a return value. diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index 87f5c9251c7..73647ac2292 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -18,8 +18,8 @@ #include #include #include +#include #include -#include #include #include #include diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp index cadcb1265c4..a68d649b8c8 100644 --- a/cpp/include/cudf/detail/groupby/sort_helper.hpp +++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp @@ -63,8 +63,8 @@ struct sort_groupby_helper { sorted keys_pre_sorted = sorted::NO) : _keys(keys), _num_keys(-1), - _include_null_keys(include_null_keys), - _keys_pre_sorted(keys_pre_sorted) + _keys_pre_sorted(keys_pre_sorted), + _include_null_keys(include_null_keys) { if (keys_pre_sorted == sorted::YES and include_null_keys == null_policy::EXCLUDE and has_nulls(keys)) { diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh index 8568bd68bfd..8bbd0d1aada 100644 --- a/cpp/include/cudf/detail/indexalator.cuh +++ b/cpp/include/cudf/detail/indexalator.cuh @@ -268,7 +268,7 @@ struct input_indexalator : base_indexalator { template ()>* = nullptr> __device__ size_type operator()(void const* tp) { - release_assert(false and "only index types are supported"); + cudf_assert(false and "only index types are supported"); return 0; } }; @@ -366,7 +366,7 @@ struct output_indexalator : base_indexalator { template ()>* = nullptr> __device__ void operator()(void* tp, size_type const value) { - release_assert(false and "only index types are supported"); + cudf_assert(false and "only index types are supported"); } }; diff --git a/cpp/include/cudf/detail/label_bins.hpp b/cpp/include/cudf/detail/label_bins.hpp new file mode 100644 index 00000000000..b4da6d888fa --- /dev/null +++ b/cpp/include/cudf/detail/label_bins.hpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include + +#include +#include +#include + +namespace cudf { + +namespace detail { + +/** + * @addtogroup label_bins + * @{ + * @file + * @brief Internal APIs for labeling values by bin. + */ + +/** + * @copydoc cudf::label_bins(column_view const& input, column_view const& left_edges, inclusive + * left_inclusive, column_view const& right_edges, inclusive right_inclusive, null_order + * edge_null_precedence null_order::BEFORE, rmm::mr::device_memory_resource* mr) + * + * @param stream Stream view on which to allocate resources and queue execution. + */ +std::unique_ptr label_bins( + column_view const& input, + column_view const& left_edges, + inclusive left_inclusive, + column_view const& right_edges, + inclusive right_inclusive, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of group +} // namespace detail +} // namespace cudf diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 93f54cff588..08dae998944 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -23,8 +23,6 @@ #include -using cudf::device_span; - namespace cudf { namespace detail { /** diff --git a/cpp/include/cudf/detail/utilities/release_assert.cuh b/cpp/include/cudf/detail/utilities/assert.cuh similarity index 87% rename from cpp/include/cudf/detail/utilities/release_assert.cuh rename to cpp/include/cudf/detail/utilities/assert.cuh index e0db88d8fcb..69f9e2d3791 100644 --- a/cpp/include/cudf/detail/utilities/release_assert.cuh +++ b/cpp/include/cudf/detail/utilities/assert.cuh @@ -27,11 +27,11 @@ * * Relies on the `__PRETTY_FUNCTION__` macro which is specific to GCC and Clang. */ -#if defined(__CUDA_ARCH__) && (defined(__clang__) || defined(__GNUC__)) +#if !defined(NDEBUG) && defined(__CUDA_ARCH__) && (defined(__clang__) || defined(__GNUC__)) #define __ASSERT_STR_HELPER(x) #x -#define release_assert(e) \ +#define cudf_assert(e) \ ((e) ? static_cast(0) \ : __assert_fail(__ASSERT_STR_HELPER(e), __FILE__, __LINE__, __PRETTY_FUNCTION__)) #else -#define release_assert(e) (static_cast(0)) +#define cudf_assert(e) (static_cast(0)) #endif diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index 8b04651e1e6..31533a69487 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -17,7 +17,7 @@ #pragma once #include -#include +#include #include #include #include @@ -155,7 +155,7 @@ struct MD5ListHasher { size_type offset_end, md5_intermediate_data* hash_state) const { - release_assert(false && "MD5 Unsupported chrono type column"); + cudf_assert(false && "MD5 Unsupported chrono type column"); } template ()>* = nullptr> @@ -164,7 +164,7 @@ struct MD5ListHasher { size_type offset_end, md5_intermediate_data* hash_state) const { - release_assert(false && "MD5 Unsupported non-fixed-width type column"); + cudf_assert(false && "MD5 Unsupported non-fixed-width type column"); } template ()>* = nullptr> @@ -274,7 +274,7 @@ struct MD5Hash { size_type row_index, md5_intermediate_data* hash_state) const { - release_assert(false && "MD5 Unsupported chrono type column"); + cudf_assert(false && "MD5 Unsupported chrono type column"); } template ()>* = nullptr> @@ -282,7 +282,7 @@ struct MD5Hash { size_type row_index, md5_intermediate_data* hash_state) const { - release_assert(false && "MD5 Unsupported non-fixed-width type column"); + cudf_assert(false && "MD5 Unsupported non-fixed-width type column"); } template ()>* = nullptr> @@ -345,7 +345,7 @@ void CUDA_DEVICE_CALLABLE MD5Hash::operator()(column_device_view col, column_device_view offsets = col.child(offsets_column_index); column_device_view data = col.child(data_column_index); - if (data.type().id() == type_id::LIST) release_assert(false && "Nested list unsupported"); + if (data.type().id() == type_id::LIST) cudf_assert(false && "Nested list unsupported"); cudf::type_dispatcher(data.type(), MD5ListHasher{}, @@ -765,7 +765,7 @@ struct IdentityHash { CUDA_HOST_DEVICE_CALLABLE std::enable_if_t::value, return_type> operator()(Key const& key) const { - release_assert(false && "IdentityHash does not support this data type"); + cudf_assert(false && "IdentityHash does not support this data type"); return 0; } diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp index 8f8e2b7394c..eb752a8a0ea 100644 --- a/cpp/include/cudf/fixed_point/fixed_point.hpp +++ b/cpp/include/cudf/fixed_point/fixed_point.hpp @@ -16,7 +16,7 @@ #pragma once -#include +#include #include // Note: The versions are used in order for Jitify to work with our fixed_point type. @@ -91,7 +91,7 @@ template ())>* = nullptr> CUDA_HOST_DEVICE_CALLABLE Rep ipow(T exponent) { - release_assert(exponent >= 0 && "integer exponentiation with negative exponent is not possible."); + cudf_assert(exponent >= 0 && "integer exponentiation with negative exponent is not possible."); if (exponent == 0) return static_cast(1); auto extra = static_cast(1); auto square = static_cast(Base); diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp index f7f7f51479d..1dfacd53e0d 100644 --- a/cpp/include/cudf/groupby.hpp +++ b/cpp/include/cudf/groupby.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -166,6 +166,61 @@ class groupby { std::vector const& requests, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** + * @brief Performs grouped scans on the specified values. + * + * The values to aggregate and the aggregations to perform are specifed in an + * `aggregation_request`. Each request contains a `column_view` of values to + * aggregate and a set of `aggregation`s to perform on those elements. + * + * For each `aggregation` in a request, `values[i]` is scan aggregated with + * all previous `values[j]` where rows `i` and `j` in `keys` are equivalent. + * + * The `size()` of the request column must equal `keys.num_rows()`. + * + * For every `aggregation_request` an `aggregation_result` will be returned. + * The `aggregation_result` holds the resulting column(s) for each requested + * aggregation on the `request`s values. The order of the columns in each + * result is the same order as was specified in the request. + * + * The returned `table` contains the group labels for each row, i.e., the + * `keys` given to groupby object. Element `i` across all aggregation results + * belongs to the group at row `i` in the group labels table. + * + * The order of the rows in the group labels is arbitrary. Furthermore, + * successive `groupby::scan` calls may return results in different orders. + * + * @throws cudf::logic_error If `requests[i].values.size() != + * keys.num_rows()`. + * + * Example: + * ``` + * Input: + * keys: {1 2 1 3 1} + * {1 2 1 4 1} + * request: + * values: {3 1 4 9 2} + * aggregations: {{SUM}, {MIN}} + * + * result: + * + * keys: {3 1 1 1 2} + * {4 1 1 1 2} + * values: + * SUM: {9 3 7 9 1} + * MIN: {9 3 3 2 1} + * ``` + * + * @param requests The set of columns to scan and the scans to perform + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * @return Pair containing the table with each group's key and + * a vector of aggregation_results for each request in the same order as + * specified in `requests`. + */ + std::pair, std::vector> scan( + std::vector const& requests, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief The grouped data corresponding to a groupby operation on a set of values. * @@ -231,6 +286,11 @@ class groupby { std::vector const& requests, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); + + std::pair, std::vector> sort_scan( + std::vector const& requests, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); }; /** @} */ } // namespace groupby diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 3e63e8fc770..7cb3db1eb30 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -24,6 +24,8 @@ #include +#include + #include #include #include diff --git a/cpp/include/cudf/labeling/label_bins.hpp b/cpp/include/cudf/labeling/label_bins.hpp new file mode 100644 index 00000000000..7244698f8a2 --- /dev/null +++ b/cpp/include/cudf/labeling/label_bins.hpp @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include +#include + +namespace cudf { + +/** + * @addtogroup label_bins + * @{ + * @file + * @brief APIs for labeling values by bin. + */ + +/** + * @brief Enum used to define whether or not bins include their boundary points. + */ +enum class inclusive { YES, NO }; + +/** + * @brief Labels elements based on membership in the specified bins. + * + * A bin `i` is defined by `left_edges[i], right_edges[i]`. Whether the edges are inclusive or + * not is determined by `left_inclusive` and `right_inclusive`, respectively. + * + * A value `input[j]` belongs to bin `i` if `value[j]` is contained in the range `left_edges[i], + * right_edges[i]` (with the specified inclusiveness) and `label[j] == i`. If `input[j]` does not + * belong to any bin, then `label[j]` is NULL. + * + * Notes: + * - If an empty set of edges is provided, all elements in `input` are labeled NULL. + * - NULL elements in `input` belong to no bin and their corresponding label is NULL. + * - NaN elements in `input` belong to no bin and their corresponding label is NULL. + * - Bins must be provided in monotonically increasing order, otherwise behavior is undefined. + * - If two or more bins overlap, behavior is undefined. + * + * @throws cudf::logic_error if `input.type() == left_edges.type() == right_edges.type()` is + * violated. + * @throws cudf::logic_error if `left_edges.size() != right_edges.size()` + * @throws cudf::logic_error if `left_edges.has_nulls()` or `right_edges.has_nulls()` + * + * @param input The input elements to label according to the specified bins. + * @param left_edges Values of the left edge of each bin. + * @param left_inclusive Whether or not the left edge is inclusive. + * @param right_edges Value of the right edge of each bin. + * @param right_inclusive Whether or not the right edge is inclusive. + * @param edge_null_precedence Whether nulls in left and right edges are at the beginning or the + * end. + * @param mr Device memory resource used to allocate the returned column's device. + * @return The integer labels of the elements in `input` according to the specified bins. + */ +std::unique_ptr label_bins( + column_view const& input, + column_view const& left_edges, + inclusive left_inclusive, + column_view const& right_edges, + inclusive right_inclusive, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of group +} // namespace cudf diff --git a/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp b/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp new file mode 100644 index 00000000000..ba3e1d17d7f --- /dev/null +++ b/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace cudf { +namespace lists { +namespace detail { + +/** + * @copydoc cudf::lists::drop_list_duplicates + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr drop_list_duplicates( + lists_column_view const& lists_column, + null_equality nulls_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +} // namespace detail +} // namespace lists +} // namespace cudf diff --git a/cpp/include/cudf/lists/list_device_view.cuh b/cpp/include/cudf/lists/list_device_view.cuh index 3afafe9d1fa..4f207474526 100644 --- a/cpp/include/cudf/lists/list_device_view.cuh +++ b/cpp/include/cudf/lists/list_device_view.cuh @@ -37,12 +37,12 @@ class list_device_view { : lists_column(lists_column), _row_index(row_index) { column_device_view const& offsets = lists_column.offsets(); - release_assert(row_index >= 0 && row_index < lists_column.size() && - row_index < offsets.size() && "row_index out of bounds"); + cudf_assert(row_index >= 0 && row_index < lists_column.size() && row_index < offsets.size() && + "row_index out of bounds"); begin_offset = offsets.element(row_index); - release_assert(begin_offset >= 0 && begin_offset <= lists_column.child().size() && - "begin_offset out of bounds."); + cudf_assert(begin_offset >= 0 && begin_offset <= lists_column.child().size() && + "begin_offset out of bounds."); _size = offsets.element(row_index + 1) - begin_offset; } @@ -71,7 +71,7 @@ class list_device_view { */ CUDA_DEVICE_CALLABLE size_type element_offset(size_type idx) const { - release_assert(idx >= 0 && idx < size() && "idx out of bounds"); + cudf_assert(idx >= 0 && idx < size() && "idx out of bounds"); return begin_offset + idx; } @@ -93,7 +93,7 @@ class list_device_view { */ CUDA_DEVICE_CALLABLE bool is_null(size_type idx) const { - release_assert(idx >= 0 && idx < size() && "Index out of bounds."); + cudf_assert(idx >= 0 && idx < size() && "Index out of bounds."); auto element_offset = begin_offset + idx; return lists_column.child().is_null(element_offset); } @@ -294,7 +294,7 @@ struct list_size_functor { CUDA_HOST_DEVICE_CALLABLE list_size_functor(column_device_view const& d_col) : d_column(d_col) { #if defined(__CUDA_ARCH__) - release_assert(d_col.type().id() == type_id::LIST && "Only list type column is supported"); + cudf_assert(d_col.type().id() == type_id::LIST && "Only list type column is supported"); #else CUDF_EXPECTS(d_col.type().id() == type_id::LIST, "Only list type column is supported"); #endif diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp index 1e2fa80b129..4d29b0a5b6a 100644 --- a/cpp/include/cudf/strings/convert/convert_integers.hpp +++ b/cpp/include/cudf/strings/convert/convert_integers.hpp @@ -78,7 +78,10 @@ std::unique_ptr from_integers( * characters are valid for conversion to integers. * * The output row entry will be set to `true` if the corresponding string element - * has at least one character in [-+0-9]. + * have all characters in [-+0-9]. The optional sign character must only be in the first + * position. Notice that the the integer value is not checked to be within its storage limits. + * For strict integer type check, use the other `is_integer()` API which accepts `data_type` + * argument. * * @code{.pseudo} * Example: @@ -89,12 +92,44 @@ std::unique_ptr from_integers( * * Any null row results in a null entry for that row in the output column. * - * @param strings Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of boolean results for each string. + * @param strings Strings instance for this operation. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New column of boolean results for each string. + */ +std::unique_ptr is_integer( + strings_column_view const& strings, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns a boolean column identifying strings in which all + * characters are valid for conversion to integers. + * + * The output row entry will be set to `true` if the corresponding string element + * has all characters in [-+0-9]. The optional sign character must only be in the first + * position. Also, the integer component must fit within the size limits of the underlying + * storage type, which is provided by the int_type parameter. + * + * @code{.pseudo} + * Example: + * s = ['123456', '-456', '', 'A', '+7'] + * + * output1 = s.is_integer(s, data_type{type_id::INT32}) + * output1 is [true, true, false, false, true] + * + * output2 = s.is_integer(s, data_type{type_id::INT8}) + * output2 is [false, false, false, false, true] + * @endcode + * + * Any null row results in a null entry for that row in the output column. + * + * @param strings Strings instance for this operation. + * @param int_type Integer type used for checking underflow and overflow. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New column of boolean results for each string. */ std::unique_ptr is_integer( strings_column_view const& strings, + data_type int_type, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh index d9840e78be2..04d215ff7cb 100644 --- a/cpp/include/cudf/table/row_operators.cuh +++ b/cpp/include/cudf/table/row_operators.cuh @@ -17,8 +17,8 @@ #pragma once #include +#include #include -#include #include #include #include @@ -190,7 +190,7 @@ class element_equality_comparator { std::enable_if_t()>* = nullptr> __device__ bool operator()(size_type lhs_element_index, size_type rhs_element_index) { - release_assert(false && "Attempted to compare elements of uncomparable types."); + cudf_assert(false && "Attempted to compare elements of uncomparable types."); return false; } @@ -291,7 +291,7 @@ class element_relational_comparator { std::enable_if_t()>* = nullptr> __device__ weak_ordering operator()(size_type lhs_element_index, size_type rhs_element_index) { - release_assert(false && "Attempted to compare elements of uncomparable types."); + cudf_assert(false && "Attempted to compare elements of uncomparable types."); return weak_ordering::LESS; } diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 48e5d9543b8..7a3316a0571 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,8 +24,6 @@ #define CUDA_DEVICE_CALLABLE inline #endif -#include // TODO no idea why this is needed ¯\_(ツ)_/¯ - #include #include #include diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp index 26c51d0435a..bd9ea015a32 100644 --- a/cpp/include/cudf/utilities/type_dispatcher.hpp +++ b/cpp/include/cudf/utilities/type_dispatcher.hpp @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include #include @@ -501,7 +501,7 @@ CUDA_HOST_DEVICE_CALLABLE constexpr decltype(auto) type_dispatcher(cudf::data_ty #ifndef __CUDA_ARCH__ CUDF_FAIL("Unsupported type_id."); #else - release_assert(false && "Unsupported type_id."); + cudf_assert(false && "Unsupported type_id."); // The following code will never be reached, but the compiler generates a // warning if there isn't a return value. diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index 3f3efdb7626..65dd5c73475 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -147,6 +147,7 @@ * @defgroup lists_gather Gathering * @defgroup lists_elements Counting * @defgroup lists_drop_duplicates Filtering + * @defgroup lists_sort Sorting * @} * @defgroup nvtext_apis NVText * @{ @@ -164,4 +165,8 @@ * @defgroup utility_bitmask Bitmask * @defgroup utility_error Exception * @} + * @defgroup labeling_apis Labeling + * @{ + * @defgroup label_bins Bin Labeling + * @} */ diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index 04dc8776d20..33c19617308 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -125,11 +125,17 @@ std::unique_ptr make_row_number_aggregation() { return std::make_unique(aggregation::ROW_NUMBER); } -/// Factory to create a COLLECT aggregation -std::unique_ptr make_collect_aggregation(null_policy null_handling) +/// Factory to create a COLLECT_LIST aggregation +std::unique_ptr make_collect_list_aggregation(null_policy null_handling) { return std::make_unique(null_handling); } +/// Factory to create a COLLECT_SET aggregation +std::unique_ptr make_collect_set_aggregation(null_policy null_handling, + null_equality null_equal) +{ + return std::make_unique(null_handling, null_equal); +} /// Factory to create a LAG aggregation std::unique_ptr make_lag_aggregation(size_type offset) { diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 60167d77507..845a5512c27 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -44,8 +44,6 @@ #include #include -using cudf::device_span; - namespace cudf { size_type state_null_count(mask_state state, size_type size) { diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index 487aed4b411..cdd8ceb0a6c 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -159,6 +159,24 @@ std::pair, std::vector> groupby::aggr return dispatch_aggregation(requests, 0, mr); } +// Compute scan requests +std::pair, std::vector> groupby::scan( + std::vector const& requests, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + CUDF_EXPECTS( + std::all_of(requests.begin(), + requests.end(), + [this](auto const& request) { return request.values.size() == _keys.num_rows(); }), + "Size mismatch between request values and groupby keys."); + + verify_valid_requests(requests); + + if (_keys.num_rows() == 0) { return std::make_pair(empty_like(_keys), empty_results(requests)); } + + return sort_scan(requests, rmm::cuda_stream_default, mr); +} + groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); diff --git a/cpp/src/groupby/hash/multi_pass_kernels.cuh b/cpp/src/groupby/hash/multi_pass_kernels.cuh index a491b50478a..24de22705a9 100644 --- a/cpp/src/groupby/hash/multi_pass_kernels.cuh +++ b/cpp/src/groupby/hash/multi_pass_kernels.cuh @@ -20,8 +20,8 @@ #include #include #include +#include #include -#include #include #include @@ -65,7 +65,7 @@ struct var_hash_functor { size_type source_index, size_type target_index) noexcept { - release_assert(false and "Invalid source type for std, var aggregation combination."); + cudf_assert(false and "Invalid source type for std, var aggregation combination."); } template diff --git a/cpp/src/groupby/sort/groupby.cu b/cpp/src/groupby/sort/aggregate.cpp similarity index 79% rename from cpp/src/groupby/sort/groupby.cu rename to cpp/src/groupby/sort/aggregate.cpp index 5c54dd3cb4c..b171b19413b 100644 --- a/cpp/src/groupby/sort/groupby.cu +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,20 +15,20 @@ */ #include -#include "group_reductions.hpp" +#include +#include #include #include -#include #include #include #include #include #include -#include #include #include #include +#include #include #include #include @@ -51,71 +51,17 @@ namespace detail { * memoised sorted and/or grouped values and re-using will save on computation * of these values. */ -struct store_result_functor { - store_result_functor(size_type col_idx, - column_view const& values, - sort::sort_groupby_helper& helper, - cudf::detail::result_cache& cache, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : col_idx(col_idx), helper(helper), cache(cache), values(values), stream(stream), mr(mr) - { - } - +struct aggregrate_result_functor final : store_result_functor { + using store_result_functor::store_result_functor; template void operator()(aggregation const& agg) { + CUDF_FAIL("Unsupported aggregation."); } - - private: - /** - * @brief Get the grouped values - * - * Computes the grouped values from @p values on first invocation and returns - * the stored result on subsequent invocation - */ - column_view get_grouped_values() - { - // TODO (dm): After implementing single pass multi-agg, explore making a - // cache of all grouped value columns rather than one at a time - if (grouped_values) - return grouped_values->view(); - else if (sorted_values) - // TODO (dm): When we implement scan, it wouldn't be ok to return sorted - // values when asked for grouped values. Change this then. - return sorted_values->view(); - else - grouped_values = helper.grouped_values(values); - return grouped_values->view(); - }; - - /** - * @brief Get the grouped and sorted values - * - * Computes the grouped and sorted (within each group) values from @p values - * on first invocation and returns the stored result on subsequent invocation - */ - column_view get_sorted_values() - { - if (not sorted_values) sorted_values = helper.sorted_values(values); - return sorted_values->view(); - }; - - private: - size_type col_idx; ///< Index of column in requests being operated on - sort::sort_groupby_helper& helper; ///< Sort helper - cudf::detail::result_cache& cache; ///< cache of results to store into - column_view const& values; ///< Column of values to group and aggregate - - rmm::cuda_stream_view stream; ///< CUDA stream on which to execute kernels - rmm::mr::device_memory_resource* mr; ///< Memory resource to allocate space for results - - std::unique_ptr sorted_values; ///< Memoised grouped and sorted values - std::unique_ptr grouped_values; ///< Memoised grouped values }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -129,7 +75,7 @@ void store_result_functor::operator()(aggregation cons } template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -138,7 +84,7 @@ void store_result_functor::operator()(aggregation const& } template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -149,7 +95,7 @@ void store_result_functor::operator()(aggregation const& agg) }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -164,7 +110,7 @@ void store_result_functor::operator()(aggregation const& ag }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -179,7 +125,7 @@ void store_result_functor::operator()(aggregation const& ag }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -216,7 +162,7 @@ void store_result_functor::operator()(aggregation const& agg) }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -253,7 +199,7 @@ void store_result_functor::operator()(aggregation const& agg) }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -277,7 +223,7 @@ void store_result_functor::operator()(aggregation const& agg) }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -300,7 +246,7 @@ void store_result_functor::operator()(aggregation const& }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -314,7 +260,7 @@ void store_result_functor::operator()(aggregation const& agg) }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -335,7 +281,7 @@ void store_result_functor::operator()(aggregation const& }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -355,7 +301,7 @@ void store_result_functor::operator()(aggregation const& ag }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -372,7 +318,7 @@ void store_result_functor::operator()(aggregation const& a }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -401,12 +347,12 @@ void store_result_functor::operator()(aggregation cons } template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { auto null_handling = static_cast(agg)._null_handling; CUDF_EXPECTS(null_handling == null_policy::INCLUDE, - "null exclusion is not supported on groupby COLLECT aggregation."); + "null exclusion is not supported on groupby COLLECT_LIST aggregation."); if (cache.has_result(col_idx, agg)) return; @@ -416,6 +362,25 @@ void store_result_functor::operator()(aggregation const& a cache.add_result(col_idx, agg, std::move(result)); }; +template <> +void aggregrate_result_functor::operator()(aggregation const& agg) +{ + auto const null_handling = + static_cast(agg)._null_handling; + CUDF_EXPECTS(null_handling == null_policy::INCLUDE, + "null exclusion is not supported on groupby COLLECT_SET aggregation."); + + if (cache.has_result(col_idx, agg)) { return; } + + auto const collect_result = detail::group_collect( + get_grouped_values(), helper.group_offsets(), helper.num_groups(), stream, mr); + auto const nulls_equal = + static_cast(agg)._null_equal; + cache.add_result(col_idx, + agg, + lists::detail::drop_list_duplicates( + lists_column_view(collect_result->view()), nulls_equal, stream, mr)); +}; } // namespace detail // Sort-based groupby @@ -431,7 +396,7 @@ std::pair, std::vector> groupby::sort for (size_t i = 0; i < requests.size(); i++) { auto store_functor = - detail::store_result_functor(i, requests[i].values, helper(), cache, stream, mr); + detail::aggregrate_result_functor(i, requests[i].values, helper(), cache, stream, mr); for (size_t j = 0; j < requests[i].aggregations.size(); j++) { // TODO (dm): single pass compute all supported reductions cudf::detail::aggregation_dispatcher( diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp new file mode 100644 index 00000000000..565320fbe80 --- /dev/null +++ b/cpp/src/groupby/sort/functors.hpp @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include + +#include + +#include + +namespace cudf { +namespace groupby { +namespace detail { +/** + * @brief Functor to dispatch aggregation with + * + * This functor is to be used with `aggregation_dispatcher` to compute the + * appropriate aggregation. If the values on which to run the aggregation are + * unchanged, then this functor should be re-used. This is because it stores + * memoised sorted and/or grouped values and re-using will save on computation + * of these values. + */ +struct store_result_functor { + store_result_functor(size_type col_idx, + column_view const& values, + sort::sort_groupby_helper& helper, + cudf::detail::result_cache& cache, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : col_idx(col_idx), helper(helper), cache(cache), values(values), stream(stream), mr(mr) + { + } + + protected: + /** + * @brief Get the grouped values + * + * Computes the grouped values from @p values on first invocation and returns + * the stored result on subsequent invocation + */ + column_view get_grouped_values() + { + // TODO (dm): After implementing single pass multi-agg, explore making a + // cache of all grouped value columns rather than one at a time + if (grouped_values) + return grouped_values->view(); + else if (sorted_values) + // In scan, it wouldn't be ok to return sorted values when asked for grouped values. + // It's overridden in scan implementation. + return sorted_values->view(); + else + return (grouped_values = helper.grouped_values(values))->view(); + }; + + /** + * @brief Get the grouped and sorted values + * + * Computes the grouped and sorted (within each group) values from @p values + * on first invocation and returns the stored result on subsequent invocation + */ + column_view get_sorted_values() + { + return sorted_values ? sorted_values->view() + : (sorted_values = helper.sorted_values(values))->view(); + }; + + protected: + size_type col_idx; ///< Index of column in requests being operated on + sort::sort_groupby_helper& helper; ///< Sort helper + cudf::detail::result_cache& cache; ///< cache of results to store into + column_view const& values; ///< Column of values to group and aggregate + + rmm::cuda_stream_view stream; ///< CUDA stream on which to execute kernels + rmm::mr::device_memory_resource* mr; ///< Memory resource to allocate space for results + + std::unique_ptr sorted_values; ///< Memoised grouped and sorted values + std::unique_ptr grouped_values; ///< Memoised grouped values +}; +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/sort/group_count.cu b/cpp/src/groupby/sort/group_count.cu index 60e0ce31db1..121e4bb889d 100644 --- a/cpp/src/groupby/sort/group_count.cu +++ b/cpp/src/groupby/sort/group_count.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/groupby/sort/group_count_scan.cu b/cpp/src/groupby/sort/group_count_scan.cu new file mode 100644 index 00000000000..4ad533aebdc --- /dev/null +++ b/cpp/src/groupby/sort/group_count_scan.cu @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace cudf { +namespace groupby { +namespace detail { +std::unique_ptr count_scan(cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + std::unique_ptr result = make_fixed_width_column( + data_type{type_id::INT32}, group_labels.size(), mask_state::UNALLOCATED, stream, mr); + + if (group_labels.empty()) { return result; } + + auto resultview = result->mutable_view(); + // aggregation::COUNT_ALL + thrust::exclusive_scan_by_key(rmm::exec_policy(stream), + group_labels.begin(), + group_labels.end(), + thrust::make_constant_iterator(1), + resultview.begin()); + return result; +} + +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/sort/group_max.cu b/cpp/src/groupby/sort/group_max.cu index bd4e676b83d..3f5592186df 100644 --- a/cpp/src/groupby/sort/group_max.cu +++ b/cpp/src/groupby/sort/group_max.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/groupby/sort/group_max_scan.cu b/cpp/src/groupby/sort/group_max_scan.cu new file mode 100644 index 00000000000..303d606be9d --- /dev/null +++ b/cpp/src/groupby/sort/group_max_scan.cu @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +namespace cudf { +namespace groupby { +namespace detail { +std::unique_ptr max_scan(column_view const& values, + size_type num_groups, + cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return type_dispatcher( + values.type(), scan_functor{}, values, num_groups, group_labels, stream, mr); +} + +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/sort/group_min_scan.cu b/cpp/src/groupby/sort/group_min_scan.cu new file mode 100644 index 00000000000..4a692cdf0bd --- /dev/null +++ b/cpp/src/groupby/sort/group_min_scan.cu @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +namespace cudf { +namespace groupby { +namespace detail { +std::unique_ptr min_scan(column_view const& values, + size_type num_groups, + cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return type_dispatcher( + values.type(), scan_functor{}, values, num_groups, group_labels, stream, mr); +} + +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/sort/group_scan.hpp b/cpp/src/groupby/sort/group_scan.hpp new file mode 100644 index 00000000000..efb39068d2e --- /dev/null +++ b/cpp/src/groupby/sort/group_scan.hpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +#include + +namespace cudf { +namespace groupby { +namespace detail { +/** + * @brief Internal API to calculate groupwise cumulative sum + * + * @param values Grouped values to get sum of + * @param num_groups Number of groups + * @param group_labels ID of group that the corresponding value belongs to + * @param mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr sum_scan(column_view const& values, + size_type num_groups, + cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +/** + * @brief Internal API to calculate groupwise cumulative minimum value + * + * @param values Grouped values to get minimum from + * @param num_groups Number of groups + * @param group_labels ID of group that the corresponding value belongs to + * @param mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr min_scan(column_view const& values, + size_type num_groups, + cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +/** + * @brief Internal API to calculate groupwise cumulative maximum value + * + * @param values Grouped values to get maximum from + * @param num_groups Number of groups + * @param group_labels ID of group that the corresponding value belongs to + * @param mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr max_scan(column_view const& values, + size_type num_groups, + cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +/** + * @brief Internal API to calculate cumulative number of values in each group + * + * @param group_labels ID of group that the corresponding value belongs to + * @param mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches. + * @return Column of type INT32 of count values + */ +std::unique_ptr count_scan(cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh new file mode 100644 index 00000000000..9f8614a61b4 --- /dev/null +++ b/cpp/src/groupby/sort/group_scan_util.cuh @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +namespace cudf { +namespace groupby { +namespace detail { +template +struct scan_functor { + template + static constexpr bool is_supported() + { + if (K == aggregation::SUM) + return cudf::is_numeric() || cudf::is_duration() || cudf::is_fixed_point(); + else if (K == aggregation::MIN or K == aggregation::MAX) + return cudf::is_fixed_width() and is_relationally_comparable(); + else + return false; + } + + template + std::enable_if_t(), std::unique_ptr> operator()( + column_view const& values, + size_type num_groups, + cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + using DeviceType = device_storage_type_t; + using OpType = cudf::detail::corresponding_operator_t; + using ResultType = cudf::detail::target_type_t; + using ResultDeviceType = device_storage_type_t; + + auto result_type = is_fixed_point() + ? data_type{type_to_id(), values.type().scale()} + : data_type{type_to_id()}; + + std::unique_ptr result = + make_fixed_width_column(result_type, values.size(), mask_state::UNALLOCATED, stream, mr); + + if (values.is_empty()) { return result; } + + auto result_table = mutable_table_view({*result}); + cudf::detail::initialize_with_identity(result_table, {K}, stream); + + auto result_view = mutable_column_device_view::create(result->mutable_view(), stream); + auto values_view = column_device_view::create(values, stream); + + if (values.has_nulls()) { + auto input = thrust::make_transform_iterator( + make_null_replacement_iterator(*values_view, OpType::template identity()), + thrust::identity{}); + thrust::inclusive_scan_by_key(rmm::exec_policy(stream), + group_labels.begin(), + group_labels.end(), + input, + result_view->begin(), + thrust::equal_to{}, + OpType{}); + result->set_null_mask(cudf::detail::copy_bitmask(values, stream)); + } else { + auto input = thrust::make_transform_iterator(values_view->begin(), + thrust::identity{}); + thrust::inclusive_scan_by_key(rmm::exec_policy(stream), + group_labels.begin(), + group_labels.end(), + input, + result_view->begin(), + thrust::equal_to{}, + OpType{}); + } + return result; + } + + template + std::enable_if_t(), std::unique_ptr> operator()(Args&&... args) + { + CUDF_FAIL("Unsupported groupby scan type-agg combination"); + } +}; + +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/sort/group_sum_scan.cu b/cpp/src/groupby/sort/group_sum_scan.cu new file mode 100644 index 00000000000..ae9b1c321d4 --- /dev/null +++ b/cpp/src/groupby/sort/group_sum_scan.cu @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +namespace cudf { +namespace groupby { +namespace detail { +std::unique_ptr sum_scan(column_view const& values, + size_type num_groups, + cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return type_dispatcher( + values.type(), scan_functor{}, values, num_groups, group_labels, stream, mr); +} + +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp new file mode 100644 index 00000000000..63de4ea8684 --- /dev/null +++ b/cpp/src/groupby/sort/scan.cpp @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace cudf { +namespace groupby { +namespace detail { +/** + * @brief Functor to dispatch aggregation with + * + * This functor is to be used with `aggregation_dispatcher` to compute the + * appropriate aggregation. If the values on which to run the aggregation are + * unchanged, then this functor should be re-used. This is because it stores + * memoised sorted and/or grouped values and re-using will save on computation + * of these values. + */ +struct scan_result_functor final : store_result_functor { + using store_result_functor::store_result_functor; + template + void operator()(aggregation const& agg) + { + CUDF_FAIL("Unsupported groupby scan aggregation"); + } + + private: + column_view get_grouped_values() + { + // TODO (dm): After implementing single pass multi-agg, explore making a + // cache of all grouped value columns rather than one at a time + if (grouped_values) + return grouped_values->view(); + else + return (grouped_values = helper.grouped_values(values))->view(); + }; +}; + +template <> +void scan_result_functor::operator()(aggregation const& agg) +{ + if (cache.has_result(col_idx, agg)) return; + + cache.add_result( + col_idx, + agg, + detail::sum_scan(get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr)); +} + +template <> +void scan_result_functor::operator()(aggregation const& agg) +{ + if (cache.has_result(col_idx, agg)) return; + + cache.add_result( + col_idx, + agg, + detail::min_scan(get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr)); +} + +template <> +void scan_result_functor::operator()(aggregation const& agg) +{ + if (cache.has_result(col_idx, agg)) return; + + cache.add_result( + col_idx, + agg, + detail::max_scan(get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr)); +} + +template <> +void scan_result_functor::operator()(aggregation const& agg) +{ + if (cache.has_result(col_idx, agg)) return; + + cache.add_result(col_idx, agg, detail::count_scan(helper.group_labels(), stream, mr)); +} +} // namespace detail + +// Sort-based groupby +std::pair, std::vector> groupby::sort_scan( + std::vector const& requests, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // We're going to start by creating a cache of results so that aggs that + // depend on other aggs will not have to be recalculated. e.g. mean depends on + // sum and count. std depends on mean and count + cudf::detail::result_cache cache(requests.size()); + + for (size_t i = 0; i < requests.size(); i++) { + auto store_functor = + detail::scan_result_functor(i, requests[i].values, helper(), cache, stream, mr); + for (auto const& aggregation : requests[i].aggregations) { + // TODO (dm): single pass compute all supported reductions + cudf::detail::aggregation_dispatcher(aggregation->kind, store_functor, *aggregation); + } + } + + auto results = detail::extract_results(requests, cache); + + return std::make_pair(helper().sorted_keys(stream, mr), std::move(results)); +} +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 6e8937607b9..538e238b5ea 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include @@ -68,7 +68,7 @@ struct page_state_s { // (leaf) value decoding int32_t nz_count; // number of valid entries in nz_idx (write position in circular buffer) int32_t dict_pos; // write position of dictionary indices - int32_t out_pos; // read position of final output + int32_t src_pos; // input read position of final output value int32_t ts_scale; // timestamp scale: <0: divide by -ts_scale, >0: multiply by ts_scale uint32_t nz_idx[non_zero_buffer_size]; // circular buffer of non-null value positions uint32_t dict_idx[non_zero_buffer_size]; // Dictionary index, boolean, or string offset values @@ -963,6 +963,7 @@ static __device__ bool setupLocalPageInfo(page_state_s *const s, if (d + t < s->page.num_nesting_levels) { s->page.nesting[d + t].valid_count = 0; s->page.nesting[d + t].value_count = 0; + s->page.nesting[d + t].null_count = 0; } d += blockDim.x; } @@ -1029,13 +1030,13 @@ static __device__ bool setupLocalPageInfo(page_state_s *const s, s->dtype_len = 8; // Convert to 64-bit timestamp } - // first row within the page to start reading + // first row within the page to output if (page_start_row >= min_row) { s->first_row = 0; } else { s->first_row = (int32_t)min(min_row - page_start_row, (size_t)s->page.num_rows); } - // # of rows within the page to read + // # of rows within the page to output s->num_rows = s->page.num_rows; if ((page_start_row + s->first_row) + s->num_rows > min_row + num_rows) { s->num_rows = @@ -1127,43 +1128,54 @@ static __device__ bool setupLocalPageInfo(page_state_s *const s, s->nz_count = 0; s->num_input_values = s->page.num_input_values; s->dict_pos = 0; - s->out_pos = 0; - - // handle row bounds (skip_rows, min_rows) - s->input_row_count = s->first_row; + s->src_pos = 0; + + // for flat hierarchies, we can't know how many leaf values to skip unless we do a full + // preprocess of the definition levels (since nulls will have no actual decodable value, there + // is no direct correlation between # of rows and # of decodable values). so we will start + // processing at the beginning of the value stream and disregard any indices that start + // before the first row. + if (s->col.max_level[level_type::REPETITION] == 0) { + s->page.skipped_values = 0; + s->page.skipped_leaf_values = 0; + s->input_value_count = 0; + s->input_row_count = 0; - // return the lower bound to compare (page-relative) thread row index against. Explanation: - // In the case of nested schemas, rows can span page boundaries. That is to say, - // we can encounter the first value for row X on page M, but the last value for page M - // might not be the last value for row X. page M+1 (or further) may contain the last value. - // - // This means that the first values we encounter for a given page (M+1) may not belong to the - // row indicated by chunk_row, but to the row before it that spanned page boundaries. If that - // previous row is within the overall row bounds, include the values by allowing relative row - // index -1 - int max_row = (min_row + num_rows) - 1; - if (min_row < page_start_row && max_row >= page_start_row - 1) { s->row_index_lower_bound = -1; - } else { - s->row_index_lower_bound = s->first_row; } - - // if we're in the decoding step, jump directly to the first - // value we care about - if (s->col.column_data_base != nullptr) { - // for flat hierarchies, we haven't computed skipped_values yet, but we can do so trivially - // now - if (s->col.max_level[level_type::REPETITION] == 0) { - s->page.skipped_values = s->first_row; - s->page.skipped_leaf_values = s->first_row; + // for nested hierarchies, we have run a preprocess that lets us skip directly to the values + // we need to start decoding at + else { + // input_row_count translates to "how many rows we have processed so far", so since we are + // skipping directly to where we want to start decoding, set it to first_row + s->input_row_count = s->first_row; + + // return the lower bound to compare (page-relative) thread row index against. Explanation: + // In the case of nested schemas, rows can span page boundaries. That is to say, + // we can encounter the first value for row X on page M, but the last value for page M + // might not be the last value for row X. page M+1 (or further) may contain the last value. + // + // This means that the first values we encounter for a given page (M+1) may not belong to the + // row indicated by chunk_row, but to the row before it that spanned page boundaries. If that + // previous row is within the overall row bounds, include the values by allowing relative row + // index -1 + int const max_row = (min_row + num_rows) - 1; + if (min_row < page_start_row && max_row >= page_start_row - 1) { + s->row_index_lower_bound = -1; + } else { + s->row_index_lower_bound = s->first_row; } - s->input_value_count = s->page.skipped_values > -1 ? s->page.skipped_values : 0; - } else { - s->input_value_count = 0; - s->input_leaf_count = 0; - s->page.skipped_values = -1; - s->page.skipped_leaf_values = -1; + // if we're in the decoding step, jump directly to the first + // value we care about + if (s->col.column_data_base != nullptr) { + s->input_value_count = s->page.skipped_values > -1 ? s->page.skipped_values : 0; + } else { + s->input_value_count = 0; + s->input_leaf_count = 0; + s->page.skipped_values = -1; + s->page.skipped_leaf_values = -1; + } } __threadfence_block(); @@ -1279,7 +1291,7 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu int t) { // max nesting depth of the column - int max_depth = s->col.max_nesting_depth; + int const max_depth = s->col.max_nesting_depth; // how many (input) values we've processed in the page so far int input_value_count = s->input_value_count; // how many rows we've processed in the page so far @@ -1304,19 +1316,19 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu // track (page-relative) row index for the thread so we can compare against input bounds // keep track of overall # of rows we've read. - int is_new_row = start_depth == 0 ? 1 : 0; - uint32_t warp_row_count_mask = ballot(is_new_row); - int32_t thread_row_index = + int const is_new_row = start_depth == 0 ? 1 : 0; + uint32_t const warp_row_count_mask = ballot(is_new_row); + int32_t const thread_row_index = input_row_count + ((__popc(warp_row_count_mask & ((1 << t) - 1)) + is_new_row) - 1); input_row_count += __popc(warp_row_count_mask); - // is this thread within row bounds? - int in_row_bounds = thread_row_index >= s->row_index_lower_bound && - thread_row_index < (s->first_row + s->num_rows) - ? 1 - : 0; + // is this thread within read row bounds? + int const in_row_bounds = thread_row_index >= s->row_index_lower_bound && + thread_row_index < (s->first_row + s->num_rows) + ? 1 + : 0; // compute warp and thread value counts - uint32_t warp_count_mask = + uint32_t const warp_count_mask = ballot((0 >= start_depth && 0 <= end_depth) && in_row_bounds ? 1 : 0); warp_value_count = __popc(warp_count_mask); @@ -1329,36 +1341,35 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu PageNestingInfo *pni = &s->page.nesting[s_idx]; // if we are within the range of nesting levels we should be adding value indices for - int in_nesting_bounds = + int const in_nesting_bounds = ((s_idx >= start_depth && s_idx <= end_depth) && in_row_bounds) ? 1 : 0; // everything up to the max_def_level is a non-null value - uint32_t is_valid = 0; - if (d >= pni->max_def_level && in_nesting_bounds) { is_valid = 1; } + uint32_t const is_valid = d >= pni->max_def_level && in_nesting_bounds ? 1 : 0; // compute warp and thread valid counts - uint32_t warp_valid_mask; - // for flat schemas, a simple ballot_sync gives us the correct count and bit positions because - // every value in the input matches to a value in the output - if (max_depth == 0) { - warp_valid_mask = ballot(is_valid); - } - // for nested schemas, it's more complicated. This warp will visit 32 incoming values, - // however not all of them will necessarily represent a value at this nesting level. so the - // validity bit for thread t might actually represent output value t-6. the correct position - // for thread t's bit is cur_value_count. for cuda 11 we could use __reduce_or_sync(), but - // until then we have to do a warp reduce. - else { - warp_valid_mask = WarpReduceOr32(is_valid << thread_value_count); - } + uint32_t const warp_valid_mask = + // for flat schemas, a simple ballot_sync gives us the correct count and bit positions + // because every value in the input matches to a value in the output + max_depth == 1 + ? ballot(is_valid) + : + // for nested schemas, it's more complicated. This warp will visit 32 incoming values, + // however not all of them will necessarily represent a value at this nesting level. so + // the validity bit for thread t might actually represent output value t-6. the correct + // position for thread t's bit is cur_value_count. for cuda 11 we could use + // __reduce_or_sync(), but until then we have to do a warp reduce. + WarpReduceOr32(is_valid << thread_value_count); + thread_valid_count = __popc(warp_valid_mask & ((1 << thread_value_count) - 1)); warp_valid_count = __popc(warp_valid_mask); // if this is the value column emit an index for value decoding if (is_valid && s_idx == max_depth - 1) { - int idx = pni->valid_count + thread_valid_count; - int ofs = pni->value_count + thread_value_count; - s->nz_idx[rolling_index(idx)] = ofs; + int const src_pos = pni->valid_count + thread_valid_count; + int const dst_pos = pni->value_count + thread_value_count; + // nz_idx is a mapping of src buffer indices to destination buffer indices + s->nz_idx[rolling_index(src_pos)] = dst_pos; } // compute warp and thread value counts for the -next- nesting level. we need to @@ -1366,7 +1377,7 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu // level. more concretely : the offset for the current nesting level == current length of the // next nesting level if (s_idx < max_depth - 1) { - uint32_t next_warp_count_mask = + uint32_t const next_warp_count_mask = ballot((s_idx + 1 >= start_depth && s_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0); next_warp_value_count = __popc(next_warp_count_mask); next_thread_value_count = __popc(next_warp_count_mask & ((1 << t) - 1)); @@ -1375,17 +1386,36 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu // and we have a valid data_out pointer, it implies this is a list column, so // emit an offset. if (in_nesting_bounds && pni->data_out != nullptr) { - int idx = pni->value_count + thread_value_count; - cudf::size_type ofs = s->page.nesting[s_idx + 1].value_count + next_thread_value_count + - s->page.nesting[s_idx + 1].page_start_value; + int const idx = pni->value_count + thread_value_count; + cudf::size_type const ofs = s->page.nesting[s_idx + 1].value_count + + next_thread_value_count + + s->page.nesting[s_idx + 1].page_start_value; (reinterpret_cast(pni->data_out))[idx] = ofs; } } - // increment count of valid values, count of total values, and validity mask + // nested schemas always read and write to the same bounds (that is, read and write positions + // are already pre-bounded by first_row/num_rows). flat schemas will start reading at the + // first value, even if that is before first_row, because we cannot trivially jump to + // the correct position to start reading. since we are about to write the validity vector here + // we need to adjust our computed mask to take into account the write row bounds. + int const in_write_row_bounds = + max_depth == 1 + ? thread_row_index >= s->first_row && thread_row_index < (s->first_row + s->num_rows) + : in_row_bounds; + int const first_thread_in_write_range = + max_depth == 1 ? __ffs(ballot(in_write_row_bounds)) - 1 : 0; + // # of bits to of the validity mask to write out + int const warp_valid_mask_bit_count = + first_thread_in_write_range < 0 ? 0 : warp_value_count - first_thread_in_write_range; + + // increment count of valid values, count of total values, and update validity mask if (!t) { - if (pni->valid_map != nullptr && in_row_bounds) { - store_validity(pni, warp_valid_mask, warp_value_count); + if (pni->valid_map != nullptr && warp_valid_mask_bit_count > 0) { + uint32_t const warp_output_valid_mask = warp_valid_mask >> first_thread_in_write_range; + store_validity(pni, warp_output_valid_mask, warp_valid_mask_bit_count); + + pni->null_count += warp_valid_mask_bit_count - __popc(warp_output_valid_mask); } pni->valid_count += warp_valid_count; pni->value_count += warp_value_count; @@ -1669,16 +1699,17 @@ extern "C" __global__ void __launch_bounds__(block_size) ((s->col.data_type & 7) == BOOLEAN || (s->col.data_type & 7) == BYTE_ARRAY) ? 64 : 32; } + // skipped_leaf_values will always be 0 for flat hierarchies. uint32_t skipped_leaf_values = s->page.skipped_leaf_values; - while (!s->error && (s->input_value_count < s->num_input_values || s->out_pos < s->nz_count)) { + while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) { int target_pos; - int out_pos = s->out_pos; + int src_pos = s->src_pos; if (t < out_thread0) { target_pos = - min(out_pos + 2 * (block_size - out_thread0), s->nz_count + (block_size - out_thread0)); + min(src_pos + 2 * (block_size - out_thread0), s->nz_count + (block_size - out_thread0)); } else { - target_pos = min(s->nz_count, out_pos + block_size - out_thread0); + target_pos = min(s->nz_count, src_pos + block_size - out_thread0); if (out_thread0 > 32) { target_pos = min(target_pos, s->dict_pos); } } __syncthreads(); @@ -1689,6 +1720,7 @@ extern "C" __global__ void __launch_bounds__(block_size) // - produces non-NULL value indices in s->nz_idx for subsequent decoding gpuDecodeLevels(s, target_pos, t); } else if (t < out_thread0) { + // skipped_leaf_values will always be 0 for flat hierarchies. uint32_t src_target_pos = target_pos + skipped_leaf_values; // WARP1: Decode dictionary indices, booleans or string positions @@ -1703,49 +1735,72 @@ extern "C" __global__ void __launch_bounds__(block_size) } else { // WARP1..WARP3: Decode values int dtype = s->col.data_type & 7; - out_pos += t - out_thread0; - uint32_t src_pos = out_pos + skipped_leaf_values; - - int output_value_idx = s->nz_idx[rolling_index(out_pos)]; + src_pos += t - out_thread0; + + // the position in the output column/buffer + int dst_pos = s->nz_idx[rolling_index(src_pos)]; + + // for the flat hierarchy case we will be reading from the beginning of the value stream, + // regardless of the value of first_row. so adjust our destination offset accordingly. + // example: + // - user has passed skip_rows = 2, so our first_row to output is 2 + // - the row values we get from nz_idx will be + // 0, 1, 2, 3, 4 .... + // - by shifting these values by first_row, the sequence becomes + // -1, -2, 0, 1, 2 ... + // - so we will end up ignoring the first two input rows, and input rows 2..n will + // get written to the output starting at position 0. + // + if (s->col.max_nesting_depth == 1) { dst_pos -= s->first_row; } + + // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values + // before first_row) in the flat hierarchy case. + if (src_pos < target_pos && dst_pos >= 0) { + // src_pos represents the logical row position we want to read from. But in the case of + // nested hierarchies, there is no 1:1 mapping of rows to values. So our true read position + // has to take into account the # of values we have to skip in the page to get to the + // desired logical row. For flat hierarchies, skipped_leaf_values will always be 0. + uint32_t val_src_pos = src_pos + skipped_leaf_values; - if (out_pos < target_pos && output_value_idx >= 0 && output_value_idx < s->num_input_values) { // nesting level that is storing actual leaf values int leaf_level_index = s->col.max_nesting_depth - 1; uint32_t dtype_len = s->dtype_len; - void *dst = s->page.nesting[leaf_level_index].data_out + - static_cast(output_value_idx) * dtype_len; - if (dtype == BYTE_ARRAY) - gpuOutputString(s, src_pos, dst); - else if (dtype == BOOLEAN) - gpuOutputBoolean(s, src_pos, static_cast(dst)); - else if (s->col.converted_type == DECIMAL) { + void *dst = + s->page.nesting[leaf_level_index].data_out + static_cast(dst_pos) * dtype_len; + if (dtype == BYTE_ARRAY) { + gpuOutputString(s, val_src_pos, dst); + } else if (dtype == BOOLEAN) { + gpuOutputBoolean(s, val_src_pos, static_cast(dst)); + } else if (s->col.converted_type == DECIMAL) { switch (dtype) { - case INT32: gpuOutputFast(s, src_pos, static_cast(dst)); break; - case INT64: gpuOutputFast(s, src_pos, static_cast(dst)); break; + case INT32: gpuOutputFast(s, val_src_pos, static_cast(dst)); break; + case INT64: gpuOutputFast(s, val_src_pos, static_cast(dst)); break; default: // we currently do not support reading byte arrays larger than DECIMAL64 if (s->dtype_len_in <= 8) { - gpuOutputFixedLenByteArrayAsInt64(s, src_pos, static_cast(dst)); + gpuOutputFixedLenByteArrayAsInt64(s, val_src_pos, static_cast(dst)); } else { - gpuOutputDecimalAsFloat(s, src_pos, static_cast(dst), dtype); + gpuOutputDecimalAsFloat(s, val_src_pos, static_cast(dst), dtype); } break; } - } else if (dtype == INT96) - gpuOutputInt96Timestamp(s, src_pos, static_cast(dst)); - else if (dtype_len == 8) { - if (s->ts_scale) - gpuOutputInt64Timestamp(s, src_pos, static_cast(dst)); - else - gpuOutputFast(s, src_pos, static_cast(dst)); - } else if (dtype_len == 4) - gpuOutputFast(s, src_pos, static_cast(dst)); - else - gpuOutputGeneric(s, src_pos, static_cast(dst), dtype_len); + } else if (dtype == INT96) { + gpuOutputInt96Timestamp(s, val_src_pos, static_cast(dst)); + } else if (dtype_len == 8) { + if (s->ts_scale) { + gpuOutputInt64Timestamp(s, val_src_pos, static_cast(dst)); + } else { + gpuOutputFast(s, val_src_pos, static_cast(dst)); + } + } else if (dtype_len == 4) { + gpuOutputFast(s, val_src_pos, static_cast(dst)); + } else { + gpuOutputGeneric(s, val_src_pos, static_cast(dst), dtype_len); + } } - if (t == out_thread0) { *(volatile int32_t *)&s->out_pos = target_pos; } + if (t == out_thread0) { *(volatile int32_t *)&s->src_pos = target_pos; } } __syncthreads(); } diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index ad3c214069f..555259c443d 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -98,6 +98,7 @@ struct PageNestingInfo { // set during data decoding int32_t valid_count; // # of valid values decoded in this page/nesting-level int32_t value_count; // total # of values decoded in this page/nesting-level + int32_t null_count; // null count int32_t valid_map_offset; // current offset in bits relative to valid_map uint8_t *data_out; // pointer into output buffer uint32_t *valid_map; // pointer into output validity buffer @@ -128,7 +129,17 @@ struct PageInfo { Encoding definition_level_encoding; // Encoding used for definition levels (data page) Encoding repetition_level_encoding; // Encoding used for repetition levels (data page) + // for nested types, we run a preprocess step in order to determine output + // column sizes. Because of this, we can jump directly to the position in the + // input data to start decoding instead of reading all of the data and discarding + // rows we don't care about. + // + // NOTE: for flat hierarchies we do not do the preprocess step, so skipped_values and + // skipped_leaf_values will always be 0. + // + // # of values skipped in the repetition/definition level stream int skipped_values; + // # of values skipped in the actual data stream. int skipped_leaf_values; // nesting information (input/output) for each page diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index 16cf0877c23..698eb1569cb 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -1361,7 +1361,7 @@ void reader::impl::decode_page_data(hostdevice_vector &chu if (chunk_nested_valids.host_ptr(chunk_offsets[pi->chunk_idx])[l_idx] == nullptr) { continue; } - out_buf.null_count() += pni[l_idx].value_count - pni[l_idx].valid_count; + out_buf.null_count() += pni[l_idx].null_count; } } diff --git a/cpp/src/labeling/label_bins.cu b/cpp/src/labeling/label_bins.cu new file mode 100644 index 00000000000..70a6826d9eb --- /dev/null +++ b/cpp/src/labeling/label_bins.cu @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf { +namespace detail { +namespace { + +// Sentinel used to indicate that an input value should be placed in the null +// bin. +// NOTE: In theory if a user decided to specify 2^31 bins this would fail. We +// could make this an error in Python, but that is such a crazy edge case... +constexpr size_type NULL_VALUE{std::numeric_limits::max()}; + +/* + * Functor for finding bins using thrust::transform. + * + * This functor is stateful, in the sense that it stores (for read-only use) + * pointers to the edge ranges on construction to enable natural use with + * thrust::transform semantics. To handle null values, this functor assumes + * that the input iterators have already been shifted to exclude the range + * containing nulls. The `edge_index_shift` parameter is used to return the + * index of a value's bin accounting for this shift. + */ +template +struct bin_finder { + bin_finder(RandomAccessIterator left_begin, + RandomAccessIterator left_end, + RandomAccessIterator right_begin) + : m_left_begin(left_begin), m_left_end(left_end), m_right_begin(right_begin) + { + } + + __device__ size_type operator()(thrust::pair input_value) const + { + // Immediately return sentinel for null inputs. + if (!input_value.second) return NULL_VALUE; + + T value = input_value.first; + auto bound = thrust::lower_bound(thrust::seq, m_left_begin, m_left_end, value, m_left_comp); + + // Exit early and return sentinel for values that lie below the interval. + if (bound == m_left_begin) { return NULL_VALUE; } + + auto index = thrust::distance(m_left_begin, thrust::prev(bound)); + return (m_right_comp(value, m_right_begin[index])) ? index : NULL_VALUE; + } + + const RandomAccessIterator + m_left_begin{}; // The beginning of the range containing the left bin edges. + const RandomAccessIterator m_left_end{}; // The end of the range containing the left bin edges. + const RandomAccessIterator + m_right_begin{}; // The beginning of the range containing the right bin edges. + const LeftComparator m_left_comp{}; // Comparator used for left edges. + const RightComparator m_right_comp{}; // Comparator used for right edges. +}; + +// Functor to identify rows that should be filtered out based on the sentinel set by +// bin_finder::operator(). +struct filter_null_sentinel { + __device__ bool operator()(size_type i) { return i != NULL_VALUE; } +}; + +// Bin the input by the edges in left_edges and right_edges. +template +std::unique_ptr label_bins(column_view const& input, + column_view const& left_edges, + column_view const& right_edges, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto output = make_numeric_column( + data_type(type_to_id()), input.size(), mask_state::UNALLOCATED, stream, mr); + auto output_mutable_view = output->mutable_view(); + auto output_begin = output_mutable_view.begin(); + auto output_end = output_mutable_view.end(); + + // These device column views are necessary for creating iterators that work + // for columns of compound types. The column_view iterators fail for compound + // types because they return raw pointers to the start of the data. The output + // does not require these iterators because it's always a primitive type. + auto input_device_view = column_device_view::create(input, stream); + auto left_edges_device_view = column_device_view::create(left_edges, stream); + auto right_edges_device_view = column_device_view::create(right_edges, stream); + + auto left_begin = left_edges_device_view->begin(); + auto left_end = left_edges_device_view->end(); + auto right_begin = right_edges_device_view->begin(); + + using RandomAccessIterator = decltype(left_edges_device_view->begin()); + + if (input.has_nulls()) { + thrust::transform(rmm::exec_policy(stream), + input_device_view->pair_begin(), + input_device_view->pair_end(), + output_begin, + bin_finder( + left_begin, left_end, right_begin)); + } else { + thrust::transform(rmm::exec_policy(stream), + input_device_view->pair_begin(), + input_device_view->pair_end(), + output_begin, + bin_finder( + left_begin, left_end, right_begin)); + } + + const auto mask_and_count = valid_if(output_begin, output_end, filter_null_sentinel()); + + output->set_null_mask(mask_and_count.first, mask_and_count.second); + return output; +} + +template +constexpr auto is_supported_bin_type() +{ + return cudf::is_relationally_comparable() && cudf::is_equality_comparable(); +} + +struct bin_type_dispatcher { + template + std::enable_if_t(), std::unique_ptr> operator()( + Args&&... args) + { + CUDF_FAIL("Type not support for cudf::bin"); + } + + template + std::enable_if_t(), std::unique_ptr> operator()( + column_view const& input, + column_view const& left_edges, + inclusive left_inclusive, + column_view const& right_edges, + inclusive right_inclusive, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + if ((left_inclusive == inclusive::YES) && (right_inclusive == inclusive::YES)) + return label_bins, thrust::less_equal>( + input, left_edges, right_edges, stream, mr); + if ((left_inclusive == inclusive::YES) && (right_inclusive == inclusive::NO)) + return label_bins, thrust::less>( + input, left_edges, right_edges, stream, mr); + if ((left_inclusive == inclusive::NO) && (right_inclusive == inclusive::YES)) + return label_bins, thrust::less_equal>( + input, left_edges, right_edges, stream, mr); + if ((left_inclusive == inclusive::NO) && (right_inclusive == inclusive::NO)) + return label_bins, thrust::less>( + input, left_edges, right_edges, stream, mr); + + CUDF_FAIL("Undefined inclusive setting."); + } +}; + +} // anonymous namespace + +/// Bin the input by the edges in left_edges and right_edges. +std::unique_ptr label_bins(column_view const& input, + column_view const& left_edges, + inclusive left_inclusive, + column_view const& right_edges, + inclusive right_inclusive, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE() + CUDF_EXPECTS((input.type() == left_edges.type()) && (input.type() == right_edges.type()), + "The input and edge columns must have the same types."); + CUDF_EXPECTS(left_edges.size() == right_edges.size(), + "The left and right edge columns must be of the same length."); + CUDF_EXPECTS(!left_edges.has_nulls() && !right_edges.has_nulls(), + "The left and right edge columns cannot contain nulls."); + + // Handle empty inputs. + if (input.is_empty()) { return make_empty_column(data_type(type_to_id())); } + + return type_dispatcher(input.type(), + detail::bin_type_dispatcher{}, + input, + left_edges, + left_inclusive, + right_edges, + right_inclusive, + stream, + mr); +} + +} // namespace detail + +/// Bin the input by the edges in left_edges and right_edges. +std::unique_ptr label_bins(column_view const& input, + column_view const& left_edges, + inclusive left_inclusive, + column_view const& right_edges, + inclusive right_inclusive, + rmm::mr::device_memory_resource* mr) +{ + return detail::label_bins( + input, left_edges, left_inclusive, right_edges, right_inclusive, rmm::cuda_stream_default, mr); +} +} // namespace cudf diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 1eb105d296d..529b7489c35 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -225,6 +225,8 @@ void generate_offsets(size_type num_entries, return offsets[i - prefix_sum_empty_lists[i]]; }); } +} // anonymous namespace + /** * @copydoc cudf::lists::drop_list_duplicates * @@ -276,7 +278,6 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu cudf::detail::copy_bitmask(lists_column.parent(), stream, mr)); } -} // anonymous namespace } // namespace detail /** diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu index 336aabde15e..8233635050e 100644 --- a/cpp/src/lists/explode.cu +++ b/cpp/src/lists/explode.cu @@ -29,6 +29,7 @@ #include #include #include +#include #include #include diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp index 67323126751..1df0a4ab41a 100644 --- a/cpp/src/quantiles/quantiles_util.hpp +++ b/cpp/src/quantiles/quantiles_util.hpp @@ -15,7 +15,7 @@ */ #include -#include +#include #include #include @@ -144,7 +144,7 @@ select_quantile(ValueAccessor get_value, size_type size, double q, interpolation default: #if defined(__CUDA_ARCH__) - release_assert(false && "Invalid interpolation operation for quantiles"); + cudf_assert(false && "Invalid interpolation operation for quantiles"); return Result(); #else CUDF_FAIL("Invalid interpolation operation for quantiles."); @@ -173,7 +173,7 @@ select_quantile_data(Iterator begin, size_type size, double q, interpolation int } #if defined(__CUDA_ARCH__) - release_assert(false && "Invalid interpolation operation for quantiles"); + cudf_assert(false && "Invalid interpolation operation for quantiles"); return Result(); #else CUDF_FAIL("Invalid interpolation operation for quantiles."); @@ -200,7 +200,7 @@ CUDA_HOST_DEVICE_CALLABLE bool select_quantile_validity(Iterator begin, } #if defined(__CUDA_ARCH__) - release_assert(false && "Invalid interpolation operation for quantiles"); + cudf_assert(false && "Invalid interpolation operation for quantiles"); return false; #else CUDF_FAIL("Invalid interpolation operation for quantiles."); diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index dcc48aafb39..42562507fa9 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -315,7 +315,7 @@ template ::value and !(op == aggregation::COUNT_VALID || op == aggregation::COUNT_ALL || op == aggregation::ROW_NUMBER || op == aggregation::LEAD || - op == aggregation::LAG || op == aggregation::COLLECT)>* = nullptr> + op == aggregation::LAG || op == aggregation::COLLECT_LIST)>* = nullptr> bool __device__ process_rolling_window(column_device_view input, column_device_view ignored_default_outputs, mutable_column_device_view output, @@ -814,7 +814,7 @@ struct rolling_window_launcher { typename PrecedingWindowIterator, typename FollowingWindowIterator> std::enable_if_t> operator()(column_view const& input, column_view const& default_outputs, @@ -897,11 +897,11 @@ struct rolling_window_launcher { } /** - * @brief Creates the offsets child of the result of the `COLLECT` window aggregation + * @brief Creates the offsets child of the result of the `COLLECT_LIST` window aggregation * * Given the input column, the preceding/following window bounds, and `min_periods`, * the sizes of each list row may be computed. These values can then be used to - * calculate the offsets for the result of `COLLECT`. + * calculate the offsets for the result of `COLLECT_LIST`. * * Note: If `min_periods` exceeds the number of observations for a window, the size * is set to `0` (since the result is `null`). @@ -945,7 +945,7 @@ struct rolling_window_launcher { } /** - * @brief Generate mapping of each row in the COLLECT result's child column + * @brief Generate mapping of each row in the COLLECT_LIST result's child column * to the index of the row it belongs to. * * If @@ -1030,7 +1030,7 @@ struct rolling_window_launcher { /** * @brief Create gather map to generate the child column of the result of - * the `COLLECT` window aggregation. + * the `COLLECT_LIST` window aggregation. */ template std::unique_ptr create_collect_gather_map(column_view const& child_offsets, @@ -1064,7 +1064,7 @@ struct rolling_window_launcher { } /** - * @brief Count null entries in result of COLLECT. + * @brief Count null entries in result of COLLECT_LIST. */ size_type count_child_nulls(column_view const& input, std::unique_ptr const& gather_map, @@ -1139,7 +1139,7 @@ struct rolling_window_launcher { } template - std::enable_if_t<(op == aggregation::COLLECT), std::unique_ptr> operator()( + std::enable_if_t<(op == aggregation::COLLECT_LIST), std::unique_ptr> operator()( column_view const& input, column_view const& default_outputs, PrecedingIter preceding_begin_raw, @@ -1150,7 +1150,7 @@ struct rolling_window_launcher { rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(default_outputs.is_empty(), - "COLLECT window function does not support default values."); + "COLLECT_LIST window function does not support default values."); if (input.is_empty()) return empty_like(input); @@ -1370,6 +1370,7 @@ std::unique_ptr rolling_window(column_view const& input, auto input_col = cudf::is_dictionary(input.type()) ? dictionary_column_view(input).get_indices_annotated() : input; + auto output = cudf::type_dispatcher(input_col.type(), dispatch_rolling{}, input_col, diff --git a/cpp/src/rolling/rolling_detail.hpp b/cpp/src/rolling/rolling_detail.hpp index d7fa92f1978..18bd0ea2217 100644 --- a/cpp/src/rolling/rolling_detail.hpp +++ b/cpp/src/rolling/rolling_detail.hpp @@ -41,7 +41,7 @@ static constexpr bool is_rolling_supported() (op == aggregation::SUM) or (op == aggregation::MIN) or (op == aggregation::MAX) or (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or (op == aggregation::MEAN) or (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or - (op == aggregation::LAG) or (op == aggregation::COLLECT); + (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST); constexpr bool is_valid_numeric_agg = (cudf::is_numeric() or cudf::is_duration() or @@ -54,23 +54,23 @@ static constexpr bool is_rolling_supported() return (op == aggregation::MIN) or (op == aggregation::MAX) or (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or - (op == aggregation::LAG) or (op == aggregation::COLLECT); + (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST); } else if (cudf::is_fixed_point()) { return (op == aggregation::SUM) or (op == aggregation::MIN) or (op == aggregation::MAX) or (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or - (op == aggregation::LAG) or (op == aggregation::COLLECT); + (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST); } else if (std::is_same()) { return (op == aggregation::MIN) or (op == aggregation::MAX) or (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or - (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT); + (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT_LIST); } else if (std::is_same()) { return (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or - (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT); + (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT_LIST); } else if (std::is_same()) { // TODO: Add support for COUNT_VALID, COUNT_ALL, ROW_NUMBER. - return op == aggregation::COLLECT; + return op == aggregation::COLLECT_LIST; } else { return false; } diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index 2dcece0b3be..8b46f66a48f 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -35,6 +35,7 @@ #include #include +#include #include #include diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu index 5c5032b5c87..7eee2b3cc0e 100644 --- a/cpp/src/strings/convert/convert_integers.cu +++ b/cpp/src/strings/convert/convert_integers.cu @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -38,6 +37,160 @@ namespace cudf { namespace strings { + +namespace detail { +namespace { + +/** + * @brief This only checks if a string is a valid integer within the bounds of its storage type. + */ +template +struct string_to_integer_check_fn { + __device__ bool operator()(thrust::pair const& p) const + { + if (!p.second || p.first.empty()) { return false; } + + auto const d_str = p.first.data(); + if (d_str[0] == '-' && std::is_unsigned::value) { return false; } + + auto iter = d_str + static_cast((d_str[0] == '-' || d_str[0] == '+')); + auto const iter_end = d_str + p.first.size_bytes(); + if (iter == iter_end) { return false; } + + auto const sign = d_str[0] == '-' ? IntegerType{-1} : IntegerType{1}; + auto const bound_val = + sign > 0 ? std::numeric_limits::max() : std::numeric_limits::min(); + + IntegerType value = 0; // parse the string to integer and check for overflow along the way + while (iter != iter_end) { // check all bytes for valid characters + auto const chr = *iter++; + // Check for valid character + if (chr < '0' || chr > '9') { return false; } + + // Check for underflow and overflow: + auto const digit = static_cast(chr - '0'); + auto const bound_check = (bound_val - sign * digit) / IntegerType{10} * sign; + if (value > bound_check) return false; + value = value * IntegerType{10} + digit; + } + + return true; + } +}; + +/** + * @brief The dispatch functions for checking if strings are valid integers. + */ +struct dispatch_is_integer_fn { + template ::value>* = nullptr> + std::unique_ptr operator()(strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const + { + auto const d_column = column_device_view::create(strings.parent(), stream); + auto results = make_numeric_column(data_type{type_id::BOOL8}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); + + auto d_results = results->mutable_view().data(); + if (strings.has_nulls()) { + thrust::transform(rmm::exec_policy(stream), + d_column->pair_begin(), + d_column->pair_end(), + d_results, + string_to_integer_check_fn{}); + } else { + thrust::transform(rmm::exec_policy(stream), + d_column->pair_begin(), + d_column->pair_end(), + d_results, + string_to_integer_check_fn{}); + } + + // Calling mutable_view() on a column invalidates it's null count so we need to set it back + results->set_null_count(strings.null_count()); + + return results; + } + + template ::value>* = nullptr> + std::unique_ptr operator()(strings_column_view const&, + rmm::cuda_stream_view, + rmm::mr::device_memory_resource*) const + { + CUDF_FAIL("is_integer is expecting an integer type"); + } +}; + +} // namespace + +std::unique_ptr is_integer( + strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + auto const d_column = column_device_view::create(strings.parent(), stream); + auto results = make_numeric_column(data_type{type_id::BOOL8}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); + + auto d_results = results->mutable_view().data(); + if (strings.has_nulls()) { + thrust::transform( + rmm::exec_policy(stream), + d_column->pair_begin(), + d_column->pair_end(), + d_results, + [] __device__(auto const& p) { return p.second ? string::is_integer(p.first) : false; }); + } else { + thrust::transform( + rmm::exec_policy(stream), + d_column->pair_begin(), + d_column->pair_end(), + d_results, + [] __device__(auto const& p) { return p.second ? string::is_integer(p.first) : false; }); + } + + // Calling mutable_view() on a column invalidates it's null count so we need to set it back + results->set_null_count(strings.null_count()); + + return results; +} + +std::unique_ptr is_integer( + strings_column_view const& strings, + data_type int_type, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + if (strings.is_empty()) { return cudf::make_empty_column(data_type{type_id::BOOL8}); } + return type_dispatcher(int_type, dispatch_is_integer_fn{}, strings, stream, mr); +} + +} // namespace detail + +// external APIs +std::unique_ptr is_integer(strings_column_view const& strings, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::is_integer(strings, rmm::cuda_stream_default, mr); +} + +std::unique_ptr is_integer(strings_column_view const& strings, + data_type int_type, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::is_integer(strings, int_type, rmm::cuda_stream_default, mr); +} + namespace detail { namespace { /** @@ -69,11 +222,10 @@ struct dispatch_to_integers_fn { mutable_column_view& output_column, rmm::cuda_stream_view stream) const { - auto d_results = output_column.data(); thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_column.size()), - d_results, + output_column.data(), string_to_integer_fn{strings_column}); } // non-integral types throw an exception @@ -102,19 +254,22 @@ std::unique_ptr to_integers(strings_column_view const& strings, { size_type strings_count = strings.size(); if (strings_count == 0) return make_numeric_column(output_type, 0); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; - // create integer output column copying the strings null-mask - auto results = make_numeric_column(output_type, + + // Create integer output column copying the strings null-mask + auto results = make_numeric_column(output_type, strings_count, cudf::detail::copy_bitmask(strings.parent(), stream, mr), strings.null_count(), stream, mr); - auto results_view = results->mutable_view(); - // fill output column with integers - type_dispatcher(output_type, dispatch_to_integers_fn{}, d_strings, results_view, stream); + // Fill output column with integers + auto const strings_dev_view = column_device_view::create(strings.parent(), stream); + auto results_view = results->mutable_view(); + type_dispatcher(output_type, dispatch_to_integers_fn{}, *strings_dev_view, results_view, stream); + + // Calling mutable_view() on a column invalidates it's null count so we need to set it back results->set_null_count(strings.null_count()); + return results; } @@ -253,42 +408,5 @@ std::unique_ptr from_integers(column_view const& integers, return detail::from_integers(integers, rmm::cuda_stream_default, mr); } -namespace detail { -std::unique_ptr is_integer( - strings_column_view const& strings, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) -{ - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - // create output column - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings.size(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); - auto d_results = results->mutable_view().data(); - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings.size()), - d_results, - [d_column] __device__(size_type idx) { - if (d_column.is_null(idx)) return false; - return string::is_integer(d_column.element(idx)); - }); - results->set_null_count(strings.null_count()); - return results; -} -} // namespace detail - -// external API -std::unique_ptr is_integer(strings_column_view const& strings, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::is_integer(strings, rmm::cuda_stream_default, mr); -} - } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu index 95f9ecbe2ef..cac774ef43e 100644 --- a/cpp/src/strings/replace/backref_re.cu +++ b/cpp/src/strings/replace/backref_re.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -43,7 +43,8 @@ namespace { * * The backref numbers are expected to be 1-based. * - * Returns a modified string without back-ref indicators. + * Returns a modified string without back-ref indicators and a vector of backref + * byte position pairs. * ``` * Example: * for input string: 'hello \2 and \1' @@ -51,8 +52,9 @@ namespace { * returned string is: 'hello and ' * ``` */ -std::string parse_backrefs(std::string const& repl, std::vector& backrefs) +std::pair> parse_backrefs(std::string const& repl) { + std::vector backrefs; std::string str = repl; // make a modifiable copy std::smatch m; std::regex ex("(\\\\\\d+)"); // this searches for backslash-number(s); example "\1" @@ -60,21 +62,19 @@ std::string parse_backrefs(std::string const& repl, std::vector& b size_type byte_offset = 0; while (std::regex_search(str, m, ex)) { if (m.size() == 0) break; - backref_type item; - std::string bref = m[0]; - size_type position = static_cast(m.position(0)); - size_type length = static_cast(bref.length()); + std::string const backref = m[0]; + size_type const position = static_cast(m.position(0)); + size_type const length = static_cast(backref.length()); byte_offset += position; - item.first = std::atoi(bref.c_str() + 1); // back-ref index number - CUDF_EXPECTS(item.first > 0, "Back-reference numbers must be greater than 0"); - item.second = byte_offset; // position within the string + size_type const index = std::atoi(backref.c_str() + 1); // back-ref index number + CUDF_EXPECTS(index > 0, "Back-reference numbers must be greater than 0"); rtn += str.substr(0, position); str = str.substr(position + length); - backrefs.push_back(item); + backrefs.push_back({index, byte_offset}); } if (!str.empty()) // add the remainder rtn += str; // of the string - return rtn; + return {rtn, backrefs}; } } // namespace @@ -87,54 +87,54 @@ std::unique_ptr replace_with_backrefs( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - auto strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(stream, mr); + if (strings.is_empty()) return make_empty_strings_column(stream, mr); CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); CUDF_EXPECTS(!repl.empty(), "Parameter repl must not be empty"); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; + auto d_strings = column_device_view::create(strings.parent(), stream); // compile regex into device object - auto prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); - auto d_prog = *prog; - auto regex_insts = d_prog.insts_counts(); + auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings.size(), stream); + auto const regex_insts = d_prog->insts_counts(); // parse the repl string for backref indicators - std::vector h_backrefs; - std::string repl_template = parse_backrefs(repl, h_backrefs); - rmm::device_vector backrefs(h_backrefs); - string_scalar repl_scalar(repl_template); - string_view d_repl_template{repl_scalar.data(), repl_scalar.size()}; - - // copy null mask - auto null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); - auto null_count = strings.null_count(); + auto const parse_result = parse_backrefs(repl); + rmm::device_uvector backrefs(parse_result.second.size(), stream); + CUDA_TRY(cudaMemcpyAsync(backrefs.data(), + parse_result.second.data(), + sizeof(backref_type) * backrefs.size(), + cudaMemcpyHostToDevice, + stream.value())); + string_scalar repl_scalar(parse_result.first, true, stream); + string_view const d_repl_template = repl_scalar.value(); + + using BackRefIterator = decltype(backrefs.begin()); // create child columns - children_pair children(nullptr, nullptr); - // Each invocation is predicated on the stack size - // which is dependent on the number of regex instructions - if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS)) { - children = make_strings_children( - backrefs_fn{ - d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()}, - strings_count, - null_count, - stream, - mr); - } else if (regex_insts <= RX_MEDIUM_INSTS) - children = replace_with_backrefs_medium( - d_strings, d_prog, d_repl_template, backrefs, null_count, stream, mr); - else - children = replace_with_backrefs_large( - d_strings, d_prog, d_repl_template, backrefs, null_count, stream, mr); - - return make_strings_column(strings_count, + children_pair children = [&] { + // Each invocation is predicated on the stack size + // which is dependent on the number of regex instructions + if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS)) { + return make_strings_children( + backrefs_fn{ + *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end()}, + strings.size(), + strings.null_count(), + stream, + mr); + } else if (regex_insts <= RX_MEDIUM_INSTS) + return replace_with_backrefs_medium( + *d_strings, *d_prog, d_repl_template, backrefs, strings.null_count(), stream, mr); + else + return replace_with_backrefs_large( + *d_strings, *d_prog, d_repl_template, backrefs, strings.null_count(), stream, mr); + }(); + + return make_strings_column(strings.size(), std::move(children.first), std::move(children.second), - null_count, - std::move(null_mask), + strings.null_count(), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), stream, mr); } diff --git a/cpp/src/strings/replace/backref_re.cuh b/cpp/src/strings/replace/backref_re.cuh index f13d84cf9ca..529b91a98e5 100644 --- a/cpp/src/strings/replace/backref_re.cuh +++ b/cpp/src/strings/replace/backref_re.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,13 +42,13 @@ using backref_type = thrust::pair; * Small to medium instruction lengths can use the stack effectively though smaller executes faster. * Longer patterns require global memory. Shorter patterns are common in data cleaning. */ -template +template struct backrefs_fn { column_device_view const d_strings; reprog_device prog; string_view const d_repl; // string replacement template - rmm::device_vector::iterator backrefs_begin; - rmm::device_vector::iterator backrefs_end; + Iterator backrefs_begin; + Iterator backrefs_end; int32_t* d_offsets{}; char* d_chars{}; @@ -117,7 +117,7 @@ using children_pair = std::pair, std::unique_ptr children_pair replace_with_backrefs_medium(column_device_view const& d_strings, reprog_device& d_prog, string_view const& d_repl_template, - rmm::device_vector& backrefs, + device_span backrefs, size_type null_count, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); @@ -125,7 +125,7 @@ children_pair replace_with_backrefs_medium(column_device_view const& d_strings, children_pair replace_with_backrefs_large(column_device_view const& d_strings, reprog_device& d_prog, string_view const& d_repl_template, - rmm::device_vector& backrefs, + device_span backrefs, size_type null_count, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); diff --git a/cpp/src/strings/replace/backref_re_large.cu b/cpp/src/strings/replace/backref_re_large.cu index 0b078132623..56bd8941b8a 100644 --- a/cpp/src/strings/replace/backref_re_large.cu +++ b/cpp/src/strings/replace/backref_re_large.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,17 +24,17 @@ namespace cudf { namespace strings { namespace detail { -// children_pair replace_with_backrefs_large(column_device_view const& d_strings, reprog_device& d_prog, string_view const& d_repl_template, - rmm::device_vector& backrefs, + device_span backrefs, size_type null_count, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + using Iterator = decltype(backrefs.begin()); return make_strings_children( - backrefs_fn{ + backrefs_fn{ d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()}, d_strings.size(), null_count, diff --git a/cpp/src/strings/replace/backref_re_medium.cu b/cpp/src/strings/replace/backref_re_medium.cu index 899e0cb2a3e..8b1dd6c5999 100644 --- a/cpp/src/strings/replace/backref_re_medium.cu +++ b/cpp/src/strings/replace/backref_re_medium.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,17 +24,17 @@ namespace cudf { namespace strings { namespace detail { -// children_pair replace_with_backrefs_medium(column_device_view const& d_strings, reprog_device& d_prog, string_view const& d_repl_template, - rmm::device_vector& backrefs, + device_span backrefs, size_type null_count, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + using Iterator = decltype(backrefs.begin()); return make_strings_children( - backrefs_fn{ + backrefs_fn{ d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()}, d_strings.size(), null_count, diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu index 3eb551ead18..39725361741 100644 --- a/cpp/src/strings/replace/multi_re.cu +++ b/cpp/src/strings/replace/multi_re.cu @@ -139,15 +139,13 @@ std::unique_ptr replace_re( auto strings_count = strings.size(); if (strings_count == 0) return make_empty_strings_column(stream, mr); if (patterns.empty()) // no patterns; just return a copy - return std::make_unique(strings.parent()); + return std::make_unique(strings.parent(), stream, mr); CUDF_EXPECTS(!repls.has_nulls(), "Parameter repls must not have any nulls"); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; - auto repls_column = column_device_view::create(repls.parent(), stream); - auto d_repls = *repls_column; - auto d_flags = get_character_flags_table(); + auto d_strings = column_device_view::create(strings.parent(), stream); + auto d_repls = column_device_view::create(repls.parent(), stream); + auto d_flags = get_character_flags_table(); // compile regexes into device objects size_type regex_insts = 0; @@ -170,37 +168,39 @@ std::unique_ptr replace_re( reprog_device* d_progs = reinterpret_cast(progs_buffer.data()); // create working buffer for ranges pairs - rmm::device_vector found_ranges(patterns.size() * strings_count); - auto d_found_ranges = found_ranges.data().get(); + rmm::device_uvector found_ranges(patterns.size() * strings_count, stream); + auto d_found_ranges = found_ranges.data(); // create child columns - std::pair, std::unique_ptr> children(nullptr, nullptr); - // Each invocation is predicated on the stack size which is dependent on the number of regex - // instructions - if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS)) - children = make_strings_children( - replace_multi_regex_fn{ - d_strings, d_progs, static_cast(progs.size()), d_found_ranges, d_repls}, - strings_count, - strings.null_count(), - stream, - mr); - else if (regex_insts <= RX_MEDIUM_INSTS) - children = make_strings_children( - replace_multi_regex_fn{ - d_strings, d_progs, static_cast(progs.size()), d_found_ranges, d_repls}, - strings_count, - strings.null_count(), - stream, - mr); - else - children = make_strings_children( - replace_multi_regex_fn{ - d_strings, d_progs, static_cast(progs.size()), d_found_ranges, d_repls}, - strings_count, - strings.null_count(), - stream, - mr); + // std::pair, std::unique_ptr> children(nullptr, nullptr); + auto children = [&] { + // Each invocation is predicated on the stack size which is dependent on the number of regex + // instructions + if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS)) + return make_strings_children( + replace_multi_regex_fn{ + *d_strings, d_progs, static_cast(progs.size()), d_found_ranges, *d_repls}, + strings_count, + strings.null_count(), + stream, + mr); + else if (regex_insts <= RX_MEDIUM_INSTS) + return make_strings_children( + replace_multi_regex_fn{ + *d_strings, d_progs, static_cast(progs.size()), d_found_ranges, *d_repls}, + strings_count, + strings.null_count(), + stream, + mr); + else + return make_strings_children( + replace_multi_regex_fn{ + *d_strings, d_progs, static_cast(progs.size()), d_found_ranges, *d_repls}, + strings_count, + strings.null_count(), + stream, + mr); + }(); return make_strings_column(strings_count, std::move(children.first), diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu index 08af1d76d22..138fe3fa508 100644 --- a/cpp/src/strings/translate.cu +++ b/cpp/src/strings/translate.cu @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -30,7 +29,8 @@ #include #include -#include +#include +#include #include @@ -46,31 +46,37 @@ namespace { */ struct translate_fn { column_device_view const d_strings; - rmm::device_vector::iterator table_begin; - rmm::device_vector::iterator table_end; - int32_t const* d_offsets{}; + rmm::device_uvector::iterator table_begin; + rmm::device_uvector::iterator table_end; + int32_t* d_offsets{}; char* d_chars{}; - __device__ size_type operator()(size_type idx) + __device__ void operator()(size_type idx) { - if (d_strings.is_null(idx)) return 0; - string_view d_str = d_strings.element(idx); - size_type bytes = d_str.size_bytes(); - char* out_ptr = d_offsets ? d_chars + d_offsets[idx] : nullptr; + if (d_strings.is_null(idx)) { + if (!d_chars) d_offsets[idx] = 0; + return; + } + string_view const d_str = d_strings.element(idx); + + size_type bytes = d_str.size_bytes(); + char* out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; for (auto chr : d_str) { - auto entry = - thrust::find_if(thrust::seq, table_begin, table_end, [chr] __device__(auto const& te) { - return te.first == chr; - }); - if (entry != table_end) { + auto const entry = + thrust::lower_bound(thrust::seq, + table_begin, + table_end, + translate_table{chr, 0}, + [](auto const& lhs, auto const& rhs) { return lhs.first < rhs.first; }); + if (entry != table_end && entry->first == chr) { bytes -= bytes_in_char_utf8(chr); - chr = static_cast(*entry).second; + chr = entry->second; if (chr) // if null, skip the character bytes += bytes_in_char_utf8(chr); } if (chr && out_ptr) out_ptr += from_char_utf8(chr, out_ptr); } - return bytes; + if (!d_chars) d_offsets[idx] = bytes; } }; @@ -83,8 +89,7 @@ std::unique_ptr translate( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(stream, mr); + if (strings.is_empty()) return make_empty_strings_column(stream, mr); size_type table_size = static_cast(chars_table.size()); // convert input table @@ -92,35 +97,32 @@ std::unique_ptr translate( std::transform(chars_table.begin(), chars_table.end(), htable.begin(), [](auto entry) { return translate_table{entry.first, entry.second}; }); + // The size of this table is usually much less than 100 so it is was + // found to be more efficient to sort on the CPU than the GPU. + thrust::sort(htable.begin(), htable.end(), [](auto const& lhs, auto const& rhs) { + return lhs.first < rhs.first; + }); // copy translate table to device memory - rmm::device_vector table(htable); - - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; - // create null mask - rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); - // create offsets column - auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator( - 0, translate_fn{d_strings, table.begin(), table.end()}); - auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto d_offsets = offsets_column->view().data(); - - // build chars column - size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; - auto chars_column = strings::detail::create_chars_child_column( - strings_count, strings.null_count(), bytes, stream, mr); - auto d_chars = chars_column->mutable_view().data(); - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - translate_fn{d_strings, table.begin(), table.end(), d_offsets, d_chars}); - - return make_strings_column(strings_count, - std::move(offsets_column), - std::move(chars_column), + rmm::device_uvector table(htable.size(), stream); + CUDA_TRY(cudaMemcpyAsync(table.data(), + htable.data(), + sizeof(translate_table) * htable.size(), + cudaMemcpyHostToDevice, + stream.value())); + + auto d_strings = column_device_view::create(strings.parent(), stream); + + auto children = make_strings_children(translate_fn{*d_strings, table.begin(), table.end()}, + strings.size(), + strings.null_count(), + stream, + mr); + + return make_strings_column(strings.size(), + std::move(children.first), + std::move(children.second), strings.null_count(), - std::move(null_mask), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), stream, mr); } diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 6ebe529b56e..e3a43ac25c0 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -14,6 +14,12 @@ * limitations under the License. */ +#include +#include +#include + +#include + #include #include #include @@ -22,17 +28,11 @@ #include #include #include +#include #include #include #include -#include - -#include - -#include -#include - #include #include @@ -54,32 +54,39 @@ namespace { */ struct normalize_spaces_fn { cudf::column_device_view const d_strings; // strings to normalize - int32_t const* d_offsets{}; // offsets into d_buffer - char* d_buffer{}; // output buffer for characters + int32_t* d_offsets{}; // offsets into d_buffer + char* d_chars{}; // output buffer for characters - __device__ int32_t operator()(cudf::size_type idx) + __device__ void operator()(cudf::size_type idx) { - if (d_strings.is_null(idx)) return 0; - cudf::string_view single_space(" ", 1); + if (d_strings.is_null(idx)) { + if (!d_chars) d_offsets[idx] = 0; + return; + } + cudf::string_view const single_space(" ", 1); auto const d_str = d_strings.element(idx); - char* buffer = d_offsets ? d_buffer + d_offsets[idx] : nullptr; + char* buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; char* optr = buffer; // running output pointer int32_t nbytes = 0; // holds the number of bytes per output string - // create tokenizer for this string with whitespace delimiter (default) + + // create a tokenizer for this string with whitespace delimiter (default) characters_tokenizer tokenizer(d_str); + // this will retrieve tokens automatically skipping runs of whitespace while (tokenizer.next_token()) { - auto token_pos = tokenizer.token_byte_positions(); + auto const token_pos = tokenizer.token_byte_positions(); nbytes += token_pos.second - token_pos.first + 1; // token size plus a single space if (optr) { - cudf::string_view token(d_str.data() + token_pos.first, token_pos.second - token_pos.first); + cudf::string_view const token(d_str.data() + token_pos.first, + token_pos.second - token_pos.first); if (optr != buffer) // prepend space unless we are at the beginning optr = cudf::strings::detail::copy_string(optr, single_space); // write token to output buffer - optr = cudf::strings::detail::copy_string(optr, token); // copy token to output + optr = cudf::strings::detail::copy_string(optr, token); } } - return (nbytes > 0) ? nbytes - 1 : 0; // remove trailing space + // remove trailing space + if (!d_chars) d_offsets[idx] = (nbytes > 0) ? nbytes - 1 : 0; } }; @@ -95,7 +102,7 @@ struct codepoint_to_utf8_fn { cudf::column_device_view const d_strings; // input strings uint32_t const* cp_data; // full code-point array int32_t const* d_cp_offsets{}; // offsets to each string's code-point array - int32_t const* d_offsets{}; // offsets for the output strings + int32_t* d_offsets{}; // offsets for the output strings char* d_chars{}; // buffer for the output strings column /** @@ -105,7 +112,7 @@ struct codepoint_to_utf8_fn { * @param count number of code-points in `str_cps` * @return Number of bytes required for the output */ - __device__ cudf::size_type compute_output_size(uint32_t const* str_cps, uint32_t count) + __device__ int32_t compute_output_size(uint32_t const* str_cps, uint32_t count) { return thrust::transform_reduce( thrust::seq, @@ -113,17 +120,23 @@ struct codepoint_to_utf8_fn { str_cps + count, [](auto cp) { return 1 + (cp >= UTF8_1BYTE) + (cp >= UTF8_2BYTE) + (cp >= UTF8_3BYTE); }, 0, - thrust::plus()); + thrust::plus()); } - __device__ cudf::size_type operator()(cudf::size_type idx) + __device__ void operator()(cudf::size_type idx) { - if (d_strings.is_null(idx)) return 0; + if (d_strings.is_null(idx)) { + if (!d_chars) d_offsets[idx] = 0; + return; + } auto const d_str = d_strings.element(idx); auto const offset = d_cp_offsets[idx]; auto const count = d_cp_offsets[idx + 1] - offset; // number of code-points auto str_cps = cp_data + offset; // code-points for this string - if (!d_chars) return compute_output_size(str_cps, count); + if (!d_chars) { + d_offsets[idx] = compute_output_size(str_cps, count); + return; + } // convert each code-point to 1-4 UTF-8 encoded bytes char* out_ptr = d_chars + d_offsets[idx]; for (uint32_t jdx = 0; jdx < count; ++jdx) { @@ -149,7 +162,6 @@ struct codepoint_to_utf8_fn { *out_ptr++ = static_cast((code_point & 0x3F) | 0x0080); } } - return 0; } }; @@ -161,40 +173,20 @@ std::unique_ptr normalize_spaces( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - cudf::size_type strings_count = strings.size(); - if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); // create device column - auto strings_column = cudf::column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; - // copy bitmask - rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); - - // create offsets by calculating size of each string for output - auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator( - 0, normalize_spaces_fn{d_strings}); // this does size-only calc - auto offsets_column = cudf::strings::detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto d_offsets = offsets_column->view().data(); - - // build the chars column - cudf::size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; - auto chars_column = cudf::strings::detail::create_chars_child_column( - strings_count, strings.null_count(), bytes, stream, mr); - auto d_chars = chars_column->mutable_view().data(); + auto d_strings = cudf::column_device_view::create(strings.parent(), stream); - // copy tokens to the chars buffer - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - normalize_spaces_fn{d_strings, d_offsets, d_chars}); - chars_column->set_null_count(0); // reset null count for child column + // build offsets and children using the normalize_space_fn + auto children = cudf::strings::detail::make_strings_children( + normalize_spaces_fn{*d_strings}, strings.size(), strings.null_count(), stream, mr); - return cudf::make_strings_column(strings_count, - std::move(offsets_column), - std::move(chars_column), + return cudf::make_strings_column(strings.size(), + std::move(children.first), + std::move(children.second), strings.null_count(), - std::move(null_mask), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), stream, mr); } @@ -207,8 +199,7 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const strings_count = strings.size(); - if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); // create the normalizer and call it data_normalizer normalizer(stream, do_lower_case); @@ -229,33 +220,20 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con // - the cp_offsets identify which code-points go with which strings uint32_t const* cp_chars = result.first->data(); int32_t const* cp_offsets = reinterpret_cast(result.second->data()); - auto strings_column = cudf::column_device_view::create(strings.parent(), stream); - - // build the output offsets column: compute the output size of each string - auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator( - 0, codepoint_to_utf8_fn{*strings_column, cp_chars, cp_offsets}); - auto offsets_column = cudf::strings::detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto d_offsets = offsets_column->view().data(); - // create the output chars column - cudf::size_type output_bytes = - cudf::detail::get_value(offsets_column->view(), strings_count, stream); - auto chars_column = cudf::strings::detail::create_chars_child_column( - strings_count, strings.null_count(), output_bytes, stream, mr); - auto d_chars = chars_column->mutable_view().data(); + auto d_strings = cudf::column_device_view::create(strings.parent(), stream); - // build the chars output data: convert the 4-byte code-point values into UTF-8 chars - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - codepoint_to_utf8_fn{*strings_column, cp_chars, cp_offsets, d_offsets, d_chars}); - chars_column->set_null_count(0); // reset null count for child column + // build offsets and children using the codepoint_to_utf8_fn + auto children = cudf::strings::detail::make_strings_children( + codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets}, + strings.size(), + strings.null_count(), + stream, + mr); - return cudf::make_strings_column(strings_count, - std::move(offsets_column), - std::move(chars_column), + return cudf::make_strings_column(strings.size(), + std::move(children.first), + std::move(children.second), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr), stream, diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index e95aab16098..ab14c2577bb 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -54,6 +54,7 @@ ConfigureTest(ERROR_TEST error/error_handling_test.cu) ################################################################################################### # - groupby tests --------------------------------------------------------------------------------- ConfigureTest(GROUPBY_TEST + groupby/collect_set_test.cpp groupby/groupby_groups_test.cpp groupby/group_argmin_test.cpp groupby/group_argmax_test.cpp @@ -70,7 +71,11 @@ ConfigureTest(GROUPBY_TEST groupby/group_quantile_test.cpp groupby/group_nunique_test.cpp groupby/group_nth_element_test.cpp - groupby/group_collect_test.cpp) + groupby/group_collect_test.cpp + groupby/group_sum_scan_test.cpp + groupby/group_min_scan_test.cpp + groupby/group_max_scan_test.cpp + groupby/group_count_scan_test.cpp) ################################################################################################### # - join tests ------------------------------------------------------------------------------------ @@ -394,6 +399,11 @@ ConfigureTest(LISTS_TEST lists/extract_tests.cpp lists/sort_lists_tests.cpp) +################################################################################################### +# - bin tests ---------------------------------------------------------------------------------- +ConfigureTest(LABEL_BINS_TEST + labeling/label_bins_tests.cpp) + ################################################################################################### ### enable testing ################################################################################ ################################################################################################### diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu index debf540ea8e..da9509e94a6 100644 --- a/cpp/tests/error/error_handling_test.cu +++ b/cpp/tests/error/error_handling_test.cu @@ -83,11 +83,13 @@ TEST(StreamCheck, CatchFailedKernel) "invalid configuration argument"); } -__global__ void assert_false_kernel() { release_assert(false && "this kernel should die"); } +#ifndef NDEBUG + +__global__ void assert_false_kernel() { cudf_assert(false && "this kernel should die"); } -__global__ void assert_true_kernel() { release_assert(true && "this kernel should live"); } +__global__ void assert_true_kernel() { cudf_assert(true && "this kernel should live"); } -TEST(ReleaseAssertDeathTest, release_assert_false) +TEST(DebugAssertDeathTest, cudf_assert_false) { testing::FLAGS_gtest_death_test_style = "threadsafe"; @@ -100,19 +102,21 @@ TEST(ReleaseAssertDeathTest, release_assert_false) // each attempted kernel launch if (cudaErrorAssert == cudaDeviceSynchronize()) { std::abort(); } - // If we reach this point, the release_assert didn't work so we exit normally, which will cause + // If we reach this point, the cudf_assert didn't work so we exit normally, which will cause // EXPECT_DEATH to fail. }; EXPECT_DEATH(call_kernel(), "this kernel should die"); } -TEST(ReleaseAssert, release_assert_true) +TEST(DebugAssert, cudf_assert_true) { assert_true_kernel<<<1, 1>>>(); ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); } +#endif + // These tests don't use CUDF_TEST_PROGRAM_MAIN because : // 1.) They don't need the RMM Pool // 2.) The RMM Pool interferes with the death test diff --git a/cpp/tests/groupby/collect_set_test.cpp b/cpp/tests/groupby/collect_set_test.cpp new file mode 100644 index 00000000000..5303b8f4f61 --- /dev/null +++ b/cpp/tests/groupby/collect_set_test.cpp @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +namespace cudf { +namespace test { + +#define COL_K cudf::test::fixed_width_column_wrapper +#define COL_V cudf::test::fixed_width_column_wrapper +#define COL_S cudf::test::strings_column_wrapper +#define LCL_V cudf::test::lists_column_wrapper +#define LCL_S cudf::test::lists_column_wrapper +#define VALIDITY std::initializer_list +#define COLLECT_SET cudf::make_collect_set_aggregation() +#define COLLECT_SET_NULL_UNEQUAL \ + cudf::make_collect_set_aggregation(null_policy::INCLUDE, null_equality::UNEQUAL) + +struct CollectSetTest : public cudf::test::BaseFixture { +}; + +template +struct CollectSetTypedTest : public cudf::test::BaseFixture { +}; + +using FixedWidthTypesNotBool = cudf::test::Concat; +TYPED_TEST_CASE(CollectSetTypedTest, FixedWidthTypesNotBool); + +TYPED_TEST(CollectSetTypedTest, ExceptionTests) +{ + std::vector agg_requests(1); + agg_requests[0].values = COL_V{{1, 2, 3, 4, 5, 6}, {true, false, true, false, true, false}}; + agg_requests[0].aggregations.push_back(cudf::make_collect_list_aggregation(null_policy::EXCLUDE)); + + // groupby cannot exclude nulls + groupby::groupby gby{table_view{{COL_K{1, 1, 2, 2, 3, 3}}}}; + EXPECT_THROW(gby.aggregate(agg_requests), cudf::logic_error); +} + +TYPED_TEST(CollectSetTypedTest, TrivialInput) +{ + // Empty input + // TODO: Enable this test after issue#7611 has been fixed + // test_single_agg(COL_K{}, COL_V{}, COL_K{}, COL_V{}, COLLECT_SET); + + // Single key input + { + COL_K keys{1}; + COL_V vals{10}; + COL_K keys_expected{1}; + LCL_V vals_expected{LCL_V{10}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); + } + + // Non-repeated keys + { + COL_K keys{2, 1}; + COL_V vals{20, 10}; + COL_K keys_expected{1, 2}; + LCL_V vals_expected{LCL_V{10}, LCL_V{20}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); + } +} + +TYPED_TEST(CollectSetTypedTest, TypicalInput) +{ + // Pre-sorted keys + { + COL_K keys{1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3}; + COL_V vals{10, 11, 10, 10, 20, 21, 21, 20, 30, 33, 32, 31}; + COL_K keys_expected{1, 2, 3}; + LCL_V vals_expected{{10, 11}, {20, 21}, {30, 31, 32, 33}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); + } + + // Expect the result keys to be sorted by sort-based groupby + { + COL_K keys{4, 1, 2, 4, 3, 3, 2, 1}; + COL_V vals{40, 10, 20, 40, 30, 30, 20, 11}; + COL_K keys_expected{1, 2, 3, 4}; + LCL_V vals_expected{{10, 11}, {20}, {30}, {40}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); + } +} + +// Keys and values columns are sliced columns +TYPED_TEST(CollectSetTypedTest, SlicedColumnsInput) +{ + COL_K keys_original{1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3}; + COL_V vals_original{10, 11, 10, 10, 20, 21, 21, 20, 30, 33, 32, 31}; + { + auto const keys = cudf::slice(keys_original, {0, 4})[0]; // { 1, 1, 1, 1 } + auto const vals = cudf::slice(vals_original, {0, 4})[0]; // { 10, 11, 10, 10 } + auto const keys_expected = COL_K{1}; + auto const vals_expected = LCL_V{{10, 11}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); + } + { + auto const keys = cudf::slice(keys_original, {2, 10})[0]; // { 1, 1, 2, 2, 2, 2, 3, 3 } + auto const vals = cudf::slice(vals_original, {2, 10})[0]; // { 10, 10, 20, 21, 21, 20, 30, 33 } + auto const keys_expected = COL_K{1, 2, 3}; + auto const vals_expected = LCL_V{{10}, {20, 21}, {30, 33}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); + } +} + +TEST_F(CollectSetTest, StringInput) +{ + COL_K keys{1, 2, 3, 3, 2, 1, 2, 1, 2, 1, 1, 1, 1}; + COL_S vals{ + "String 1, first", + "String 2, first", + "String 3, first", + "String 3, second", + "String 2, second", + "String 1, second", + "String 2, second", // repeated + "String 1, second", // repeated + "String 2, second", // repeated + "String 1, second", // repeated + "String 1, second", // repeated + "String 1, second", // repeated + "String 1, second" // repeated + }; + COL_K keys_expected{1, 2, 3}; + LCL_S vals_expected{{"String 1, first", "String 1, second"}, + {"String 2, first", "String 2, second"}, + {"String 3, first", "String 3, second"}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); +} + +TYPED_TEST(CollectSetTypedTest, CollectWithNulls) +{ + // Just use an arbitrary value to store null entries + // Using this alias variable will make the code look cleaner + constexpr int32_t null = 0; + + // Pre-sorted keys + { + COL_K keys{1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3}; + COL_V vals{{10, 10, null, null, 20, null, null, null, 30, 31, 30, 31}, + {true, true, false, false, true, false, false, false, true, true, true, true}}; + COL_K keys_expected{1, 2, 3}; + + // By default, nulls are consider equals, thus only one null is kept per key + LCL_V vals_expected{{{10, null}, VALIDITY{true, false}}, + {{20, null}, VALIDITY{true, false}}, + {{30, 31}, VALIDITY{true, true}}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); + + // All nulls per key are kept (nulls are put at the end of each list) + vals_expected = LCL_V{{{10, null, null}, VALIDITY{true, false, false}}, + {{20, null, null, null}, VALIDITY{true, false, false, false}}, + {{30, 31}, VALIDITY{true, true}}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET_NULL_UNEQUAL); + } + + // Expect the result keys to be sorted by sort-based groupby + { + COL_K keys{4, 1, 2, 4, 3, 3, 3, 3, 2, 1}; + COL_V vals{{40, 10, 20, 40, null, null, null, null, 21, null}, + {true, true, true, true, false, false, false, false, true, false}}; + COL_K keys_expected{1, 2, 3, 4}; + + // By default, nulls are consider equals, thus only one null is kept per key + LCL_V vals_expected{{{10, null}, VALIDITY{true, false}}, + {{20, 21}, VALIDITY{true, true}}, + {{null}, VALIDITY{false}}, + {{40}, VALIDITY{true}}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); + + // All nulls per key are kept (nulls are put at the end of each list) + vals_expected = LCL_V{{{10, null}, VALIDITY{true, false}}, + {{20, 21}, VALIDITY{true, true}}, + {{null, null, null, null}, VALIDITY{false, false, false, false}}, + {{40}, VALIDITY{true}}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET_NULL_UNEQUAL); + } +} + +} // namespace test +} // namespace cudf diff --git a/cpp/tests/groupby/group_collect_test.cpp b/cpp/tests/groupby/group_collect_test.cpp index 9edd0a6932a..8a578ea0c0f 100644 --- a/cpp/tests/groupby/group_collect_test.cpp +++ b/cpp/tests/groupby/group_collect_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,15 +26,15 @@ namespace cudf { namespace test { template -struct groupby_collect_test : public cudf::test::BaseFixture { +struct groupby_collect_list_test : public cudf::test::BaseFixture { }; using FixedWidthTypesNotBool = cudf::test::Concat; -TYPED_TEST_CASE(groupby_collect_test, FixedWidthTypesNotBool); +TYPED_TEST_CASE(groupby_collect_list_test, FixedWidthTypesNotBool); -TYPED_TEST(groupby_collect_test, CollectWithoutNulls) +TYPED_TEST(groupby_collect_list_test, CollectWithoutNulls) { using K = int32_t; using V = TypeParam; @@ -45,11 +45,11 @@ TYPED_TEST(groupby_collect_test, CollectWithoutNulls) fixed_width_column_wrapper expect_keys{1, 2}; lists_column_wrapper expect_vals{{1, 2, 3}, {4, 5, 6}}; - auto agg = cudf::make_collect_aggregation(); + auto agg = cudf::make_collect_list_aggregation(); test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); } -TYPED_TEST(groupby_collect_test, CollectWithNulls) +TYPED_TEST(groupby_collect_list_test, CollectWithNulls) { using K = int32_t; using V = TypeParam; @@ -64,11 +64,11 @@ TYPED_TEST(groupby_collect_test, CollectWithNulls) lists_column_wrapper expect_vals{ {{1, 2}, validity.begin()}, {{3, 4}, validity.begin()}, {{5, 6}, validity.begin()}}; - auto agg = cudf::make_collect_aggregation(); + auto agg = cudf::make_collect_list_aggregation(); test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); } -TYPED_TEST(groupby_collect_test, CollectLists) +TYPED_TEST(groupby_collect_list_test, CollectLists) { using K = int32_t; using V = TypeParam; @@ -83,11 +83,11 @@ TYPED_TEST(groupby_collect_test, CollectLists) lists_column_wrapper expect_vals{ {{1, 2}, {3, 4}}, {{5, 6, 7}, LCW{}}, {{9, 10}, {11}}}; - auto agg = cudf::make_collect_aggregation(); + auto agg = cudf::make_collect_list_aggregation(); test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); } -TYPED_TEST(groupby_collect_test, dictionary) +TYPED_TEST(groupby_collect_list_test, dictionary) { using K = int32_t; using V = TypeParam; @@ -105,10 +105,11 @@ TYPED_TEST(groupby_collect_test, dictionary) 0, rmm::device_buffer{0}); - test_single_agg(keys, vals, expect_keys, expect_vals->view(), cudf::make_collect_aggregation()); + test_single_agg( + keys, vals, expect_keys, expect_vals->view(), cudf::make_collect_list_aggregation()); } -TYPED_TEST(groupby_collect_test, CollectFailsWithNullExclusion) +TYPED_TEST(groupby_collect_list_test, CollectFailsWithNullExclusion) { using K = int32_t; using V = TypeParam; @@ -121,10 +122,10 @@ TYPED_TEST(groupby_collect_test, CollectFailsWithNullExclusion) std::vector agg_requests(1); agg_requests[0].values = values; - agg_requests[0].aggregations.push_back(cudf::make_collect_aggregation(null_policy::EXCLUDE)); + agg_requests[0].aggregations.push_back(cudf::make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_EXPECT_THROW_MESSAGE(gby.aggregate(agg_requests), - "null exclusion is not supported on groupby COLLECT aggregation."); + "null exclusion is not supported on groupby COLLECT_LIST aggregation."); } } // namespace test diff --git a/cpp/tests/groupby/group_count_scan_test.cpp b/cpp/tests/groupby/group_count_scan_test.cpp new file mode 100644 index 00000000000..b7b18982f51 --- /dev/null +++ b/cpp/tests/groupby/group_count_scan_test.cpp @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +namespace cudf { +namespace test { +using K = int32_t; +using key_wrapper = fixed_width_column_wrapper; + +template +struct groupby_count_scan_test : public cudf::test::BaseFixture { + using V = T; + using R = cudf::detail::target_type_t; + using value_wrapper = fixed_width_column_wrapper; + using result_wrapper = fixed_width_column_wrapper; +}; + +TYPED_TEST_CASE(groupby_count_scan_test, cudf::test::AllTypes); + +TYPED_TEST(groupby_count_scan_test, basic) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys {1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + value_wrapper vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + + key_wrapper expect_keys {1, 1, 1, 2, 2, 2, 2, 3, 3, 3}; + result_wrapper expect_vals{0, 1, 2, 0, 1, 2, 3, 0, 1, 2}; + // clang-format on + + auto agg1 = cudf::make_count_aggregation(); + CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)), + "Unsupported groupby scan aggregation"); + + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); +} + +TYPED_TEST(groupby_count_scan_test, empty_cols) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys; + value_wrapper vals; + + key_wrapper expect_keys; + result_wrapper expect_vals; + // clang-format on + + auto agg1 = cudf::make_count_aggregation(); + EXPECT_NO_THROW(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1))); + + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); +} + +TYPED_TEST(groupby_count_scan_test, zero_valid_keys) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys( {1, 2, 3}, all_null()); + value_wrapper vals{3, 4, 5}; + + key_wrapper expect_keys{}; + result_wrapper expect_vals{}; + // clang-format on + + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); +} + +TYPED_TEST(groupby_count_scan_test, zero_valid_values) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys {1, 1, 1}; + value_wrapper vals({3, 4, 5}, all_null()); + + key_wrapper expect_keys{1, 1, 1}; + result_wrapper expect_vals{0, 1, 2}; + // clang-format on + + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); +} + +TYPED_TEST(groupby_count_scan_test, null_keys_and_values) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys( {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); + value_wrapper vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0}); + + // {1, 1, 1, 2, 2, 2, 2, 3, _, 3, 4} + key_wrapper expect_keys( {1, 1, 1, 2, 2, 2, 2, 3, 3, 4}, all_valid()); + // {0, 3, 6, 1, 4, _, 9, 2, 7, 8, -} + result_wrapper expect_vals{0, 1, 2, 0, 1, 2, 3, 0, 1, 0}; + // clang-format on + + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); +} + +struct groupby_count_scan_string_test : public cudf::test::BaseFixture { +}; + +TEST_F(groupby_count_scan_string_test, basic) +{ + using V = cudf::string_view; + using R = cudf::detail::target_type_t; + using result_wrapper = fixed_width_column_wrapper; + + // clang-format off + key_wrapper keys { 1, 3, 3, 5, 5, 0}; + strings_column_wrapper vals{"1", "1", "1", "1", "1", "1"}; + + key_wrapper expect_keys {0, 1, 3, 3, 5, 5}; + result_wrapper expect_vals{0, 0, 0, 1, 0, 1}; + // clang-format on + + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); +} + +template +struct FixedPointTestBothReps : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes); + +TYPED_TEST(FixedPointTestBothReps, GroupByCountScan) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = fixed_point_column_wrapper; + + using V = decimalXX; + using R = cudf::detail::target_type_t; + using result_wrapper = fixed_width_column_wrapper; + + auto const scale = scale_type{-1}; + // clang-format off + auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + auto const vals = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale}; + + auto const expect_keys = key_wrapper{1, 1, 1, 2, 2, 2, 2, 3, 3, 3}; + auto const expect_vals = result_wrapper{0, 1, 2, 0, 1, 2, 3, 0, 1, 2}; + // clang-format on + + CUDF_EXPECT_THROW_MESSAGE( + test_single_scan(keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation()), + "Unsupported groupby scan aggregation"); + + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); +} + +struct groupby_dictionary_count_scan_test : public cudf::test::BaseFixture { +}; + +TEST_F(groupby_dictionary_count_scan_test, basic) +{ + using V = std::string; + using R = cudf::detail::target_type_t; + using result_wrapper = fixed_width_column_wrapper; + + // clang-format off + strings_column_wrapper keys{"1", "3", "3", "5", "5", "0"}; + dictionary_column_wrapper vals{1, 1, 1, 1, 1, 1}; + strings_column_wrapper expect_keys{"0", "1", "3", "3", "5", "5"}; + result_wrapper expect_vals{0, 0, 0, 1, 0, 1}; + // clang-format on + + auto agg1 = cudf::make_count_aggregation(); + CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)), + "Unsupported groupby scan aggregation"); + test_single_scan( + keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation(null_policy::INCLUDE)); +} + +} // namespace test +} // namespace cudf diff --git a/cpp/tests/groupby/group_max_scan_test.cpp b/cpp/tests/groupby/group_max_scan_test.cpp new file mode 100644 index 00000000000..c1fc48ca698 --- /dev/null +++ b/cpp/tests/groupby/group_max_scan_test.cpp @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include +#include + +namespace cudf { +namespace test { +using K = int32_t; +using key_wrapper = fixed_width_column_wrapper; + +template +struct groupby_max_scan_test : public cudf::test::BaseFixture { + using V = T; + using R = cudf::detail::target_type_t; + using value_wrapper = fixed_width_column_wrapper; + using result_wrapper = fixed_width_column_wrapper; +}; + +TYPED_TEST_CASE(groupby_max_scan_test, cudf::test::FixedWidthTypesWithoutFixedPoint); + +TYPED_TEST(groupby_max_scan_test, basic) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys {1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + value_wrapper vals({5, 6, 7, 8, 9, 0, 1, 2, 3, 4}); + + key_wrapper expect_keys {1, 1, 1, 2, 2, 2, 2, 3, 3, 3}; + // {5, 8, 1, 6, 9, 0, 4, 7, 2, 3} + result_wrapper expect_vals({5, 8, 8, 6, 9, 9, 9, 7, 7, 7}); + // clang-format on + + auto agg = cudf::make_max_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_max_scan_test, empty_cols) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + key_wrapper keys{}; + value_wrapper vals{}; + + key_wrapper expect_keys{}; + result_wrapper expect_vals{}; + + auto agg = cudf::make_max_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_max_scan_test, zero_valid_keys) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys( {1, 2, 3}, all_null()); + value_wrapper vals({3, 4, 5}); + + key_wrapper expect_keys{}; + result_wrapper expect_vals{}; + // clang-format on + + auto agg = cudf::make_max_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_max_scan_test, zero_valid_values) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys {1, 1, 1}; + value_wrapper vals({3, 4, 5}, all_null()); + + key_wrapper expect_keys {1, 1, 1}; + result_wrapper expect_vals({-1, -1, -1}, all_null()); + // clang-format on + + auto agg = cudf::make_max_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_max_scan_test, null_keys_and_values) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys( {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); + value_wrapper vals({5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0}); + + // {1, 1, 1, 2, 2, 2, 2, 3, _, 3, 4} + key_wrapper expect_keys( {1, 1, 1, 2, 2, 2, 2, 3, 3, 4}, all_valid()); + // { -, 3, 6, 1, 4, -, 9, 2, _, 8, -} + result_wrapper expect_vals({-1, 8, 8, 6, 9, -1, 9, 7, 7, -1}, + { 0, 1, 1, 1, 1, 0, 1, 1, 1, 0}); + // clang-format on + + auto agg = cudf::make_max_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +template +struct FixedPointTestBothReps : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes); + +TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxScanDecimalAsValue) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = fixed_point_column_wrapper; + + for (auto const i : {2, 1, 0, -1, -2}) { + auto const scale = scale_type{i}; + // clang-format off + auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + auto const vals = fp_wrapper{{5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, scale}; + + // {5, 8, 1, 6, 9, 0, 4, 7, 2, 3} + auto const expect_keys = key_wrapper{1, 1, 1, 2, 2, 2, 2, 3, 3, 3}; + auto const expect_vals_max = fp_wrapper{{5, 8, 8, 6, 9, 9, 9, 7, 7, 7}, scale}; + // clang-format on + + auto agg = cudf::make_max_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals_max, std::move(agg)); + } +} + +} // namespace test +} // namespace cudf diff --git a/cpp/tests/groupby/group_min_scan_test.cpp b/cpp/tests/groupby/group_min_scan_test.cpp new file mode 100644 index 00000000000..d3186d880cc --- /dev/null +++ b/cpp/tests/groupby/group_min_scan_test.cpp @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +namespace cudf { +namespace test { +using K = int32_t; +using key_wrapper = fixed_width_column_wrapper; + +template +struct groupby_min_scan_test : public cudf::test::BaseFixture { + using V = T; + using R = cudf::detail::target_type_t; + using value_wrapper = fixed_width_column_wrapper; + using result_wrapper = fixed_width_column_wrapper; +}; + +TYPED_TEST_CASE(groupby_min_scan_test, cudf::test::FixedWidthTypesWithoutFixedPoint); + +TYPED_TEST(groupby_min_scan_test, basic) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys {1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + value_wrapper vals({5, 6, 7, 8, 9, 0, 1, 2, 3, 4}); + + key_wrapper expect_keys {1, 1, 1, 2, 2, 2, 2, 3, 3, 3}; + result_wrapper expect_vals({5, 5, 1, 6, 6, 0, 0, 7, 2, 2}); + // clang-format on + + auto agg = cudf::make_min_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_min_scan_test, empty_cols) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + key_wrapper keys{}; + value_wrapper vals{}; + + key_wrapper expect_keys{}; + result_wrapper expect_vals{}; + + auto agg = cudf::make_min_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_min_scan_test, zero_valid_keys) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys({1, 2, 3}, all_null()); + value_wrapper vals({3, 4, 5}); + + key_wrapper expect_keys{}; + result_wrapper expect_vals{}; + // clang-format on + + auto agg = cudf::make_min_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_min_scan_test, zero_valid_values) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys {1, 1, 1}; + value_wrapper vals({3, 4, 5}, all_null()); + + key_wrapper expect_keys {1, 1, 1}; + result_wrapper expect_vals({-1, -1, -1}, all_null()); + // clang-format on + + auto agg = cudf::make_min_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_min_scan_test, null_keys_and_values) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys( {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); + value_wrapper vals({5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0}); + + // { 1, 1, 1, 2, 2, 2, 2, 3, _, 3, 4} + key_wrapper expect_keys( { 1, 1, 1, 2, 2, 2, 2, 3, 3, 4}, all_valid()); + // { _, 8, 1, 6, 9, _, 4, 7, 2, 3, _} + result_wrapper expect_vals({-1, 8, 1, 6, 6, -1, 4, 7, 3, -1}, + { 0, 1, 1, 1, 1, 0, 1, 1, 1, 0}); + // clang-format on + + auto agg = cudf::make_min_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +struct groupby_min_scan_string_test : public cudf::test::BaseFixture { +}; + +TEST_F(groupby_min_scan_string_test, basic) +{ + key_wrapper keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + strings_column_wrapper vals{"año", "bit", "₹1", "aaa", "zit", "bat", "aaa", "$1", "₹1", "wut"}; + + key_wrapper expect_keys{1, 1, 1, 2, 2, 2, 2, 3, 3, 3}; + strings_column_wrapper expect_vals; + + auto agg = cudf::make_min_aggregation(); + CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)), + "Unsupported groupby scan type-agg combination"); +} + +template +struct FixedPointTestBothReps : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes); + +TYPED_TEST(FixedPointTestBothReps, GroupBySortMinScanDecimalAsValue) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = fixed_point_column_wrapper; + + for (auto const i : {2, 1, 0, -1, -2}) { + auto const scale = scale_type{i}; + + // clang-format off + auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + auto const vals = fp_wrapper{{5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, scale}; + + // {5, 8, 1, 6, 9, 0, 4, 7, 2, 3} + auto const expect_keys = key_wrapper{1, 1, 1, 2, 2, 2, 2, 3, 3, 3}; + auto const expect_vals_min = fp_wrapper{{5, 5, 1, 6, 6, 0, 0, 7, 2, 2}, scale}; + // clang-format on + + auto agg = cudf::make_min_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals_min, std::move(agg)); + } +} + +} // namespace test +} // namespace cudf diff --git a/cpp/tests/groupby/group_sum_scan_test.cpp b/cpp/tests/groupby/group_sum_scan_test.cpp new file mode 100644 index 00000000000..9f6c21462b3 --- /dev/null +++ b/cpp/tests/groupby/group_sum_scan_test.cpp @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +namespace cudf { +namespace test { +using K = int32_t; +using key_wrapper = fixed_width_column_wrapper; + +template +struct groupby_sum_scan_test : public cudf::test::BaseFixture { + using V = T; + using R = cudf::detail::target_type_t; + using value_wrapper = fixed_width_column_wrapper; + using result_wrapper = fixed_width_column_wrapper; +}; + +using supported_types = + cudf::test::Concat, + cudf::test::DurationTypes>; + +TYPED_TEST_CASE(groupby_sum_scan_test, supported_types); + +TYPED_TEST(groupby_sum_scan_test, basic) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys {1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + value_wrapper vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + + key_wrapper expect_keys {1, 1, 1, 2, 2, 2, 2, 3, 3, 3}; + // {0, 3, 6, 1, 4, 5, 9, 2, 7, 8} + result_wrapper expect_vals{0, 3, 9, 1, 5, 10, 19, 2, 9, 17}; + // clang-format on + auto agg = cudf::make_sum_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_sum_scan_test, empty_cols) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys{}; + value_wrapper vals{}; + + key_wrapper expect_keys{}; + result_wrapper expect_vals{}; + // clang-format on + + auto agg = cudf::make_sum_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_sum_scan_test, zero_valid_keys) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys({1, 2, 3}, all_null()); + value_wrapper vals{3, 4, 5}; + + key_wrapper expect_keys{}; + result_wrapper expect_vals{}; + // clang-format on + + auto agg = cudf::make_sum_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_sum_scan_test, zero_valid_values) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys {1, 1, 1}; + value_wrapper vals({3, 4, 5}, all_null()); + + key_wrapper expect_keys {1, 1, 1}; + result_wrapper expect_vals({3, 4, 5}, all_null()); + // clang-format on + + auto agg = cudf::make_sum_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_sum_scan_test, null_keys_and_values) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys( {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); + value_wrapper vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0}); + + // { 1, 1, 1, 2, 2, 2, 2, 3, *, 3, 4}; + key_wrapper expect_keys( { 1, 1, 1, 2, 2, 2, 2, 3, 3, 4}, all_valid()); + // { -, 3, 6, 1, 4, -, 9, 2, _, 8, -} + result_wrapper expect_vals({-1, 3, 9, 1, 5, -1, 14, 2, 10, -1}, + { 0, 1, 1, 1, 1, 0, 1, 1, 1, 0}); + // clang-format on + + auto agg = cudf::make_sum_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +template +struct FixedPointTestBothReps : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes); + +TYPED_TEST(FixedPointTestBothReps, GroupBySortSumScanDecimalAsValue) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = fixed_point_column_wrapper; + using out_fp_wrapper = fixed_point_column_wrapper; + + for (auto const i : {2, 1, 0, -1, -2}) { + auto const scale = scale_type{i}; + // clang-format off + auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + auto const vals = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale}; + + auto const expect_keys = key_wrapper {1, 1, 1, 2, 2, 2, 2, 3, 3, 3}; + auto const expect_vals_sum = out_fp_wrapper{{0, 3, 9, 1, 5, 10, 19, 2, 9, 17}, scale}; + // clang-format on + + auto agg2 = cudf::make_sum_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals_sum, std::move(agg2)); + } +} + +} // namespace test +} // namespace cudf diff --git a/cpp/tests/groupby/groupby_keys_test.cpp b/cpp/tests/groupby/groupby_keys_test.cpp index 06ec9eb8968..78299e1a18c 100644 --- a/cpp/tests/groupby/groupby_keys_test.cpp +++ b/cpp/tests/groupby/groupby_keys_test.cpp @@ -33,166 +33,229 @@ using supported_types = cudf::test:: TYPED_TEST_CASE(groupby_keys_test, supported_types); -// clang-format off TYPED_TEST(groupby_keys_test, basic) { - using K = TypeParam; - using V = int32_t; - using R = cudf::detail::target_type_t; + using K = TypeParam; + using V = int32_t; + using R = cudf::detail::target_type_t; - fixed_width_column_wrapper keys { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; - fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + // clang-format off + fixed_width_column_wrapper keys { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - fixed_width_column_wrapper expect_keys { 1, 2, 3 }; - fixed_width_column_wrapper expect_vals { 3, 4, 3 }; + fixed_width_column_wrapper expect_keys { 1, 2, 3 }; + fixed_width_column_wrapper expect_vals { 3, 4, 3 }; + // clang-format on - auto agg = cudf::make_count_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); + auto agg = cudf::make_count_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } TYPED_TEST(groupby_keys_test, zero_valid_keys) { - using K = TypeParam; - using V = int32_t; - using R = cudf::detail::target_type_t; + using K = TypeParam; + using V = int32_t; + using R = cudf::detail::target_type_t; - fixed_width_column_wrapper keys ( { 1, 2, 3}, all_null() ); - fixed_width_column_wrapper vals { 3, 4, 5}; + // clang-format off + fixed_width_column_wrapper keys ( { 1, 2, 3}, all_null() ); + fixed_width_column_wrapper vals { 3, 4, 5}; - fixed_width_column_wrapper expect_keys { }; - fixed_width_column_wrapper expect_vals { }; + fixed_width_column_wrapper expect_keys { }; + fixed_width_column_wrapper expect_vals { }; + // clang-format on - auto agg = cudf::make_count_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); + auto agg = cudf::make_count_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } TYPED_TEST(groupby_keys_test, some_null_keys) { - using K = TypeParam; - using V = int32_t; - using R = cudf::detail::target_type_t; - - fixed_width_column_wrapper keys( { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, - { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); - fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; + using K = TypeParam; + using V = int32_t; + using R = cudf::detail::target_type_t; - // { 1, 1, 1, 2, 2, 2, 2, 3, 3, 4} - fixed_width_column_wrapper expect_keys({ 1, 2, 3, 4}, all_valid()); - // { 0, 3, 6, 1, 4, 5, 9, 2, 8, -} - fixed_width_column_wrapper expect_vals { 3, 4, 2, 1}; + // clang-format off + fixed_width_column_wrapper keys( { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, + { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); + fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; + + // { 1, 1, 1, 2, 2, 2, 2, 3, 3, 4} + fixed_width_column_wrapper expect_keys({ 1, 2, 3, 4}, all_valid()); + // { 0, 3, 6, 1, 4, 5, 9, 2, 8, -} + fixed_width_column_wrapper expect_vals { 3, 4, 2, 1}; + // clang-format on - auto agg = cudf::make_count_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); + auto agg = cudf::make_count_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } TYPED_TEST(groupby_keys_test, include_null_keys) { - using K = TypeParam; - using V = int32_t; - using R = cudf::detail::target_type_t; - - fixed_width_column_wrapper keys( { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, - { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); - fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; - - // { 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, -} - fixed_width_column_wrapper expect_keys({ 1, 2, 3, 4, 3}, - { 1, 1, 1, 1, 0}); - // { 0, 3, 6, 1, 4, 5, 9, 2, 8, -, -} - fixed_width_column_wrapper expect_vals { 9, 19, 10, 4, 7}; - - auto agg = cudf::make_sum_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), - force_use_sort_impl::NO, null_policy::INCLUDE); + using K = TypeParam; + using V = int32_t; + using R = cudf::detail::target_type_t; + + // clang-format off + fixed_width_column_wrapper keys( { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, + { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); + fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; + + // { 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, -} + fixed_width_column_wrapper expect_keys({ 1, 2, 3, 4, 3}, + { 1, 1, 1, 1, 0}); + // { 0, 3, 6, 1, 4, 5, 9, 2, 8, -, -} + fixed_width_column_wrapper expect_vals { 9, 19, 10, 4, 7}; + // clang-format on + + auto agg = cudf::make_sum_aggregation(); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + std::move(agg), + force_use_sort_impl::NO, + null_policy::INCLUDE); } TYPED_TEST(groupby_keys_test, pre_sorted_keys) { - using K = TypeParam; - using V = int32_t; - using R = cudf::detail::target_type_t; + using K = TypeParam; + using V = int32_t; + using R = cudf::detail::target_type_t; - fixed_width_column_wrapper keys { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4}; - fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; + // clang-format off + fixed_width_column_wrapper keys { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4}; + fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; - fixed_width_column_wrapper expect_keys { 1, 2, 3, 4}; - fixed_width_column_wrapper expect_vals { 3, 18, 24, 4}; + fixed_width_column_wrapper expect_keys { 1, 2, 3, 4}; + fixed_width_column_wrapper expect_vals { 3, 18, 24, 4}; + // clang-format on - auto agg = cudf::make_sum_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), - force_use_sort_impl::YES, null_policy::EXCLUDE, sorted::YES); + auto agg = cudf::make_sum_aggregation(); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + std::move(agg), + force_use_sort_impl::YES, + null_policy::EXCLUDE, + sorted::YES); } TYPED_TEST(groupby_keys_test, pre_sorted_keys_descending) { - using K = TypeParam; - using V = int32_t; - using R = cudf::detail::target_type_t; + using K = TypeParam; + using V = int32_t; + using R = cudf::detail::target_type_t; - fixed_width_column_wrapper keys { 4, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1}; - fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; + // clang-format off + fixed_width_column_wrapper keys { 4, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1}; + fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; - fixed_width_column_wrapper expect_keys { 4, 3, 2, 1 }; - fixed_width_column_wrapper expect_vals { 0, 6, 22, 21 }; + fixed_width_column_wrapper expect_keys { 4, 3, 2, 1 }; + fixed_width_column_wrapper expect_vals { 0, 6, 22, 21 }; + // clang-format on - auto agg = cudf::make_sum_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), - force_use_sort_impl::YES, null_policy::EXCLUDE, sorted::YES, {order::DESCENDING}); + auto agg = cudf::make_sum_aggregation(); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + std::move(agg), + force_use_sort_impl::YES, + null_policy::EXCLUDE, + sorted::YES, + {order::DESCENDING}); } TYPED_TEST(groupby_keys_test, pre_sorted_keys_nullable) { - using K = TypeParam; - using V = int32_t; - using R = cudf::detail::target_type_t; + using K = TypeParam; + using V = int32_t; + using R = cudf::detail::target_type_t; - fixed_width_column_wrapper keys( { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4}, - { 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1}); - fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; + // clang-format off + fixed_width_column_wrapper keys( { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4}, + { 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1}); + fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; - fixed_width_column_wrapper expect_keys({ 1, 2, 3, 4}, all_valid()); - fixed_width_column_wrapper expect_vals { 3, 15, 17, 4}; + fixed_width_column_wrapper expect_keys({ 1, 2, 3, 4}, all_valid()); + fixed_width_column_wrapper expect_vals { 3, 15, 17, 4}; + // clang-format on - auto agg = cudf::make_sum_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), - force_use_sort_impl::YES, null_policy::EXCLUDE, sorted::YES); + auto agg = cudf::make_sum_aggregation(); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + std::move(agg), + force_use_sort_impl::YES, + null_policy::EXCLUDE, + sorted::YES); } TYPED_TEST(groupby_keys_test, pre_sorted_keys_nulls_before_include_nulls) { - using K = TypeParam; - using V = int32_t; - using R = cudf::detail::target_type_t; - - fixed_width_column_wrapper keys( { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4}, - { 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1}); - fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; - - // { 1, 1, 1, -, -, 2, 2, -, 3, 3, 4} - fixed_width_column_wrapper expect_keys({ 1, 2, 2, 3, 3, 4}, - { 1, 0, 1, 0, 1, 1}); - fixed_width_column_wrapper expect_vals { 3, 7, 11, 7, 17, 4}; - - auto agg = cudf::make_sum_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), - force_use_sort_impl::YES, null_policy::INCLUDE, sorted::YES); + using K = TypeParam; + using V = int32_t; + using R = cudf::detail::target_type_t; + + // clang-format off + fixed_width_column_wrapper keys( { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4}, + { 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1}); + fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; + + // { 1, 1, 1, -, -, 2, 2, -, 3, 3, 4} + fixed_width_column_wrapper expect_keys({ 1, 2, 2, 3, 3, 4}, + { 1, 0, 1, 0, 1, 1}); + fixed_width_column_wrapper expect_vals { 3, 7, 11, 7, 17, 4}; + // clang-format on + + auto agg = cudf::make_sum_aggregation(); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + std::move(agg), + force_use_sort_impl::YES, + null_policy::INCLUDE, + sorted::YES); +} + +TYPED_TEST(groupby_keys_test, mismatch_num_rows) +{ + using K = TypeParam; + using V = int32_t; + + fixed_width_column_wrapper keys{1, 2, 3}; + fixed_width_column_wrapper vals{0, 1, 2, 3, 4}; + + auto agg = cudf::make_count_aggregation(); + CUDF_EXPECT_THROW_MESSAGE(test_single_agg(keys, vals, keys, vals, std::move(agg)), + "Size mismatch between request values and groupby keys."); + CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, keys, vals, std::move(agg)), + "Size mismatch between request values and groupby keys."); } -struct groupby_string_keys_test : public cudf::test::BaseFixture {}; +struct groupby_string_keys_test : public cudf::test::BaseFixture { +}; TEST_F(groupby_string_keys_test, basic) { - using V = int32_t; - using R = cudf::detail::target_type_t; + using V = int32_t; + using R = cudf::detail::target_type_t; - strings_column_wrapper keys { "aaa", "año", "₹1", "aaa", "año", "año", "aaa", "₹1", "₹1", "año"}; - fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + // clang-format off + strings_column_wrapper keys { "aaa", "año", "₹1", "aaa", "año", "año", "aaa", "₹1", "₹1", "año"}; + fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - strings_column_wrapper expect_keys({ "aaa", "año", "₹1" }); - fixed_width_column_wrapper expect_vals { 9, 19, 17 }; + strings_column_wrapper expect_keys({ "aaa", "año", "₹1" }); + fixed_width_column_wrapper expect_vals { 9, 19, 17 }; + // clang-format on - auto agg = cudf::make_sum_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); + auto agg = cudf::make_sum_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } // clang-format on diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp index 0b68b7bbfb4..c7e27cd6367 100644 --- a/cpp/tests/groupby/groupby_test_util.hpp +++ b/cpp/tests/groupby/groupby_test_util.hpp @@ -99,6 +99,32 @@ inline void test_single_agg(column_view const& keys, } } +inline void test_single_scan(column_view const& keys, + column_view const& values, + column_view const& expect_keys, + column_view const& expect_vals, + std::unique_ptr&& agg, + null_policy include_null_keys = null_policy::EXCLUDE, + sorted keys_are_sorted = sorted::NO, + std::vector const& column_order = {}, + std::vector const& null_precedence = {}) +{ + std::vector requests; + requests.emplace_back(groupby::aggregation_request()); + requests[0].values = values; + + requests[0].aggregations.push_back(std::move(agg)); + + groupby::groupby gb_obj( + table_view({keys}), include_null_keys, keys_are_sorted, column_order, null_precedence); + + // groupby scan uses sort implementation + auto result = gb_obj.scan(requests); + + CUDF_TEST_EXPECT_TABLES_EQUAL(table_view({expect_keys}), result.first->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expect_vals, *result.second[0].results[0], true); +} + inline auto all_valid() { auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 995ee94472f..013457d8ed6 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -100,6 +100,89 @@ std::unique_ptr create_compressible_fixed_table(cudf::size_type num return create_fixed_table(num_columns, num_rows, include_validity, compressible_elements); } +// this function replicates the "list_gen" function in +// python/cudf/cudf/tests/test_parquet.py +template +std::unique_ptr make_parquet_list_col( + int skip_rows, int num_rows, int lists_per_row, int list_size, bool include_validity) +{ + auto valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0 ? 1 : 0; }); + + // root list + std::vector row_offsets(num_rows + 1); + int row_offset_count = 0; + { + int offset = 0; + for (int idx = 0; idx < (num_rows) + 1; idx++) { + row_offsets[row_offset_count] = offset; + if (!include_validity || valids[idx]) { offset += lists_per_row; } + row_offset_count++; + } + } + cudf::test::fixed_width_column_wrapper offsets(row_offsets.begin(), + row_offsets.begin() + row_offset_count); + + // child list + std::vector child_row_offsets((num_rows * lists_per_row) + 1); + int child_row_offset_count = 0; + { + int offset = 0; + for (int idx = 0; idx < (num_rows * lists_per_row); idx++) { + int row_index = idx / lists_per_row; + if (include_validity && !valids[row_index]) { continue; } + + child_row_offsets[child_row_offset_count] = offset; + offset += list_size; + child_row_offset_count++; + } + child_row_offsets[child_row_offset_count++] = offset; + } + cudf::test::fixed_width_column_wrapper child_offsets( + child_row_offsets.begin(), child_row_offsets.begin() + child_row_offset_count); + + // child values + std::vector child_values(num_rows * lists_per_row * list_size); + T first_child_value_index = skip_rows * lists_per_row * list_size; + int child_value_count = 0; + { + for (int idx = 0; idx < (num_rows * lists_per_row * list_size); idx++) { + int row_index = idx / (lists_per_row * list_size); + + int val = first_child_value_index; + first_child_value_index++; + + if (include_validity && !valids[row_index]) { continue; } + + child_values[child_value_count] = val; + child_value_count++; + } + } + // validity by value instead of index + auto valids2 = cudf::detail::make_counting_transform_iterator( + 0, [list_size](auto i) { return (i % list_size) % 2 == 0 ? 1 : 0; }); + auto child_data = include_validity + ? cudf::test::fixed_width_column_wrapper( + child_values.begin(), child_values.begin() + child_value_count, valids2) + : cudf::test::fixed_width_column_wrapper( + child_values.begin(), child_values.begin() + child_value_count); + + int child_offsets_size = static_cast(child_offsets).size() - 1; + auto child = cudf::make_lists_column( + child_offsets_size, child_offsets.release(), child_data.release(), 0, rmm::device_buffer{}); + + int offsets_size = static_cast(offsets).size() - 1; + return include_validity + ? cudf::make_lists_column( + offsets_size, + offsets.release(), + std::move(child), + cudf::UNKNOWN_NULL_COUNT, + cudf::test::detail::make_null_mask(valids, valids + offsets_size)) + : cudf::make_lists_column( + offsets_size, offsets.release(), std::move(child), 0, rmm::device_buffer{}); +} + void compare_metadata_equality(cudf::io::table_input_metadata in_meta, cudf::io::table_metadata out_meta) { @@ -2188,6 +2271,125 @@ TEST_F(ParquetReaderTest, UserBounds) } } +TEST_F(ParquetReaderTest, UserBoundsWithNulls) +{ + // clang-format off + cudf::test::fixed_width_column_wrapper col{{1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,5,5,5,5, 6,6,6,6,6,6,6,6, 7,7,7,7,7,7,7,7, 8,8,8,8,8,8,8,8} + ,{1,1,1,0,0,0,1,1, 1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,0,0, 1,0,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,0}}; + // clang-format on + cudf::table_view tbl({col}); + auto filepath = temp_env->get_temp_filepath("UserBoundsWithNulls.parquet"); + cudf_io::parquet_writer_options out_args = + cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, tbl); + cudf_io::write_parquet(out_args); + + // skip_rows / num_rows + // clang-format off + std::vector> params{ {-1, -1}, {1, 3}, {3, -1}, + {31, -1}, {32, -1}, {33, -1}, + {31, 5}, {32, 5}, {33, 5}, + {-1, 7}, {-1, 31}, {-1, 32}, {-1, 33}, + {62, -1}, {63, -1}, + {62, 2}, {63, 1}}; + // clang-format on + for (auto p : params) { + cudf_io::parquet_reader_options read_args = + cudf::io::parquet_reader_options::builder(cudf_io::source_info{filepath}); + if (p.first >= 0) { read_args.set_skip_rows(p.first); } + if (p.second >= 0) { read_args.set_num_rows(p.second); } + auto result = cudf_io::read_parquet(read_args); + + p.first = p.first < 0 ? 0 : p.first; + p.second = p.second < 0 ? static_cast(col).size() - p.first : p.second; + std::vector slice_indices{p.first, p.first + p.second}; + auto expected = cudf::slice(col, slice_indices); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected[0]); + } +} + +TEST_F(ParquetReaderTest, UserBoundsWithNullsLarge) +{ + constexpr int num_rows = 30 * 1000000; + + std::mt19937 gen(6747); + std::bernoulli_distribution bn(0.7f); + auto valids = + cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(gen); }); + auto values = thrust::make_counting_iterator(0); + + cudf::test::fixed_width_column_wrapper col(values, values + num_rows, valids); + + // this file will have row groups of 1,000,000 each + cudf::table_view tbl({col}); + auto filepath = temp_env->get_temp_filepath("UserBoundsWithNullsLarge.parquet"); + cudf_io::parquet_writer_options out_args = + cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, tbl); + cudf_io::write_parquet(out_args); + + // skip_rows / num_rows + // clang-format off + std::vector> params{ {-1, -1}, {31, -1}, {32, -1}, {33, -1}, {1613470, -1}, {1999999, -1}, + {31, 1}, {32, 1}, {33, 1}, + // deliberately span some row group boundaries + {999000, 1001}, {999000, 2000}, {2999999, 2}, {13999997, -1}, + {16785678, 3}, {22996176, 31}, + {24001231, 17}, {29000001, 989999}, {29999999, 1} }; + // clang-format on + for (auto p : params) { + cudf_io::parquet_reader_options read_args = + cudf::io::parquet_reader_options::builder(cudf_io::source_info{filepath}); + if (p.first >= 0) { read_args.set_skip_rows(p.first); } + if (p.second >= 0) { read_args.set_num_rows(p.second); } + auto result = cudf_io::read_parquet(read_args); + + p.first = p.first < 0 ? 0 : p.first; + p.second = p.second < 0 ? static_cast(col).size() - p.first : p.second; + std::vector slice_indices{p.first, p.first + p.second}; + auto expected = cudf::slice(col, slice_indices); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected[0]); + } +} + +TEST_F(ParquetReaderTest, ListUserBoundsWithNullsLarge) +{ + constexpr int num_rows = 5 * 1000000; + auto colp = make_parquet_list_col(0, num_rows, 5, 8, true); + cudf::column_view col = *colp; + + // this file will have row groups of 1,000,000 each + cudf::table_view tbl({col}); + auto filepath = temp_env->get_temp_filepath("ListUserBoundsWithNullsLarge.parquet"); + cudf_io::parquet_writer_options out_args = + cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, tbl); + cudf_io::write_parquet(out_args); + + // skip_rows / num_rows + // clang-format off + std::vector> params{ {-1, -1}, {31, -1}, {32, -1}, {33, -1}, {161470, -1}, {4499997, -1}, + {31, 1}, {32, 1}, {33, 1}, + // deliberately span some row group boundaries + {999000, 1001}, {999000, 2000}, {2999999, 2}, + {1678567, 3}, {4299676, 31}, + {4001231, 17}, {1900000, 989999}, {4999999, 1} }; + // clang-format on + for (auto p : params) { + cudf_io::parquet_reader_options read_args = + cudf::io::parquet_reader_options::builder(cudf_io::source_info{filepath}); + if (p.first >= 0) { read_args.set_skip_rows(p.first); } + if (p.second >= 0) { read_args.set_num_rows(p.second); } + auto result = cudf_io::read_parquet(read_args); + + p.first = p.first < 0 ? 0 : p.first; + p.second = p.second < 0 ? static_cast(col).size() - p.first : p.second; + std::vector slice_indices{p.first, p.first + p.second}; + auto expected = cudf::slice(col, slice_indices); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected[0]); + } +} + TEST_F(ParquetReaderTest, ReorderedColumns) { { diff --git a/cpp/tests/labeling/label_bins_tests.cpp b/cpp/tests/labeling/label_bins_tests.cpp new file mode 100644 index 00000000000..34c8ff7251f --- /dev/null +++ b/cpp/tests/labeling/label_bins_tests.cpp @@ -0,0 +1,440 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace { + +template +using fwc_wrapper = cudf::test::fixed_width_column_wrapper; + +template +using fpc_wrapper = cudf::test::fixed_point_column_wrapper; + +// TODO: Should we move these into type_lists? They seem generally useful. +using cudf::test::FixedPointTypes; +using cudf::test::FloatingPointTypes; +using NumericTypesNotBool = + cudf::test::Concat; +using SignedNumericTypesNotBool = + cudf::test::Types; + +struct BinTestFixture : public cudf::test::BaseFixture { +}; + +/* + * Test error cases. + * + * Most of these are not parameterized by type to avoid unnecessary test overhead. + */ + +// Left edges type check. +TEST(BinColumnErrorTests, TestInvalidLeft) +{ + fwc_wrapper left_edges{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + fwc_wrapper right_edges{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + fwc_wrapper input{0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5}; + + EXPECT_THROW( + cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO), + cudf::logic_error); +}; + +// Right edges type check. +TEST(BinColumnErrorTests, TestInvalidRight) +{ + fwc_wrapper left_edges{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + fwc_wrapper right_edges{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + fwc_wrapper input{0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5}; + + EXPECT_THROW( + cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO), + cudf::logic_error); +}; + +// Input type check. +TEST(BinColumnErrorTests, TestInvalidInput) +{ + fwc_wrapper left_edges{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + fwc_wrapper right_edges{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + fwc_wrapper input{0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5}; + + EXPECT_THROW( + cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO), + cudf::logic_error); +}; + +// Number of left and right edges must match. +TEST(BinColumnErrorTests, TestMismatchedEdges) +{ + fwc_wrapper left_edges{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + fwc_wrapper right_edges{1, 2, 3, 4, 5, 6, 7, 8, 9}; + fwc_wrapper input{0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5}; + + EXPECT_THROW( + cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO), + cudf::logic_error); +}; + +// Left edges with nulls. +TEST(BinColumnErrorTests, TestLeftEdgesWithNullsBefore) +{ + fwc_wrapper left_edges{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 1, 1, 1, 1, 1, 1, 1, 1, 1}}; + fwc_wrapper right_edges{1, 2, 3, 4, 5, 6, 7, 8, 9}; + fwc_wrapper input{0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5}; + + EXPECT_THROW( + cudf::label_bins(input, left_edges, cudf::inclusive::NO, right_edges, cudf::inclusive::NO), + cudf::logic_error); +}; + +// Right edges with nulls. +TEST(BinColumnErrorTests, TestRightEdgesWithNullsBefore) +{ + fwc_wrapper left_edges{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + fwc_wrapper right_edges{{1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 1, 1, 1, 1, 1, 1, 1, 1, 1}}; + fwc_wrapper input{0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5}; + + EXPECT_THROW( + cudf::label_bins(input, left_edges, cudf::inclusive::NO, right_edges, cudf::inclusive::NO), + cudf::logic_error); +}; + +/* + * Valid exceptional cases. + */ + +template +struct GenericExceptionCasesBinTestFixture : public BinTestFixture { + void test(fwc_wrapper input, + fwc_wrapper expected, + fwc_wrapper left_edges, + fwc_wrapper right_edges) + { + auto result = + cudf::label_bins(input, left_edges, cudf::inclusive::NO, right_edges, cudf::inclusive::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } +}; + +template +struct ExceptionCasesBinTestFixture : public GenericExceptionCasesBinTestFixture { +}; + +TYPED_TEST_CASE(ExceptionCasesBinTestFixture, NumericTypesNotBool); + +// Empty input must return an empty output. +TYPED_TEST(ExceptionCasesBinTestFixture, TestEmptyInput) +{ + this->test({}, {}, {0, 2, 4, 6, 8}, {2, 4, 6, 8, 10}); +}; + +// If no edges are provided, the bin for all inputs is null. +TYPED_TEST(ExceptionCasesBinTestFixture, TestEmptyEdges) +{ + this->test({1, 1}, {{0, 0}, {0, 0}}, {}, {}); +}; + +// Values outside the bounds should be labeled NULL. +TYPED_TEST(ExceptionCasesBinTestFixture, TestOutOfBoundsInput) +{ + this->test({7, 9, 11, 13}, {{3, 4, 0, 0}, {1, 1, 0, 0}}, {0, 2, 4, 6, 8}, {2, 4, 6, 8, 10}); +}; + +// Null inputs must map to nulls. +TYPED_TEST(ExceptionCasesBinTestFixture, TestInputWithNulls) +{ + this->test( + {{1, 3, 5, 7}, {0, 1, 0, 1}}, {{0, 1, 0, 3}, {0, 1, 0, 1}}, {0, 2, 4, 6, 8}, {2, 4, 6, 8, 10}); +}; + +// Test that nan values are assigned the NULL label. +template +struct NaNBinTestFixture : public GenericExceptionCasesBinTestFixture { +}; + +TYPED_TEST_CASE(NaNBinTestFixture, FloatingPointTypes); + +TYPED_TEST(NaNBinTestFixture, TestNaN) +{ + if (std::numeric_limits::has_quiet_NaN) { + this->test( + {std::numeric_limits::quiet_NaN()}, {{0}, {0}}, {0, 2, 4, 6, 8}, {2, 4, 6, 8, 10}); + } +} + +/* + * Test inclusion options. + */ + +template +struct BoundaryExclusionBinTestFixture : public BinTestFixture { + void test(cudf::inclusive left_inc, + cudf::inclusive right_inc, + fwc_wrapper expected) + { + fwc_wrapper left_edges{0, 2, 4, 6, 8}; + fwc_wrapper right_edges{2, 4, 6, 8, 10}; + fwc_wrapper input{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + + auto result = cudf::label_bins(input, left_edges, left_inc, right_edges, right_inc); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } +}; + +TYPED_TEST_CASE(BoundaryExclusionBinTestFixture, NumericTypesNotBool); + +// Boundary points when both bounds are excluded should be labeled null. +TYPED_TEST(BoundaryExclusionBinTestFixture, TestNoIncludes) +{ + this->test(cudf::inclusive::NO, + cudf::inclusive::NO, + {{0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5}, {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0}}); +}; + +// Boundary point 1 should be in bin 1 [1, 2). +TYPED_TEST(BoundaryExclusionBinTestFixture, TestIncludeLeft) +{ + this->test(cudf::inclusive::YES, + cudf::inclusive::NO, + {{0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 0}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}}); +}; + +// Boundary point 1 should be in bin 0 (0, 1]. +TYPED_TEST(BoundaryExclusionBinTestFixture, TestIncludeRight) +{ + this->test(cudf::inclusive::NO, + cudf::inclusive::YES, + {{0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4}, {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}); +}; + +/* + * Test real data. + */ + +// Test numeric data of reasonable size with noncontiguous bins. +template +struct RealDataBinTestFixture : public BinTestFixture { + void test(unsigned int num_elements = 512, + unsigned int inputs_per_bin = 4, + T left_edge_start_val = 0) + { + // Avoid testing numbers that are larger than the current type supports. + const T largest_value = (num_elements / inputs_per_bin) * 4; + num_elements = std::min(std::numeric_limits::max(), largest_value); + + unsigned int num_edges = num_elements / inputs_per_bin; + + std::vector left_edge_vector(num_edges); + std::vector right_edge_vector(num_edges); + std::vector partial_input_vector(num_edges); + std::vector input_vector; + std::vector partial_expected_vector(num_edges); + std::vector expected_vector; + std::vector expected_validity(num_elements, 1); + + std::iota(left_edge_vector.begin(), left_edge_vector.end(), left_edge_start_val); + + // Create noncontiguous bins of width 2 separate by 2, and place inputs in the middle of each + // bin. + std::transform( + left_edge_vector.begin(), left_edge_vector.end(), left_edge_vector.begin(), [](T val) { + return val * 4; + }); + std::transform( + left_edge_vector.begin(), left_edge_vector.end(), right_edge_vector.begin(), [](T val) { + return val + 2; + }); + std::transform( + left_edge_vector.begin(), left_edge_vector.end(), partial_input_vector.begin(), [](T val) { + return val + 1; + }); + std::iota(partial_expected_vector.begin(), partial_expected_vector.end(), 0); + + // Create vector containing duplicates of all the inputs. + input_vector.reserve(num_elements); + expected_vector.reserve(num_elements); + for (unsigned int i = 0; i < inputs_per_bin; ++i) { + input_vector.insert( + input_vector.end(), partial_input_vector.begin(), partial_input_vector.end()); + expected_vector.insert( + expected_vector.end(), partial_expected_vector.begin(), partial_expected_vector.end()); + } + + // Column wrappers are necessary inputs for the function. + fwc_wrapper left_edges(left_edge_vector.begin(), left_edge_vector.end()); + fwc_wrapper right_edges(right_edge_vector.begin(), right_edge_vector.end()); + fwc_wrapper input(input_vector.begin(), input_vector.end()); + fwc_wrapper expected( + expected_vector.begin(), expected_vector.end(), expected_validity.begin()); + + auto result = + cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } +}; + +TYPED_TEST_CASE(RealDataBinTestFixture, NumericTypesNotBool); + +TYPED_TEST(RealDataBinTestFixture, TestRealData256) { this->test(256); }; +TYPED_TEST(RealDataBinTestFixture, TestRealData512) { this->test(512); }; +TYPED_TEST(RealDataBinTestFixture, TestRealData1024) { this->test(1024); }; + +// Test negative numbers for signed types. +template +struct NegativeNumbersBinTestFixture : public RealDataBinTestFixture { + void test(unsigned int num_elements = 512, unsigned int inputs_per_bin = 4) + { + RealDataBinTestFixture::test( + num_elements, inputs_per_bin, -static_cast(num_elements / 2)); + } +}; + +TYPED_TEST_CASE(NegativeNumbersBinTestFixture, SignedNumericTypesNotBool); + +TYPED_TEST(NegativeNumbersBinTestFixture, TestNegativeNumbers256) { this->test(256); }; +TYPED_TEST(NegativeNumbersBinTestFixture, TestNegativeNumbers512) { this->test(512); }; +TYPED_TEST(NegativeNumbersBinTestFixture, TestNegativeNumbers1024) { this->test(1024); }; + +/* + * Test fixed point types. + */ + +template +struct FixedPointBinTestFixture : public BinTestFixture { +}; + +TYPED_TEST_CASE(FixedPointBinTestFixture, FixedPointTypes); + +TYPED_TEST(FixedPointBinTestFixture, TestFixedPointData) +{ + using fpc_type_wrapper = fpc_wrapper>; + + fpc_type_wrapper left_edges{{0, 10, 20, 30, 40, 50, 60, 70, 80, 90}, numeric::scale_type{0}}; + fpc_type_wrapper right_edges{{10, 20, 30, 40, 50, 60, 70, 80, 90, 100}, numeric::scale_type{0}}; + fpc_type_wrapper input{{25, 25, 25, 25, 25, 25, 25, 25, 25, 25}, numeric::scale_type{0}}; + + auto result = + cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO); + + // Check that every element is placed in bin 2. + fwc_wrapper expected{{2, 2, 2, 2, 2, 2, 2, 2, 2, 2}, + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +}; + +/* + * Test strings. + */ + +// Basic test of strings of lowercase alphanumerics. +TEST(TestStringData, SimpleStringTest) +{ + cudf::test::strings_column_wrapper left_edges{"a", "b", "c", "d", "e"}; + cudf::test::strings_column_wrapper right_edges{"b", "c", "d", "e", "f"}; + cudf::test::strings_column_wrapper input{"abc", "bcd", "cde", "def", "efg"}; + + auto result = + cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO); + + fwc_wrapper expected{{0, 1, 2, 3, 4}, {1, 1, 1, 1, 1}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +}; + +// Test non-ASCII characters. +TEST(TestStringData, NonAsciiStringTest) +{ + cudf::test::strings_column_wrapper left_edges{"A"}; + cudf::test::strings_column_wrapper right_edges{"z"}; + cudf::test::strings_column_wrapper input{"Héllo", + "thesé", + "HERE", + "tést strings", + "", + "1.75", + "-34", + "+9.8", + "17¼", + "x³", + "2³", + " 12⅝", + "1234567890", + "de", + "\t\r\n\f "}; + + auto result = + cudf::label_bins(input, left_edges, cudf::inclusive::NO, right_edges, cudf::inclusive::NO); + + fwc_wrapper expected{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +// Test sliced non-ASCII characters. +TEST(TestStringData, SlicedNonAsciiStringTest) +{ + cudf::test::strings_column_wrapper left_edges{"A"}; + cudf::test::strings_column_wrapper right_edges{"z"}; + cudf::test::strings_column_wrapper input{"Héllo", + "thesé", + "HERE", + "tést strings", + "", + "1.75", + "-34", + "+9.8", + "17¼", + "x³", + "2³", + " 12⅝", + "1234567890", + "de", + "\t\r\n\f "}; + + auto sliced_inputs = cudf::slice(input, {1, 5, 5, 11}); + + { + auto result = cudf::label_bins( + sliced_inputs[0], left_edges, cudf::inclusive::NO, right_edges, cudf::inclusive::NO); + fwc_wrapper expected{{0, 0, 0, 0}, {1, 1, 1, 0}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } + + { + auto result = cudf::label_bins( + sliced_inputs[1], left_edges, cudf::inclusive::NO, right_edges, cudf::inclusive::NO); + fwc_wrapper expected{{0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 1, 0}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } +} + +} // anonymous namespace + +CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/rolling/collect_list_test.cpp b/cpp/tests/rolling/collect_list_test.cpp index 6a3a80601d0..de179223d68 100644 --- a/cpp/tests/rolling/collect_list_test.cpp +++ b/cpp/tests/rolling/collect_list_test.cpp @@ -64,7 +64,7 @@ TYPED_TEST(TypedCollectListTest, BasicRollingWindow) static_cast(foll_column).size()); auto const result_column_based_window = - rolling_window(input_column, prev_column, foll_column, 1, make_collect_aggregation()); + rolling_window(input_column, prev_column, foll_column, 1, make_collect_list_aggregation()); auto const expected_result = lists_column_wrapper{ @@ -79,11 +79,11 @@ TYPED_TEST(TypedCollectListTest, BasicRollingWindow) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_column_based_window->view()); auto const result_fixed_window = - rolling_window(input_column, 2, 1, 1, make_collect_aggregation()); + rolling_window(input_column, 2, 1, 1, make_collect_list_aggregation()); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_fixed_window->view()); auto const result_with_nulls_excluded = - rolling_window(input_column, 2, 1, 1, make_collect_aggregation(null_policy::EXCLUDE)); + rolling_window(input_column, 2, 1, 1, make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); } @@ -104,7 +104,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyOutputLists) static_cast(foll_column).size()); auto const result_column_based_window = - rolling_window(input_column, prev_column, foll_column, 0, make_collect_aggregation()); + rolling_window(input_column, prev_column, foll_column, 0, make_collect_list_aggregation()); auto const expected_result = lists_column_wrapper{ @@ -120,7 +120,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyOutputLists) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_column_based_window->view()); auto const result_with_nulls_excluded = rolling_window( - input_column, prev_column, foll_column, 0, make_collect_aggregation(null_policy::EXCLUDE)); + input_column, prev_column, foll_column, 0, make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); } @@ -138,7 +138,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyOutputListsAtEnds) auto foll_column = fixed_width_column_wrapper{0, 1, 1, 1, 1, 0}; auto const result = - rolling_window(input_column, prev_column, foll_column, 0, make_collect_aggregation()); + rolling_window(input_column, prev_column, foll_column, 0, make_collect_list_aggregation()); auto const expected_result = lists_column_wrapper{{}, {0, 1, 2}, {1, 2, 3}, {2, 3, 4}, {3, 4, 5}, {}}.release(); @@ -146,7 +146,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyOutputListsAtEnds) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); auto const result_with_nulls_excluded = rolling_window( - input_column, prev_column, foll_column, 0, make_collect_aggregation(null_policy::EXCLUDE)); + input_column, prev_column, foll_column, 0, make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); } @@ -164,11 +164,11 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods) auto const input_column = fixed_width_column_wrapper{0, 1, 2, 3, 4, 5}; auto const num_elements = static_cast(input_column).size(); - auto preceding = 2; - auto following = 1; - auto min_periods = 3; - auto const result = - rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + auto preceding = 2; + auto following = 1; + auto min_periods = 3; + auto const result = rolling_window( + input_column, preceding, following, min_periods, make_collect_list_aggregation()); auto const expected_result = lists_column_wrapper{ {{}, {0, 1, 2}, {1, 2, 3}, {2, 3, 4}, {3, 4, 5}, {}}, @@ -183,7 +183,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); @@ -191,8 +191,8 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods) following = 2; min_periods = 4; - auto result_2 = - rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + auto result_2 = rolling_window( + input_column, preceding, following, min_periods, make_collect_list_aggregation()); auto expected_result_2 = lists_column_wrapper{ {{}, {0, 1, 2, 3}, {1, 2, 3, 4}, {2, 3, 4, 5}, {}, {}}, cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) { @@ -206,7 +206,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_2->view(), result_2_with_nulls_excluded->view()); @@ -228,11 +228,11 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods) { // One result row at each end should be null. - auto preceding = 2; - auto following = 1; - auto min_periods = 3; - auto const result = - rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + auto preceding = 2; + auto following = 1; + auto min_periods = 3; + auto const result = rolling_window( + input_column, preceding, following, min_periods, make_collect_list_aggregation()); auto expected_result_child_values = std::vector{0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5}; auto expected_result_child_validity = std::vector{1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1}; @@ -265,7 +265,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); auto expected_result_child_values = std::vector{0, 2, 2, 3, 2, 3, 3, 5}; auto expected_result_child = fixed_width_column_wrapper( @@ -287,11 +287,11 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods) { // First result row, and the last two result rows should be null. - auto preceding = 2; - auto following = 2; - auto min_periods = 4; - auto const result = - rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + auto preceding = 2; + auto following = 2; + auto min_periods = 4; + auto const result = rolling_window( + input_column, preceding, following, min_periods, make_collect_list_aggregation()); auto expected_result_child_values = std::vector{0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5}; auto expected_result_child_validity = std::vector{1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1}; @@ -325,7 +325,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); auto expected_result_child_values = std::vector{0, 2, 3, 2, 3, 2, 3, 5}; auto expected_result_child = fixed_width_column_wrapper( @@ -358,11 +358,11 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsOnStrings) auto const input_column = strings_column_wrapper{"0", "1", "2", "3", "4", "5"}; auto const num_elements = static_cast(input_column).size(); - auto preceding = 2; - auto following = 1; - auto min_periods = 3; - auto const result = - rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + auto preceding = 2; + auto following = 1; + auto min_periods = 3; + auto const result = rolling_window( + input_column, preceding, following, min_periods, make_collect_list_aggregation()); auto const expected_result = lists_column_wrapper{ {{}, {"0", "1", "2"}, {"1", "2", "3"}, {"2", "3", "4"}, {"3", "4", "5"}, {}}, @@ -377,7 +377,7 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsOnStrings) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); @@ -385,8 +385,8 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsOnStrings) following = 2; min_periods = 4; - auto result_2 = - rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + auto result_2 = rolling_window( + input_column, preceding, following, min_periods, make_collect_list_aggregation()); auto expected_result_2 = lists_column_wrapper{ {{}, {"0", "1", "2", "3"}, {"1", "2", "3", "4"}, {"2", "3", "4", "5"}, {}, {}}, cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) { @@ -400,7 +400,7 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsOnStrings) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_2->view(), result_2_with_nulls_excluded->view()); @@ -421,11 +421,11 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsWithDecimal) { // One result row at each end should be null. - auto preceding = 2; - auto following = 1; - auto min_periods = 3; - auto const result = - rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + auto preceding = 2; + auto following = 1; + auto min_periods = 3; + auto const result = rolling_window( + input_column, preceding, following, min_periods, make_collect_list_aggregation()); auto expected_result_child_values = std::vector{0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5}; auto expected_result_child = @@ -451,7 +451,7 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsWithDecimal) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); @@ -459,11 +459,11 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsWithDecimal) { // First result row, and the last two result rows should be null. - auto preceding = 2; - auto following = 2; - auto min_periods = 4; - auto const result = - rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + auto preceding = 2; + auto following = 2; + auto min_periods = 4; + auto const result = rolling_window( + input_column, preceding, following, min_periods, make_collect_list_aggregation()); auto expected_result_child_values = std::vector{0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5}; auto expected_result_child = @@ -489,7 +489,7 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsWithDecimal) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); @@ -515,7 +515,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindow) preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto const expected_result = lists_column_wrapper{ {10, 11}, @@ -536,7 +536,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindow) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); } @@ -563,7 +563,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindowWithNulls) preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto expected_child = fixed_width_column_wrapper{ {10, 11, 10, 11, 12, 11, 12, 13, 12, 13, 14, 13, 14, 20, 21, 20, 21, 22, 21, 22, 23, 22, 23}, @@ -587,7 +587,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindowWithNulls) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); auto expected_child = fixed_width_column_wrapper{ 10, 10, 12, 12, 13, 12, 13, 14, 13, 14, 20, 20, 22, 22, 23, 22, 23}; @@ -627,7 +627,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindow) preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto const expected_result = lists_column_wrapper{ {10, 11, 12, 13}, @@ -650,7 +650,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindow) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); } @@ -678,7 +678,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithNulls) preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto null_at_0 = iterator_with_null_at(0); auto null_at_1 = iterator_with_null_at(1); @@ -705,7 +705,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithNulls) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); // After null exclusion, `11`, `21`, and `null` should not appear. auto const expected_result_with_nulls_excluded = lists_column_wrapper{ @@ -744,7 +744,7 @@ TEST_F(CollectListTest, BasicGroupedTimeRangeRollingWindowOnStrings) preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto const expected_result = lists_column_wrapper{ {"10", "11", "12", "13"}, @@ -767,7 +767,7 @@ TEST_F(CollectListTest, BasicGroupedTimeRangeRollingWindowOnStrings) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); } @@ -793,7 +793,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithNulls) preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto null_at_0 = iterator_with_null_at(0); auto null_at_1 = iterator_with_null_at(1); @@ -821,7 +821,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithNulls) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); // After null exclusion, `11`, `21`, and `null` should not appear. auto const expected_result_with_nulls_excluded = lists_column_wrapper{ @@ -868,7 +868,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindowOnStructs) preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto expected_numeric_column = fixed_width_column_wrapper{ 10, 11, 12, 13, 10, 11, 12, 13, 10, 11, 12, 13, 14, 10, 11, 12, @@ -898,7 +898,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindowOnStructs) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); } @@ -928,7 +928,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithMinPeriods) preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto const expected_result = lists_column_wrapper{ {{10, 11, 12, 13}, @@ -954,7 +954,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithMinPeriods) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); } @@ -984,7 +984,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithNullsAndMinPer preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto null_at_1 = iterator_with_null_at(1); @@ -1013,7 +1013,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithNullsAndMinPer preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); // After null exclusion, `11`, `21`, and `null` should not appear. auto const expected_result_with_nulls_excluded = lists_column_wrapper{ @@ -1056,7 +1056,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithMinPeriods) preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto const expected_result = lists_column_wrapper{ {{"10", "11", "12", "13"}, @@ -1082,7 +1082,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithMinPeriods) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); } @@ -1110,7 +1110,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithNullsAndMinPer preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto null_at_1 = iterator_with_null_at(1); @@ -1139,7 +1139,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithNullsAndMinPer preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); // After null exclusion, `11`, `21`, and `null` should not appear. auto const expected_result_with_nulls_excluded = lists_column_wrapper{ @@ -1190,7 +1190,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowOnStructsWithMinPe preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto expected_numeric_column = fixed_width_column_wrapper{ 10, 11, 12, 13, 10, 11, 12, 13, 10, 11, 12, 13, 14, 10, 11, 12, 13, 14, 10, 11, 12, 13, 14}; @@ -1226,7 +1226,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowOnStructsWithMinPe preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); } diff --git a/cpp/tests/strings/integers_tests.cu b/cpp/tests/strings/integers_tests.cu index d6bf03b3f76..f15116ae4c2 100644 --- a/cpp/tests/strings/integers_tests.cu +++ b/cpp/tests/strings/integers_tests.cu @@ -26,20 +26,18 @@ #include #include +// Using an alias variable for the null elements +// This will make the code looks cleaner +constexpr auto NULL_VAL = 0; + struct StringsConvertTest : public cudf::test::BaseFixture { }; -TEST_F(StringsConvertTest, IsInteger) +TEST_F(StringsConvertTest, IsIntegerBasicCheck) { - cudf::test::strings_column_wrapper strings; - auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::is_integer(strings_view); - EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); - EXPECT_EQ(0, results->view().size()); - cudf::test::strings_column_wrapper strings1( {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""}); - results = cudf::strings::is_integer(cudf::strings_column_view(strings1)); + auto results = cudf::strings::is_integer(cudf::strings_column_view(strings1)); cudf::test::fixed_width_column_wrapper expected1({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1); @@ -50,24 +48,187 @@ TEST_F(StringsConvertTest, IsInteger) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); } +TEST_F(StringsConvertTest, ZeroSizeIsIntegerBasicCheck) +{ + cudf::test::strings_column_wrapper strings; + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::is_integer(strings_view); + EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); + EXPECT_EQ(0, results->view().size()); +} + +TEST_F(StringsConvertTest, IsIntegerBoundCheckNoNull) +{ + auto strings = cudf::test::strings_column_wrapper( + {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""}); + auto results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT32}); + auto expected = + cudf::test::fixed_width_column_wrapper({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + strings = cudf::test::strings_column_wrapper( + {"0", "+0", "-0", "1234567890", "-27341132", "+012", "023", "-045"}); + results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT32}); + expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST_F(StringsConvertTest, IsIntegerBoundCheckWithNulls) +{ + std::vector const h_strings{ + "eee", "1234", nullptr, "", "-9832", "93.24", "765é", nullptr}; + auto const strings = cudf::test::strings_column_wrapper( + h_strings.begin(), + h_strings.end(), + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + auto const results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT32}); + // Input has null elements then the output should have the same null mask + auto const expected = cudf::test::fixed_width_column_wrapper( + std::initializer_list{0, 1, NULL_VAL, 0, 1, 0, 0, NULL_VAL}, + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST_F(StringsConvertTest, ZeroSizeIsIntegerBoundCheck) +{ + // Empty input + auto strings = cudf::test::strings_column_wrapper{}; + auto results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT32}); + EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); + EXPECT_EQ(0, results->view().size()); +} + +TEST_F(StringsConvertTest, IsIntegerBoundCheckSmallNumbers) +{ + auto strings = cudf::test::strings_column_wrapper( + {"-200", "-129", "-128", "-120", "0", "120", "127", "130", "150", "255", "300", "500"}); + auto results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT8}); + auto expected = + cudf::test::fixed_width_column_wrapper({0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::UINT8}); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + strings = cudf::test::strings_column_wrapper( + {"-40000", "-32769", "-32768", "-32767", "-32766", "32765", "32766", "32767", "32768"}); + results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT16}); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 1, 1, 1, 1, 1, 1, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::UINT16}); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT32}); + expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST_F(StringsConvertTest, IsIntegerBoundCheckLargeNumbers) +{ + auto strings = + cudf::test::strings_column_wrapper({"-2147483649", // std::numeric_limits::min() - 1 + "-2147483648", // std::numeric_limits::min() + "-2147483647", // std::numeric_limits::min() + 1 + "2147483646", // std::numeric_limits::max() - 1 + "2147483647", // std::numeric_limits::max() + "2147483648", // std::numeric_limits::max() + 1 + "4294967294", // std::numeric_limits::max() - 1 + "4294967295", // std::numeric_limits::max() + "4294967296"}); // std::numeric_limits::max() + 1 + auto results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT32}); + auto expected = cudf::test::fixed_width_column_wrapper({0, 1, 1, 1, 1, 0, 0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::UINT32}); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 1, 1, 1, 1, 1, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + strings = cudf::test::strings_column_wrapper( + {"-9223372036854775809", // std::numeric_limits::min() - 1 + "-9223372036854775808", // std::numeric_limits::min() + "-9223372036854775807", // std::numeric_limits::min() + 1 + "9223372036854775806", // std::numeric_limits::max() - 1 + "9223372036854775807", // std::numeric_limits::max() + "9223372036854775808", // std::numeric_limits::max() + 1 + "18446744073709551614", // std::numeric_limits::max() - 1 + "18446744073709551615", // std::numeric_limits::max() + "18446744073709551616"}); // std::numeric_limits::max() + 1 + results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT64}); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 1, 1, 1, 0, 0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::UINT64}); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 1, 1, 1, 1, 1, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsConvertTest, ToInteger) { - std::vector h_strings{ - "eee", "1234", nullptr, "", "-9832", "93.24", "765é", "-1.78e+5", "2147483647", "-2147483648"}; + std::vector h_strings{"eee", + "1234", + nullptr, + "", + "-9832", + "93.24", + "765é", + nullptr, + "-1.78e+5", + "2147483647", + "-2147483648", + "2147483648"}; cudf::test::strings_column_wrapper strings( h_strings.begin(), h_strings.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - std::vector h_expected{0, 1234, 0, 0, -9832, 93, 765, -1, 2147483647, -2147483648}; - auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::to_integers(strings_view, cudf::data_type{cudf::type_id::INT32}); + auto results = cudf::strings::to_integers(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT16}); + auto const expected_i16 = cudf::test::fixed_width_column_wrapper( + std::initializer_list{0, 1234, NULL_VAL, 0, -9832, 93, 765, NULL_VAL, -1, -1, 0, 0}, + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_i16); - cudf::test::fixed_width_column_wrapper expected( - h_expected.begin(), - h_expected.end(), + results = cudf::strings::to_integers(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT32}); + auto const expected_i32 = cudf::test::fixed_width_column_wrapper( + std::initializer_list{ + 0, 1234, NULL_VAL, 0, -9832, 93, 765, NULL_VAL, -1, 2147483647, -2147483648, -2147483648}, thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_i32); + + results = cudf::strings::to_integers(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::UINT32}); + auto const expected_u32 = cudf::test::fixed_width_column_wrapper( + std::initializer_list{0, + 1234, + NULL_VAL, + 0, + 4294957464, + 93, + 765, + NULL_VAL, + 4294967295, + 2147483647, + 2147483648, + 2147483648}, + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_u32); } TEST_F(StringsConvertTest, FromInteger) @@ -114,7 +275,7 @@ TEST_F(StringsConvertTest, EmptyStringsColumn) cudf::test::strings_column_wrapper strings({"", "", ""}); auto results = cudf::strings::to_integers(cudf::strings_column_view(strings), cudf::data_type{cudf::type_id::INT64}); - cudf::test::fixed_width_column_wrapper expected({0, 0, 0}); + cudf::test::fixed_width_column_wrapper expected{0, 0, 0}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); } diff --git a/docs/cudf/source/api.rst b/docs/cudf/source/api.rst index df7e92c125d..b4ca0321073 100644 --- a/docs/cudf/source/api.rst +++ b/docs/cudf/source/api.rst @@ -20,6 +20,13 @@ Series :inherited-members: :exclude-members: serialize, deserialize, logical_not, logical_or, logical_and, remainder, sum_of_squares, fill, merge, iteritems, items, device_deserialize, device_serialize, host_deserialize, host_serialize, to_dict, tolist, to_list +Lists +----- +.. currentmodule:: cudf.core.column.lists + +.. autoclass:: ListMethods + :members: + Strings ------- .. currentmodule:: cudf.core.column.string @@ -253,4 +260,4 @@ GpuArrowReader .. currentmodule:: cudf.comm.gpuarrow .. autoclass:: GpuArrowReader :members: - :exclude-members: count, index \ No newline at end of file + :exclude-members: count, index diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index 3bc9adb5f49..e6675591164 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -1163,6 +1163,17 @@ public static ColumnVector decimalFromInts(int scale, int... values) { } } + /** + * Create a new decimal vector from boxed unscaled values (Integer array) and scale. + * The created vector is of type DType.DECIMAL32, whose max precision is 9. + * Compared with scale of [[java.math.BigDecimal]], the scale here represents the opposite meaning. + */ + public static ColumnVector decimalFromBoxedInts(int scale, Integer... values) { + try (HostColumnVector host = HostColumnVector.decimalFromBoxedInts(scale, values)) { + return host.copyToDevice(); + } + } + /** * Create a new decimal vector from unscaled values (long array) and scale. * The created vector is of type DType.DECIMAL64, whose max precision is 18. @@ -1174,6 +1185,17 @@ public static ColumnVector decimalFromLongs(int scale, long... values) { } } + /** + * Create a new decimal vector from boxed unscaled values (Long array) and scale. + * The created vector is of type DType.DECIMAL64, whose max precision is 18. + * Compared with scale of [[java.math.BigDecimal]], the scale here represents the opposite meaning. + */ + public static ColumnVector decimalFromBoxedLongs(int scale, Long... values) { + try (HostColumnVector host = HostColumnVector.decimalFromBoxedLongs(scale, values)) { + return host.copyToDevice(); + } + } + /** * Create a new decimal vector from double floats with specific DecimalType and RoundingMode. * All doubles will be rescaled if necessary, according to scale of input DecimalType and RoundingMode. diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java index 559256aa7bf..846bcb3b635 100644 --- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java @@ -481,6 +481,23 @@ public static HostColumnVector decimalFromInts(int scale, int... values) { return build(DType.create(DType.DTypeEnum.DECIMAL32, scale), values.length, (b) -> b.appendUnscaledDecimalArray(values)); } + /** + * Create a new decimal vector from boxed unscaled values (Integer array) and scale. + * The created vector is of type DType.DECIMAL32, whose max precision is 9. + * Compared with scale of [[java.math.BigDecimal]], the scale here represents the opposite meaning. + */ + public static HostColumnVector decimalFromBoxedInts(int scale, Integer... values) { + return build(DType.create(DType.DTypeEnum.DECIMAL32, scale), values.length, (b) -> { + for (Integer v : values) { + if (v == null) { + b.appendNull(); + } else { + b.appendUnscaledDecimal(v); + } + } + }); + } + /** * Create a new decimal vector from unscaled values (long array) and scale. * The created vector is of type DType.DECIMAL64, whose max precision is 18. @@ -490,6 +507,23 @@ public static HostColumnVector decimalFromLongs(int scale, long... values) { return build(DType.create(DType.DTypeEnum.DECIMAL64, scale), values.length, (b) -> b.appendUnscaledDecimalArray(values)); } + /** + * Create a new decimal vector from boxed unscaled values (Long array) and scale. + * The created vector is of type DType.DECIMAL64, whose max precision is 18. + * Compared with scale of [[java.math.BigDecimal]], the scale here represents the opposite meaning. + */ + public static HostColumnVector decimalFromBoxedLongs(int scale, Long... values) { + return build(DType.create(DType.DTypeEnum.DECIMAL64, scale), values.length, (b) -> { + for (Long v : values) { + if (v == null) { + b.appendNull(); + } else { + b.appendUnscaledDecimal(v); + } + } + }); + } + /** * Create a new decimal vector from double floats with specific DecimalType and RoundingMode. * All doubles will be rescaled if necessary, according to scale of input DecimalType and RoundingMode. diff --git a/java/src/main/native/src/AggregationJni.cpp b/java/src/main/native/src/AggregationJni.cpp index aae7cb493a8..c5184111edf 100644 --- a/java/src/main/native/src/AggregationJni.cpp +++ b/java/src/main/native/src/AggregationJni.cpp @@ -206,7 +206,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCollectAgg(JNIEnv cudf::jni::auto_set_device(env); cudf::null_policy policy = include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE; - std::unique_ptr ret = cudf::make_collect_aggregation(policy); + std::unique_ptr ret = cudf::make_collect_list_aggregation(policy); return reinterpret_cast(ret.release()); } CATCH_STD(env, 0); diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 73db5ee4df3..4132016d85c 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -712,6 +713,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas case cudf::type_id::UINT64: result = cudf::strings::from_integers(*column); break; + case cudf::type_id::DECIMAL32: + case cudf::type_id::DECIMAL64: + result = cudf::strings::from_fixed_point(*column); + break; default: JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid data type", 0); } } else if (column->type().id() == cudf::type_id::STRING) { @@ -733,6 +738,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas case cudf::type_id::UINT64: result = cudf::strings::to_integers(*column, n_data_type); break; + case cudf::type_id::DECIMAL32: + case cudf::type_id::DECIMAL64: + result = cudf::strings::to_fixed_point(*column, n_data_type); + break; default: JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid data type", 0); } } else if (cudf::is_timestamp(n_data_type) && cudf::is_numeric(column->type())) { diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 8b40f6e93d4..02fbe56431b 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2266,6 +2266,73 @@ void testCastBoolToString() { testCastFixedWidthToStringsAndBack(DType.BOOL8, () -> ColumnVector.fromBoxedBooleans(booleans), () -> ColumnVector.fromStrings(stringBools)); } + @Test + void testCastDecimal32ToString() { + + Integer[] unScaledValues = {0, null, 3, 2, -43, null, 5234, -73451, 348093, -234810}; + String[] strDecimalValues = new String[unScaledValues.length]; + for (int scale : new int[]{-2, -1, 0, 1, 2}) { + for (int i = 0; i < strDecimalValues.length; i++) { + Long value = unScaledValues[i] == null ? null : Long.valueOf(unScaledValues[i]); + strDecimalValues[i] = dumpDecimal(value, scale); + } + + testCastFixedWidthToStringsAndBack(DType.create(DType.DTypeEnum.DECIMAL32, scale), + () -> ColumnVector.decimalFromBoxedInts(scale, unScaledValues), + () -> ColumnVector.fromStrings(strDecimalValues)); + } + } + + @Test + void testCastDecimal64ToString() { + + Long[] unScaledValues = {0l, null, 3l, 2l, -43l, null, 234802l, -94582l, 1234208124l, -2342348023812l}; + String[] strDecimalValues = new String[unScaledValues.length]; + for (int scale : new int[]{-5, -2, -1, 0, 1, 2, 5}) { + for (int i = 0; i < strDecimalValues.length; i++) { + strDecimalValues[i] = dumpDecimal(unScaledValues[i], scale); + System.out.println(strDecimalValues[i]); + } + + testCastFixedWidthToStringsAndBack(DType.create(DType.DTypeEnum.DECIMAL64, scale), + () -> ColumnVector.decimalFromBoxedLongs(scale, unScaledValues), + () -> ColumnVector.fromStrings(strDecimalValues)); + } + } + + /** + * Helper function to create decimal strings which can be processed by castStringToDecimal functor. + * We can not simply create decimal string via `String.valueOf`, because castStringToDecimal doesn't + * support scientific notations so far. + * + * issue for scientific notation: https://github.com/rapidsai/cudf/issues/7665 + */ + private static String dumpDecimal(Long unscaledValue, int scale) { + if (unscaledValue == null) return null; + + StringBuilder builder = new StringBuilder(); + if (unscaledValue < 0) builder.append('-'); + String absValue = String.valueOf(Math.abs(unscaledValue)); + + if (scale >= 0) { + builder.append(absValue); + for (int i = 0; i < scale; i++) builder.append('0'); + return builder.toString(); + } + + if (absValue.length() <= -scale) { + builder.append('0').append('.'); + for (int i = 0; i < -scale - absValue.length(); i++) builder.append('0'); + builder.append(absValue); + } else { + int split = absValue.length() + scale; + builder.append(absValue.substring(0, split)) + .append('.') + .append(absValue.substring(split)); + } + return builder.toString(); + } + private static String[] getStringArray(T[] input) { String[] result = new String[input.length]; for (int i = 0 ; i < input.length ; i++) { diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 5c6801137ae..840f0c98987 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -50,6 +50,7 @@ class AggregationKind(Enum): NUNIQUE = libcudf_aggregation.aggregation.Kind.NUNIQUE NTH = libcudf_aggregation.aggregation.Kind.NTH_ELEMENT COLLECT = libcudf_aggregation.aggregation.Kind.COLLECT + COLLECT_SET = libcudf_aggregation.aggregation.Kind.COLLECT_SET PTX = libcudf_aggregation.aggregation.Kind.PTX CUDA = libcudf_aggregation.aggregation.Kind.CUDA @@ -241,7 +242,13 @@ cdef class _AggregationFactory: @classmethod def collect(cls): cdef Aggregation agg = Aggregation.__new__(Aggregation) - agg.c_obj = move(libcudf_aggregation.make_collect_aggregation()) + agg.c_obj = move(libcudf_aggregation.make_collect_list_aggregation()) + return agg + + @classmethod + def collect_set(cls): + cdef Aggregation agg = Aggregation.__new__(Aggregation) + agg.c_obj = move(libcudf_aggregation.make_collect_set_aggregation()) return agg @classmethod diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index 660db29f7a9..e9836c11361 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -34,7 +34,8 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: ARGMIN 'cudf::aggregation::ARGMIN' NUNIQUE 'cudf::aggregation::NUNIQUE' NTH_ELEMENT 'cudf::aggregation::NTH_ELEMENT' - COLLECT 'cudf::aggregation::COLLECT' + COLLECT 'cudf::aggregation::COLLECT_LIST' + COLLECT_SET 'cudf::aggregation::COLLECT_SET' PTX 'cudf::aggregation::PTX' CUDA 'cudf::aggregation::CUDA' Kind kind @@ -83,7 +84,9 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: size_type n ) except + - cdef unique_ptr[aggregation] make_collect_aggregation() except + + cdef unique_ptr[aggregation] make_collect_list_aggregation() except + + + cdef unique_ptr[aggregation] make_collect_set_aggregation() except + cdef unique_ptr[aggregation] make_udf_aggregation( udf_type type, diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd index 519565fa48c..39da6b26502 100644 --- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd +++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd @@ -70,6 +70,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: column_in_metadata& set_nullability(bool nullable) column_in_metadata& set_list_column_as_map() column_in_metadata& set_int96_timestamps(bool req) + column_in_metadata& set_decimal_precision(uint8_t precision) column_in_metadata& child(size_type i) cdef cppclass table_input_metadata: diff --git a/python/cudf/cudf/_lib/cpp/lists/extract.pxd b/python/cudf/cudf/_lib/cpp/lists/extract.pxd new file mode 100644 index 00000000000..89fa893c17d --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/lists/extract.pxd @@ -0,0 +1,14 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view + +from cudf._lib.cpp.types cimport size_type + +cdef extern from "cudf/lists/extract.hpp" namespace "cudf::lists" nogil: + cdef unique_ptr[column] extract_list_element( + const lists_column_view, + size_type + ) except + diff --git a/python/cudf/cudf/_lib/cpp/lists/sorting.pxd b/python/cudf/cudf/_lib/cpp/lists/sorting.pxd new file mode 100644 index 00000000000..55e8e09427c --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/lists/sorting.pxd @@ -0,0 +1,15 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.types cimport order, null_order +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view + + +cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil: + cdef unique_ptr[column] sort_lists( + const lists_column_view source_column, + order column_order, + null_order null_precedence + ) except + diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 0f0ee35556a..2971aad8313 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -10,23 +10,29 @@ from cudf._lib.cpp.lists.count_elements cimport ( from cudf._lib.cpp.lists.explode cimport ( explode_outer as cpp_explode_outer ) +from cudf._lib.cpp.lists.sorting cimport ( + sort_lists as cpp_sort_lists +) from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.cpp.types cimport size_type +from cudf._lib.cpp.types cimport size_type, order, null_order from cudf._lib.column cimport Column from cudf._lib.table cimport Table +from cudf._lib.types cimport ( + underlying_type_t_null_order, underlying_type_t_order +) from cudf.core.dtypes import ListDtype +from cudf._lib.cpp.lists.extract cimport extract_list_element + def count_elements(Column col): - if not isinstance(col.dtype, ListDtype): - raise TypeError("col is not a list column.") # shared_ptr required because lists_column_view has no default # ctor @@ -58,3 +64,39 @@ def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False): column_names=tbl._column_names, index_names=None if ignore_index else tbl._index_names ) + + +def sort_lists(Column col, bool ascending, str na_position): + cdef shared_ptr[lists_column_view] list_view = ( + make_shared[lists_column_view](col.view()) + ) + cdef order c_sort_order = ( + order.ASCENDING if ascending else order.DESCENDING + ) + cdef null_order c_null_prec = ( + null_order.BEFORE if na_position == "first" else null_order.AFTER + ) + + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_sort_lists(list_view.get()[0], c_sort_order, c_null_prec) + ) + + return Column.from_unique_ptr(move(c_result)) + + +def extract_element(Column col, size_type index): + # shared_ptr required because lists_column_view has no default + # ctor + cdef shared_ptr[lists_column_view] list_view = ( + make_shared[lists_column_view](col.view()) + ) + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(extract_list_element(list_view.get()[0], index)) + + result = Column.from_unique_ptr(move(c_result)) + return result diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 87179c02fe2..0158df46cc4 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -20,7 +20,8 @@ from cudf.utils.dtypes import ( np_to_pa_dtype, is_categorical_dtype, is_list_dtype, - is_struct_dtype + is_struct_dtype, + is_decimal_dtype, ) from cudf._lib.utils cimport get_column_names @@ -310,7 +311,7 @@ cpdef write_parquet( for i, name in enumerate(table._column_names, num_index_cols_meta): tbl_meta.get().column_metadata[i].set_name(name.encode()) - _set_col_children_names( + _set_col_metadata( table[name]._column, tbl_meta.get().column_metadata[i] ) @@ -448,7 +449,7 @@ cdef class ParquetWriter: for i, name in enumerate(table._column_names, num_index_cols_meta): self.tbl_meta.get().column_metadata[i].set_name(name.encode()) - _set_col_children_names( + _set_col_metadata( table[name]._column, self.tbl_meta.get().column_metadata[i] ) @@ -546,14 +547,16 @@ cdef Column _update_column_struct_field_names( col.set_base_children(tuple(children)) return col -cdef _set_col_children_names(Column col, column_in_metadata& col_meta): +cdef _set_col_metadata(Column col, column_in_metadata& col_meta): if is_struct_dtype(col): for i, (child_col, name) in enumerate( zip(col.children, list(col.dtype.fields)) ): col_meta.child(i).set_name(name.encode()) - _set_col_children_names(child_col, col_meta.child(i)) + _set_col_metadata(child_col, col_meta.child(i)) elif is_list_dtype(col): - _set_col_children_names(col.children[1], col_meta.child(1)) + _set_col_metadata(col.children[1], col_meta.child(1)) else: + if is_decimal_dtype(col): + col_meta.set_decimal_precision(col.dtype.precision) return diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx index dba0abb9cf0..f97b45d8abf 100644 --- a/python/cudf/cudf/_lib/table.pyx +++ b/python/cudf/cudf/_lib/table.pyx @@ -34,8 +34,8 @@ cdef class Table: Parameters ---------- - data : OrderedColumnDict - An OrderedColumnDict mapping column names to Columns + data : dict + An dict mapping column names to Columns index : Table A Table representing the (optional) index columns. """ @@ -109,7 +109,7 @@ cdef class Table: it += 1 index = Table(dict(zip(index_names, index_columns))) - # Construct the data OrderedColumnDict + # Construct the data dict data_columns = [] for _ in column_names: data_columns.append(Column.from_unique_ptr(move(dereference(it)))) @@ -154,7 +154,7 @@ cdef class Table: column_idx += 1 index = Table(dict(zip(index_names, index_columns))) - # Construct the data OrderedColumnDict + # Construct the data dict cdef size_type source_column_idx = 0 data_columns = [] for _ in column_names: diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 6698a47b416..4fe795e57a9 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -23,6 +23,7 @@ from cudf.utils.dtypes import ( is_categorical_dtype, is_list_dtype, is_struct_dtype, + is_decimal_dtype, ) @@ -80,7 +81,11 @@ cpdef generate_pandas_metadata(Table table, index): "'category' column dtypes are currently not " + "supported by the gpu accelerated parquet writer" ) - elif is_list_dtype(col) or is_struct_dtype(col): + elif ( + is_list_dtype(col) + or is_struct_dtype(col) + or is_decimal_dtype(col) + ): types.append(col.dtype.to_arrow()) else: types.append(np_to_pa_dtype(col.dtype)) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 1d3f73822a9..2204fbdea1f 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -7,7 +7,7 @@ import cudf from cudf._lib.copying import segmented_gather -from cudf._lib.lists import count_elements +from cudf._lib.lists import count_elements, extract_element, sort_lists from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, as_column, column from cudf.core.column.methods import ColumnMethodsMixin @@ -178,6 +178,38 @@ def __init__(self, column, parent=None): ) super().__init__(column=column, parent=parent) + def get(self, index): + """ + Extract element at the given index from each component + + Extract element from lists, tuples, or strings in + each element in the Series/Index. + + Parameters + ---------- + index : int + + Returns + ------- + Series or Index + + Examples + -------- + >>> s = cudf.Series([[1, 2, 3], [3, 4, 5], [4, 5, 6]]) + >>> s.list.get(-1) + 0 3 + 1 5 + 2 6 + dtype: int64 + """ + min_col_list_len = self.len().min() + if -min_col_list_len <= index < min_col_list_len: + return self._return_or_inplace( + extract_element(self._column, index) + ) + else: + raise IndexError("list index out of range") + @property def leaves(self): """ @@ -285,3 +317,57 @@ def take(self, lists_indices): raise else: return res + + def sort_values( + self, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + ignore_index=False, + ): + """ + Sort each list by the values. + + Sort the lists in ascending or descending order by some criterion. + + Parameters + ---------- + ascending : bool, default True + If True, sort values in ascending order, otherwise descending. + na_position : {'first', 'last'}, default 'last' + 'first' puts nulls at the beginning, 'last' puts nulls at the end. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, ..., n - 1. + + Returns + ------- + ListColumn with each list sorted + + Notes + ----- + Difference from pandas: + * Not supporting: `inplace`, `kind` + + Examples + -------- + >>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]]) + >>> s.list.sort_values(ascending=True, na_position="last") + 0 [2.0, 4.0, 9.0, nan] + 1 [2.0, 8.0, 8.0] + 2 [1.0, 2.0] + dtype: list + """ + if inplace: + raise NotImplementedError("`inplace` not currently implemented.") + if kind != "quicksort": + raise NotImplementedError("`kind` not currently implemented.") + if na_position not in {"first", "last"}: + raise ValueError(f"Unknown `na_position` value {na_position}") + if is_list_dtype(self._column.children[1].dtype): + raise NotImplementedError("Nested lists sort is not supported.") + + return self._return_or_inplace( + sort_lists(self._column, ascending, na_position), + retain_index=not ignore_index, + ) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 03743e4464b..0c580132290 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -3,12 +3,12 @@ from __future__ import annotations import itertools -from collections import OrderedDict from collections.abc import MutableMapping from typing import ( TYPE_CHECKING, Any, Callable, + Dict, Mapping, Optional, Tuple, @@ -18,8 +18,8 @@ import pandas as pd import cudf +from cudf.core import column from cudf.utils.utils import ( - OrderedColumnDict, cached_property, to_flat_dict, to_nested_dict, @@ -31,7 +31,7 @@ class ColumnAccessor(MutableMapping): - _data: "OrderedDict[Any, ColumnBase]" + _data: "Dict[Any, ColumnBase]" multiindex: bool _level_names: Tuple[Any, ...] @@ -63,10 +63,26 @@ def __init__( self._data = data._data self.multiindex = multiindex self._level_names = level_names + else: + # This code path is performance-critical for copies and should be + # modified with care. + self._data = {} + if data: + data = dict(data) + # Faster than next(iter(data.values())) + column_length = len(data[next(iter(data))]) + for k, v in data.items(): + # Much faster to avoid the function call if possible; the + # extra isinstance is negligible if we do have to make a + # column from something else. + if not isinstance(v, column.ColumnBase): + v = column.as_column(v) + if len(v) != column_length: + raise ValueError("All columns must be of equal length") + self._data[k] = v - self._data = OrderedColumnDict(data) - self.multiindex = multiindex - self._level_names = level_names + self.multiindex = multiindex + self._level_names = level_names def __iter__(self): return self._data.__iter__() @@ -76,7 +92,6 @@ def __getitem__(self, key: Any) -> ColumnBase: def __setitem__(self, key: Any, value: Any): self.set_by_label(key, value) - self._clear_cache() def __delitem__(self, key: Any): self._data.__delitem__(key) @@ -144,6 +159,13 @@ def _grouped_data(self) -> MutableMapping: else: return self._data + @cached_property + def _column_length(self): + try: + return len(self._data[next(iter(self._data))]) + except StopIteration: + return 0 + def _clear_cache(self): cached_properties = "columns", "names", "_grouped_data" for attr in cached_properties: @@ -152,6 +174,10 @@ def _clear_cache(self): except AttributeError: pass + # Column length should only be cleared if no data is present. + if len(self._data) == 0 and hasattr(self, "_column_length"): + del self._column_length + def to_pandas_index(self) -> pd.Index: """" Convert the keys of the ColumnAccessor to a Pandas Index object. @@ -169,7 +195,9 @@ def to_pandas_index(self) -> pd.Index: result = pd.Index(self.names, name=self.name, tupleize_cols=False) return result - def insert(self, name: Any, value: Any, loc: int = -1): + def insert( + self, name: Any, value: Any, loc: int = -1, validate: bool = True + ): """ Insert column into the ColumnAccessor at the specified location. @@ -199,6 +227,13 @@ def insert(self, name: Any, value: Any, loc: int = -1): if name in self._data: raise ValueError(f"Cannot insert '{name}', already exists") if loc == len(self._data): + if validate: + value = column.as_column(value) + if len(self._data) > 0: + if len(value) != self._column_length: + raise ValueError("All columns must be of equal length") + else: + self._column_length = len(value) self._data[name] = value else: new_keys = self.names[:loc] + (name,) + self.names[loc:] @@ -270,16 +305,29 @@ def select_by_index(self, index: Any) -> ColumnAccessor: data, multiindex=self.multiindex, level_names=self.level_names, ) - def set_by_label(self, key: Any, value: Any): + def set_by_label(self, key: Any, value: Any, validate: bool = True): """ Add (or modify) column by name. Parameters ---------- - key : name of the column + key + name of the column value : column-like + The value to insert into the column. + validate : bool + If True, the provided value will be coerced to a column and + validated before setting (Default value = True). """ key = self._pad_key(key) + if validate: + value = column.as_column(value) + if len(self._data) > 0: + if len(value) != self._column_length: + raise ValueError("All columns must be of equal length") + else: + self._column_length = len(value) + self._data[key] = value self._clear_cache() diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 812a20cba45..bd009a9ad84 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -52,7 +52,6 @@ is_struct_dtype, numeric_normalize_types, ) -from cudf.utils.utils import OrderedColumnDict T = TypeVar("T", bound="DataFrame") @@ -4854,7 +4853,7 @@ def hash_columns(self, columns=None): table_to_hash = self else: cols = [self[k]._column for k in columns] - table_to_hash = Frame(data=OrderedColumnDict(zip(columns, cols))) + table_to_hash = Frame(data=dict(zip(columns, cols))) return Series(table_to_hash._hash()).values diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index bfcc2d125db..e6898b8c606 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -40,8 +40,8 @@ class Frame(libcudf.table.Table): Parameters ---------- - data : OrderedColumnDict - An OrderedColumnDict mapping column names to Columns + data : dict + An dict mapping column names to Columns index : Table A Frame representing the (optional) index columns. """ diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 7ed2157277c..9d4643da637 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3571,6 +3571,7 @@ def sort_values( 4 3 3 4 1 5 + dtype: int64 """ if inplace: diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index b3ba439cb15..76a02d5e74a 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -5222,7 +5222,7 @@ def test_memory_usage_multi(): def test_setitem_diff_size_list(list_input, key): gdf = cudf.datasets.randomdata(5) with pytest.raises( - ValueError, match=("All values must be of equal length") + ValueError, match=("All columns must be of equal length") ): gdf[key] = list_input diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 33812cfa7a7..2ab1382b34e 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -1,4 +1,5 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. +import functools import pandas as pd import pyarrow as pa @@ -159,3 +160,89 @@ def test_take_invalid(invalid, exception): gs = cudf.Series([[0, 1], [2, 3]]) with exception: gs.list.take(invalid) + + +def key_func_builder(x, na_position): + if x is None: + if na_position == "first": + return -1e8 + else: + return 1e8 + else: + return x + + +@pytest.mark.parametrize( + "data", + [ + [[4, 2, None, 9], [8, 8, 2], [2, 1]], + [[4, 2, None, 9], [8, 8, 2], None], + [[4, 2, None, 9], [], None], + ], +) +@pytest.mark.parametrize( + "index", + [ + None, + pd.Index(["a", "b", "c"]), + pd.MultiIndex.from_tuples( + [(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"] + ), + ], +) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("na_position", ["first", "last"]) +@pytest.mark.parametrize("ignore_index", [True, False]) +def test_sort_values(data, index, ascending, na_position, ignore_index): + key_func = functools.partial(key_func_builder, na_position=na_position) + + ps = pd.Series(data, index=index) + gs = cudf.from_pandas(ps) + + expected = ps.apply( + lambda x: sorted(x, key=key_func, reverse=not ascending) + if x is not None + else None + ) + if ignore_index: + expected.reset_index(drop=True, inplace=True) + got = gs.list.sort_values( + ascending=ascending, na_position=na_position, ignore_index=ignore_index + ) + + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "data, index, expect", + [ + ([[None, None], [None, None]], 0, [None, None]), + ([[1, 2], [3, 4]], 0, [1, 3]), + ([["a", "b"], ["c", "d"]], 1, ["b", "d"]), + ([[1, None], [None, 2]], 1, [None, 2]), + ([[[1, 2], [3, 4]], [[5, 6], [7, 8]]], 1, [[3, 4], [7, 8]]), + ], +) +def test_get(data, index, expect): + sr = cudf.Series(data) + expect = cudf.Series(expect) + got = sr.list.get(index) + assert_eq(expect, got) + + +def test_get_nested_lists(): + sr = cudf.Series( + [ + [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [], [[3, 4], [7, 8]]], + [[], [[9, 10]], [[11, 12], [13, 14]]], + ] + ) + expect = cudf.Series([[[1, 2], [3, 4]], []]) + got = sr.list.get(0) + assert_eq(expect, got) + + +def test_get_nulls(): + with pytest.raises(IndexError, match="list index out of range"): + sr = cudf.Series([[], [], []]) + sr.list.get(100) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 6d50e4b6fee..a7a11c95e30 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1026,7 +1026,7 @@ def test_parquet_reader_list_skiprows(skip, tmpdir): assert_eq(expect, got, check_dtype=False) -@pytest.mark.parametrize("skip", range(0, 128)) +@pytest.mark.parametrize("skip", range(0, 120)) def test_parquet_reader_list_num_rows(skip, tmpdir): num_rows = 128 src = pd.DataFrame( @@ -1043,7 +1043,8 @@ def test_parquet_reader_list_num_rows(skip, tmpdir): src.to_parquet(fname) assert os.path.exists(fname) - rows_to_read = min(3, num_rows - skip) + # make sure to leave a few rows at the end that we don't read + rows_to_read = min(3, (num_rows - skip) - 5) expect = src.iloc[skip:].head(rows_to_read) got = cudf.read_parquet(fname, skiprows=skip, num_rows=rows_to_read) assert_eq(expect, got, check_dtype=False) @@ -1920,3 +1921,18 @@ def test_parquet_writer_nested(tmpdir, data): got = pd.read_parquet(fname) assert_eq(expect, got) + + +def test_parquet_writer_decimal(tmpdir): + from cudf.core.dtypes import Decimal64Dtype + + gdf = cudf.DataFrame({"val": [0.00, 0.01, 0.02]}) + + gdf["dec_val"] = gdf["val"].astype(Decimal64Dtype(7, 2)) + + fname = tmpdir.join("test_parquet_writer_decimal.parquet") + gdf.to_parquet(fname) + assert os.path.exists(fname) + + got = pd.read_parquet(fname) + assert_eq(gdf, got) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 03a39f6fb4b..ba9fa734248 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -280,36 +280,6 @@ def __get__(self, instance, cls): return value -class ColumnValuesMappingMixin: - """ - Coerce provided values for the mapping to Columns. - """ - - def __setitem__(self, key, value): - - value = column.as_column(value) - super().__setitem__(key, value) - - -class EqualLengthValuesMappingMixin: - """ - Require all values in the mapping to have the same length. - """ - - def __setitem__(self, key, value): - if len(self) > 0: - first = next(iter(self.values())) - if len(value) != len(first): - raise ValueError("All values must be of equal length") - super().__setitem__(key, value) - - -class OrderedColumnDict( - ColumnValuesMappingMixin, EqualLengthValuesMappingMixin, OrderedDict -): - pass - - class NestedMappingMixin: """ Make missing values of a mapping empty instances