diff --git a/build.sh b/build.sh index 45074a6645f..c2eba134c35 100755 --- a/build.sh +++ b/build.sh @@ -230,6 +230,7 @@ if buildAll || hasArg libcudf; then fi echo "$MSG" python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${LIB_BUILD_DIR}/ninja_log.html + cp ${LIB_BUILD_DIR}/.ninja_log ${LIB_BUILD_DIR}/ninja.log fi if [[ ${INSTALL_TARGET} != "" ]]; then diff --git a/ci/checks/style.sh b/ci/checks/style.sh index 13f7f0e6267..9fb86b0b3c5 100755 --- a/ci/checks/style.sh +++ b/ci/checks/style.sh @@ -14,7 +14,7 @@ LANG=C.UTF-8 . /opt/conda/etc/profile.d/conda.sh conda activate rapids -FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.12/cmake-format-rapids-cmake.json +FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.04/cmake-format-rapids-cmake.json export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE}) wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL} diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index f23296038f2..6f19f174da0 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -85,6 +85,7 @@ if [ "$BUILD_LIBCUDF" == '1' ]; then gpuci_logger "Copying build metrics results" mkdir -p "$WORKSPACE/build-metrics" cp "$LIBCUDF_BUILD_DIR/ninja_log.html" "$WORKSPACE/build-metrics/BuildMetrics.html" + cp "$LIBCUDF_BUILD_DIR/ninja.log" "$WORKSPACE/build-metrics/ninja.log" fi gpuci_logger "Build conda pkg for libcudf_kafka" diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh index 8a2c9d9be7c..1699fc16a47 100755 --- a/ci/cpu/prebuild.sh +++ b/ci/cpu/prebuild.sh @@ -3,32 +3,11 @@ # Copyright (c) 2020, NVIDIA CORPORATION. set -e -DEFAULT_CUDA_VER="11.5" -DEFAULT_PYTHON_VER="3.8" - -#Always upload cudf Python package +#Always upload cudf packages export UPLOAD_CUDF=1 - -#Upload libcudf once per CUDA -if [[ "$PYTHON" == "${DEFAULT_PYTHON_VER}" ]]; then - export UPLOAD_LIBCUDF=1 -else - export UPLOAD_LIBCUDF=0 -fi - -# upload cudf_kafka for all versions of Python -if [[ "$CUDA" == "${DEFAULT_CUDA_VER}" ]]; then - export UPLOAD_CUDF_KAFKA=1 -else - export UPLOAD_CUDF_KAFKA=0 -fi - -#We only want to upload libcudf_kafka once per python/CUDA combo -if [[ "$PYTHON" == "${DEFAULT_PYTHON_VER}" ]] && [[ "$CUDA" == "${DEFAULT_CUDA_VER}" ]]; then - export UPLOAD_LIBCUDF_KAFKA=1 -else - export UPLOAD_LIBCUDF_KAFKA=0 -fi +export UPLOAD_LIBCUDF=1 +export UPLOAD_CUDF_KAFKA=1 +export UPLOAD_LIBCUDF_KAFKA=1 if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then #If project flash is not activate, always build both diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 1105b9c194d..5575b69c226 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -42,10 +42,13 @@ sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' # rapids-cmake version sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' fetch_rapids.cmake +# cmake-format rapids-cmake definitions +sed_runner 's/'"branch-.*\/cmake-format-rapids-cmake.json"'/'"branch-${NEXT_SHORT_TAG}\/cmake-format-rapids-cmake.json"'/g' ci/checks/style.sh + # doxyfile update sed_runner 's/PROJECT_NUMBER = .*/PROJECT_NUMBER = '${NEXT_FULL_TAG}'/g' cpp/doxygen/Doxyfile -# RTD update +# sphinx docs update sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/cudf/source/conf.py sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/cudf/source/conf.py diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e4637408110..90e94ffcc7b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -425,13 +425,11 @@ add_library( src/strings/copying/concatenate.cu src/strings/copying/copying.cu src/strings/copying/shift.cu + src/strings/count_matches.cu src/strings/extract/extract.cu src/strings/extract/extract_all.cu src/strings/filling/fill.cu src/strings/filter_chars.cu - src/strings/findall.cu - src/strings/find.cu - src/strings/find_multiple.cu src/strings/padding.cu src/strings/json/json_path.cu src/strings/regex/regcomp.cpp @@ -441,6 +439,10 @@ add_library( src/strings/replace/multi_re.cu src/strings/replace/replace.cu src/strings/replace/replace_re.cu + src/strings/search/findall.cu + src/strings/search/findall_record.cu + src/strings/search/find.cu + src/strings/search/find_multiple.cu src/strings/split/partition.cu src/strings/split/split.cu src/strings/split/split_record.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 57592de59af..13ef02efc99 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -123,7 +123,7 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- -ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates.cpp) +ConfigureNVBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates.cpp) # ################################################################################################## # * join benchmark -------------------------------------------------------------------------------- diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp index ca3a748ccad..5f23cbbafdd 100644 --- a/cpp/benchmarks/fixture/benchmark_fixture.hpp +++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp @@ -32,8 +32,9 @@ inline auto make_cuda() { return std::make_shared inline auto make_pool_instance() { static rmm::mr::cuda_memory_resource cuda_mr; - static rmm::mr::pool_memory_resource pool_mr{&cuda_mr}; - return std::shared_ptr(&pool_mr); + static auto pool_mr = + std::make_shared>(&cuda_mr); + return pool_mr; } } // namespace diff --git a/cpp/benchmarks/io/csv/csv_reader.cpp b/cpp/benchmarks/io/csv/csv_reader.cpp index 7cbdb8261b8..241ba4d5954 100644 --- a/cpp/benchmarks/io/csv/csv_reader.cpp +++ b/cpp/benchmarks/io/csv/csv_reader.cpp @@ -43,9 +43,7 @@ void BM_csv_read_varying_input(benchmark::State& state) cuio_source_sink_pair source_sink(source_type); cudf_io::csv_writer_options options = - cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view) - .include_header(true) - .rows_per_chunk(1 << 14); // TODO: remove once default is sensible + cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view).include_header(true); cudf_io::write_csv(options); cudf_io::csv_reader_options const read_options = @@ -59,6 +57,7 @@ void BM_csv_read_varying_input(benchmark::State& state) state.SetBytesProcessed(data_size * state.iterations()); state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); + state.counters["encoded_file_size"] = source_sink.size(); } void BM_csv_read_varying_options(benchmark::State& state) @@ -79,23 +78,22 @@ void BM_csv_read_varying_options(benchmark::State& state) auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size}); auto const view = tbl->view(); - std::vector csv_data; + cuio_source_sink_pair source_sink(io_type::HOST_BUFFER); cudf_io::csv_writer_options options = - cudf_io::csv_writer_options::builder(cudf_io::sink_info{&csv_data}, view) + cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view) .include_header(true) - .line_terminator("\r\n") - .rows_per_chunk(1 << 14); // TODO: remove once default is sensible + .line_terminator("\r\n"); cudf_io::write_csv(options); cudf_io::csv_reader_options read_options = - cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_data.data(), csv_data.size()}) + cudf_io::csv_reader_options::builder(source_sink.make_source_info()) .use_cols_indexes(cols_to_read) .thousands('\'') .windowslinetermination(true) .comment('#') .prefix("BM_"); - size_t const chunk_size = csv_data.size() / num_chunks; + size_t const chunk_size = source_sink.size() / num_chunks; cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks; auto mem_stats_logger = cudf::memory_stats_logger(); for (auto _ : state) { @@ -132,6 +130,7 @@ void BM_csv_read_varying_options(benchmark::State& state) auto const data_processed = data_size * cols_to_read.size() / view.num_columns(); state.SetBytesProcessed(data_processed * state.iterations()); state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); + state.counters["encoded_file_size"] = source_sink.size(); } #define CSV_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type) \ diff --git a/cpp/benchmarks/io/csv/csv_writer.cpp b/cpp/benchmarks/io/csv/csv_writer.cpp index ad1fafb7f0d..413a269bcb2 100644 --- a/cpp/benchmarks/io/csv/csv_writer.cpp +++ b/cpp/benchmarks/io/csv/csv_writer.cpp @@ -46,14 +46,13 @@ void BM_csv_write_varying_inout(benchmark::State& state) for (auto _ : state) { cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 cudf_io::csv_writer_options options = - cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view) - .include_header(true) - .rows_per_chunk(1 << 14); // TODO: remove once default is sensible + cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view).include_header(true); cudf_io::write_csv(options); } state.SetBytesProcessed(data_size * state.iterations()); state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); + state.counters["encoded_file_size"] = source_sink.size(); } void BM_csv_write_varying_options(benchmark::State& state) @@ -71,12 +70,12 @@ void BM_csv_write_varying_options(benchmark::State& state) auto const view = tbl->view(); std::string const na_per(na_per_len, '#'); - std::vector csv_data; + cuio_source_sink_pair source_sink(io_type::HOST_BUFFER); auto mem_stats_logger = cudf::memory_stats_logger(); for (auto _ : state) { cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 cudf_io::csv_writer_options options = - cudf_io::csv_writer_options::builder(cudf_io::sink_info{&csv_data}, view) + cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view) .include_header(true) .na_rep(na_per) .rows_per_chunk(rows_per_chunk); @@ -85,6 +84,7 @@ void BM_csv_write_varying_options(benchmark::State& state) state.SetBytesProcessed(data_size * state.iterations()); state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); + state.counters["encoded_file_size"] = source_sink.size(); } #define CSV_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type) \ diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index e035cc10dc1..3743be8bd5a 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -16,6 +16,7 @@ #include +#include #include #include @@ -53,13 +54,25 @@ cudf_io::source_info cuio_source_sink_pair::make_source_info() cudf_io::sink_info cuio_source_sink_pair::make_sink_info() { switch (type) { - case io_type::VOID: return cudf_io::sink_info(); + case io_type::VOID: return cudf_io::sink_info(&void_sink); case io_type::FILEPATH: return cudf_io::sink_info(file_name); case io_type::HOST_BUFFER: return cudf_io::sink_info(&buffer); default: CUDF_FAIL("invalid output type"); } } +size_t cuio_source_sink_pair::size() +{ + switch (type) { + case io_type::VOID: return void_sink.bytes_written(); + case io_type::FILEPATH: + return static_cast( + std::ifstream(file_name, std::ifstream::ate | std::ifstream::binary).tellg()); + case io_type::HOST_BUFFER: return buffer.size(); + default: CUDF_FAIL("invalid output type"); + } +} + std::vector dtypes_for_column_selection(std::vector const& data_types, column_selection col_sel) { diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp index 7107585dbcc..c74ee191d4e 100644 --- a/cpp/benchmarks/io/cuio_common.hpp +++ b/cpp/benchmarks/io/cuio_common.hpp @@ -39,6 +39,15 @@ std::string random_file_in_dir(std::string const& dir_path); * @brief Class to create a coupled `source_info` and `sink_info` of given type. */ class cuio_source_sink_pair { + class bytes_written_only_sink : public cudf::io::data_sink { + size_t _bytes_written = 0; + + public: + void host_write(void const* data, size_t size) override { _bytes_written += size; } + void flush() override {} + size_t bytes_written() override { return _bytes_written; } + }; + public: cuio_source_sink_pair(io_type type); ~cuio_source_sink_pair() @@ -66,12 +75,15 @@ class cuio_source_sink_pair { */ cudf::io::sink_info make_sink_info(); + [[nodiscard]] size_t size(); + private: static temp_directory const tmpdir; io_type const type; std::vector buffer; std::string const file_name; + bytes_written_only_sink void_sink; }; /** diff --git a/cpp/benchmarks/io/orc/orc_reader.cpp b/cpp/benchmarks/io/orc/orc_reader.cpp index 56cca8d80a4..e15513275ee 100644 --- a/cpp/benchmarks/io/orc/orc_reader.cpp +++ b/cpp/benchmarks/io/orc/orc_reader.cpp @@ -66,13 +66,13 @@ void BM_orc_read_varying_input(benchmark::State& state) state.SetBytesProcessed(data_size * state.iterations()); state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); + state.counters["encoded_file_size"] = source_sink.size(); } -std::vector get_col_names(std::vector const& orc_data) +std::vector get_col_names(cudf_io::source_info const& source) { cudf_io::orc_reader_options const read_options = - cudf_io::orc_reader_options::builder(cudf_io::source_info{orc_data.data(), orc_data.size()}) - .num_rows(1); + cudf_io::orc_reader_options::builder(source).num_rows(1); return cudf_io::read_orc(read_options).metadata.column_names; } @@ -88,25 +88,26 @@ void BM_orc_read_varying_options(benchmark::State& state) auto const use_np_dtypes = (flags & 2) != 0; auto const ts_type = cudf::data_type{static_cast(state.range(state_idx++))}; + // skip_rows is not supported on nested types auto const data_types = dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED), int32_t(type_group_id::FLOATING_POINT), int32_t(type_group_id::FIXED_POINT), int32_t(type_group_id::TIMESTAMP), - int32_t(cudf::type_id::STRING), - int32_t(cudf::type_id::LIST)}), + int32_t(cudf::type_id::STRING)}), col_sel); auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size}); auto const view = tbl->view(); - std::vector orc_data; + cuio_source_sink_pair source_sink(io_type::HOST_BUFFER); cudf_io::orc_writer_options options = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{&orc_data}, view); + cudf_io::orc_writer_options::builder(source_sink.make_sink_info(), view); cudf_io::write_orc(options); - auto const cols_to_read = select_column_names(get_col_names(orc_data), col_sel); + auto const cols_to_read = + select_column_names(get_col_names(source_sink.make_source_info()), col_sel); cudf_io::orc_reader_options read_options = - cudf_io::orc_reader_options::builder(cudf_io::source_info{orc_data.data(), orc_data.size()}) + cudf_io::orc_reader_options::builder(source_sink.make_source_info()) .columns(cols_to_read) .use_index(use_index) .use_np_dtypes(use_np_dtypes) @@ -148,6 +149,7 @@ void BM_orc_read_varying_options(benchmark::State& state) auto const data_processed = data_size * cols_to_read.size() / view.num_columns(); state.SetBytesProcessed(data_processed * state.iterations()); state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); + state.counters["encoded_file_size"] = source_sink.size(); } #define ORC_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type) \ @@ -179,11 +181,12 @@ BENCHMARK_REGISTER_F(OrcRead, column_selection) ->Unit(benchmark::kMillisecond) ->UseManualTime(); +// Need an API to get the number of stripes to enable row_selection::STRIPES here BENCHMARK_DEFINE_F(OrcRead, row_selection) (::benchmark::State& state) { BM_orc_read_varying_options(state); } BENCHMARK_REGISTER_F(OrcRead, row_selection) ->ArgsProduct({{int32_t(column_selection::ALL)}, - {int32_t(row_selection::STRIPES), int32_t(row_selection::NROWS)}, + {int32_t(row_selection::NROWS)}, {1, 8}, {0b11}, // defaults {int32_t(cudf::type_id::EMPTY)}}) diff --git a/cpp/benchmarks/io/orc/orc_writer.cpp b/cpp/benchmarks/io/orc/orc_writer.cpp index 594feac41b1..50ae76e867c 100644 --- a/cpp/benchmarks/io/orc/orc_writer.cpp +++ b/cpp/benchmarks/io/orc/orc_writer.cpp @@ -62,6 +62,7 @@ void BM_orc_write_varying_inout(benchmark::State& state) state.SetBytesProcessed(data_size * state.iterations()); state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); + state.counters["encoded_file_size"] = source_sink.size(); } void BM_orc_write_varying_options(benchmark::State& state) @@ -98,6 +99,7 @@ void BM_orc_write_varying_options(benchmark::State& state) state.SetBytesProcessed(data_size * state.iterations()); state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); + state.counters["encoded_file_size"] = source_sink.size(); } #define ORC_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type) \ diff --git a/cpp/benchmarks/io/parquet/parquet_reader.cpp b/cpp/benchmarks/io/parquet/parquet_reader.cpp index b4f8dc8c450..09194931498 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader.cpp @@ -66,14 +66,13 @@ void BM_parq_read_varying_input(benchmark::State& state) state.SetBytesProcessed(data_size * state.iterations()); state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); + state.counters["encoded_file_size"] = source_sink.size(); } -std::vector get_col_names(std::vector const& parquet_data) +std::vector get_col_names(cudf::io::source_info const& source) { cudf_io::parquet_reader_options const read_options = - cudf_io::parquet_reader_options::builder( - cudf_io::source_info{parquet_data.data(), parquet_data.size()}) - .num_rows(1); + cudf_io::parquet_reader_options::builder(source).num_rows(1); return cudf_io::read_parquet(read_options).metadata.column_names; } @@ -89,26 +88,26 @@ void BM_parq_read_varying_options(benchmark::State& state) auto const use_pandas_metadata = (flags & 2) != 0; auto const ts_type = cudf::data_type{static_cast(state.range(state_idx++))}; + // No nested types here, because of https://github.com/rapidsai/cudf/issues/9970 auto const data_types = dtypes_for_column_selection( get_type_or_group({static_cast(type_group_id::INTEGRAL), static_cast(type_group_id::FLOATING_POINT), static_cast(type_group_id::FIXED_POINT), static_cast(type_group_id::TIMESTAMP), - static_cast(cudf::type_id::STRING), - static_cast(cudf::type_id::LIST)}), + static_cast(cudf::type_id::STRING)}), col_sel); auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size}); auto const view = tbl->view(); - std::vector parquet_data; + cuio_source_sink_pair source_sink(io_type::HOST_BUFFER); cudf_io::parquet_writer_options options = - cudf_io::parquet_writer_options::builder(cudf_io::sink_info{&parquet_data}, view); + cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view); cudf_io::write_parquet(options); - auto const cols_to_read = select_column_names(get_col_names(parquet_data), col_sel); + auto const cols_to_read = + select_column_names(get_col_names(source_sink.make_source_info()), col_sel); cudf_io::parquet_reader_options read_options = - cudf_io::parquet_reader_options::builder( - cudf_io::source_info{parquet_data.data(), parquet_data.size()}) + cudf_io::parquet_reader_options::builder(source_sink.make_source_info()) .columns(cols_to_read) .convert_strings_to_categories(str_to_categories) .use_pandas_metadata(use_pandas_metadata) @@ -150,6 +149,7 @@ void BM_parq_read_varying_options(benchmark::State& state) auto const data_processed = data_size * cols_to_read.size() / view.num_columns(); state.SetBytesProcessed(data_processed * state.iterations()); state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); + state.counters["encoded_file_size"] = source_sink.size(); } #define PARQ_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type) \ @@ -181,20 +181,18 @@ BENCHMARK_REGISTER_F(ParquetRead, column_selection) ->Unit(benchmark::kMillisecond) ->UseManualTime(); -// Disabled until we add an API to read metadata from a parquet file and determine num row groups. -// https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863 -/* +// row_selection::ROW_GROUPS disabled until we add an API to read metadata from a parquet file and +// determine num row groups. https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863 BENCHMARK_DEFINE_F(ParquetRead, row_selection) (::benchmark::State& state) { BM_parq_read_varying_options(state); } BENCHMARK_REGISTER_F(ParquetRead, row_selection) ->ArgsProduct({{int32_t(column_selection::ALL)}, - {int32_t(row_selection::ROW_GROUPS), int32_t(row_selection::NROWS)}, + {int32_t(row_selection::NROWS)}, {1, 4}, {0b01}, // defaults {int32_t(cudf::type_id::EMPTY)}}) ->Unit(benchmark::kMillisecond) ->UseManualTime(); -*/ BENCHMARK_DEFINE_F(ParquetRead, misc_options) (::benchmark::State& state) { BM_parq_read_varying_options(state); } diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp index 937198d9048..8287c27f804 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp @@ -61,6 +61,7 @@ void BM_parq_write_varying_inout(benchmark::State& state) state.SetBytesProcessed(data_size * state.iterations()); state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); + state.counters["encoded_file_size"] = source_sink.size(); } void BM_parq_write_varying_options(benchmark::State& state) @@ -93,6 +94,7 @@ void BM_parq_write_varying_options(benchmark::State& state) state.SetBytesProcessed(data_size * state.iterations()); state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); + state.counters["encoded_file_size"] = source_sink.size(); } #define PARQ_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type) \ diff --git a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp index cc41c0237c2..98eaba213e5 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp @@ -25,6 +25,7 @@ #include #include +#include #include #include @@ -48,15 +49,17 @@ void PQ_write(benchmark::State& state) cudf::table_view view = tbl->view(); auto mem_stats_logger = cudf::memory_stats_logger(); + cuio_source_sink_pair source_sink(io_type::VOID); for (auto _ : state) { cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 cudf_io::parquet_writer_options opts = - cudf_io::parquet_writer_options::builder(cudf_io::sink_info(), view); + cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view); cudf_io::write_parquet(opts); } state.SetBytesProcessed(static_cast(state.iterations()) * state.range(0)); state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); + state.counters["encoded_file_size"] = source_sink.size(); } void PQ_write_chunked(benchmark::State& state) @@ -71,10 +74,11 @@ void PQ_write_chunked(benchmark::State& state) } auto mem_stats_logger = cudf::memory_stats_logger(); + cuio_source_sink_pair source_sink(io_type::VOID); for (auto _ : state) { cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 cudf_io::chunked_parquet_writer_options opts = - cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info()); + cudf_io::chunked_parquet_writer_options::builder(source_sink.make_sink_info()); cudf_io::parquet_chunked_writer writer(opts); std::for_each(tables.begin(), tables.end(), [&writer](std::unique_ptr const& tbl) { writer.write(*tbl); @@ -84,6 +88,7 @@ void PQ_write_chunked(benchmark::State& state) state.SetBytesProcessed(static_cast(state.iterations()) * state.range(0)); state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); + state.counters["encoded_file_size"] = source_sink.size(); } #define PWBM_BENCHMARK_DEFINE(name, size, num_columns) \ diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates.cpp index 8039d7d065f..317db92ae8b 100644 --- a/cpp/benchmarks/stream_compaction/drop_duplicates.cpp +++ b/cpp/benchmarks/stream_compaction/drop_duplicates.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,64 +15,102 @@ */ #include -#include +#include #include #include #include -#include -#include + +#include + +#include #include #include -class Compaction : public cudf::benchmark { -}; +// necessary for custom enum types +// see: https://github.com/NVIDIA/nvbench/blob/main/examples/enums.cu +NVBENCH_DECLARE_ENUM_TYPE_STRINGS( + // Enum type: + cudf::duplicate_keep_option, + // Callable to generate input strings: + [](cudf::duplicate_keep_option option) { + switch (option) { + case cudf::duplicate_keep_option::KEEP_FIRST: return "KEEP_FIRST"; + case cudf::duplicate_keep_option::KEEP_LAST: return "KEEP_LAST"; + case cudf::duplicate_keep_option::KEEP_NONE: return "KEEP_NONE"; + default: return "ERROR"; + } + }, + // Callable to generate descriptions: + [](auto) { return std::string{}; }) + +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms"); + +template +void nvbench_drop_duplicates(nvbench::state& state, + nvbench::type_list>) +{ + if constexpr (not std::is_same_v and + Keep != cudf::duplicate_keep_option::KEEP_FIRST) { + state.skip("Skip unwanted benchmarks."); + } + + cudf::rmm_pool_raii pool_raii; + + auto const num_rows = state.get_int64("NumRows"); + + cudf::test::UniformRandomGenerator rand_gen(0, 100); + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [&rand_gen](auto row) { return rand_gen.generate(); }); + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 100 == 0 ? false : true; }); + cudf::test::fixed_width_column_wrapper values(elements, elements + num_rows, valids); + + auto input_column = cudf::column_view(values); + auto input_table = cudf::table_view({input_column, input_column, input_column, input_column}); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + auto result = cudf::detail::drop_duplicates( + input_table, {0}, Keep, cudf::null_equality::EQUAL, cudf::null_order::BEFORE, stream_view); + }); +} template -void BM_compaction(benchmark::State& state, cudf::duplicate_keep_option keep) +void nvbench_unordered_drop_duplicates(nvbench::state& state, nvbench::type_list) { - auto const n_rows = static_cast(state.range(0)); + cudf::rmm_pool_raii pool_raii; + + auto const num_rows = state.get_int64("NumRows"); cudf::test::UniformRandomGenerator rand_gen(0, 100); auto elements = cudf::detail::make_counting_transform_iterator( 0, [&rand_gen](auto row) { return rand_gen.generate(); }); auto valids = cudf::detail::make_counting_transform_iterator( 0, [](auto i) { return i % 100 == 0 ? false : true; }); - cudf::test::fixed_width_column_wrapper values(elements, elements + n_rows, valids); + cudf::test::fixed_width_column_wrapper values(elements, elements + num_rows, valids); auto input_column = cudf::column_view(values); auto input_table = cudf::table_view({input_column, input_column, input_column, input_column}); - for (auto _ : state) { - cuda_event_timer timer(state, true); - auto result = cudf::drop_duplicates(input_table, {0}, keep); - } + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + auto result = cudf::detail::unordered_drop_duplicates( + input_table, {0}, cudf::null_equality::EQUAL, stream_view); + }); } -#define concat(a, b, c) a##b##c -#define get_keep(op) cudf::duplicate_keep_option::KEEP_##op - -// TYPE, OP -#define RBM_BENCHMARK_DEFINE(name, type, keep) \ - BENCHMARK_DEFINE_F(Compaction, name)(::benchmark::State & state) \ - { \ - BM_compaction(state, get_keep(keep)); \ - } \ - BENCHMARK_REGISTER_F(Compaction, name) \ - ->UseManualTime() \ - ->Arg(10000) /* 10k */ \ - ->Arg(100000) /* 100k */ \ - ->Arg(1000000) /* 1M */ \ - ->Arg(10000000) /* 10M */ - -#define COMPACTION_BENCHMARK_DEFINE(type, keep) \ - RBM_BENCHMARK_DEFINE(concat(type, _, keep), type, keep) - -COMPACTION_BENCHMARK_DEFINE(bool, NONE); -COMPACTION_BENCHMARK_DEFINE(int8_t, NONE); -COMPACTION_BENCHMARK_DEFINE(int32_t, NONE); -COMPACTION_BENCHMARK_DEFINE(int32_t, FIRST); -COMPACTION_BENCHMARK_DEFINE(int32_t, LAST); -using cudf::timestamp_ms; -COMPACTION_BENCHMARK_DEFINE(timestamp_ms, NONE); -COMPACTION_BENCHMARK_DEFINE(float, NONE); +using data_type = nvbench::type_list; +using keep_option = nvbench::enum_type_list; + +NVBENCH_BENCH_TYPES(nvbench_drop_duplicates, NVBENCH_TYPE_AXES(data_type, keep_option)) + .set_name("drop_duplicates") + .set_type_axes_names({"Type", "KeepOption"}) + .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); + +NVBENCH_BENCH_TYPES(nvbench_unordered_drop_duplicates, NVBENCH_TYPE_AXES(data_type)) + .set_name("unordered_drop_duplicates") + .set_type_axes_names({"Type"}) + .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp index 980b353406a..fbcfabb4532 100644 --- a/cpp/benchmarks/string/contains.cpp +++ b/cpp/benchmarks/string/contains.cpp @@ -46,7 +46,7 @@ static void BM_contains(benchmark::State& state, contains_type ct) cudf::strings::count_re(input, "\\d+"); break; case contains_type::findall: // returns occurrences of matches - cudf::strings::findall_re(input, "\\d+"); + cudf::strings::findall(input, "\\d+"); break; } } diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake index c964c85156c..5a20f78b798 100644 --- a/cpp/cmake/thirdparty/get_cucollections.cmake +++ b/cpp/cmake/thirdparty/get_cucollections.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -21,7 +21,7 @@ function(find_and_configure_cucollections) cuco 0.0 GLOBAL_TARGETS cuco::cuco CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections - GIT_TAG 0ca860b824f5dc22cf8a41f09912e62e11f07d82 + GIT_TAG 6ec8b6dcdeceea07ab4456d32461a05c18864411 OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF" ) diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/docs/DEVELOPER_GUIDE.md index 5e465ed6991..eeebe38d873 100644 --- a/cpp/docs/DEVELOPER_GUIDE.md +++ b/cpp/docs/DEVELOPER_GUIDE.md @@ -347,7 +347,9 @@ implemented using asynchronous APIs on the default stream (e.g., stream 0). The recommended pattern for doing this is to make the definition of the external API invoke an internal API in the `detail` namespace. The internal `detail` API has the same parameters as the -public API, plus a `rmm::cuda_stream_view` parameter at the end defaulted to +public API, plus a `rmm::cuda_stream_view` parameter at the end with no default value. If the +detail API also accepts a memory resource parameter, the stream parameter should be ideally placed +just *before* the memory resource. The public API will call the detail API and provide `rmm::cuda_stream_default`. The implementation should be wholly contained in the `detail` API definition and use only asynchronous versions of CUDA APIs with the stream parameter. @@ -362,14 +364,14 @@ void external_function(...); // cpp/include/cudf/detail/header.hpp namespace detail{ -void external_function(..., rmm::cuda_stream_view stream = rmm::cuda_stream_default) +void external_function(..., rmm::cuda_stream_view stream) } // namespace detail // cudf/src/implementation.cpp namespace detail{ - // defaulted stream parameter + // Use the stream parameter in the detail implementation. void external_function(..., rmm::cuda_stream_view stream){ - // implementation uses stream w/ async APIs + // Implementation uses the stream with async APIs. rmm::device_buffer buff(...,stream); CUDA_TRY(cudaMemcpyAsync(...,stream.value())); kernel<<<..., stream>>>(...); @@ -378,8 +380,8 @@ namespace detail{ } // namespace detail void external_function(...){ - CUDF_FUNC_RANGE(); // Auto generates NVTX range for lifetime of this function - detail::external_function(...); + CUDF_FUNC_RANGE(); // Generates an NVTX range for the lifetime of this function. + detail::external_function(..., rmm::cuda_stream_default); } ``` diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index 87823d71c6f..3d065556827 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -75,6 +75,18 @@ std::unique_ptr drop_duplicates( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc cudf::unordered_drop_duplicates + * + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr
unordered_drop_duplicates( + table_view const& input, + std::vector const& keys, + null_equality nulls_equal = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy) * @@ -94,5 +106,24 @@ cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal = null_equality::EQUAL, rmm::cuda_stream_view stream = rmm::cuda_stream_default); +/** + * @copydoc cudf::unordered_distinct_count(column_view const&, null_policy, nan_policy) + * + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + */ +cudf::size_type unordered_distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + +/** + * @copydoc cudf::unordered_distinct_count(table_view const&, null_equality) + * + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + */ +cudf::size_type unordered_distinct_count(table_view const& input, + null_equality nulls_equal = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp index 6a85428d8f0..a7112ae415d 100644 --- a/cpp/include/cudf/fixed_point/fixed_point.hpp +++ b/cpp/include/cudf/fixed_point/fixed_point.hpp @@ -387,9 +387,9 @@ class fixed_point { /** * @brief operator + (for adding two `fixed_point` numbers) * - * If `_scale`s are equal, `_value`s are added
- * If `_scale`s are not equal, number with smaller `_scale` is shifted to the - * greater `_scale`, and then `_value`s are added + * If `_scale`s are equal, `_value`s are added. + * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the + * smaller `_scale`, and then the `_value`s are added. * * @tparam Rep1 Representation type of number being added to `this` * @tparam Rad1 Radix (base) type of number being added to `this` @@ -402,9 +402,9 @@ class fixed_point { /** * @brief operator - (for subtracting two `fixed_point` numbers) * - * If `_scale`s are equal, `_value`s are subtracted
- * If `_scale`s are not equal, number with smaller `_scale` is shifted to the - * greater `_scale`, and then `_value`s are subtracted + * If `_scale`s are equal, `_value`s are subtracted. + * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the + * smaller `_scale`, and then the `_value`s are subtracted. * * @tparam Rep1 Representation type of number being added to `this` * @tparam Rad1 Radix (base) type of number being added to `this` @@ -417,7 +417,7 @@ class fixed_point { /** * @brief operator * (for multiplying two `fixed_point` numbers) * - * `_scale`s are added and `_value`s are multiplied + * `_scale`s are added and `_value`s are multiplied. * * @tparam Rep1 Representation type of number being added to `this` * @tparam Rad1 Radix (base) type of number being added to `this` @@ -430,7 +430,7 @@ class fixed_point { /** * @brief operator / (for dividing two `fixed_point` numbers) * - * `_scale`s are subtracted and `_value`s are divided + * `_scale`s are subtracted and `_value`s are divided. * * @tparam Rep1 Representation type of number being added to `this` * @tparam Rad1 Radix (base) type of number being added to `this` @@ -443,9 +443,9 @@ class fixed_point { /** * @brief operator == (for comparing two `fixed_point` numbers) * - * If `_scale`s are equal, `_value`s are compared
- * If `_scale`s are not equal, number with smaller `_scale` is shifted to the - * greater `_scale`, and then `_value`s are compared + * If `_scale`s are equal, `_value`s are compared. + * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the + * smaller `_scale`, and then the `_value`s are compared. * * @tparam Rep1 Representation type of number being added to `this` * @tparam Rad1 Radix (base) type of number being added to `this` @@ -458,9 +458,9 @@ class fixed_point { /** * @brief operator != (for comparing two `fixed_point` numbers) * - * If `_scale`s are equal, `_value`s are compared
- * If `_scale`s are not equal, number with smaller `_scale` is shifted to the - * greater `_scale`, and then `_value`s are compared + * If `_scale`s are equal, `_value`s are compared. + * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the + * smaller `_scale`, and then the `_value`s are compared. * * @tparam Rep1 Representation type of number being added to `this` * @tparam Rad1 Radix (base) type of number being added to `this` @@ -473,9 +473,9 @@ class fixed_point { /** * @brief operator <= (for comparing two `fixed_point` numbers) * - * If `_scale`s are equal, `_value`s are compared
- * If `_scale`s are not equal, number with smaller `_scale` is shifted to the - * greater `_scale`, and then `_value`s are compared + * If `_scale`s are equal, `_value`s are compared. + * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the + * smaller `_scale`, and then the `_value`s are compared. * * @tparam Rep1 Representation type of number being added to `this` * @tparam Rad1 Radix (base) type of number being added to `this` @@ -488,9 +488,9 @@ class fixed_point { /** * @brief operator >= (for comparing two `fixed_point` numbers) * - * If `_scale`s are equal, `_value`s are compared
- * If `_scale`s are not equal, number with smaller `_scale` is shifted to the - * greater `_scale`, and then `_value`s are compared + * If `_scale`s are equal, `_value`s are compared. + * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the + * smaller `_scale`, and then the `_value`s are compared. * * @tparam Rep1 Representation type of number being added to `this` * @tparam Rad1 Radix (base) type of number being added to `this` @@ -503,9 +503,9 @@ class fixed_point { /** * @brief operator < (for comparing two `fixed_point` numbers) * - * If `_scale`s are equal, `_value`s are compared
- * If `_scale`s are not equal, number with smaller `_scale` is shifted to the - * greater `_scale`, and then `_value`s are compared + * If `_scale`s are equal, `_value`s are compared. + * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the + * smaller `_scale`, and then the `_value`s are compared. * * @tparam Rep1 Representation type of number being added to `this` * @tparam Rad1 Radix (base) type of number being added to `this` @@ -518,9 +518,9 @@ class fixed_point { /** * @brief operator > (for comparing two `fixed_point` numbers) * - * If `_scale`s are equal, `_value`s are compared
- * If `_scale`s are not equal, number with smaller `_scale` is shifted to the - * greater `_scale`, and then `_value`s are compared + * If `_scale`s are equal, `_value`s are compared. + * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the + * smaller `_scale`, and then the `_value`s are compared. * * @tparam Rep1 Representation type of number being added to `this` * @tparam Rad1 Radix (base) type of number being added to `this` @@ -534,7 +534,7 @@ class fixed_point { * @brief Method for creating a `fixed_point` number with a new `scale` * * The `fixed_point` number returned will have the same value, underlying representation and - * radix as `this`, the only thing changed is the scale + * radix as `this`, the only thing changed is the scale. * * @param scale The `scale` of the returned `fixed_point` number * @return `fixed_point` number with a new `scale` diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index 108251dd646..c2187f056cf 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -72,7 +72,6 @@ class orc_reader_options { // Columns that should be read as Decimal128 std::vector _decimal128_columns; - bool _enable_decimal128 = true; friend orc_reader_options_builder; @@ -152,11 +151,6 @@ class orc_reader_options { */ std::vector const& get_decimal128_columns() const { return _decimal128_columns; } - /** - * @brief Whether to use row index to speed-up reading. - */ - bool is_enabled_decimal128() const { return _enable_decimal128; } - // Setters /** @@ -226,18 +220,13 @@ class orc_reader_options { * * @param val Vector of fully qualified column names. */ - void set_decimal_cols_as_float(std::vector val) + [[deprecated( + "Decimal to float conversion is deprecated and will be remove in future release")]] void + set_decimal_cols_as_float(std::vector val) { _decimal_cols_as_float = std::move(val); } - /** - * @brief Enable/Disable the use of decimal128 type - * - * @param use Boolean value to enable/disable. - */ - void enable_decimal128(bool use) { _enable_decimal128 = use; } - /** * @brief Set columns that should be read as 128-bit Decimal * @@ -357,7 +346,10 @@ class orc_reader_options_builder { * @param val Vector of column names. * @return this for chaining. */ - orc_reader_options_builder& decimal_cols_as_float(std::vector val) + [[deprecated( + "Decimal to float conversion is deprecated and will be remove in future " + "release")]] orc_reader_options_builder& + decimal_cols_as_float(std::vector val) { options._decimal_cols_as_float = std::move(val); return *this; @@ -375,17 +367,6 @@ class orc_reader_options_builder { return *this; } - /** - * @brief Enable/Disable use of decimal128 type - * - * @param use Boolean value to enable/disable. - */ - orc_reader_options_builder& decimal128(bool use) - { - options.enable_decimal128(use); - return *this; - } - /** * @brief move orc_reader_options member once it's built. */ diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index 7551511d281..94039d81f31 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -189,7 +189,7 @@ std::unique_ptr
drop_nans( * @note if @p input.num_rows() is zero, there is no error, and an empty table * is returned. * - * @throws cudf::logic_error if The `input` size and `boolean_mask` size mismatches. + * @throws cudf::logic_error if `input.num_rows() != boolean_mask.size()`. * @throws cudf::logic_error if `boolean_mask` is not `type_id::BOOL8` type. * * @param[in] input The input table_view to filter @@ -214,7 +214,10 @@ enum class duplicate_keep_option { }; /** - * @brief Create a new table without duplicate rows + * @brief Create a new table without duplicate rows. + * + * The output table is sorted according to the lexicographic ordering of the data in the columns + * indexed by `keys`. * * Given an `input` table_view, each row is copied to output table if the corresponding * row of `keys` columns is unique, where the definition of unique depends on the value of @p keep: @@ -222,18 +225,18 @@ enum class duplicate_keep_option { * - KEEP_LAST: only the last of a sequence of duplicate rows is copied * - KEEP_NONE: no duplicate rows are copied * - * @throws cudf::logic_error if The `input` row size mismatches with `keys`. + * @throws cudf::logic_error if the `keys` column indices are out of bounds in the `input` table. * * @param[in] input input table_view to copy only unique rows * @param[in] keys vector of indices representing key columns from `input` - * @param[in] keep keep first entry, last entry, or no entries if duplicates found + * @param[in] keep keep first row, last row, or no rows of the found duplicates * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, nulls are not * equal if null_equality::UNEQUAL * @param[in] null_precedence flag to denote nulls should appear before or after non-null items * @param[in] mr Device memory resource used to allocate the returned table's device - * memory + * memory * - * @return Table with unique rows as per specified `keep`. + * @return Table with sorted unique rows as specified by `keep`. */ std::unique_ptr
drop_duplicates( table_view const& input, @@ -244,37 +247,95 @@ std::unique_ptr
drop_duplicates( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Count the unique elements in the column_view + * @brief Create a new table without duplicate rows with hash-based algorithms. + * + * Given an `input` table_view, each row is copied to output table if the corresponding + * row of `keys` columns is unique. If duplicate rows are present, it is unspecified which + * row is copied. * - * Given an input column_view, number of unique elements in this column_view is returned + * The order of elements in the output table is not specified. + * + * @param[in] input input table_view to copy only unique rows + * @param[in] keys vector of indices representing key columns from `input` + * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, nulls are not + * equal if null_equality::UNEQUAL + * @param[in] mr Device memory resource used to allocate the returned table's device + * memory + * + * @return Table with unique rows in an unspecified order. + */ +std::unique_ptr
unordered_drop_duplicates( + table_view const& input, + std::vector const& keys, + null_equality nulls_equal = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Count the number of consecutive groups of equivalent elements in a column. * * If `null_handling` is null_policy::EXCLUDE and `nan_handling` is nan_policy::NAN_IS_NULL, both * `NaN` and `null` values are ignored. If `null_handling` is null_policy::EXCLUDE and - * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in unique - * count. + * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in count. + * + * `null`s are handled as equal. * - * @param[in] input The column_view whose unique elements will be counted. + * @param[in] input The column_view whose number of distinct consecutive groups will be counted * @param[in] null_handling flag to include or ignore `null` while counting - * @param[in] nan_handling flag to consider `NaN==null` or not. + * @param[in] nan_handling flag to consider `NaN==null` or not * - * @return number of unique elements + * @return number of distinct consecutive groups in the column */ cudf::size_type distinct_count(column_view const& input, null_policy null_handling, nan_policy nan_handling); /** - * @brief Count the unique rows in a table. - * + * @brief Count the number of consecutive groups of equivalent elements in a table. * - * @param[in] input Table whose unique rows will be counted. - * @param[in] nulls_equal flag to denote if null elements should be considered equal - * nulls are not equal if null_equality::UNEQUAL + * @param[in] input Table whose number of distinct consecutive groups will be counted + * @param[in] nulls_equal flag to denote if null elements should be considered equal. + * nulls are not equal if null_equality::UNEQUAL. * - * @return number of unique rows in the table + * @return number of distinct consecutive groups in the table */ cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal = null_equality::EQUAL); +/** + * @brief Count the unique elements in the column_view. + * + * If `nulls_equal == nulls_equal::UNEQUAL`, all `null`s are unique. + * + * Given an input column_view, number of unique elements in this column_view is returned. + * + * If `null_handling` is null_policy::EXCLUDE and `nan_handling` is nan_policy::NAN_IS_NULL, both + * `NaN` and `null` values are ignored. If `null_handling` is null_policy::EXCLUDE and + * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in unique + * count. + * + * `null`s are handled as equal. + * + * @param[in] input The column_view whose unique elements will be counted + * @param[in] null_handling flag to include or ignore `null` while counting + * @param[in] nan_handling flag to consider `NaN==null` or not + * + * @return number of unique elements + */ +cudf::size_type unordered_distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling); + +/** + * @brief Count the unique rows in a table. + * + * @param[in] input Table whose unique rows will be counted + * @param[in] nulls_equal flag to denote if null elements should be considered equal. + * nulls are not equal if null_equality::UNEQUAL. + * + * @return number of unique rows in the table + */ +cudf::size_type unordered_distinct_count(table_view const& input, + null_equality nulls_equal = null_equality::EQUAL); + /** @} */ } // namespace cudf diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp index 4cfd0b75cd4..0964e713592 100644 --- a/cpp/include/cudf/strings/find_multiple.hpp +++ b/cpp/include/cudf/strings/find_multiple.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,30 +27,32 @@ namespace strings { */ /** - * @brief Returns a column with character position values where each + * @brief Returns a lists column with character position values where each * of the target strings are found in each string. * - * The size of the output column is targets.size() * strings.size(). - * output[i] contains the position of target[i % targets.size()] in string[i/targets.size()] + * The size of the output column is `input.size()`. + * Each row of the output column is of size `targets.size()`. + * + * `output[i,j]` contains the position of `targets[j]` in `input[i]` * * @code{.pseudo} * Example: - * s = ["abc","def"] - * t = ["a","c","e"] - * r = find_multiple(s,t) - * r is now [ 0, 2,-1, // for "abc": "a" at pos 0, "c" at pos 2, "e" not found - * -1,-1, 1 ] // for "def": "a" and "b" not found, "e" at pos 1 + * s = ["abc", "def"] + * t = ["a", "c", "e"] + * r = find_multiple(s, t) + * r is now {[ 0, 2,-1], // for "abc": "a" at pos 0, "c" at pos 2, "e" not found + * [-1,-1, 1 ]} // for "def": "a" and "b" not found, "e" at pos 1 * @endcode * - * @throw cudf::logic_error targets is empty or contains nulls + * @throw cudf::logic_error if `targets` is empty or contains nulls * - * @param strings Strings instance for this operation. + * @param input Strings instance for this operation. * @param targets Strings to search for in each string. * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New integer column with character position values. + * @return Lists column with character position values. */ std::unique_ptr find_multiple( - strings_column_view const& strings, + strings_column_view const& input, strings_column_view const& targets, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp index 6c3139747af..4207cddbafb 100644 --- a/cpp/include/cudf/strings/findall.hpp +++ b/cpp/include/cudf/strings/findall.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -52,7 +52,37 @@ namespace strings { * @param mr Device memory resource used to allocate the returned table's device memory. * @return New table of strings columns. */ -std::unique_ptr
findall_re( +std::unique_ptr
findall( + strings_column_view const& strings, + std::string const& pattern, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns a lists column of strings for each matching occurrence of the + * regex pattern within each string. + * + * @code{.pseudo} + * Example: + * s = ["bunny", "rabbit", "hare", "dog"] + * r = findall_record(s, "[ab]"") + * r is now a lists column like: + * [ ["b"] + * ["a","b","b"] + * ["a"] + * null ] + * @endcode + * + * A null output row results if the pattern is not found in the corresponding row + * input string. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param strings Strings instance for this operation. + * @param pattern Regex pattern to match within each string. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New lists column of strings. + */ +std::unique_ptr findall_record( strings_column_view const& strings, std::string const& pattern, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp index 43cc059eddd..9d75295cd39 100644 --- a/cpp/include/nvtext/subword_tokenize.hpp +++ b/cpp/include/nvtext/subword_tokenize.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,9 +19,6 @@ #include #include -#include -#include - namespace nvtext { /** @@ -43,6 +40,8 @@ struct hashed_vocabulary { std::unique_ptr table; // uint64 std::unique_ptr bin_coefficients; // uint64 std::unique_ptr bin_offsets; // uint16 + std::unique_ptr cp_metadata; // uint32 + std::unique_ptr aux_cp_table; // uint64 }; /** diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py index bac6697da82..33c369b254f 100755 --- a/cpp/scripts/sort_ninja_log.py +++ b/cpp/scripts/sort_ninja_log.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. # import argparse import os @@ -34,49 +34,63 @@ # build a map of the log entries entries = {} with open(log_file, "r") as log: + last = 0 + files = {} for line in log: entry = line.split() if len(entry) > 4: - elapsed = int(entry[1]) - int(entry[0]) obj_file = entry[3] file_size = ( os.path.getsize(os.path.join(log_path, obj_file)) if os.path.exists(obj_file) else 0 ) - entries[entry[3]] = (elapsed, file_size) + start = int(entry[0]) + end = int(entry[1]) + # logic based on ninjatracing + if end < last: + files = {} + last = end + files.setdefault(entry[4], (entry[3], start, end, file_size)) -# check file could be loaded + # build entries from files dict + for entry in files.values(): + entries[entry[0]] = (entry[1], entry[2], entry[3]) + +# check file could be loaded and we have entries to report if len(entries) == 0: print("Could not parse", log_file) exit() -# sort the keys by build time (descending order) -keys = list(entries.keys()) -sl = sorted(keys, key=lambda k: entries[k][0], reverse=True) +# sort the entries by build-time (descending order) +sorted_list = sorted( + list(entries.keys()), + key=lambda k: entries[k][1] - entries[k][0], + reverse=True, +) -if output_fmt == "xml": - # output results in XML format +# output results in XML format +def output_xml(entries, sorted_list, args): root = ET.Element("testsuites") testsuite = ET.Element( "testsuite", attrib={ "name": "build-time", - "tests": str(len(keys)), + "tests": str(len(sorted_list)), "failures": str(0), "errors": str(0), }, ) root.append(testsuite) - for key in sl: - entry = entries[key] - elapsed = float(entry[0]) / 1000 + for name in sorted_list: + entry = entries[name] + build_time = float(entry[1] - entry[0]) / 1000 item = ET.Element( "testcase", attrib={ "classname": "BuildTime", - "name": key, - "time": str(elapsed), + "name": name, + "time": str(build_time), }, ) testsuite.append(item) @@ -85,62 +99,219 @@ xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ") print(xmlstr) -elif output_fmt == "html": - # output results in HTML format - print("Sorted Ninja Build Times") - # Note: Jenkins does not support style defined in the html + +# utility converts a millisecond value to a colum width in pixels +def time_to_width(value, end): + # map a value from (0,end) to (0,1000) + r = (float(value) / float(end)) * 1000.0 + return int(r) + + +# assign each entry to a thread by analyzing the start/end times and +# slotting them into thread buckets where they fit +def assign_entries_to_threads(entries): + # first sort the entries' keys by end timestamp + sorted_keys = sorted( + list(entries.keys()), key=lambda k: entries[k][1], reverse=True + ) + + # build the chart data by assigning entries to threads + results = {} + threads = [] + for name in sorted_keys: + entry = entries[name] + + # assign this entry by finding the first available thread identified + # by the thread's current start time greater than the entry's end time + tid = -1 + for t in range(len(threads)): + if threads[t] >= entry[1]: + threads[t] = entry[0] + tid = t + break + + # if no current thread found, create a new one with this entry + if tid < 0: + threads.append(entry[0]) + tid = len(threads) - 1 + + # add entry name to the array associated with this tid + if tid not in results.keys(): + results[tid] = [] + results[tid].append(name) + + # first entry has the last end time + end_time = entries[sorted_keys[0]][1] + + # return the threaded entries and the last end time + return (results, end_time) + + +# output chart results in HTML format +def output_html(entries, sorted_list, args): + print("Build Metrics Report") + # Note: Jenkins does not support javascript nor style defined in the html # https://www.jenkins.io/doc/book/security/configuring-content-security-policy/ print("") if args.msg is not None: print("

", args.msg, "

") - print("
") - print( - "", - "", - "", - sep="", - ) - summary = {"red": 0, "yellow": 0, "green": 0} + + # map entries to threads + # the end_time is used to scale all the entries to a fixed output width + threads, end_time = assign_entries_to_threads(entries) + + # color ranges for build times + summary = {"red": 0, "yellow": 0, "green": 0, "white": 0} red = "bgcolor='#FFBBD0'" yellow = "bgcolor='#FFFF80'" green = "bgcolor='#AAFFBD'" - for key in sl: - result = entries[key] - elapsed = result[0] - color = green - if elapsed > 300000: # 5 minutes - color = red - summary["red"] += 1 - elif elapsed > 120000: # 2 minutes - color = yellow - summary["yellow"] += 1 - else: - summary["green"] += 1 + white = "bgcolor='#FFFFFF'" + + # create the build-time chart + print("
FileCompile time
(ms)
Size
(bytes)
") + for tid in range(len(threads)): + names = threads[tid] + # sort the names for this thread by start time + names = sorted(names, key=lambda k: entries[k][0]) + + # use the last entry's end time as the total row size + # (this is an estimate and does not have to be exact) + last_entry = entries[names[len(names) - 1]] + last_time = time_to_width(last_entry[1], end_time) print( - "", + "") + + # done with the chart + print("
", - key, - "", - result[0], - "", - result[1], - "
", sep="", ) - print("

") + + prev_end = 0 # used for spacing between entries + + # write out each entry for this thread as a column for a single row + for name in names: + entry = entries[name] + start = entry[0] + end = entry[1] + + # this handles minor gaps between end of the + # previous entry and the start of the next + if prev_end > 0 and start > prev_end: + size = time_to_width(start - prev_end, end_time) + print("") + # adjust for the cellspacing + prev_end = end + int(end_time / 500) + + # format the build-time + build_time = end - start + build_time_str = str(build_time) + " ms" + if build_time > 120000: # 2 minutes + minutes = int(build_time / 60000) + seconds = int(((build_time / 60000) - minutes) * 60) + build_time_str = "{:d}:{:02d} min".format(minutes, seconds) + elif build_time > 1000: + build_time_str = "{:.3f} s".format(build_time / 1000) + + # assign color and accumulate legend values + color = white + if build_time > 300000: # 5 minutes + color = red + summary["red"] += 1 + elif build_time > 120000: # 2 minutes + color = yellow + summary["yellow"] += 1 + elif build_time > 1000: # 1 second + color = green + summary["green"] += 1 + else: + summary["white"] += 1 + + # compute the pixel width based on build-time + size = max(time_to_width(build_time, end_time), 2) + # output the column for this entry + print("") + # update the entry with just the computed output info + entries[name] = (build_time_str, color, entry[2]) + + # add a filler column at the end of each row + print("
", end="") + # use a slightly smaller, fixed-width font + print("", end="") + + # add the file-name if it fits, otherwise, truncate the name + file_name = os.path.basename(name) + if len(file_name) + 3 > size / 7: + abbr_size = int(size / 7) - 3 + if abbr_size > 1: + print(file_name[:abbr_size], "...", sep="", end="") + else: + print(file_name, end="") + # done with this entry + print("

") + + # output detail table in build-time descending order + print("") + print( + "", + "", + "", + sep="", + ) + for name in sorted_list: + entry = entries[name] + build_time_str = entry[0] + color = entry[1] + file_size = entry[2] + + # format file size + file_size_str = "" + if file_size > 1000000: + file_size_str = "{:.3f} MB".format(file_size / 1000000) + elif file_size > 1000: + file_size_str = "{:.3f} KB".format(file_size / 1000) + elif file_size > 0: + file_size_str = str(file_size) + " bytes" + + # output entry row + print("", sep="", end="") + print("", sep="", end="") + print("", sep="") + + print("
FileCompile timeSize
", name, "", build_time_str, "", file_size_str, "

") + # include summary table with color legend + print("") print("time > 5 minutes") print("") print("2 minutes < time < 5 minutes") print("") - print("time < 2 minutes") + print("1 second < time < 2 minutes") print("") + print("time < 1 second") + print("") print("
", summary["red"], "
", summary["yellow"], "
", summary["green"], "
", summary["white"], "
") -else: - # output results in CSV format + +# output results in CSV format +def output_csv(entries, sorted_list, args): print("time,size,file") - for key in sl: - result = entries[key] - print(result[0], result[1], key, sep=",") + for name in sorted_list: + entry = entries[name] + build_time = entry[1] - entry[0] + file_size = entry[2] + print(build_time, file_size, name, sep=",") + + +if output_fmt == "xml": + output_xml(entries, sorted_list, args) +elif output_fmt == "html": + output_html(entries, sorted_list, args) +else: + output_csv(entries, sorted_list, args) diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index e3d1ea88ece..96b7fd48dc9 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -57,26 +58,29 @@ std::unique_ptr add_keys( // [a,b,c,d,f] + [d,b,e] = [a,b,c,d,f,d,b,e] auto combined_keys = cudf::detail::concatenate(std::vector{old_keys, new_keys}, stream); - // sort and remove any duplicates from the combined keys - // drop_duplicates([a,b,c,d,f,d,b,e]) = [a,b,c,d,e,f] - auto table_keys = cudf::detail::drop_duplicates(table_view{{combined_keys->view()}}, - std::vector{0}, // only one key column - duplicate_keep_option::KEEP_FIRST, - null_equality::EQUAL, - null_order::BEFORE, - stream, - mr) - ->release(); - std::unique_ptr keys_column(std::move(table_keys.front())); + + // Drop duplicates from the combined keys, then sort the result. + // sort(unordered_drop_duplicates([a,b,c,d,f,d,b,e])) = [a,b,c,d,e,f] + auto table_keys = + cudf::detail::unordered_drop_duplicates(table_view{{combined_keys->view()}}, + std::vector{0}, // only one key column + null_equality::EQUAL, + stream, + mr); + std::vector column_order{order::ASCENDING}; + std::vector null_precedence{null_order::AFTER}; // should be no nulls here + auto sorted_keys = + cudf::detail::sort(table_keys->view(), column_order, null_precedence, stream, mr)->release(); + + std::unique_ptr keys_column(std::move(sorted_keys.front())); // create a map for the indices // lower_bound([a,b,c,d,e,f],[a,b,c,d,f]) = [0,1,2,3,5] - auto map_indices = cudf::detail::lower_bound( - table_view{{keys_column->view()}}, - table_view{{old_keys}}, - std::vector{order::ASCENDING}, - std::vector{null_order::AFTER}, // should be no nulls here - stream, - mr); + auto map_indices = cudf::detail::lower_bound(table_view{{keys_column->view()}}, + table_view{{old_keys}}, + column_order, + null_precedence, + stream, + mr); // now create the indices column -- map old values to the new ones // gather([4,0,3,1,2,2,2,4,0],[0,1,2,3,5]) = [5,0,3,1,2,2,2,5,0] column_view indices_view(dictionary_column.indices().type(), diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu index fd86d8ec7d4..301338fa1a8 100644 --- a/cpp/src/dictionary/detail/concatenate.cu +++ b/cpp/src/dictionary/detail/concatenate.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -216,15 +217,15 @@ std::unique_ptr concatenate(host_span columns, // sort keys and remove duplicates; // this becomes the keys child for the output dictionary column - auto table_keys = cudf::detail::drop_duplicates(table_view{{all_keys->view()}}, - std::vector{0}, - duplicate_keep_option::KEEP_FIRST, - null_equality::EQUAL, - null_order::BEFORE, - stream, - mr) - ->release(); - std::unique_ptr keys_column(std::move(table_keys.front())); + auto table_keys = cudf::detail::unordered_drop_duplicates( + table_view{{all_keys->view()}}, std::vector{0}, null_equality::EQUAL, stream, mr); + auto sorted_keys = cudf::detail::sort(table_keys->view(), + std::vector{order::ASCENDING}, + std::vector{null_order::BEFORE}, + stream, + mr) + ->release(); + std::unique_ptr keys_column(std::move(sorted_keys.front())); // next, concatenate the indices std::vector indices_views(columns.size()); diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu index 72f6e034479..c1fb1fa2180 100644 --- a/cpp/src/dictionary/set_keys.cu +++ b/cpp/src/dictionary/set_keys.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -120,16 +121,17 @@ std::unique_ptr set_keys( auto keys = dictionary_column.keys(); CUDF_EXPECTS(keys.type() == new_keys.type(), "keys types must match"); - // copy the keys -- use drop_duplicates to make sure they are sorted and unique - auto table_keys = cudf::detail::drop_duplicates(table_view{{new_keys}}, - std::vector{0}, - duplicate_keep_option::KEEP_FIRST, - null_equality::EQUAL, - null_order::BEFORE, - stream, - mr) - ->release(); - std::unique_ptr keys_column(std::move(table_keys.front())); + // copy the keys -- use unordered_drop_duplicates to make sure they are unique, then + // sort the results. + auto unique_keys = cudf::detail::unordered_drop_duplicates( + table_view{{new_keys}}, std::vector{0}, null_equality::EQUAL, stream, mr); + auto sorted_keys = cudf::detail::sort(unique_keys->view(), + std::vector{order::ASCENDING}, + std::vector{null_order::BEFORE}, + stream, + mr) + ->release(); + std::unique_ptr keys_column(std::move(sorted_keys.front())); // compute the new nulls auto matches = cudf::detail::contains(keys, keys_column->view(), stream, mr); diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index e8b4a8b1cbf..57bb222aaa0 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -37,7 +38,6 @@ #include -#include #include #include @@ -219,20 +219,18 @@ std::pair, std::vector> groupby::scan groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - auto grouped_keys = helper().sorted_keys(rmm::cuda_stream_default, mr); + auto const stream = rmm::cuda_stream_default; + auto grouped_keys = helper().sorted_keys(stream, mr); - auto const& group_offsets = helper().group_offsets(rmm::cuda_stream_default); - std::vector group_offsets_vector(group_offsets.size()); - thrust::copy(thrust::device_pointer_cast(group_offsets.begin()), - thrust::device_pointer_cast(group_offsets.end()), - group_offsets_vector.begin()); + auto const& group_offsets = helper().group_offsets(stream); + auto const group_offsets_vector = cudf::detail::make_std_vector_sync(group_offsets, stream); - if (values.num_columns()) { + if (not values.is_empty()) { auto grouped_values = cudf::detail::gather(values, - helper().key_sort_order(rmm::cuda_stream_default), + helper().key_sort_order(stream), cudf::out_of_bounds_policy::DONT_CHECK, cudf::detail::negative_index_policy::NOT_ALLOWED, - rmm::cuda_stream_default, + stream, mr); return groupby::groups{ std::move(grouped_keys), std::move(group_offsets_vector), std::move(grouped_values)}; diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 817b9fd7b01..f133b79a27e 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -232,7 +232,6 @@ size_t gather_stream_info(const size_t stripe_index, */ auto decimal_column_type(std::vector const& float64_columns, std::vector const& decimal128_columns, - bool is_decimal128_enabled, cudf::io::orc::detail::aggregate_orc_metadata const& metadata, int column_index) { @@ -244,7 +243,7 @@ auto decimal_column_type(std::vector const& float64_columns, }; auto const user_selected_float64 = is_column_in(float64_columns); - auto const user_selected_decimal128 = is_decimal128_enabled and is_column_in(decimal128_columns); + auto const user_selected_decimal128 = is_column_in(decimal128_columns); CUDF_EXPECTS(not user_selected_float64 or not user_selected_decimal128, "Both decimal128 and float64 types selected for column " + column_path); @@ -255,9 +254,6 @@ auto decimal_column_type(std::vector const& float64_columns, .precision.value_or(cuda::std::numeric_limits::digits10); if (precision <= cuda::std::numeric_limits::digits10) return type_id::DECIMAL32; if (precision <= cuda::std::numeric_limits::digits10) return type_id::DECIMAL64; - CUDF_EXPECTS(is_decimal128_enabled, - "Decimal precision too high for decimal64, use `decimal_cols_as_float` or enable " - "decimal128 use"); return type_id::DECIMAL128; } @@ -754,8 +750,7 @@ std::unique_ptr reader::impl::create_empty_column(const size_type orc_co _metadata.get_schema(orc_col_id), _use_np_dtypes, _timestamp_type.id(), - decimal_column_type( - _decimal_cols_as_float, decimal128_columns, is_decimal128_enabled, _metadata, orc_col_id)); + decimal_column_type(_decimal_cols_as_float, decimal128_columns, _metadata, orc_col_id)); int32_t scale = 0; std::vector> child_columns; std::unique_ptr out_col = nullptr; @@ -900,7 +895,6 @@ reader::impl::impl(std::vector>&& sources, // Control decimals conversion _decimal_cols_as_float = options.get_decimal_cols_as_float(); decimal128_columns = options.get_decimal128_columns(); - is_decimal128_enabled = options.is_enabled_decimal128(); } timezone_table reader::impl::compute_timezone_table( @@ -964,8 +958,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, _metadata.get_col_type(col.id), _use_np_dtypes, _timestamp_type.id(), - decimal_column_type( - _decimal_cols_as_float, decimal128_columns, is_decimal128_enabled, _metadata, col.id)); + decimal_column_type(_decimal_cols_as_float, decimal128_columns, _metadata, col.id)); CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type"); if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or col_type == type_id::DECIMAL128) { diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp index e8aa298012b..1e586bcde00 100644 --- a/cpp/src/io/orc/reader_impl.hpp +++ b/cpp/src/io/orc/reader_impl.hpp @@ -223,7 +223,6 @@ class reader::impl { bool _use_np_dtypes{true}; std::vector _decimal_cols_as_float; std::vector decimal128_columns; - bool is_decimal128_enabled{true}; data_type _timestamp_type{type_id::EMPTY}; reader_column_meta _col_meta{}; }; diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index c2115c3caa4..21bfd8120f7 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -68,7 +68,7 @@ class make_pair_function { { // Compute the hash value of row `i` auto row_hash_value = remap_sentinel_hash(_hash(i), _empty_key_sentinel); - return cuco::make_pair(std::move(row_hash_value), std::move(i)); + return cuco::make_pair(row_hash_value, i); } private: diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index f38e653c4a6..e492968b8a6 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -45,7 +45,7 @@ struct make_pair_function_semi { { // The value is irrelevant since we only ever use the hash map to check for // membership of a particular row index. - return cuco::make_pair(i, 0); + return cuco::make_pair(static_cast(i), 0); } }; diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index 8563a2a3bd3..39fe0b60c8c 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -49,7 +49,7 @@ struct make_pair_function { { // The value is irrelevant since we only ever use the hash map to check for // membership of a particular row index. - return cuco::make_pair(i, 0); + return cuco::make_pair(static_cast(i), 0); } }; diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp index 6f9149a47e2..234eaf51f96 100644 --- a/cpp/src/reductions/reductions.cpp +++ b/cpp/src/reductions/reductions.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -93,7 +93,7 @@ struct reduce_dispatch_functor { case aggregation::NUNIQUE: { auto nunique_agg = dynamic_cast(agg.get()); return make_fixed_width_scalar( - detail::distinct_count( + detail::unordered_distinct_count( col, nunique_agg->_null_handling, nan_policy::NAN_IS_VALID, stream), stream, mr); diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 5c695f8a16f..2c7488084b5 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,14 +14,18 @@ * limitations under the License. */ +#include "stream_compaction_common.cuh" +#include "stream_compaction_common.hpp" + #include #include #include +#include +#include #include #include #include #include -#include #include #include @@ -30,39 +34,19 @@ #include #include +#include + +#include +#include +#include +#include #include namespace cudf { namespace detail { - -cudf::size_type distinct_count(table_view const& keys, - null_equality nulls_equal, - rmm::cuda_stream_view stream) -{ - // sort only indices - auto sorted_indices = sorted_order(keys, - std::vector{}, - std::vector{}, - stream, - rmm::mr::get_current_device_resource()); - - // count unique elements - auto sorted_row_index = sorted_indices->view().data(); - auto device_input_table = cudf::table_device_view::create(keys, stream); - - row_equality_comparator comp( - nullate::DYNAMIC{cudf::has_nulls(keys)}, *device_input_table, *device_input_table, nulls_equal); - return thrust::count_if( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(keys.num_rows()), - [sorted_row_index, comp] __device__(cudf::size_type i) { - return (i == 0 || not comp(sorted_row_index[i], sorted_row_index[i - 1])); - }); -} - +namespace { /** - * @brief Functor to check for `NAN` at an index in a `column_device_view`. + * @brief Functor to check for `NaN` at an index in a `column_device_view`. * * @tparam T The type of `column_device_view` */ @@ -76,97 +60,199 @@ struct check_for_nan { check_for_nan(cudf::column_device_view input) : _input{input} {} /** - * @brief Operator to be called to check for `NAN` at `index` in `_input` + * @brief Operator to be called to check for `NaN` at `index` in `_input` * - * @param[in] index The index at which the `NAN` needs to be checked in `input` + * @param[in] index The index at which the `NaN` needs to be checked in `input` * - * @returns bool true if value at `index` is `NAN` and not null, else false + * @returns bool true if value at `index` is `NaN` and not null, else false */ - __device__ bool operator()(size_type index) + __device__ bool operator()(size_type index) const noexcept { return std::isnan(_input.data()[index]) and _input.is_valid(index); } - protected: cudf::column_device_view _input; }; /** * @brief A structure to be used along with type_dispatcher to check if a - * `column_view` has `NAN`. + * `column_view` has `NaN`. */ struct has_nans { /** - * @brief Checks if `input` has `NAN` + * @brief Checks if `input` has `NaN` * * @note This will be applicable only for floating point type columns. * - * @param[in] input The `column_view` which will be checked for `NAN` + * @param[in] input The `column_view` which will be checked for `NaN` * @param[in] stream CUDA stream used for device memory operations and kernel launches. * - * @returns bool true if `input` has `NAN` else false + * @returns bool true if `input` has `NaN` else false */ - template ::value>* = nullptr> + template >* = nullptr> bool operator()(column_view const& input, rmm::cuda_stream_view stream) { auto input_device_view = cudf::column_device_view::create(input, stream); auto device_view = *input_device_view; - auto count = thrust::count_if(rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(input.size()), - check_for_nan(device_view)); - return count > 0; + return thrust::any_of(rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(input.size()), + check_for_nan(device_view)); } /** - * @brief Checks if `input` has `NAN` + * @brief Checks if `input` has `NaN` * * @note This will be applicable only for non-floating point type columns. And - * non-floating point columns can never have `NAN`, so it will always return + * non-floating point columns can never have `NaN`, so it will always return * false * - * @param[in] input The `column_view` which will be checked for `NAN` + * @param[in] input The `column_view` which will be checked for `NaN` * @param[in] stream CUDA stream used for device memory operations and kernel launches. * - * @returns bool Always false as non-floating point columns can't have `NAN` + * @returns bool Always false as non-floating point columns can't have `NaN` */ - template ::value>* = nullptr> - bool operator()(column_view const& input, rmm::cuda_stream_view stream) + template >* = nullptr> + bool operator()(column_view const&, rmm::cuda_stream_view) + { + return false; + } +}; + +/** + * @brief A functor to be used along with device type_dispatcher to check if + * the row `index` of `column_device_view` is `NaN`. + */ +struct check_nan { + // Check if it's `NaN` for floating point type columns + template >* = nullptr> + __device__ inline bool operator()(column_device_view const& input, size_type index) + { + return std::isnan(input.data()[index]); + } + // Non-floating point type columns can never have `NaN`, so it will always return false. + template >* = nullptr> + __device__ inline bool operator()(column_device_view const&, size_type) { return false; } }; +} // namespace + +cudf::size_type distinct_count(table_view const& keys, + null_equality nulls_equal, + rmm::cuda_stream_view stream) +{ + auto table_ptr = cudf::table_device_view::create(keys, stream); + row_equality_comparator comp( + nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr, *table_ptr, nulls_equal); + return thrust::count_if( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(keys.num_rows()), + [comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); }); +} + +cudf::size_type unordered_distinct_count(table_view const& keys, + null_equality nulls_equal, + rmm::cuda_stream_view stream) +{ + auto table_ptr = cudf::table_device_view::create(keys, stream); + auto const num_rows = table_ptr->num_rows(); + auto const has_null = nullate::DYNAMIC{cudf::has_nulls(keys)}; + + hash_map_type key_map{compute_hash_table_size(num_rows), + COMPACTION_EMPTY_KEY_SENTINEL, + COMPACTION_EMPTY_VALUE_SENTINEL, + detail::hash_table_allocator_type{default_allocator{}, stream}, + stream.value()}; + + compaction_hash hash_key{has_null, *table_ptr}; + row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal); + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [] __device__(size_type i) { return cuco::make_pair(i, i); }); + + // when nulls are equal, insert non-null rows only to improve efficiency + if (nulls_equal == null_equality::EQUAL and has_null) { + thrust::counting_iterator stencil(0); + auto const [row_bitmask, null_count] = cudf::detail::bitmask_or(keys, stream); + row_validity pred{static_cast(row_bitmask.data())}; + + key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, stream.value()); + return key_map.get_size() + static_cast((null_count > 0) ? 1 : 0); + } + // otherwise, insert all + key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); + return key_map.get_size(); +} cudf::size_type distinct_count(column_view const& input, null_policy null_handling, nan_policy nan_handling, rmm::cuda_stream_view stream) { - if (0 == input.size() || input.null_count() == input.size()) { return 0; } - - cudf::size_type nrows = input.size(); - - bool has_nan = false; - // Check for Nans - // Checking for nulls in input and flag nan_handling, as the count will - // only get affected if these two conditions are true. NAN will only be - // be an extra if nan_handling was NAN_IS_NULL and input also had null, which - // will increase the count by 1. - if (input.has_nulls() and nan_handling == nan_policy::NAN_IS_NULL) { - has_nan = cudf::type_dispatcher(input.type(), has_nans{}, input, stream); - } + auto const num_rows = input.size(); - auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream); + if (num_rows == 0 or num_rows == input.null_count()) { return 0; } - // if nan is considered null and there are already null values - if (nan_handling == nan_policy::NAN_IS_NULL and has_nan and input.has_nulls()) --count; + auto const count_nulls = null_handling == null_policy::INCLUDE; + auto const nan_is_null = nan_handling == nan_policy::NAN_IS_NULL; + auto const should_check_nan = cudf::is_floating_point(input.type()); + auto input_device_view = cudf::column_device_view::create(input, stream); + auto device_view = *input_device_view; + auto input_table_view = table_view{{input}}; + auto table_ptr = cudf::table_device_view::create(input_table_view, stream); + row_equality_comparator comp(nullate::DYNAMIC{cudf::has_nulls(input_table_view)}, + *table_ptr, + *table_ptr, + null_equality::EQUAL); - if (null_handling == null_policy::EXCLUDE and input.has_nulls()) - return --count; - else - return count; + return thrust::count_if( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(num_rows), + [count_nulls, nan_is_null, should_check_nan, device_view, comp] __device__(cudf::size_type i) { + auto const is_null = device_view.is_null(i); + auto const is_nan = nan_is_null and should_check_nan and + cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i); + if (not count_nulls and (is_null or (nan_is_null and is_nan))) { return false; } + if (i == 0) { return true; } + if (count_nulls and nan_is_null and (is_nan or is_null)) { + auto const prev_is_nan = + should_check_nan and + cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i - 1); + return not(prev_is_nan or device_view.is_null(i - 1)); + } + return not comp(i, i - 1); + }); } +cudf::size_type unordered_distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream) +{ + if (0 == input.size() or input.null_count() == input.size()) { return 0; } + + auto count = detail::unordered_distinct_count(table_view{{input}}, null_equality::EQUAL, stream); + + // Check for nulls. If the null policy is EXCLUDE and null values were found, + // we decrement the count. + auto const has_null = input.has_nulls(); + if (null_handling == null_policy::EXCLUDE and has_null) { --count; } + + // Check for NaNs. There are two cases that can lead to decrementing the + // count. The first case is when the input has no nulls, but has NaN values + // handled as a null via NAN_IS_NULL and has a policy to EXCLUDE null values + // from the count. The second case is when the input has null values and NaN + // values handled as nulls via NAN_IS_NULL. Regardless of whether the null + // policy is set to EXCLUDE, we decrement the count to avoid double-counting + // null and NaN as distinct entities. + auto const has_nan_as_null = (nan_handling == nan_policy::NAN_IS_NULL) and + cudf::type_dispatcher(input.type(), has_nans{}, input, stream); + if (has_nan_as_null and (has_null or null_handling == null_policy::EXCLUDE)) { --count; } + return count; +} } // namespace detail cudf::size_type distinct_count(column_view const& input, @@ -183,4 +269,18 @@ cudf::size_type distinct_count(table_view const& input, null_equality nulls_equa return detail::distinct_count(input, nulls_equal); } +cudf::size_type unordered_distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling) +{ + CUDF_FUNC_RANGE(); + return detail::unordered_distinct_count(input, null_handling, nan_handling); +} + +cudf::size_type unordered_distinct_count(table_view const& input, null_equality nulls_equal) +{ + CUDF_FUNC_RANGE(); + return detail::unordered_distinct_count(input, nulls_equal); +} + } // namespace cudf diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index abc34663aee..2fd1f530b6d 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,13 +14,16 @@ * limitations under the License. */ -#include +#include "drop_duplicates.cuh" +#include "stream_compaction_common.cuh" +#include "stream_compaction_common.hpp" #include #include #include #include #include +#include #include #include #include @@ -37,6 +40,7 @@ #include #include +#include #include namespace cudf { @@ -85,12 +89,12 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys, auto comp = row_equality_comparator( nullate::DYNAMIC{cudf::has_nulls(keys)}, *device_input_table, *device_input_table, nulls_equal); - auto result_end = unique_copy(sorted_indices->view().begin(), - sorted_indices->view().end(), - unique_indices.begin(), - comp, - keep, - stream); + auto result_end = cudf::detail::unique_copy(sorted_indices->view().begin(), + sorted_indices->view().end(), + unique_indices.begin(), + comp, + keep, + stream); return cudf::detail::slice(column_view(unique_indices), 0, @@ -106,7 +110,7 @@ std::unique_ptr drop_duplicates(table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (0 == input.num_rows() || 0 == input.num_columns() || 0 == keys.size()) { + if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) { return empty_like(input); } @@ -130,6 +134,62 @@ std::unique_ptr
drop_duplicates(table_view const& input, mr); } +std::unique_ptr
unordered_drop_duplicates(table_view const& input, + std::vector const& keys, + null_equality nulls_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) { + return empty_like(input); + } + + auto keys_view = input.select(keys); + auto table_ptr = cudf::table_device_view::create(keys_view, stream); + auto has_null = nullate::DYNAMIC{cudf::has_nulls(keys_view)}; + auto const num_rows{table_ptr->num_rows()}; + + hash_map_type key_map{compute_hash_table_size(num_rows), + COMPACTION_EMPTY_KEY_SENTINEL, + COMPACTION_EMPTY_VALUE_SENTINEL, + detail::hash_table_allocator_type{default_allocator{}, stream}, + stream.value()}; + + compaction_hash hash_key{has_null, *table_ptr}; + row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal); + + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [] __device__(size_type i) { return cuco::make_pair(i, i); }); + // insert unique indices into the map. + key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); + + auto counting_iter = thrust::make_counting_iterator(0); + rmm::device_uvector index_exists_in_map(num_rows, stream, mr); + // enumerate all indices to check if they are present in the map. + key_map.contains(counting_iter, counting_iter + num_rows, index_exists_in_map.begin(), hash_key); + + auto const output_size{key_map.get_size()}; + + // write unique indices to a numeric column + auto unique_indices = cudf::make_numeric_column( + data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED, stream, mr); + auto mutable_view = mutable_column_device_view::create(*unique_indices, stream); + thrust::copy_if(rmm::exec_policy(stream), + counting_iter, + counting_iter + num_rows, + index_exists_in_map.begin(), + mutable_view->begin(), + thrust::identity{}); + + // run gather operation to establish new order + return detail::gather(input, + unique_indices->view(), + out_of_bounds_policy::DONT_CHECK, + detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); +} + } // namespace detail std::unique_ptr
drop_duplicates(table_view const& input, @@ -144,4 +204,13 @@ std::unique_ptr
drop_duplicates(table_view const& input, input, keys, keep, nulls_equal, null_precedence, rmm::cuda_stream_default, mr); } +std::unique_ptr
unordered_drop_duplicates(table_view const& input, + std::vector const& keys, + null_equality nulls_equal, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::unordered_drop_duplicates(input, keys, nulls_equal, rmm::cuda_stream_default, mr); +} + } // namespace cudf diff --git a/cpp/src/stream_compaction/stream_compaction_common.cuh b/cpp/src/stream_compaction/stream_compaction_common.cuh new file mode 100644 index 00000000000..8ba9223a1bc --- /dev/null +++ b/cpp/src/stream_compaction/stream_compaction_common.cuh @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "stream_compaction_common.hpp" + +namespace cudf { +namespace detail { + +/** + * @brief Device callable to hash a given row. + */ +template +class compaction_hash { + public: + compaction_hash(Nullate has_nulls, table_device_view t) : _hash{has_nulls, t} {} + + __device__ inline auto operator()(size_type i) const noexcept + { + auto hash = _hash(i); + return (hash == COMPACTION_EMPTY_KEY_SENTINEL) ? (hash - 1) : hash; + } + + private: + row_hash _hash; +}; + +/** + * @brief Device functor to determine if a row is valid. + */ +class row_validity { + public: + row_validity(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {} + + __device__ inline bool operator()(const size_type& i) const noexcept + { + return cudf::bit_is_set(_row_bitmask, i); + } + + private: + bitmask_type const* _row_bitmask; +}; + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/stream_compaction/stream_compaction_common.hpp b/cpp/src/stream_compaction/stream_compaction_common.hpp new file mode 100644 index 00000000000..1d743eccdbe --- /dev/null +++ b/cpp/src/stream_compaction/stream_compaction_common.hpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include +#include + +#include + +#include + +#include + +namespace cudf { +namespace detail { + +constexpr auto COMPACTION_EMPTY_KEY_SENTINEL = std::numeric_limits::max(); +constexpr auto COMPACTION_EMPTY_VALUE_SENTINEL = std::numeric_limits::min(); + +using hash_type = cuco::detail::MurmurHash3_32; + +using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor>; + +using hash_map_type = + cuco::static_map; + +using row_hash = cudf::row_hasher; + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/strings/count_matches.cuh b/cpp/src/strings/count_matches.cu similarity index 87% rename from cpp/src/strings/count_matches.cuh rename to cpp/src/strings/count_matches.cu index c14142f4779..d0a6825666b 100644 --- a/cpp/src/strings/count_matches.cuh +++ b/cpp/src/strings/count_matches.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,16 +14,13 @@ * limitations under the License. */ -#pragma once - +#include #include -#include #include #include #include -#include #include #include @@ -32,6 +29,7 @@ namespace cudf { namespace strings { namespace detail { +namespace { /** * @brief Functor counts the total matches to the given regex in each string. */ @@ -50,12 +48,13 @@ struct count_matches_fn { int32_t end = d_str.length(); while ((begin < end) && (prog.find(idx, d_str, begin, end) > 0)) { ++count; - begin = end; + begin = end + (begin == end); end = d_str.length(); } return count; } }; +} // namespace /** * @brief Returns a column of regex match counts for each string in the given column. @@ -67,11 +66,10 @@ struct count_matches_fn { * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. */ -std::unique_ptr count_matches( - column_device_view const& d_strings, - reprog_device const& d_prog, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr count_matches(column_device_view const& d_strings, + reprog_device const& d_prog, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Create output column auto counts = make_numeric_column( diff --git a/cpp/src/strings/count_matches.hpp b/cpp/src/strings/count_matches.hpp new file mode 100644 index 00000000000..1339f2b1ebd --- /dev/null +++ b/cpp/src/strings/count_matches.hpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf { + +class column_device_view; + +namespace strings { +namespace detail { + +class reprog_device; + +/** + * @brief Returns a column of regex match counts for each string in the given column. + * + * A null entry will result in a zero count for that output row. + * + * @param d_strings Device view of the input strings column. + * @param d_prog Regex instance to evaluate on each string. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. + */ +std::unique_ptr count_matches( + column_device_view const& d_strings, + reprog_device const& d_prog, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu index 584741298c2..c4749eae003 100644 --- a/cpp/src/strings/extract/extract_all.cu +++ b/cpp/src/strings/extract/extract_all.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 7be88d01387..244cec1d780 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -830,6 +830,7 @@ class regex_compiler { m_prog.set_start_inst(andstack[andstack.size() - 1].id_first); m_prog.optimize1(); m_prog.optimize2(); + m_prog.check_for_errors(); m_prog.set_groups_count(cursubid); } }; @@ -926,6 +927,68 @@ void reprog::optimize2() _startinst_ids.push_back(-1); // terminator mark } +/** + * @brief Check a specific instruction for errors. + * + * Currently this is checking for an infinite-loop condition as documented in this issue: + * https://github.com/rapidsai/cudf/issues/10006 + * + * Example instructions list created from pattern `(A?)+` + * ``` + * 0: CHAR c='A', next=2 + * 1: OR right=0, left=2, next=2 + * 2: RBRA id=1, next=4 + * 3: LBRA id=1, next=1 + * 4: OR right=3, left=5, next=5 + * 5: END + * ``` + * + * Following the example above, the instruction at `id==1` (OR) + * is being checked. If the instruction path returns to `id==1` + * without including the `0==CHAR` or `5==END` as in this example, + * then this would cause the runtime to go into an infinite-loop. + * + * It appears this example pattern is not valid. But Python interprets + * its behavior similarly to pattern `(A*)`. Handling this in the same + * way does not look feasible with the current implementation. + * + * @throw cudf::logic_error if instruction logic error is found + * + * @param id Instruction to check if repeated. + * @param next_id Next instruction to process. + */ +void reprog::check_for_errors(int32_t id, int32_t next_id) +{ + auto inst = inst_at(next_id); + while (inst.type == LBRA || inst.type == RBRA) { + next_id = inst.u2.next_id; + inst = inst_at(next_id); + } + if (inst.type == OR) { + CUDF_EXPECTS(next_id != id, "Unsupported regex pattern"); + check_for_errors(id, inst.u2.left_id); + check_for_errors(id, inst.u1.right_id); + } +} + +/** + * @brief Check regex instruction set for any errors. + * + * Currently, this checks for OR instructions that eventually point back to themselves with only + * intervening capture group instructions between causing an infinite-loop during runtime + * evaluation. + */ +void reprog::check_for_errors() +{ + for (auto id = 0; id < insts_count(); ++id) { + auto const inst = inst_at(id); + if (inst.type == OR) { + check_for_errors(id, inst.u2.left_id); + check_for_errors(id, inst.u1.right_id); + } + } +} + #ifndef NDEBUG void reprog::print(regex_flags const flags) { @@ -933,83 +996,81 @@ void reprog::print(regex_flags const flags) printf("Instructions:\n"); for (std::size_t i = 0; i < _insts.size(); i++) { const reinst& inst = _insts[i]; - printf("%zu :", i); + printf("%3zu: ", i); switch (inst.type) { - default: printf("Unknown instruction: %d, nextid= %d", inst.type, inst.u2.next_id); break; + default: printf("Unknown instruction: %d, next=%d", inst.type, inst.u2.next_id); break; case CHAR: - if (inst.u1.c <= 32 || inst.u1.c >= 127) - printf( - "CHAR, c = '0x%02x', nextid= %d", static_cast(inst.u1.c), inst.u2.next_id); - else - printf("CHAR, c = '%c', nextid= %d", inst.u1.c, inst.u2.next_id); + if (inst.u1.c <= 32 || inst.u1.c >= 127) { + printf(" CHAR c='0x%02x', next=%d", static_cast(inst.u1.c), inst.u2.next_id); + } else { + printf(" CHAR c='%c', next=%d", inst.u1.c, inst.u2.next_id); + } break; - case RBRA: printf("RBRA, subid= %d, nextid= %d", inst.u1.subid, inst.u2.next_id); break; - case LBRA: printf("LBRA, subid= %d, nextid= %d", inst.u1.subid, inst.u2.next_id); break; + case RBRA: printf(" RBRA id=%d, next=%d", inst.u1.subid, inst.u2.next_id); break; + case LBRA: printf(" LBRA id=%d, next=%d", inst.u1.subid, inst.u2.next_id); break; case OR: - printf("OR, rightid=%d, leftid=%d, nextid=%d", - inst.u1.right_id, - inst.u2.left_id, - inst.u2.next_id); + printf( + " OR right=%d, left=%d, next=%d", inst.u1.right_id, inst.u2.left_id, inst.u2.next_id); break; - case STAR: printf("STAR, nextid= %d", inst.u2.next_id); break; - case PLUS: printf("PLUS, nextid= %d", inst.u2.next_id); break; - case QUEST: printf("QUEST, nextid= %d", inst.u2.next_id); break; - case ANY: printf("ANY, nextid= %d", inst.u2.next_id); break; - case ANYNL: printf("ANYNL, nextid= %d", inst.u2.next_id); break; - case NOP: printf("NOP, nextid= %d", inst.u2.next_id); break; + case STAR: printf(" STAR next=%d", inst.u2.next_id); break; + case PLUS: printf(" PLUS next=%d", inst.u2.next_id); break; + case QUEST: printf(" QUEST next=%d", inst.u2.next_id); break; + case ANY: printf(" ANY next=%d", inst.u2.next_id); break; + case ANYNL: printf(" ANYNL next=%d", inst.u2.next_id); break; + case NOP: printf(" NOP next=%d", inst.u2.next_id); break; case BOL: { - printf("BOL, c = "); + printf(" BOL c="); if (inst.u1.c == '\n') { printf("'\\n'"); } else { printf("'%c'", inst.u1.c); } - printf(", nextid= %d", inst.u2.next_id); + printf(", next=%d", inst.u2.next_id); break; } case EOL: { - printf("EOL, c = "); + printf(" EOL c="); if (inst.u1.c == '\n') { printf("'\\n'"); } else { printf("'%c'", inst.u1.c); } - printf(", nextid= %d", inst.u2.next_id); + printf(", next=%d", inst.u2.next_id); break; } - case CCLASS: printf("CCLASS, cls_id=%d , nextid= %d", inst.u1.cls_id, inst.u2.next_id); break; - case NCCLASS: - printf("NCCLASS, cls_id=%d , nextid= %d", inst.u1.cls_id, inst.u2.next_id); - break; - case BOW: printf("BOW, nextid= %d", inst.u2.next_id); break; - case NBOW: printf("NBOW, nextid= %d", inst.u2.next_id); break; - case END: printf("END"); break; + case CCLASS: printf(" CCLASS cls=%d , next=%d", inst.u1.cls_id, inst.u2.next_id); break; + case NCCLASS: printf("NCCLASS cls=%d, next=%d", inst.u1.cls_id, inst.u2.next_id); break; + case BOW: printf(" BOW next=%d", inst.u2.next_id); break; + case NBOW: printf(" NBOW next=%d", inst.u2.next_id); break; + case END: printf(" END"); break; } printf("\n"); } printf("startinst_id=%d\n", _startinst_id); if (_startinst_ids.size() > 0) { - printf("startinst_ids:"); - for (size_t i = 0; i < _startinst_ids.size(); i++) + printf("startinst_ids: ["); + for (size_t i = 0; i < _startinst_ids.size(); i++) { printf(" %d", _startinst_ids[i]); - printf("\n"); + } + printf("]\n"); } int count = static_cast(_classes.size()); printf("\nClasses %d\n", count); for (int i = 0; i < count; i++) { const reclass& cls = _classes[i]; - int len = static_cast(cls.literals.size()); + auto const size = static_cast(cls.literals.size()); printf("%2d: ", i); - for (int j = 0; j < len; j += 2) { + for (int j = 0; j < size; j += 2) { char32_t c1 = cls.literals[j]; char32_t c2 = cls.literals[j + 1]; - if (c1 <= 32 || c1 >= 127 || c2 <= 32 || c2 >= 127) + if (c1 <= 32 || c1 >= 127 || c2 <= 32 || c2 >= 127) { printf("0x%02x-0x%02x", static_cast(c1), static_cast(c2)); - else + } else { printf("%c-%c", static_cast(c1), static_cast(c2)); - if ((j + 2) < len) printf(", "); + } + if ((j + 2) < size) { printf(", "); } } printf("\n"); if (cls.builtins) { @@ -1024,7 +1085,7 @@ void reprog::print(regex_flags const flags) } printf("\n"); } - if (_num_capturing_groups) printf("Number of capturing groups: %d\n", _num_capturing_groups); + if (_num_capturing_groups) { printf("Number of capturing groups: %d\n", _num_capturing_groups); } } #endif diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h index 3131767de59..18735d0f980 100644 --- a/cpp/src/strings/regex/regcomp.h +++ b/cpp/src/strings/regex/regcomp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -116,14 +116,19 @@ class reprog { void optimize1(); void optimize2(); + void check_for_errors(); +#ifndef NDEBUG void print(regex_flags const flags); +#endif private: std::vector _insts; std::vector _classes; int32_t _startinst_id; std::vector _startinst_ids; // short-cut to speed-up ORs - int32_t _num_capturing_groups; + int32_t _num_capturing_groups{}; + + void check_for_errors(int32_t id, int32_t next_id); }; } // namespace detail diff --git a/cpp/src/strings/find.cu b/cpp/src/strings/search/find.cu similarity index 100% rename from cpp/src/strings/find.cu rename to cpp/src/strings/search/find.cu diff --git a/cpp/src/strings/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu similarity index 67% rename from cpp/src/strings/find_multiple.cu rename to cpp/src/strings/search/find_multiple.cu index 72e7081cb7a..5756c239f1c 100644 --- a/cpp/src/strings/find_multiple.cu +++ b/cpp/src/strings/search/find_multiple.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include #include @@ -31,37 +33,32 @@ namespace cudf { namespace strings { namespace detail { std::unique_ptr find_multiple( - strings_column_view const& strings, + strings_column_view const& input, strings_column_view const& targets, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - auto strings_count = strings.size(); - if (strings_count == 0) return make_empty_column(type_id::INT32); - auto targets_count = targets.size(); + auto const strings_count = input.size(); + auto const targets_count = targets.size(); CUDF_EXPECTS(targets_count > 0, "Must include at least one search target"); CUDF_EXPECTS(!targets.has_nulls(), "Search targets cannot contain null strings"); - auto strings_column = column_device_view::create(strings.parent(), stream); + auto strings_column = column_device_view::create(input.parent(), stream); auto d_strings = *strings_column; auto targets_column = column_device_view::create(targets.parent(), stream); auto d_targets = *targets_column; + auto const total_count = strings_count * targets_count; + // create output column - auto total_count = strings_count * targets_count; - auto results = make_numeric_column(data_type{type_id::INT32}, - total_count, - rmm::device_buffer{0, stream, mr}, - 0, - stream, - mr); // no nulls - auto results_view = results->mutable_view(); - auto d_results = results_view.data(); + auto results = make_numeric_column( + data_type{type_id::INT32}, total_count, rmm::device_buffer{0, stream, mr}, 0, stream, mr); + // fill output column with position values thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(total_count), - d_results, + results->mutable_view().begin(), [d_strings, d_targets, targets_count] __device__(size_type idx) { size_type str_idx = idx / targets_count; if (d_strings.is_null(str_idx)) return -1; @@ -70,18 +67,30 @@ std::unique_ptr find_multiple( return d_str.find(d_tgt); }); results->set_null_count(0); - return results; + + auto offsets = cudf::detail::sequence(strings_count + 1, + numeric_scalar(0), + numeric_scalar(targets_count), + stream, + mr); + return make_lists_column(strings_count, + std::move(offsets), + std::move(results), + 0, + rmm::device_buffer{0, stream, mr}, + stream, + mr); } } // namespace detail // external API -std::unique_ptr find_multiple(strings_column_view const& strings, +std::unique_ptr find_multiple(strings_column_view const& input, strings_column_view const& targets, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::find_multiple(strings, targets, rmm::cuda_stream_default, mr); + return detail::find_multiple(input, targets, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/findall.cu b/cpp/src/strings/search/findall.cu similarity index 95% rename from cpp/src/strings/findall.cu rename to cpp/src/strings/search/findall.cu index c82ab4f81c3..8fb754848d4 100644 --- a/cpp/src/strings/findall.cu +++ b/cpp/src/strings/search/findall.cu @@ -109,11 +109,11 @@ struct findall_count_fn : public findall_fn { } // namespace // -std::unique_ptr
findall_re( +std::unique_ptr
findall( strings_column_view const& strings, std::string const& pattern, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - rmm::cuda_stream_view stream = rmm::cuda_stream_default) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto const strings_count = strings.size(); auto const d_strings = column_device_view::create(strings.parent(), stream); @@ -205,12 +205,12 @@ std::unique_ptr
findall_re( // external API -std::unique_ptr
findall_re(strings_column_view const& strings, - std::string const& pattern, - rmm::mr::device_memory_resource* mr) +std::unique_ptr
findall(strings_column_view const& strings, + std::string const& pattern, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::findall_re(strings, pattern, mr); + return detail::findall(strings, pattern, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/search/findall_record.cu b/cpp/src/strings/search/findall_record.cu new file mode 100644 index 00000000000..9ffdb33f5f2 --- /dev/null +++ b/cpp/src/strings/search/findall_record.cu @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { + +using string_index_pair = thrust::pair; + +namespace { + +/** + * @brief This functor handles extracting matched strings by applying the compiled regex pattern + * and creating string_index_pairs for all the substrings. + */ +template +struct findall_fn { + column_device_view const d_strings; + reprog_device prog; + offset_type const* d_offsets; + string_index_pair* d_indices; + + __device__ void operator()(size_type const idx) + { + if (d_strings.is_null(idx)) { return; } + auto const d_str = d_strings.element(idx); + + auto d_output = d_indices + d_offsets[idx]; + size_type output_idx = 0; + + int32_t begin = 0; + int32_t end = d_str.length(); + while ((begin < end) && (prog.find(idx, d_str, begin, end) > 0)) { + auto const spos = d_str.byte_offset(begin); // convert + auto const epos = d_str.byte_offset(end); // to bytes + + d_output[output_idx++] = string_index_pair{d_str.data() + spos, (epos - spos)}; + + begin = end + (begin == end); + end = d_str.length(); + } + } +}; + +} // namespace + +// +std::unique_ptr findall_record( + strings_column_view const& strings, + std::string const& pattern, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + auto const strings_count = strings.size(); + auto const d_strings = column_device_view::create(strings.parent(), stream); + + // compile regex into device object + auto const d_prog = + reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); + + // Create lists offsets column + auto offsets = count_matches(*d_strings, *d_prog, stream, mr); + auto d_offsets = offsets->mutable_view().data(); + + // Compute null output rows + auto [null_mask, null_count] = cudf::detail::valid_if( + d_offsets, + d_offsets + strings_count, + [] __device__(auto const v) { return v > 0; }, + stream, + mr); + + auto const valid_count = strings_count - null_count; + // Return an empty lists column if there are no valid rows + if (valid_count == 0) { + return make_lists_column(0, + make_empty_column(type_to_id()), + make_empty_column(type_id::STRING), + 0, + rmm::device_buffer{}, + stream, + mr); + } + + // Convert counts into offsets + thrust::exclusive_scan( + rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets); + + // Create indices vector with the total number of groups that will be extracted + auto total_matches = cudf::detail::get_value(offsets->view(), strings_count, stream); + + rmm::device_uvector indices(total_matches, stream); + auto d_indices = indices.data(); + auto begin = thrust::make_counting_iterator(0); + + // Build the string indices + auto const regex_insts = d_prog->insts_counts(); + if (regex_insts <= RX_SMALL_INSTS) { + findall_fn fn{*d_strings, *d_prog, d_offsets, d_indices}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn); + } else if (regex_insts <= RX_MEDIUM_INSTS) { + findall_fn fn{*d_strings, *d_prog, d_offsets, d_indices}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn); + } else if (regex_insts <= RX_LARGE_INSTS) { + findall_fn fn{*d_strings, *d_prog, d_offsets, d_indices}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn); + } else { + findall_fn fn{*d_strings, *d_prog, d_offsets, d_indices}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn); + } + + // Build the child strings column from the resulting indices + auto strings_output = make_strings_column(indices.begin(), indices.end(), stream, mr); + + // Build the lists column from the offsets and the strings + return make_lists_column(strings_count, + std::move(offsets), + std::move(strings_output), + null_count, + std::move(null_mask), + stream, + mr); +} + +} // namespace detail + +// external API + +std::unique_ptr findall_record(strings_column_view const& strings, + std::string const& pattern, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::findall_record(strings, pattern, rmm::cuda_stream_default, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index c6dd11c1d82..62fd98d2027 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -199,12 +200,14 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); // create the normalizer and call it - data_normalizer normalizer(stream, do_lower_case); - auto result = [&strings, &normalizer, stream] { - auto const offsets = strings.offsets(); - auto const d_offsets = offsets.data() + strings.offset(); - auto const offset = cudf::detail::get_value(offsets, strings.offset(), stream); - auto const d_chars = strings.chars().data() + offset; + auto result = [&] { + auto const cp_metadata = get_codepoint_metadata(stream); + auto const aux_table = get_aux_codepoint_data(stream); + auto const normalizer = data_normalizer(cp_metadata.data(), aux_table.data(), do_lower_case); + auto const offsets = strings.offsets(); + auto const d_offsets = offsets.data() + strings.offset(); + auto const offset = cudf::detail::get_value(offsets, strings.offset(), stream); + auto const d_chars = strings.chars().data() + offset; return normalizer.normalize(d_chars, d_offsets, strings.size(), stream); }(); diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu index f3b642132e3..5af87f4de0e 100644 --- a/cpp/src/text/subword/data_normalizer.cu +++ b/cpp/src/text/subword/data_normalizer.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -261,17 +261,17 @@ __global__ void kernel_data_normalizer(unsigned char const* strings, } // namespace -data_normalizer::data_normalizer(rmm::cuda_stream_view stream, bool do_lower_case) - : do_lower_case(do_lower_case) +data_normalizer::data_normalizer(codepoint_metadata_type const* cp_metadata, + aux_codepoint_data_type const* aux_table, + bool do_lower_case) + : d_cp_metadata{cp_metadata}, d_aux_table{aux_table}, do_lower_case{do_lower_case} { - d_cp_metadata = detail::get_codepoint_metadata(stream); - d_aux_table = detail::get_aux_codepoint_data(stream); } uvector_pair data_normalizer::normalize(char const* d_strings, uint32_t const* d_offsets, uint32_t num_strings, - rmm::cuda_stream_view stream) + rmm::cuda_stream_view stream) const { if (num_strings == 0) return std::make_pair(std::make_unique>(0, stream), diff --git a/cpp/src/text/subword/detail/data_normalizer.hpp b/cpp/src/text/subword/detail/data_normalizer.hpp index 1a9eb5ba997..927de5a74f9 100644 --- a/cpp/src/text/subword/detail/data_normalizer.hpp +++ b/cpp/src/text/subword/detail/data_normalizer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,14 +48,17 @@ namespace detail { class data_normalizer { public: /** - * @brief Transfer to the GPU the metadata needed to normalize characters. + * @brief Create instance of the normalizer. * - * @param stream CUDA stream used for device memory operations and kernel launches. + * @param cp_metadata The code point metadata table to use for normalization. + * @param aux_table The auxiliary code point table. * @param do_lower_case If true, the normalizer will convert uppercase characters in the * input stream to lower case and strip accents from those characters. * If false, accented and uppercase characters are not transformed. */ - data_normalizer(rmm::cuda_stream_view stream, bool do_lower_case = true); + data_normalizer(codepoint_metadata_type const* cp_metadata, + aux_codepoint_data_type const* aux_table, + bool do_lower_case = true); /** * @brief Normalize a vector of strings. @@ -84,7 +87,7 @@ class data_normalizer { uvector_pair normalize(char const* d_strings, uint32_t const* d_offsets, uint32_t num_strings, - rmm::cuda_stream_view stream); + rmm::cuda_stream_view stream) const; private: bool const do_lower_case; diff --git a/cpp/src/text/subword/detail/tokenizer_utils.cuh b/cpp/src/text/subword/detail/tokenizer_utils.cuh index dcd241fc045..5e8de1ba244 100644 --- a/cpp/src/text/subword/detail/tokenizer_utils.cuh +++ b/cpp/src/text/subword/detail/tokenizer_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include #include +#include #include @@ -57,22 +58,16 @@ struct update_strings_lengths_fn { /** * @brief Retrieve the code point metadata table. * - * This is a singleton instance that copies a large table of integers into - * device memory on the very first call. - * * @param stream CUDA stream used for device memory operations and kernel launches. */ -codepoint_metadata_type const* get_codepoint_metadata(rmm::cuda_stream_view stream); +rmm::device_uvector get_codepoint_metadata(rmm::cuda_stream_view stream); /** - * @brief Retrieve the aux code point metadata table. - * - * This is a singleton instance that copies a large table of integers into - * device memory on the very first call. + * @brief Retrieve the auxiliary code point metadata table. * * @param stream CUDA stream used for device memory operations and kernel launches. */ -aux_codepoint_data_type const* get_aux_codepoint_data(rmm::cuda_stream_view stream); +rmm::device_uvector get_aux_codepoint_data(rmm::cuda_stream_view stream); } // namespace detail } // namespace nvtext diff --git a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp index 0259e8ce4f4..b5ad9724d72 100644 --- a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp +++ b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -61,7 +61,6 @@ class wordpiece_tokenizer { * @param do_lower_case If true, the tokenizer will convert uppercase characters in the * input stream to lowercase and strip accents from those characters. * If false, accented and uppercase characters are not transformed. - * @param stream CUDA stream used for device memory operations and kernel launches. * @param max_word_length The length of the longest word that will be tokenized. Words * longer than this will simply be replaced by the unknown token * specified in the `vocab_file`. @@ -72,7 +71,6 @@ class wordpiece_tokenizer { uint32_t stride, bool do_truncate, bool do_lower_case, - rmm::cuda_stream_view stream, uint32_t max_word_length = 200); /** diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu index 75c79381032..7cfdb4dea96 100644 --- a/cpp/src/text/subword/load_hash_file.cu +++ b/cpp/src/text/subword/load_hash_file.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,6 @@ */ #include -#include #include #include @@ -37,87 +36,32 @@ namespace nvtext { namespace detail { -namespace { -struct get_codepoint_metadata_init { - rmm::cuda_stream_view stream; - - rmm::device_uvector* operator()() const - { - auto table_vector = - new rmm::device_uvector(codepoint_metadata_size, stream); - auto table = table_vector->data(); - thrust::fill(rmm::exec_policy(stream), - table + cp_section1_end, - table + codepoint_metadata_size, - codepoint_metadata_default_value); - CUDA_TRY(cudaMemcpyAsync(table, - codepoint_metadata, - cp_section1_end * sizeof(codepoint_metadata[0]), // 1st section - cudaMemcpyHostToDevice, - stream.value())); - CUDA_TRY(cudaMemcpyAsync( - table + cp_section2_begin, - cp_metadata_917505_917999, - (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]), // 2nd section - cudaMemcpyHostToDevice, - stream.value())); - return table_vector; - }; -}; - -struct get_aux_codepoint_data_init { - rmm::cuda_stream_view stream; - - rmm::device_uvector* operator()() const - { - auto table_vector = - new rmm::device_uvector(aux_codepoint_data_size, stream); - auto table = table_vector->data(); - thrust::fill(rmm::exec_policy(stream), - table + aux_section1_end, - table + aux_codepoint_data_size, - aux_codepoint_default_value); - CUDA_TRY(cudaMemcpyAsync(table, - aux_codepoint_data, - aux_section1_end * sizeof(aux_codepoint_data[0]), // 1st section - cudaMemcpyHostToDevice, - stream.value())); - CUDA_TRY(cudaMemcpyAsync( - table + aux_section2_begin, - aux_cp_data_44032_55203, - (aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]), // 2nd section - cudaMemcpyHostToDevice, - stream.value())); - CUDA_TRY(cudaMemcpyAsync( - table + aux_section3_begin, - aux_cp_data_70475_71099, - (aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]), // 3rd section - cudaMemcpyHostToDevice, - stream.value())); - CUDA_TRY(cudaMemcpyAsync( - table + aux_section4_begin, - aux_cp_data_119134_119232, - (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]), // 4th section - cudaMemcpyHostToDevice, - stream.value())); - return table_vector; - } -}; -} // namespace - /** * @brief Retrieve the code point metadata table. * * Build the code point metadata table in device memory * using the vector pieces from codepoint_metadata.ah */ -const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stream) +rmm::device_uvector get_codepoint_metadata(rmm::cuda_stream_view stream) { - static cudf::strings::detail::thread_safe_per_context_cache< - rmm::device_uvector> - g_codepoint_metadata; - - return g_codepoint_metadata.find_or_initialize(get_codepoint_metadata_init{stream})->data(); + auto table_vector = rmm::device_uvector(codepoint_metadata_size, stream); + auto table = table_vector.data(); + thrust::fill(rmm::exec_policy(stream), + table + cp_section1_end, + table + codepoint_metadata_size, + codepoint_metadata_default_value); + CUDA_TRY(cudaMemcpyAsync(table, + codepoint_metadata, + cp_section1_end * sizeof(codepoint_metadata[0]), // 1st section + cudaMemcpyHostToDevice, + stream.value())); + CUDA_TRY(cudaMemcpyAsync( + table + cp_section2_begin, + cp_metadata_917505_917999, + (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]), // 2nd section + cudaMemcpyHostToDevice, + stream.value())); + return table_vector; } /** @@ -126,13 +70,38 @@ const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stre * Build the aux code point data table in device memory * using the vector pieces from codepoint_metadata.ah */ -const aux_codepoint_data_type* get_aux_codepoint_data(rmm::cuda_stream_view stream) +rmm::device_uvector get_aux_codepoint_data(rmm::cuda_stream_view stream) { - static cudf::strings::detail::thread_safe_per_context_cache< - rmm::device_uvector> - g_aux_codepoint_data; - - return g_aux_codepoint_data.find_or_initialize(get_aux_codepoint_data_init{stream})->data(); + auto table_vector = rmm::device_uvector(aux_codepoint_data_size, stream); + auto table = table_vector.data(); + thrust::fill(rmm::exec_policy(stream), + table + aux_section1_end, + table + aux_codepoint_data_size, + aux_codepoint_default_value); + CUDA_TRY(cudaMemcpyAsync(table, + aux_codepoint_data, + aux_section1_end * sizeof(aux_codepoint_data[0]), // 1st section + cudaMemcpyHostToDevice, + stream.value())); + CUDA_TRY(cudaMemcpyAsync( + table + aux_section2_begin, + aux_cp_data_44032_55203, + (aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]), // 2nd section + cudaMemcpyHostToDevice, + stream.value())); + CUDA_TRY(cudaMemcpyAsync( + table + aux_section3_begin, + aux_cp_data_70475_71099, + (aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]), // 3rd section + cudaMemcpyHostToDevice, + stream.value())); + CUDA_TRY(cudaMemcpyAsync( + table + aux_section4_begin, + aux_cp_data_119134_119232, + (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]), // 4th section + cudaMemcpyHostToDevice, + stream.value())); + return table_vector; } namespace { @@ -293,10 +262,15 @@ std::unique_ptr load_vocabulary_file( cudaMemcpyHostToDevice, stream.value())); - // this just initializes some constant tables into device memory - // to help speed up the runtime - detail::get_codepoint_metadata(stream); - detail::get_aux_codepoint_data(stream); + auto cp_metadata = detail::get_codepoint_metadata(stream); + auto const cp_metadata_size = static_cast(cp_metadata.size()); + result.cp_metadata = std::make_unique( + cudf::data_type{cudf::type_id::UINT32}, cp_metadata_size, cp_metadata.release()); + + auto aux_cp_table = detail::get_aux_codepoint_data(stream); + auto const aux_cp_table_size = static_cast(aux_cp_table.size()); + result.aux_cp_table = std::make_unique( + cudf::data_type{cudf::type_id::UINT64}, aux_cp_table_size, aux_cp_table.release()); return std::make_unique(std::move(result)); } diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index 193cd80d9a6..1ac7dd0d8a1 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -153,7 +153,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, // Create tokenizer wordpiece_tokenizer tokenizer( - vocab_table, max_rows_tensor, max_sequence_length, stride, do_truncate, do_lower_case, stream); + vocab_table, max_rows_tensor, max_sequence_length, stride, do_truncate, do_lower_case); // Run tokenizer auto const tokens = tokenizer.tokenize(d_chars, d_offsets, strings_count, stream); // assign output components diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu index 00798e7e4e2..afd82f0bb5d 100644 --- a/cpp/src/text/subword/wordpiece_tokenizer.cu +++ b/cpp/src/text/subword/wordpiece_tokenizer.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -394,10 +394,11 @@ wordpiece_tokenizer::wordpiece_tokenizer(hashed_vocabulary const& vocab_table, uint32_t stride, bool do_truncate, bool do_lower_case, - rmm::cuda_stream_view stream, uint32_t max_word_length) : vocab_table(vocab_table), - normalizer(stream, do_lower_case), + normalizer(vocab_table.cp_metadata->view().data(), + vocab_table.aux_cp_table->view().data(), + do_lower_case), max_sequence_length{max_sequence_length}, stride(stride), do_truncate(do_truncate), diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu index dadeaf7d1e0..405c83ab872 100644 --- a/cpp/src/transform/encode.cu +++ b/cpp/src/transform/encode.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,7 +30,10 @@ #include #include +#include #include +#include +#include namespace cudf { namespace detail { @@ -38,29 +41,23 @@ namespace detail { std::pair, std::unique_ptr> encode( table_view const& input_table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - std::vector drop_keys(input_table.num_columns()); + auto const num_cols = input_table.num_columns(); + + std::vector drop_keys(num_cols); std::iota(drop_keys.begin(), drop_keys.end(), 0); - // side effects of this function we are now dependent on: - // - resulting column elements are sorted ascending - // - nulls are sorted to the beginning - auto keys_table = cudf::detail::drop_duplicates(input_table, - drop_keys, - duplicate_keep_option::KEEP_FIRST, - null_equality::EQUAL, - null_order::AFTER, - stream, - mr); + auto unique_keys = cudf::detail::unordered_drop_duplicates( + input_table, drop_keys, null_equality::EQUAL, stream, mr); + + std::vector column_order(num_cols, order::ASCENDING); + std::vector null_precedence(num_cols, null_order::AFTER); + auto sorted_unique_keys = + cudf::detail::sort(unique_keys->view(), column_order, null_precedence, stream, mr); - auto indices_column = - cudf::detail::lower_bound(keys_table->view(), - input_table, - std::vector(input_table.num_columns(), order::ASCENDING), - std::vector(input_table.num_columns(), null_order::AFTER), - stream, - mr); + auto indices_column = cudf::detail::lower_bound( + sorted_unique_keys->view(), input_table, column_order, null_precedence, stream, mr); - return std::make_pair(std::move(keys_table), std::move(indices_column)); + return std::make_pair(std::move(sorted_unique_keys), std::move(indices_column)); } } // namespace detail diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index e23403e68e4..6b5670630ec 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -292,6 +292,7 @@ ConfigureTest( ConfigureTest( STREAM_COMPACTION_TEST stream_compaction/apply_boolean_mask_tests.cpp + stream_compaction/distinct_count_tests.cpp stream_compaction/drop_nulls_tests.cpp stream_compaction/drop_nans_tests.cpp stream_compaction/drop_duplicates_tests.cpp diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp new file mode 100644 index 00000000000..78b52db5255 --- /dev/null +++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp @@ -0,0 +1,370 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +using cudf::nan_policy; +using cudf::null_equality; +using cudf::null_policy; + +constexpr int32_t XXX{70}; // Mark for null elements +constexpr int32_t YYY{3}; // Mark for null elements + +template +struct DistinctCountCommon : public cudf::test::BaseFixture { +}; + +TYPED_TEST_SUITE(DistinctCountCommon, cudf::test::NumericTypes); + +TYPED_TEST(DistinctCountCommon, NoNull) +{ + using T = TypeParam; + + auto const input = cudf::test::make_type_param_vector( + {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4}); + + cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end()); + + // explicit instantiation to one particular type (`double`) to reduce build time + auto const expected = + static_cast(std::set(input.begin(), input.end()).size()); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + // explicit instantiation to one particular type (`double`) to reduce build time + std::vector input_data(input.begin(), input.end()); + auto const new_end = std::unique(input_data.begin(), input_data.end()); + auto const gold_ordered = std::distance(input_data.begin(), new_end); + EXPECT_EQ(gold_ordered, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); +} + +TYPED_TEST(DistinctCountCommon, TableNoNull) +{ + using T = TypeParam; + + auto const input1 = cudf::test::make_type_param_vector( + {1, 3, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4}); + auto const input2 = cudf::test::make_type_param_vector( + {3, 3, 3, 4, 31, 1, 8, 5, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1}); + + std::vector> pair_input; + std::transform( + input1.begin(), input1.end(), input2.begin(), std::back_inserter(pair_input), [](T a, T b) { + return std::make_pair(a, b); + }); + + cudf::test::fixed_width_column_wrapper input_col1(input1.begin(), input1.end()); + cudf::test::fixed_width_column_wrapper input_col2(input2.begin(), input2.end()); + cudf::table_view input_table({input_col1, input_col2}); + + auto const expected = static_cast( + std::set>(pair_input.begin(), pair_input.end()).size()); + EXPECT_EQ(expected, cudf::unordered_distinct_count(input_table, null_equality::EQUAL)); + + auto const new_end = std::unique(pair_input.begin(), pair_input.end()); + auto const gold_ordered = std::distance(pair_input.begin(), new_end); + EXPECT_EQ(gold_ordered, cudf::distinct_count(input_table, null_equality::EQUAL)); +} + +struct DistinctCount : public cudf::test::BaseFixture { +}; + +TEST_F(DistinctCount, WithNull) +{ + using T = int32_t; + + std::vector input = {1, 3, 3, XXX, 31, 1, 8, 2, 0, XXX, XXX, + XXX, 10, 40, 31, 42, 0, 42, 8, 5, XXX}; + std::vector valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; + + cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); + + // explicit instantiation to one particular type (`double`) to reduce build time + auto const expected = + static_cast(std::set(input.begin(), input.end()).size()); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + auto const new_end = std::unique(input.begin(), input.end()); + auto const gold_ordered = std::distance(input.begin(), new_end) - 3; + EXPECT_EQ(gold_ordered, + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(DistinctCount, IgnoringNull) +{ + using T = int32_t; + + std::vector input = {1, YYY, YYY, XXX, 31, 1, 8, 2, 0, XXX, 1, + XXX, 10, 40, 31, 42, 0, 42, 8, 5, XXX}; + std::vector valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; + + cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); + + auto const expected = + static_cast(std::set(input.begin(), input.end()).size()); + // Removing 2 from expected to remove count for `XXX` and `YYY` + EXPECT_EQ( + expected - 2, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); + + auto const new_end = std::unique(input.begin(), input.end()); + // -1 since `YYY, YYY, XXX` is in the same group of equivalent rows + auto const gold_ordered = std::distance(input.begin(), new_end) - 1; + EXPECT_EQ(gold_ordered, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(DistinctCount, WithNansAndNull) +{ + using T = float; + + std::vector input = {1, 3, NAN, XXX, 31, 1, 8, 2, 0, XXX, 1, + XXX, 10, 40, 31, NAN, 0, NAN, 8, 5, XXX}; + std::vector valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + auto const expected = + static_cast(std::set(input.begin(), input.end()).size()); + EXPECT_EQ( + expected + 1, // +1 since `NAN` is not in std::set + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + auto const new_end = std::unique(input.begin(), input.end()); + auto const gold_ordered = std::distance(input.begin(), new_end); + EXPECT_EQ(gold_ordered, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + input = {NAN, NAN, XXX}; + valid = {1, 1, 0}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_nan = 2; + EXPECT_EQ( + expected_all_nan, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ(expected_all_nan, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(DistinctCount, WithNansOnly) +{ + using T = float; + + std::vector input = {1, 3, NAN, 70, 31}; + std::vector valid = {1, 1, 1, 1, 1}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected = 5; + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + input = {NAN, NAN, NAN}; + valid = {1, 1, 1}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_nan = 1; + EXPECT_EQ( + expected_all_nan, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ(expected_all_nan, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(DistinctCount, NansAsNullWithNoNull) +{ + using T = float; + + std::vector input = {1, 3, NAN, 70, 31}; + std::vector valid = {1, 1, 1, 1, 1}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected = 5; + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + + input = {NAN, NAN, NAN}; + valid = {1, 1, 1}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_nan = 1; + EXPECT_EQ( + expected_all_nan, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected_all_nan, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); +} + +TEST_F(DistinctCount, NansAsNullWithNull) +{ + using T = float; + + std::vector input = {1, 3, NAN, XXX, 31}; + std::vector valid = {1, 1, 1, 0, 1}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected = 4; + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + + input = {NAN, NAN, XXX}; + valid = {1, 1, 0}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_null = 1; + EXPECT_EQ( + expected_all_null, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected_all_null, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); +} + +TEST_F(DistinctCount, NansAsNullWithIgnoreNull) +{ + using T = float; + + std::vector input = {1, 3, NAN, XXX, 31}; + std::vector valid = {1, 1, 1, 0, 1}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected = 3; + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + + input = {NAN, NAN, NAN}; + valid = {1, 1, 1}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_nan = 0; + EXPECT_EQ( + expected_all_nan, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected_all_nan, + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); +} + +TEST_F(DistinctCount, EmptyColumn) +{ + using T = float; + + cudf::test::fixed_width_column_wrapper input_col{}; + + constexpr auto expected = 0; + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); +} + +TEST_F(DistinctCount, StringColumnWithNull) +{ + cudf::test::strings_column_wrapper input_col{ + {"", "this", "is", "this", "This", "a", "column", "of", "the", "strings"}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 1}}; + + cudf::size_type const expected = + (std::vector{"", "this", "is", "This", "a", "column", "of", "strings"}).size(); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(DistinctCount, TableWithNull) +{ + cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; + cudf::test::fixed_width_column_wrapper col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, + {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}}; + cudf::table_view input{{col1, col2}}; + + EXPECT_EQ(8, cudf::unordered_distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); +} + +TEST_F(DistinctCount, EmptyColumnedTable) +{ + std::vector cols{}; + + cudf::table_view input(cols); + + EXPECT_EQ(0, cudf::unordered_distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(0, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); +} + +TEST_F(DistinctCount, TableMixedTypes) +{ + cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; + cudf::test::fixed_width_column_wrapper col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, + {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}}; + cudf::test::fixed_width_column_wrapper col3{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, + {1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0}}; + cudf::table_view input{{col1, col2, col3}}; + + EXPECT_EQ(9, cudf::unordered_distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); +} + +TEST_F(DistinctCount, TableWithStringColumnWithNull) +{ + cudf::test::fixed_width_column_wrapper col1{{0, 9, 8, 9, 6, 5, 4, 3, 2, 1, 0}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; + cudf::test::strings_column_wrapper col2{ + {"", "this", "is", "this", "this", "a", "column", "of", "the", "strings", ""}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; + + cudf::table_view input{{col1, col2}}; + EXPECT_EQ(9, cudf::unordered_distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); +} diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp index 916d2a33b97..d49b8208094 100644 --- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp +++ b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,254 +14,98 @@ * limitations under the License. */ -#include -#include -#include #include +#include #include #include #include #include + #include #include #include #include #include +#include +#include + using cudf::nan_policy; using cudf::null_equality; using cudf::null_policy; -template -struct DistinctCountCommon : public cudf::test::BaseFixture { -}; - -TYPED_TEST_SUITE(DistinctCountCommon, cudf::test::NumericTypes); - -TYPED_TEST(DistinctCountCommon, NoNull) -{ - using T = TypeParam; - - auto const input = cudf::test::make_type_param_vector( - {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4}); - - cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end()); - - cudf::size_type expected = std::set(input.begin(), input.end()).size(); - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); -} - -TYPED_TEST(DistinctCountCommon, TableNoNull) -{ - using T = TypeParam; - - auto const input1 = cudf::test::make_type_param_vector( - {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4}); - auto const input2 = cudf::test::make_type_param_vector( - {3, 3, 4, 31, 1, 8, 5, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1}); - std::vector> pair_input; - std::transform( - input1.begin(), input1.end(), input2.begin(), std::back_inserter(pair_input), [](T a, T b) { - return std::make_pair(a, b); - }); - - cudf::test::fixed_width_column_wrapper input_col1(input1.begin(), input1.end()); - cudf::test::fixed_width_column_wrapper input_col2(input2.begin(), input2.end()); - - std::vector cols{input_col1, input_col2}; - cudf::table_view input_table(cols); - - cudf::size_type expected = std::set>(pair_input.begin(), pair_input.end()).size(); - EXPECT_EQ(expected, cudf::distinct_count(input_table, null_equality::EQUAL)); -} - -struct DistinctCount : public cudf::test::BaseFixture { +struct DropDuplicatesCommon : public cudf::test::BaseFixture { }; -TEST_F(DistinctCount, WithNull) -{ - using T = int32_t; - - // Considering 70 as null - std::vector input = {1, 3, 3, 70, 31, 1, 8, 2, 0, 70, 1, 70, 10, 40, 31, 42, 0, 42, 8, 5, 70}; - std::vector valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, - 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; - - cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); - - cudf::size_type expected = std::set(input.begin(), input.end()).size(); - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); -} - -TEST_F(DistinctCount, IgnoringNull) -{ - using T = int32_t; - - // Considering 70 and 3 as null - std::vector input = {1, 3, 3, 70, 31, 1, 8, 2, 0, 70, 1, 70, 10, 40, 31, 42, 0, 42, 8, 5, 70}; - std::vector valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, - 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; - - cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); - - cudf::size_type expected = std::set(input.begin(), input.end()).size(); - // Removing 2 from expected to remove count for 70 and 3 - EXPECT_EQ(expected - 2, - cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); -} - -TEST_F(DistinctCount, WithNansAndNull) +TEST_F(DropDuplicatesCommon, StringKeyColumn) { - using T = float; - - std::vector input = {1, 3, NAN, 70, 31, 1, 8, 2, 0, 70, 1, - 70, 10, 40, 31, NAN, 0, NAN, 8, 5, 70}; - std::vector valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, - 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; - - cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; - - cudf::size_type expected = std::set(input.begin(), input.end()).size(); - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); -} - -TEST_F(DistinctCount, WithNansOnly) -{ - using T = float; - - std::vector input = {1, 3, NAN, 70, 31}; - std::vector valid = {1, 1, 1, 1, 1}; - - cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; - - cudf::size_type expected = 5; - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); -} - -TEST_F(DistinctCount, NansAsNullWithNoNull) -{ - using T = float; - - std::vector input = {1, 3, NAN, 70, 31}; - std::vector valid = {1, 1, 1, 1, 1}; - - cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; - - cudf::size_type expected = 5; - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); -} + cudf::test::fixed_width_column_wrapper col{{5, 4, 4, 5, 5, 8, 1}, {1, 0, 0, 1, 1, 1, 1}}; + cudf::test::strings_column_wrapper key_col{{"all", "new", "new", "all", "new", "the", "strings"}, + {1, 1, 1, 1, 0, 1, 1}}; + cudf::table_view input{{col, key_col}}; + std::vector keys{1}; -TEST_F(DistinctCount, NansAsNullWithNull) -{ - using T = float; + cudf::test::fixed_width_column_wrapper exp_sort_col{{5, 5, 4, 1, 8}, {1, 1, 0, 1, 1}}; + cudf::test::strings_column_wrapper exp_sort_key_col{{"new", "all", "new", "strings", "the"}, + {0, 1, 1, 1, 1}}; + cudf::table_view expected_sort{{exp_sort_col, exp_sort_key_col}}; - std::vector input = {1, 3, NAN, 70, 31}; - std::vector valid = {1, 1, 1, 0, 1}; + auto got_sort = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, got_sort->view()); - cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + auto got_unordered = unordered_drop_duplicates(input, keys); + auto key_view = got_unordered->select(keys.begin(), keys.end()); + auto sorted_result = cudf::sort_by_key(got_unordered->view(), key_view); - cudf::size_type expected = 4; - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, sorted_result->view()); } -TEST_F(DistinctCount, NansAsNullWithIgnoreNull) +TEST_F(DropDuplicatesCommon, EmptyInputTable) { - using T = float; - - std::vector input = {1, 3, NAN, 70, 31}; - std::vector valid = {1, 1, 1, 0, 1}; + cudf::test::fixed_width_column_wrapper col(std::initializer_list{}); + cudf::table_view input{{col}}; + std::vector keys{1, 2}; - cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + auto got = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); - cudf::size_type expected = 3; - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view()); } -TEST_F(DistinctCount, EmptyColumn) +TEST_F(DropDuplicatesCommon, NoColumnInputTable) { - using T = float; - - cudf::test::fixed_width_column_wrapper input_col{}; - - cudf::size_type expected = 0; - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); -} + cudf::table_view input{std::vector()}; + std::vector keys{1, 2}; -TEST_F(DistinctCount, StringColumnWithNull) -{ - cudf::test::strings_column_wrapper input_col{ - {"", "this", "is", "this", "This", "a", "column", "of", "the", "strings"}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 1}}; - - cudf::size_type expected = - (std::vector{"", "this", "is", "This", "a", "column", "of", "strings"}).size(); - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); -} + auto got = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); -TEST_F(DistinctCount, TableWithNull) -{ - cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; - cudf::test::fixed_width_column_wrapper col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, - {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}}; - cudf::table_view input{{col1, col2}}; - - EXPECT_EQ(8, cudf::distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL)); + auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view()); } -TEST_F(DistinctCount, EmptyColumnedTable) +TEST_F(DropDuplicatesCommon, EmptyKeys) { - std::vector cols{}; - - cudf::table_view input(cols); - - EXPECT_EQ(0, cudf::distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(0, cudf::distinct_count(input, null_equality::UNEQUAL)); - EXPECT_EQ(0, cudf::distinct_count(cudf::table_view{}, null_equality::EQUAL)); - EXPECT_EQ(0, cudf::distinct_count(cudf::table_view{}, null_equality::UNEQUAL)); -} + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper empty_col{}; + cudf::table_view input{{col}}; + std::vector keys{}; -TEST_F(DistinctCount, TableMixedTypes) -{ - cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; - cudf::test::fixed_width_column_wrapper col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, - {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}}; - cudf::test::fixed_width_column_wrapper col3{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, - {1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0}}; - cudf::table_view input{{col1, col2, col3}}; - - EXPECT_EQ(9, cudf::distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL)); -} + auto got = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); -TEST_F(DistinctCount, TableWithStringColumnWithNull) -{ - cudf::test::fixed_width_column_wrapper col1{{0, 9, 8, 9, 6, 5, 4, 3, 2, 1, 0}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; - cudf::test::strings_column_wrapper col2{ - {"", "this", "is", "this", "this", "a", "column", "of", "the", "strings", ""}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; - - cudf::table_view input{{col1, col2}}; - EXPECT_EQ(9, cudf::distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL)); + auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got_unordered->view()); } -struct DropDuplicate : public cudf::test::BaseFixture { +struct DropDuplicates : public cudf::test::BaseFixture { }; -TEST_F(DropDuplicate, NonNullTable) +TEST_F(DropDuplicates, NonNullTable) { cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 5}}; cudf::test::fixed_width_column_wrapper col2{{4, 5, 3, 4, 9, 4}}; @@ -271,7 +115,7 @@ TEST_F(DropDuplicate, NonNullTable) cudf::table_view input{{col1, col2, col1_key, col2_key}}; std::vector keys{2, 3}; - // Keep first of duplicate + // Keep the first duplicate row // The expected table would be sorted in ascending order with respect to keys cudf::test::fixed_width_column_wrapper exp_col1_first{{5, 5, 5, 3, 8}}; cudf::test::fixed_width_column_wrapper exp_col2_first{{4, 4, 4, 3, 9}}; @@ -284,7 +128,7 @@ TEST_F(DropDuplicate, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); - // keep last of duplicate + // Keep the last duplicate row cudf::test::fixed_width_column_wrapper exp_col1_last{{5, 5, 4, 3, 8}}; cudf::test::fixed_width_column_wrapper exp_col2_last{{4, 4, 5, 3, 9}}; cudf::test::fixed_width_column_wrapper exp_col1_key_last{{9, 19, 20, 20, 21}}; @@ -296,7 +140,7 @@ TEST_F(DropDuplicate, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); - // Keep unique + // Keep no duplicate rows cudf::test::fixed_width_column_wrapper exp_col1_unique{{5, 5, 3, 8}}; cudf::test::fixed_width_column_wrapper exp_col2_unique{{4, 4, 3, 9}}; cudf::test::fixed_width_column_wrapper exp_col1_key_unique{{9, 19, 20, 21}}; @@ -309,88 +153,145 @@ TEST_F(DropDuplicate, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); } -TEST_F(DropDuplicate, WithNull) +TEST_F(DropDuplicates, KeepFirstWithNull) { - cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 20, 19, 21, 19}, + {1, 1, 0, 0, 1, 1, 1}}; cudf::table_view input{{col, key}}; std::vector keys{1}; - // Keep first of duplicate - cudf::test::fixed_width_column_wrapper exp_col_first{{4, 5, 5, 8}, {0, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_first{{20, 19, 20, 21}, {0, 1, 1, 1}}; - cudf::table_view expected_first{{exp_col_first, exp_key_col_first}}; - auto got_first = + // nulls are equal + cudf::test::fixed_width_column_wrapper exp_col_first_equal{{3, 5, 5, 8}, {1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_first_equal{{20, 19, 20, 21}, + {0, 1, 1, 1}}; + cudf::table_view expected_first_equal{{exp_col_first_equal, exp_key_col_first_equal}}; + auto got_first_equal = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first_equal, got_first_equal->view()); - // Keep last of duplicate - cudf::test::fixed_width_column_wrapper exp_col_last{{3, 1, 5, 8}, {1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_last{{20, 19, 20, 21}, {0, 1, 1, 1}}; - cudf::table_view expected_last{{exp_col_last, exp_key_col_last}}; - auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + // nulls are unequal + cudf::test::fixed_width_column_wrapper exp_col_first_unequal{{3, 2, 5, 5, 8}, + {1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_first_unequal{{20, 20, 19, 20, 21}, + {0, 0, 1, 1, 1}}; + cudf::table_view expected_first_unequal{{exp_col_first_unequal, exp_key_col_first_unequal}}; + auto got_first_unequal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::UNEQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); - - // Keep unique of duplicate - cudf::test::fixed_width_column_wrapper exp_col_unique{{5, 8}, {1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_unique{{20, 21}, {1, 1}}; - cudf::table_view expected_unique{{exp_col_unique, exp_key_col_unique}}; - auto got_unique = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first_unequal, got_first_unequal->view()); } -TEST_F(DropDuplicate, StringKeyColumn) +TEST_F(DropDuplicates, KeepLastWithNull) { - cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; - cudf::test::strings_column_wrapper key_col{{"all", "new", "all", "new", "the", "strings"}, - {1, 1, 1, 0, 1, 1}}; - cudf::table_view input{{col, key_col}}; + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 20, 19, 21, 19}, + {1, 1, 0, 0, 1, 1, 1}}; + cudf::table_view input{{col, key}}; std::vector keys{1}; - cudf::test::fixed_width_column_wrapper exp_col_last{{5, 3, 4, 1, 8}, {1, 1, 0, 1, 1}}; - cudf::test::strings_column_wrapper exp_key_col_last{{"new", "all", "new", "strings", "the"}, - {0, 1, 1, 1, 1}}; - cudf::table_view expected_last{{exp_col_last, exp_key_col_last}}; - auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); + // nulls are equal + cudf::test::fixed_width_column_wrapper exp_col_last_equal{{2, 1, 4, 8}, {1, 1, 0, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_last_equal{{20, 19, 20, 21}, + {0, 1, 1, 1}}; + cudf::table_view expected_last_equal{{exp_col_last_equal, exp_key_col_last_equal}}; + auto got_last_equal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::EQUAL); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last_equal, got_last_equal->view()); + + // nulls are unequal + cudf::test::fixed_width_column_wrapper exp_col_last_unequal{{3, 2, 1, 4, 8}, + {1, 1, 1, 0, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_last_unequal{{20, 20, 19, 20, 21}, + {0, 0, 1, 1, 1}}; + cudf::table_view expected_last_unequal{{exp_col_last_unequal, exp_key_col_last_unequal}}; + auto got_last_unequal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::UNEQUAL); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last_unequal, got_last_unequal->view()); } -TEST_F(DropDuplicate, EmptyInputTable) +TEST_F(DropDuplicates, KeepNoneWithNull) { - cudf::test::fixed_width_column_wrapper col(std::initializer_list{}); - cudf::table_view input{{col}}; - std::vector keys{1, 2}; + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 20, 19, 21, 19}, + {1, 1, 0, 0, 1, 1, 1}}; + cudf::table_view input{{col, key}}; + std::vector keys{1}; - auto got = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + // nulls are equal + cudf::test::fixed_width_column_wrapper exp_col_unique_equal{{8}, {1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_unique_equal{{21}, {1}}; + cudf::table_view expected_unique_equal{{exp_col_unique_equal, exp_key_col_unique_equal}}; + auto got_unique_equal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::EQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique_equal, got_unique_equal->view()); + + // nulls are unequal + cudf::test::fixed_width_column_wrapper exp_col_unique_unequal{{3, 2, 8}, {1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_unique_unequal{{20, 20, 21}, + {0, 0, 1}}; + cudf::table_view expected_unique_unequal{{exp_col_unique_unequal, exp_key_col_unique_unequal}}; + auto got_unique_unequal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::UNEQUAL); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique_unequal, got_unique_unequal->view()); } -TEST_F(DropDuplicate, NoColumnInputTable) +struct UnorderedDropDuplicates : public cudf::test::BaseFixture { +}; + +TEST_F(UnorderedDropDuplicates, NonNullTable) { - cudf::table_view input{std::vector()}; - std::vector keys{1, 2}; + cudf::test::fixed_width_column_wrapper col1{{6, 6, 3, 5, 8, 5}}; + cudf::test::fixed_width_column_wrapper col2{{6, 6, 3, 4, 9, 4}}; + cudf::test::fixed_width_column_wrapper col1_key{{20, 20, 20, 19, 21, 9}}; + cudf::test::fixed_width_column_wrapper col2_key{{19, 19, 20, 20, 9, 21}}; - auto got = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + cudf::table_view input{{col1, col2, col1_key, col2_key}}; + std::vector keys{2, 3}; - CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); + // The expected table would be sorted in ascending order with respect to keys + cudf::test::fixed_width_column_wrapper exp_col1{{5, 5, 6, 3, 8}}; + cudf::test::fixed_width_column_wrapper exp_col2{{4, 4, 6, 3, 9}}; + cudf::test::fixed_width_column_wrapper exp_col1_key{{9, 19, 20, 20, 21}}; + cudf::test::fixed_width_column_wrapper exp_col2_key{{21, 20, 19, 20, 9}}; + cudf::table_view expected{{exp_col1, exp_col2, exp_col1_key, exp_col2_key}}; + + auto result = unordered_drop_duplicates(input, keys); + auto key_view = result->select(keys.begin(), keys.end()); + auto sorted_result = cudf::sort_by_key(result->view(), key_view); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view()); } -TEST_F(DropDuplicate, EmptyKeys) +TEST_F(UnorderedDropDuplicates, WithNull) { - cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper empty_col{}; - cudf::table_view input{{col}}; - std::vector keys{}; - - auto got = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + cudf::test::fixed_width_column_wrapper col{{5, 4, 4, 1, 8, 1}, {1, 0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; + cudf::table_view input{{col, key}}; + std::vector keys{1}; - CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); + // nulls are equal + cudf::test::fixed_width_column_wrapper exp_equal_col{{4, 1, 5, 8}, {0, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_equal_key_col{{20, 19, 20, 21}, {0, 1, 1, 1}}; + cudf::table_view expected_equal{{exp_equal_col, exp_equal_key_col}}; + auto res_equal = unordered_drop_duplicates(input, keys, null_equality::EQUAL); + auto equal_keys = res_equal->select(keys.begin(), keys.end()); + auto sorted_equal = cudf::sort_by_key(res_equal->view(), equal_keys); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_equal, sorted_equal->view()); + + // nulls are unequal + cudf::test::fixed_width_column_wrapper exp_unequal_col{{4, 1, 4, 5, 8}, {0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_unequal_key_col{{20, 19, 20, 20, 21}, + {0, 1, 0, 1, 1}}; + cudf::table_view expected_unequal{{exp_unequal_col, exp_unequal_key_col}}; + auto res_unequal = unordered_drop_duplicates(input, keys, null_equality::UNEQUAL); + auto sorted_unequal = cudf::sort(res_unequal->view()); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unequal, sorted_unequal->view()); } diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index 48c4aac9e8a..12a00aa35ab 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -274,6 +274,15 @@ TEST_F(StringsContainsTests, EmbeddedNullCharacter) CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); } +TEST_F(StringsContainsTests, Errors) +{ + cudf::test::strings_column_wrapper input({"3", "33"}); + auto strings_view = cudf::strings_column_view(input); + + EXPECT_THROW(cudf::strings::contains_re(strings_view, "(3?)+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::contains_re(strings_view, "3?+"), cudf::logic_error); +} + TEST_F(StringsContainsTests, CountTest) { std::vector h_strings{ diff --git a/cpp/tests/strings/find_multiple_tests.cpp b/cpp/tests/strings/find_multiple_tests.cpp index a4cb27b7a9f..7b9f639f965 100644 --- a/cpp/tests/strings/find_multiple_tests.cpp +++ b/cpp/tests/strings/find_multiple_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,13 +41,16 @@ TEST_F(StringsFindMultipleTest, FindMultiple) cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end()); auto targets_view = cudf::strings_column_view(targets); - auto results = cudf::strings::find_multiple(strings_view, targets_view); - cudf::size_type total_count = static_cast(h_strings.size() * h_targets.size()); - EXPECT_EQ(total_count, results->size()); + auto results = cudf::strings::find_multiple(strings_view, targets_view); + + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{1, -1, -1, -1, 4, -1, -1}, + LCW{4, -1, 2, -1, -1, -1, 2}, + LCW{-1, -1, -1, -1, -1, -1, -1}, + LCW{-1, 2, 1, -1, -1, -1, -1}, + LCW{-1, -1, 1, 8, -1, -1, 1}, + LCW{-1, -1, -1, -1, -1, -1, -1}}); - cudf::test::fixed_width_column_wrapper expected( - {1, -1, -1, -1, 4, -1, -1, 4, -1, 2, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, - -1, 2, 1, -1, -1, -1, -1, -1, -1, 1, 8, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index d7bf162d36f..4b1305a870a 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,7 +56,7 @@ TEST_F(StringsFindallTests, FindallTest) nullptr}; std::string pattern = "(\\w+)"; - auto results = cudf::strings::findall_re(strings_view, pattern); + auto results = cudf::strings::findall(strings_view, pattern); EXPECT_TRUE(results->num_columns() == 2); cudf::test::strings_column_wrapper expected1( @@ -75,6 +75,28 @@ TEST_F(StringsFindallTests, FindallTest) CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); } +TEST_F(StringsFindallTests, FindallRecord) +{ + cudf::test::strings_column_wrapper input( + {"3-A", "4-May 5-Day 6-Hay", "12-Dec-2021-Jan", "Feb-March", "4 ABC", "", "", "25-9000-Hal"}, + {1, 1, 1, 1, 1, 0, 1, 1}); + + auto results = cudf::strings::findall_record(cudf::strings_column_view(input), "(\\d+)-(\\w+)"); + + bool valids[] = {1, 1, 1, 0, 0, 0, 0, 1}; + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{"3-A"}, + LCW{"4-May", "5-Day", "6-Hay"}, + LCW{"12-Dec", "2021-Jan"}, + LCW{}, + LCW{}, + LCW{}, + LCW{}, + LCW{"25-9000"}}, + valids); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} + TEST_F(StringsFindallTests, MediumRegex) { // This results in 15 regex instructions and falls in the 'medium' range. @@ -87,7 +109,7 @@ TEST_F(StringsFindallTests, MediumRegex) thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::findall_re(strings_view, medium_regex); + auto results = cudf::strings::findall(strings_view, medium_regex); EXPECT_TRUE(results->num_columns() == 2); std::vector h_expected1{"first words 1234", nullptr}; @@ -115,9 +137,11 @@ TEST_F(StringsFindallTests, LargeRegex) std::vector h_strings{ "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello " "http://www.world.com I'm here @home zzzz", - "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234" + "12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012" + "34" "5678901234567890", - "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnop" + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmn" + "op" "qrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"}; cudf::test::strings_column_wrapper strings( h_strings.begin(), @@ -125,7 +149,7 @@ TEST_F(StringsFindallTests, LargeRegex) thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::findall_re(strings_view, large_regex); + auto results = cudf::strings::findall(strings_view, large_regex); EXPECT_TRUE(results->num_columns() == 1); std::vector h_expected{large_regex.c_str(), nullptr, nullptr}; diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst index 2a4dd5ff9c8..d705504cc0c 100644 --- a/docs/cudf/source/api_docs/index_objects.rst +++ b/docs/cudf/source/api_docs/index_objects.rst @@ -22,7 +22,6 @@ Properties :toctree: api/ Index.empty - Index.gpu_values Index.has_duplicates Index.is_monotonic Index.is_monotonic_increasing @@ -93,9 +92,9 @@ Conversion :toctree: api/ Index.astype - Index.to_array Index.to_arrow Index.to_list + Index.to_numpy Index.to_series Index.to_frame Index.to_pandas diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst index 891bb3a1e61..cf5dd4a2a1d 100644 --- a/docs/cudf/source/api_docs/series.rst +++ b/docs/cudf/source/api_docs/series.rst @@ -408,13 +408,13 @@ Serialization / IO / conversion .. autosummary:: :toctree: api/ - Series.to_array Series.to_arrow + Series.to_cupy Series.to_dlpack Series.to_frame - Series.to_gpu_array Series.to_hdf Series.to_json + Series.to_numpy Series.to_pandas Series.to_string Series.from_arrow diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb index a7e959a05a7..0034584a6f7 100644 --- a/docs/cudf/source/user_guide/10min.ipynb +++ b/docs/cudf/source/user_guide/10min.ipynb @@ -4550,7 +4550,7 @@ } ], "source": [ - "df['a'].to_array()" + "df['a'].to_numpy()" ] }, { @@ -4571,7 +4571,7 @@ } ], "source": [ - "ddf['a'].compute().to_array()" + "ddf['a'].compute().to_numpy()" ] }, { diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index 61981b34615..cb3234bf706 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -45,7 +45,6 @@ public final class ColumnVector extends ColumnView { NativeDepsLoader.loadNativeDeps(); } - private final OffHeapState offHeap; private Optional nullCount = Optional.empty(); private int refCount; @@ -56,14 +55,23 @@ public final class ColumnVector extends ColumnView { * owned by this instance. */ public ColumnVector(long nativePointer) { - super(getColumnViewFromColumn(nativePointer)); + super(new OffHeapState(nativePointer)); assert nativePointer != 0; - offHeap = new OffHeapState(nativePointer); MemoryCleaner.register(this, offHeap); this.refCount = 0; incRefCountInternal(true); } + private static OffHeapState makeOffHeap(DType type, long rows, Optional nullCount, + DeviceMemoryBuffer dataBuffer, DeviceMemoryBuffer validityBuffer, + DeviceMemoryBuffer offsetBuffer) { + long viewHandle = initViewHandle( + type, (int)rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(), + dataBuffer, validityBuffer, offsetBuffer, null); + return new OffHeapState(type, (int) rows, dataBuffer, validityBuffer, + offsetBuffer, null, viewHandle); + } + /** * Create a new column vector based off of data already on the device. * @param type the type of the vector @@ -81,24 +89,29 @@ public ColumnVector(long nativePointer) { public ColumnVector(DType type, long rows, Optional nullCount, DeviceMemoryBuffer dataBuffer, DeviceMemoryBuffer validityBuffer, DeviceMemoryBuffer offsetBuffer) { - super(ColumnVector.initViewHandle( - type, (int)rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(), - dataBuffer, validityBuffer, offsetBuffer, null)); + super(makeOffHeap(type, rows, nullCount, dataBuffer, validityBuffer, offsetBuffer)); assert !type.equals(DType.LIST) : "This constructor should not be used for list type"; if (!type.equals(DType.STRING)) { assert offsetBuffer == null : "offsets are only supported for STRING"; } assert (nullCount.isPresent() && nullCount.get() <= Integer.MAX_VALUE) || !nullCount.isPresent(); - offHeap = new OffHeapState(type, (int) rows, dataBuffer, validityBuffer, - offsetBuffer, null, viewHandle); MemoryCleaner.register(this, offHeap); this.nullCount = nullCount; - this.refCount = 0; incRefCountInternal(true); } + private static OffHeapState makeOffHeap(DType type, long rows, Optional nullCount, + DeviceMemoryBuffer dataBuffer, DeviceMemoryBuffer validityBuffer, + DeviceMemoryBuffer offsetBuffer, List toClose, long[] childHandles) { + long viewHandle = initViewHandle(type, (int)rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(), + dataBuffer, validityBuffer, + offsetBuffer, childHandles); + return new OffHeapState(type, (int) rows, dataBuffer, validityBuffer, offsetBuffer, + toClose, viewHandle); + } + /** * Create a new column vector based off of data already on the device with child columns. * @param type the type of the vector, typically a nested type @@ -118,16 +131,12 @@ public ColumnVector(DType type, long rows, Optional nullCount, public ColumnVector(DType type, long rows, Optional nullCount, DeviceMemoryBuffer dataBuffer, DeviceMemoryBuffer validityBuffer, DeviceMemoryBuffer offsetBuffer, List toClose, long[] childHandles) { - super(initViewHandle(type, (int)rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(), - dataBuffer, validityBuffer, - offsetBuffer, childHandles)); + super(makeOffHeap(type, rows, nullCount, dataBuffer, validityBuffer, offsetBuffer, toClose, childHandles)); if (!type.equals(DType.STRING) && !type.equals(DType.LIST)) { assert offsetBuffer == null : "offsets are only supported for STRING, LISTS"; } assert (nullCount.isPresent() && nullCount.get() <= Integer.MAX_VALUE) || !nullCount.isPresent(); - offHeap = new OffHeapState(type, (int) rows, dataBuffer, validityBuffer, offsetBuffer, - toClose, viewHandle); MemoryCleaner.register(this, offHeap); this.refCount = 0; @@ -143,8 +152,7 @@ public ColumnVector(DType type, long rows, Optional nullCount, * @param contiguousBuffer the buffer that this is based off of. */ private ColumnVector(long viewAddress, DeviceMemoryBuffer contiguousBuffer) { - super(viewAddress); - offHeap = new OffHeapState(viewAddress, contiguousBuffer); + super(new OffHeapState(viewAddress, contiguousBuffer)); MemoryCleaner.register(this, offHeap); // TODO we may want to ask for the null count anyways... this.nullCount = Optional.empty(); diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index a2e080e02f6..8155fe79080 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,6 +40,7 @@ public class ColumnView implements AutoCloseable, BinaryOperable { protected final DType type; protected final long rows; protected final long nullCount; + protected final ColumnVector.OffHeapState offHeap; /** * Constructs a Column View given a native view address @@ -50,6 +51,22 @@ public class ColumnView implements AutoCloseable, BinaryOperable { this.type = DType.fromNative(ColumnView.getNativeTypeId(viewHandle), ColumnView.getNativeTypeScale(viewHandle)); this.rows = ColumnView.getNativeRowCount(viewHandle); this.nullCount = ColumnView.getNativeNullCount(viewHandle); + this.offHeap = null; + } + + + /** + * Intended to be called from ColumnVector when it is being constructed. Because state creates a + * cudf::column_view instance and will close it in all cases, we don't want to have to double + * close it. + * @param state the state this view is based off of. + */ + protected ColumnView(ColumnVector.OffHeapState state) { + offHeap = state; + viewHandle = state.getViewHandle(); + type = DType.fromNative(ColumnView.getNativeTypeId(viewHandle), ColumnView.getNativeTypeScale(viewHandle)); + rows = ColumnView.getNativeRowCount(viewHandle); + nullCount = ColumnView.getNativeNullCount(viewHandle); } /** @@ -265,7 +282,10 @@ public long getDeviceMemorySize() { @Override public void close() { - ColumnView.deleteColumnView(viewHandle); + // close the view handle so long as offHeap is not going to do it for us. + if (offHeap == null) { + ColumnView.deleteColumnView(viewHandle); + } viewHandle = 0; } @@ -2331,13 +2351,27 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) { * Null string entries return corresponding null output columns. * @param delimiter UTF-8 encoded string identifying the split points in each string. * An empty string indicates split on whitespace. + * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. * @return New table of strings columns. */ - public final Table stringSplit(Scalar delimiter) { + public final Table stringSplit(Scalar delimiter, int maxSplit) { assert type.equals(DType.STRING) : "column type must be a String"; assert delimiter != null : "delimiter may not be null"; assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar"; - return new Table(stringSplit(this.getNativeView(), delimiter.getScalarHandle())); + return new Table(stringSplit(this.getNativeView(), delimiter.getScalarHandle(), maxSplit)); + } + + /** + * Returns a list of columns by splitting each string using the specified delimiter. + * The number of rows in the output columns will be the same as the input column. + * Null entries are added for a row where split results have been exhausted. + * Null string entries return corresponding null output columns. + * @param delimiter UTF-8 encoded string identifying the split points in each string. + * An empty string indicates split on whitespace. + * @return New table of strings columns. + */ + public final Table stringSplit(Scalar delimiter) { + return stringSplit(delimiter, -1); } /** @@ -2349,7 +2383,7 @@ public final Table stringSplit(Scalar delimiter) { */ public final Table stringSplit() { try (Scalar emptyString = Scalar.fromString("")) { - return stringSplit(emptyString); + return stringSplit(emptyString, -1); } } @@ -2362,7 +2396,7 @@ public final ColumnVector stringSplitRecord() { /** * Returns a column of lists of strings by splitting each string using whitespace as the delimiter. - * @param maxSplit the maximum number of records to split, or -1 for all of them. + * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. */ public final ColumnVector stringSplitRecord(int maxSplit) { try (Scalar emptyString = Scalar.fromString("")) { @@ -2384,7 +2418,7 @@ public final ColumnVector stringSplitRecord(Scalar delimiter) { * string using the specified delimiter. * @param delimiter UTF-8 encoded string identifying the split points in each string. * An empty string indicates split on whitespace. - * @param maxSplit the maximum number of records to split, or -1 for all of them. + * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. * @return New table of strings columns. */ public final ColumnVector stringSplitRecord(Scalar delimiter, int maxSplit) { @@ -3234,7 +3268,7 @@ public final ColumnVector listIndexOf(Scalar key, FindOptions findOption) { * The index is set to null if one of the following is true: * 1. The search key row is null. * 2. The list row is null. - * @param key ColumnView of search keys. + * @param keys ColumnView of search keys. * @param findOption Whether to find the first index of the key, or the last. * @return The resultant column of int32 indices */ @@ -3270,6 +3304,17 @@ public final Scalar getScalarElement(int index) { return new Scalar(getType(), getElement(getNativeView(), index)); } + /** + * Get the number of bytes needed to allocate a validity buffer for the given number of rows. + * According to cudf::bitmask_allocation_size_bytes, the padding boundary for null mask is 64 bytes. + */ + static long getValidityBufferSize(int numRows) { + // number of bytes required = Math.ceil(number of bits / 8) + long actualBytes = ((long) numRows + 7) >> 3; + // padding to the multiplies of the padding boundary(64 bytes) + return ((actualBytes + 63) >> 6) << 6; + } + ///////////////////////////////////////////////////////////////////////////// // INTERNAL/NATIVE ACCESS ///////////////////////////////////////////////////////////////////////////// @@ -3490,8 +3535,9 @@ private static native long repeatStringsWithColumnRepeatTimes(long stringsHandle * delimiter. * @param columnView native handle of the cudf::column_view being operated on. * @param delimiter UTF-8 encoded string identifying the split points in each string. + * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. */ - private static native long[] stringSplit(long columnView, long delimiter); + private static native long[] stringSplit(long columnView, long delimiter, int maxSplit); private static native long stringSplitRecord(long nativeView, long scalarHandle, int maxSplit); @@ -3686,7 +3732,7 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat * Native method to find the first (or last) index of each search key in the specified column, * in each row of a list column. * @param nativeView the column view handle of the list - * @param scalarColumnHandle handle to the search key column + * @param keyColumnHandle handle to the search key column * @param isFindFirst Whether to find the first index of the key, or the last. * @return column handle of the resultant column of int32 indices */ @@ -3866,11 +3912,6 @@ private static native long bitwiseMergeAndSetValidity(long baseHandle, long[] vi private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, long boolColumnViewHandle) throws CudfException; - /** - * Get the number of bytes needed to allocate a validity buffer for the given number of rows. - */ - static native long getNativeValidPointerSize(int size); - //////// // Native cudf::column_view life cycle and metadata access methods. Life cycle methods // should typically only be called from the OffHeap inner class. @@ -3960,7 +4001,7 @@ static ColumnVector createColumnVector(DType type, int rows, HostMemoryBuffer da DeviceMemoryBuffer mainValidDevBuff = null; DeviceMemoryBuffer mainOffsetsDevBuff = null; if (mainColValid != null) { - long validLen = getNativeValidPointerSize(mainColRows); + long validLen = getValidityBufferSize(mainColRows); mainValidDevBuff = DeviceMemoryBuffer.allocate(validLen); mainValidDevBuff.copyFromHostBuffer(mainColValid, 0, validLen); } @@ -4069,7 +4110,7 @@ private static NestedColumnVector createNestedColumnVector(DType type, long rows data.copyFromHostBuffer(dataBuffer, 0, dataLen); } if (validityBuffer != null) { - long validLen = getNativeValidPointerSize((int)rows); + long validLen = getValidityBufferSize((int)rows); valid = DeviceMemoryBuffer.allocate(validLen); valid.copyFromHostBuffer(validityBuffer, 0, validLen); } diff --git a/java/src/main/java/ai/rapids/cudf/DType.java b/java/src/main/java/ai/rapids/cudf/DType.java index 742501be375..2e5b0202dc5 100644 --- a/java/src/main/java/ai/rapids/cudf/DType.java +++ b/java/src/main/java/ai/rapids/cudf/DType.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -307,7 +307,7 @@ public static DType fromJavaBigDecimal(BigDecimal dec) { return new DType(DTypeEnum.DECIMAL128, -dec.scale()); } throw new IllegalArgumentException("Precision " + dec.precision() + - " exceeds max precision cuDF can support " + DECIMAL64_MAX_PRECISION); + " exceeds max precision cuDF can support " + DECIMAL128_MAX_PRECISION); } /** diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java index 0fe7d7a5df8..3abc6db385d 100644 --- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java @@ -199,7 +199,7 @@ public ColumnVector copyToDevice() { } HostMemoryBuffer hvalid = this.offHeap.valid; if (hvalid != null) { - long validLen = ColumnView.getNativeValidPointerSize((int) rows); + long validLen = ColumnView.getValidityBufferSize((int) rows); valid = DeviceMemoryBuffer.allocate(validLen); valid.copyFromHostBuffer(hvalid, 0, validLen); } @@ -858,7 +858,7 @@ public static HostColumnVector timestampNanoSecondsFromBoxedLongs(Long... values * Build */ - public static final class ColumnBuilder implements AutoCloseable { + public static final class ColumnBuilder implements AutoCloseable { private DType type; private HostMemoryBuffer data; @@ -869,28 +869,78 @@ public static final class ColumnBuilder implements AutoCloseable { private boolean nullable; private long rows; private long estimatedRows; + private long rowCapacity = 0L; + private long validCapacity = 0L; private boolean built = false; private List childBuilders = new ArrayList<>(); + private Runnable nullHandler; - private int currentIndex = 0; - private int currentByteIndex = 0; - + // The value of currentIndex can't exceed Int32.Max. Storing currentIndex as a long is to + // adapt HostMemoryBuffer.setXXX, which requires a long offset. + private long currentIndex = 0; + // Only for Strings: pointer of the byte (data) buffer + private int currentStringByteIndex = 0; + // Use bit shift instead of multiply to transform row offset to byte offset + private int bitShiftBySize = 0; + private static final int bitShiftByOffset = (int)(Math.log(OFFSET_SIZE) / Math.log(2)); public ColumnBuilder(HostColumnVector.DataType type, long estimatedRows) { this.type = type.getType(); this.nullable = type.isNullable(); this.rows = 0; - this.estimatedRows = estimatedRows; + this.estimatedRows = Math.max(estimatedRows, 1L); + this.bitShiftBySize = (int)(Math.log(this.type.getSizeInBytes()) / Math.log(2)); + + // initialize the null handler according to the data type + this.setupNullHandler(); + for (int i = 0; i < type.getNumChildren(); i++) { childBuilders.add(new ColumnBuilder(type.getChild(i), estimatedRows)); } } + private void setupNullHandler() { + if (this.type == DType.LIST) { + this.nullHandler = () -> { + this.growListBuffersAndRows(); + this.growValidBuffer(); + setNullAt(currentIndex++); + offsets.setInt(currentIndex << bitShiftByOffset, childBuilders.get(0).getCurrentIndex()); + }; + } else if (this.type == DType.STRING) { + this.nullHandler = () -> { + this.growStringBuffersAndRows(0); + this.growValidBuffer(); + setNullAt(currentIndex++); + offsets.setInt(currentIndex << bitShiftByOffset, currentStringByteIndex); + }; + } else if (this.type == DType.STRUCT) { + this.nullHandler = () -> { + this.growStructBuffersAndRows(); + this.growValidBuffer(); + setNullAt(currentIndex++); + for (ColumnBuilder childBuilder : childBuilders) { + childBuilder.appendNull(); + } + }; + } else { + this.nullHandler = () -> { + this.growFixedWidthBuffersAndRows(); + this.growValidBuffer(); + setNullAt(currentIndex++); + }; + } + } + public HostColumnVector build() { List hostColumnVectorCoreList = new ArrayList<>(); for (ColumnBuilder childBuilder : childBuilders) { hostColumnVectorCoreList.add(childBuilder.buildNestedInternal()); } + // Aligns the valid buffer size with other buffers in terms of row size, because it grows lazily. + if (valid != null) { + growValidBuffer(); + } HostColumnVector hostColumnVector = new HostColumnVector(type, rows, Optional.of(nullCount), data, valid, offsets, hostColumnVectorCoreList); built = true; @@ -902,6 +952,10 @@ private HostColumnVectorCore buildNestedInternal() { for (ColumnBuilder childBuilder : childBuilders) { hostColumnVectorCoreList.add(childBuilder.buildNestedInternal()); } + // Aligns the valid buffer size with other buffers in terms of row size, because it grows lazily. + if (valid != null) { + growValidBuffer(); + } return new HostColumnVectorCore(type, rows, Optional.of(nullCount), data, valid, offsets, hostColumnVectorCoreList); } @@ -929,71 +983,113 @@ public ColumnBuilder appendStructValues(StructData... inputList) { } /** - * A method that is responsible for growing the buffers as needed - * and incrementing the row counts when we append values or nulls. - * @param hasNull indicates whether the validity buffer needs to be considered, as the - * nullcount may not have been fully calculated yet - * @param length used for strings + * Grows valid buffer lazily. The valid buffer won't be materialized until the first null + * value appended. This method reuses the rowCapacity to track the sizes of column. + * Therefore, please call specific growBuffer method to update rowCapacity before calling + * this method. + */ + private void growValidBuffer() { + if (valid == null) { + long maskBytes = ColumnView.getValidityBufferSize((int) rowCapacity); + valid = HostMemoryBuffer.allocate(maskBytes); + valid.setMemory(0, valid.length, (byte) 0xFF); + validCapacity = rowCapacity; + return; + } + if (validCapacity < rowCapacity) { + long maskBytes = ColumnView.getValidityBufferSize((int) rowCapacity); + HostMemoryBuffer newValid = HostMemoryBuffer.allocate(maskBytes); + newValid.setMemory(0, newValid.length, (byte) 0xFF); + valid = copyBuffer(newValid, valid); + validCapacity = rowCapacity; + } + } + + /** + * A method automatically grows data buffer for fixed-width columns as needed along with + * incrementing the row counts. Please call this method before appending any value or null. */ - private void growBuffersAndRows(boolean hasNull, int length) { + private void growFixedWidthBuffersAndRows() { assert rows + 1 <= Integer.MAX_VALUE : "Row count cannot go over Integer.MAX_VALUE"; rows++; - long targetDataSize = 0; - if (!type.isNestedType()) { - if (type.equals(DType.STRING)) { - targetDataSize = data == null ? length : currentByteIndex + length; - } else { - targetDataSize = data == null ? estimatedRows * type.getSizeInBytes() : rows * type.getSizeInBytes(); - } + if (data == null) { + data = HostMemoryBuffer.allocate(estimatedRows << bitShiftBySize); + rowCapacity = estimatedRows; + } else if (rows > rowCapacity) { + long newCap = Math.min(rowCapacity * 2, Integer.MAX_VALUE - 1); + data = copyBuffer(HostMemoryBuffer.allocate(newCap << bitShiftBySize), data); + rowCapacity = newCap; } + } - if (targetDataSize > 0) { - if (data == null) { - data = HostMemoryBuffer.allocate(targetDataSize); - } else { - long maxLen; - if (type.equals(DType.STRING)) { - maxLen = Integer.MAX_VALUE; - } else { - maxLen = Integer.MAX_VALUE * (long) type.getSizeInBytes(); - } - long oldLen = data.getLength(); - long newDataLen = Math.max(1, oldLen); - while (targetDataSize > newDataLen) { - newDataLen = newDataLen * 2; - } - if (newDataLen != oldLen) { - newDataLen = Math.min(newDataLen, maxLen); - if (newDataLen < targetDataSize) { - throw new IllegalStateException("A data buffer for strings is not supported over 2GB in size"); - } - HostMemoryBuffer newData = HostMemoryBuffer.allocate(newDataLen); - data = copyBuffer(newData, data); - } - } + /** + * A method automatically grows offsets buffer for list columns as needed along with + * incrementing the row counts. Please call this method before appending any value or null. + */ + private void growListBuffersAndRows() { + assert rows + 2 <= Integer.MAX_VALUE : "Row count cannot go over Integer.MAX_VALUE"; + rows++; + + if (offsets == null) { + offsets = HostMemoryBuffer.allocate((estimatedRows + 1) << bitShiftByOffset); + offsets.setInt(0, 0); + rowCapacity = estimatedRows; + } else if (rows > rowCapacity) { + long newCap = Math.min(rowCapacity * 2, Integer.MAX_VALUE - 2); + offsets = copyBuffer(HostMemoryBuffer.allocate((newCap + 1) << bitShiftByOffset), offsets); + rowCapacity = newCap; } - if (type.equals(DType.LIST) || type.equals(DType.STRING)) { - if (offsets == null) { - offsets = HostMemoryBuffer.allocate((estimatedRows + 1) * OFFSET_SIZE); - offsets.setInt(0, 0); - } else if ((rows +1) * OFFSET_SIZE > offsets.length) { - long newOffsetLen = offsets.length * 2; - HostMemoryBuffer newOffsets = HostMemoryBuffer.allocate(newOffsetLen); - offsets = copyBuffer(newOffsets, offsets); - } + } + + /** + * A method automatically grows offsets and data buffer for string columns as needed along with + * incrementing the row counts. Please call this method before appending any value or null. + * + * @param stringLength number of bytes required by the next row + */ + private void growStringBuffersAndRows(int stringLength) { + assert rows + 2 <= Integer.MAX_VALUE : "Row count cannot go over Integer.MAX_VALUE"; + rows++; + + if (offsets == null) { + // Initialize data buffer with at least 1 byte in case the first appended value is null. + data = HostMemoryBuffer.allocate(Math.max(1, stringLength)); + offsets = HostMemoryBuffer.allocate((estimatedRows + 1) << bitShiftByOffset); + offsets.setInt(0, 0); + rowCapacity = estimatedRows; + return; } - if (hasNull || nullCount > 0) { - if (valid == null) { - long targetValidSize = ColumnView.getNativeValidPointerSize((int)estimatedRows); - valid = HostMemoryBuffer.allocate(targetValidSize); - valid.setMemory(0, targetValidSize, (byte) 0xFF); - } else if (valid.length < ColumnView.getNativeValidPointerSize((int)rows)) { - long newValidLen = valid.length * 2; - HostMemoryBuffer newValid = HostMemoryBuffer.allocate(newValidLen); - newValid.setMemory(0, newValidLen, (byte) 0xFF); - valid = copyBuffer(newValid, valid); - } + + if (rows > rowCapacity) { + long newCap = Math.min(rowCapacity * 2, Integer.MAX_VALUE - 2); + offsets = copyBuffer(HostMemoryBuffer.allocate((newCap + 1) << bitShiftByOffset), offsets); + rowCapacity = newCap; + } + + long currentLength = currentStringByteIndex + stringLength; + if (currentLength > data.length) { + long requiredLength = data.length; + do { + requiredLength = requiredLength * 2; + } while (currentLength > requiredLength); + data = copyBuffer(HostMemoryBuffer.allocate(requiredLength), data); + } + } + + /** + * For struct columns, we only need to update rows and rowCapacity (for the growth of + * valid buffer), because struct columns hold no buffer itself. + * Please call this method before appending any value or null. + */ + private void growStructBuffersAndRows() { + assert rows + 1 <= Integer.MAX_VALUE : "Row count cannot go over Integer.MAX_VALUE"; + rows++; + + if (rowCapacity == 0) { + rowCapacity = estimatedRows; + } else if (rows > rowCapacity) { + rowCapacity = Math.min(rowCapacity * 2, Integer.MAX_VALUE - 1); } } @@ -1015,29 +1111,13 @@ private HostMemoryBuffer copyBuffer(HostMemoryBuffer targetBuffer, HostMemoryBuf * Method that sets the null bit in the validity vector * @param index the row index at which the null is marked */ - private void setNullAt(int index) { + private void setNullAt(long index) { assert index < rows : "Index for null value should fit the column with " + rows + " rows"; nullCount += BitVectorHelper.setNullAt(valid, index); } public final ColumnBuilder appendNull() { - growBuffersAndRows(true, 0); - setNullAt(currentIndex); - currentIndex++; - currentByteIndex += type.getSizeInBytes(); - if (type.hasOffsets()) { - if (type.equals(DType.LIST)) { - offsets.setInt(currentIndex * OFFSET_SIZE, childBuilders.get(0).getCurrentIndex()); - } else { - // It is a String - offsets.setInt(currentIndex * OFFSET_SIZE, currentByteIndex); - } - } else if (type.equals(DType.STRUCT)) { - // structs propagate nulls to children and even further down if needed - for (ColumnBuilder childBuilder : childBuilders) { - childBuilder.appendNull(); - } - } + nullHandler.run(); return this; } @@ -1081,7 +1161,7 @@ public ColumnBuilder endStruct() { assert type.equals(DType.STRUCT) : "This only works for structs"; assert allChildrenHaveSameIndex() : "Appending structs data appears to be off " + childBuilders + " should all have the same currentIndex " + type; - growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); + growStructBuffersAndRows(); currentIndex++; return this; } @@ -1095,9 +1175,8 @@ assert allChildrenHaveSameIndex() : "Appending structs data appears to be off " */ public ColumnBuilder endList() { assert type.equals(DType.LIST); - growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); - currentIndex++; - offsets.setInt(currentIndex * OFFSET_SIZE, childBuilders.get(0).getCurrentIndex()); + growListBuffersAndRows(); + offsets.setInt(++currentIndex << bitShiftByOffset, childBuilders.get(0).getCurrentIndex()); return this; } @@ -1155,80 +1234,67 @@ public void incrCurrentIndex() { } public int getCurrentIndex() { - return currentIndex; + return (int) currentIndex; } + @Deprecated public int getCurrentByteIndex() { - return currentByteIndex; + return currentStringByteIndex; } public final ColumnBuilder append(byte value) { - growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); + growFixedWidthBuffersAndRows(); assert type.isBackedByByte(); assert currentIndex < rows; - data.setByte(currentIndex * type.getSizeInBytes(), value); - currentIndex++; - currentByteIndex += type.getSizeInBytes(); + data.setByte(currentIndex++ << bitShiftBySize, value); return this; } public final ColumnBuilder append(short value) { - growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); + growFixedWidthBuffersAndRows(); assert type.isBackedByShort(); assert currentIndex < rows; - data.setShort(currentIndex * type.getSizeInBytes(), value); - currentIndex++; - currentByteIndex += type.getSizeInBytes(); + data.setShort(currentIndex++ << bitShiftBySize, value); return this; } public final ColumnBuilder append(int value) { - growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); + growFixedWidthBuffersAndRows(); assert type.isBackedByInt(); assert currentIndex < rows; - data.setInt(currentIndex * type.getSizeInBytes(), value); - currentIndex++; - currentByteIndex += type.getSizeInBytes(); + data.setInt(currentIndex++ << bitShiftBySize, value); return this; } public final ColumnBuilder append(long value) { - growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); + growFixedWidthBuffersAndRows(); assert type.isBackedByLong(); assert currentIndex < rows; - data.setLong(currentIndex * type.getSizeInBytes(), value); - currentIndex++; - currentByteIndex += type.getSizeInBytes(); + data.setLong(currentIndex++ << bitShiftBySize, value); return this; } public final ColumnBuilder append(float value) { - growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); + growFixedWidthBuffersAndRows(); assert type.equals(DType.FLOAT32); assert currentIndex < rows; - data.setFloat(currentIndex * type.getSizeInBytes(), value); - currentIndex++; - currentByteIndex += type.getSizeInBytes(); + data.setFloat(currentIndex++ << bitShiftBySize, value); return this; } public final ColumnBuilder append(double value) { - growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); + growFixedWidthBuffersAndRows(); assert type.equals(DType.FLOAT64); assert currentIndex < rows; - data.setDouble(currentIndex * type.getSizeInBytes(), value); - currentIndex++; - currentByteIndex += type.getSizeInBytes(); + data.setDouble(currentIndex++ << bitShiftBySize, value); return this; } public final ColumnBuilder append(boolean value) { - growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); + growFixedWidthBuffersAndRows(); assert type.equals(DType.BOOL8); assert currentIndex < rows; - data.setBoolean(currentIndex * type.getSizeInBytes(), value); - currentIndex++; - currentByteIndex += type.getSizeInBytes(); + data.setBoolean(currentIndex++ << bitShiftBySize, value); return this; } @@ -1237,22 +1303,19 @@ public ColumnBuilder append(BigDecimal value) { } public ColumnBuilder append(BigInteger unscaledVal) { - growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); + growFixedWidthBuffersAndRows(); assert currentIndex < rows; if (type.typeId == DType.DTypeEnum.DECIMAL32) { - data.setInt(currentIndex * type.getSizeInBytes(), unscaledVal.intValueExact()); + data.setInt(currentIndex++ << bitShiftBySize, unscaledVal.intValueExact()); } else if (type.typeId == DType.DTypeEnum.DECIMAL64) { - data.setLong(currentIndex * type.getSizeInBytes(), unscaledVal.longValueExact()); + data.setLong(currentIndex++ << bitShiftBySize, unscaledVal.longValueExact()); } else if (type.typeId == DType.DTypeEnum.DECIMAL128) { - assert currentIndex < rows; byte[] unscaledValueBytes = unscaledVal.toByteArray(); byte[] result = convertDecimal128FromJavaToCudf(unscaledValueBytes); - data.setBytes(currentIndex*DType.DTypeEnum.DECIMAL128.sizeInBytes, result, 0, result.length); - } else { + data.setBytes(currentIndex++ << bitShiftBySize, result, 0, result.length); + } else { throw new IllegalStateException(type + " is not a supported decimal type."); } - currentIndex++; - currentByteIndex += type.getSizeInBytes(); return this; } @@ -1271,14 +1334,13 @@ public ColumnBuilder appendUTF8String(byte[] value, int srcOffset, int length) { assert length >= 0; assert value.length + srcOffset <= length; assert type.equals(DType.STRING) : " type " + type + " is not String"; - currentIndex++; - growBuffersAndRows(false, length); - assert currentIndex < rows + 1; + growStringBuffersAndRows(length); + assert currentIndex < rows; if (length > 0) { - data.setBytes(currentByteIndex, value, srcOffset, length); + data.setBytes(currentStringByteIndex, value, srcOffset, length); } - currentByteIndex += length; - offsets.setInt(currentIndex * OFFSET_SIZE, currentByteIndex); + currentStringByteIndex += length; + offsets.setInt(++currentIndex << bitShiftByOffset, currentStringByteIndex); return this; } @@ -1822,7 +1884,7 @@ public final Builder append(HostColumnVector columnVector) { } private void allocateBitmaskAndSetDefaultValues() { - long bitmaskSize = ColumnView.getNativeValidPointerSize((int) rows); + long bitmaskSize = ColumnView.getValidityBufferSize((int) rows); valid = HostMemoryBuffer.allocate(bitmaskSize); valid.setMemory(0, bitmaskSize, (byte) 0xFF); } diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index 317ef152492..a45716a89b3 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -395,6 +395,9 @@ template class native_jpointerArray { T **data() { return reinterpret_cast(wrapped.data()); } + T *const *begin() const { return data(); } + T *const *end() const { return data() + size(); } + const jlongArray get_jArray() const { return wrapped.get_jArray(); } jlongArray get_jArray() { return wrapped.get_jArray(); } diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 38c6bb3740e..63247eb0066 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -561,17 +561,17 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env, JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass, jlong column_view, - jlong delimiter) { + jlong delimiter_ptr, + jint max_split) { JNI_NULL_CHECK(env, column_view, "column is null", 0); - JNI_NULL_CHECK(env, delimiter, "string scalar delimiter is null", 0); + JNI_NULL_CHECK(env, delimiter_ptr, "string scalar delimiter is null", 0); try { cudf::jni::auto_set_device(env); - cudf::column_view *cv = reinterpret_cast(column_view); - cudf::strings_column_view scv(*cv); - cudf::string_scalar *ss_scalar = reinterpret_cast(delimiter); + cudf::strings_column_view const scv{*reinterpret_cast(column_view)}; + auto delimiter = reinterpret_cast(delimiter_ptr); - std::unique_ptr table_result = cudf::strings::split(scv, *ss_scalar); - return cudf::jni::convert_table_for_return(env, table_result); + return cudf::jni::convert_table_for_return(env, + cudf::strings::split(scv, *delimiter, max_split)); } CATCH_STD(env, 0); } @@ -1409,13 +1409,12 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_extractRe(JNIEnv *en try { cudf::jni::auto_set_device(env); - cudf::column_view *column_view = reinterpret_cast(j_view_handle); - cudf::strings_column_view strings_column(*column_view); + cudf::strings_column_view const strings_column{ + *reinterpret_cast(j_view_handle)}; cudf::jni::native_jstring pattern(env, patternObj); - std::unique_ptr table_result = - cudf::strings::extract(strings_column, pattern.get()); - return cudf::jni::convert_table_for_return(env, table_result); + return cudf::jni::convert_table_for_return( + env, cudf::strings::extract(strings_column, pattern.get())); } CATCH_STD(env, 0); } @@ -1791,16 +1790,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityLength(J CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidPointerSize(JNIEnv *env, - jobject j_object, - jint size) { - try { - cudf::jni::auto_set_device(env); - return static_cast(cudf::bitmask_allocation_size_bytes(size)); - } - CATCH_STD(env, 0); -} - JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getDeviceMemorySize(JNIEnv *env, jclass, jlong handle) { JNI_NULL_CHECK(env, handle, "native handle is null", 0); diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 10f295e27bf..aeac1856db0 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -599,37 +599,27 @@ class native_arrow_ipc_reader_handle final { void close() { source->Close(); } }; -/** - * Take a table returned by some operation and turn it into an array of column* so we can track them - * ourselves in java instead of having their life tied to the table. - * @param table_result the table to convert for return - * @param extra_columns columns not in the table that will be added to the result at the end. - */ -static jlongArray -convert_table_for_return(JNIEnv *env, std::unique_ptr &table_result, - std::vector> &extra_columns) { +jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr &&table_result, + std::vector> &&extra_columns) { std::vector> ret = table_result->release(); int table_cols = ret.size(); int num_columns = table_cols + extra_columns.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); - for (int i = 0; i < table_cols; i++) { - outcol_handles[i] = release_as_jlong(ret[i]); - } - for (size_t i = 0; i < extra_columns.size(); i++) { - outcol_handles[i + table_cols] = release_as_jlong(extra_columns[i]); - } + std::transform(ret.begin(), ret.end(), outcol_handles.begin(), + [](auto &col) { return release_as_jlong(col); }); + std::transform(extra_columns.begin(), extra_columns.end(), outcol_handles.begin() + table_cols, + [](auto &col) { return release_as_jlong(col); }); return outcol_handles.get_jArray(); } -jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr &table_result) { - std::vector> extra; - return convert_table_for_return(env, table_result, extra); +jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr &table_result, + std::vector> &&extra_columns) { + return convert_table_for_return(env, std::move(table_result), std::move(extra_columns)); } jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr &first_table, std::unique_ptr &second_table) { - std::vector> second_tmp = second_table->release(); - return convert_table_for_return(env, first_table, second_tmp); + return convert_table_for_return(env, first_table, second_table->release()); } // Convert the JNI boolean array of key column sort order to a vector of cudf::order @@ -1068,6 +1058,7 @@ cudf::table_view remove_validity_if_needed(cudf::table_view *input_table_view) { } // namespace jni } // namespace cudf +using cudf::jni::convert_table_for_return; using cudf::jni::ptr_as_jlong; using cudf::jni::release_as_jlong; @@ -1223,9 +1214,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_orderBy(JNIEnv *env, jcla std::vector sort_keys = n_sort_keys_columns.get_dereferenced(); auto sorted_col = cudf::sorted_order(cudf::table_view{sort_keys}, order, null_order); - cudf::table_view *input_table = reinterpret_cast(j_input_table); - std::unique_ptr result = cudf::gather(*input_table, sorted_col->view()); - return cudf::jni::convert_table_for_return(env, result); + auto const input_table = reinterpret_cast(j_input_table); + return convert_table_for_return(env, cudf::gather(*input_table, sorted_col->view())); } CATCH_STD(env, NULL); } @@ -1267,8 +1257,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER); std::vector tables = n_table_handles.get_dereferenced(); - std::unique_ptr result = cudf::merge(tables, indexes, order, null_order); - return cudf::jni::convert_table_for_return(env, result); + return convert_table_for_return(env, cudf::merge(tables, indexes, order, null_order)); } CATCH_STD(env, NULL); } @@ -1344,8 +1333,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV( .comment(comment) .build(); - cudf::io::table_with_metadata result = cudf::io::read_csv(opts); - return cudf::jni::convert_table_for_return(env, result.tbl); + return convert_table_for_return(env, cudf::io::read_csv(opts).tbl); } CATCH_STD(env, NULL); } @@ -1425,7 +1413,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON( // there is no need to re-order columns when inferring schema if (result.metadata.column_names.empty() || n_col_names.size() <= 0) { - return cudf::jni::convert_table_for_return(env, result.tbl); + return convert_table_for_return(env, result.tbl); } else { // json reader will not return the correct column order, // so we need to re-order the column of table according to table meta. @@ -1453,11 +1441,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON( if (!match) { // can't find some input column names in table meta, return what json reader reads. - return cudf::jni::convert_table_for_return(env, result.tbl); + return convert_table_for_return(env, result.tbl); } else { auto tbv = result.tbl->view().select(std::move(indices)); auto table = std::make_unique(tbv); - return cudf::jni::convert_table_for_return(env, table); + return convert_table_for_return(env, table); } } } @@ -1501,8 +1489,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv *env, .convert_strings_to_categories(false) .timestamp_type(cudf::data_type(static_cast(unit))) .build(); - cudf::io::table_with_metadata result = cudf::io::read_parquet(opts); - return cudf::jni::convert_table_for_return(env, result.tbl); + return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl); } CATCH_STD(env, NULL); } @@ -1672,8 +1659,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC( .timestamp_type(cudf::data_type(static_cast(unit))) .decimal128_columns(n_dec128_col_names.as_cpp_vector()) .build(); - cudf::io::table_with_metadata result = cudf::io::read_orc(opts); - return cudf::jni::convert_table_for_return(env, result.tbl); + return convert_table_for_return(env, cudf::io::read_orc(opts).tbl); } CATCH_STD(env, NULL); } @@ -1956,8 +1942,7 @@ Java_ai_rapids_cudf_Table_convertArrowTableToCudf(JNIEnv *env, jclass, jlong arr try { cudf::jni::auto_set_device(env); - std::unique_ptr result = cudf::from_arrow(*(handle->get())); - return cudf::jni::convert_table_for_return(env, result); + return convert_table_for_return(env, cudf::from_arrow(*(handle->get()))); } CATCH_STD(env, 0) } @@ -2142,7 +2127,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftSemiJoin( static_cast(compare_nulls_equal) ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL); - return cudf::jni::convert_table_for_return(env, result); + return convert_table_for_return(env, result); } CATCH_STD(env, NULL); } @@ -2171,7 +2156,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoin( static_cast(compare_nulls_equal) ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL); - return cudf::jni::convert_table_for_return(env, result); + return convert_table_for_return(env, result); } CATCH_STD(env, NULL); } @@ -2706,12 +2691,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv *env, jc try { cudf::jni::auto_set_device(env); - cudf::table_view *n_left_table = reinterpret_cast(left_table); - cudf::table_view *n_right_table = reinterpret_cast(right_table); - - std::unique_ptr result = cudf::cross_join(*n_left_table, *n_right_table); - - return cudf::jni::convert_table_for_return(env, result); + auto const left = reinterpret_cast(left_table); + auto const right = reinterpret_cast(right_table); + return convert_table_for_return(env, cudf::cross_join(*left, *right)); } CATCH_STD(env, NULL); } @@ -2734,18 +2716,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_concatenate(JNIEnv *env, try { cudf::jni::auto_set_device(env); cudf::jni::native_jpointerArray tables(env, table_handles); - - int num_tables = tables.size(); - // There are some issues with table_view and std::vector. We cannot give the - // vector a size or it will not compile. - std::vector to_concat; - to_concat.reserve(num_tables); - for (int i = 0; i < num_tables; i++) { - JNI_NULL_CHECK(env, tables[i], "input table included a null", NULL); - to_concat.push_back(*tables[i]); - } - std::unique_ptr table_result = cudf::concatenate(to_concat); - return cudf::jni::convert_table_for_return(env, table_result); + std::vector const to_concat = tables.get_dereferenced(); + return convert_table_for_return(env, cudf::concatenate(to_concat)); } CATCH_STD(env, NULL); } @@ -2763,20 +2735,19 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv *env, jc try { cudf::jni::auto_set_device(env); - cudf::table_view *n_input_table = reinterpret_cast(input_table); - cudf::column_view *n_part_column = reinterpret_cast(partition_column); - cudf::jni::native_jintArray n_output_offsets(env, output_offsets); + auto const n_input_table = reinterpret_cast(input_table); + auto const n_part_column = reinterpret_cast(partition_column); - auto result = cudf::partition(*n_input_table, *n_part_column, number_of_partitions); + auto [partitioned_table, partition_offsets] = + cudf::partition(*n_input_table, *n_part_column, number_of_partitions); - for (size_t i = 0; i < result.second.size() - 1; i++) { - // for what ever reason partition returns the length of the result at then - // end and hash partition/round robin do not, so skip the last entry for - // consistency - n_output_offsets[i] = result.second[i]; - } + // for what ever reason partition returns the length of the result at then + // end and hash partition/round robin do not, so skip the last entry for + // consistency + cudf::jni::native_jintArray n_output_offsets(env, output_offsets); + std::copy(partition_offsets.begin(), partition_offsets.end() - 1, n_output_offsets.begin()); - return cudf::jni::convert_table_for_return(env, result.first); + return convert_table_for_return(env, partitioned_table); } CATCH_STD(env, NULL); } @@ -2792,26 +2763,21 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition( try { cudf::jni::auto_set_device(env); - cudf::hash_id hash_func = static_cast(hash_function); - cudf::table_view *n_input_table = reinterpret_cast(input_table); + auto const hash_func = static_cast(hash_function); + auto const n_input_table = reinterpret_cast(input_table); cudf::jni::native_jintArray n_columns_to_hash(env, columns_to_hash); - cudf::jni::native_jintArray n_output_offsets(env, output_offsets); - JNI_ARG_CHECK(env, n_columns_to_hash.size() > 0, "columns_to_hash is zero", NULL); - std::vector columns_to_hash_vec(n_columns_to_hash.size()); - for (int i = 0; i < n_columns_to_hash.size(); i++) { - columns_to_hash_vec[i] = n_columns_to_hash[i]; - } + std::vector columns_to_hash_vec(n_columns_to_hash.begin(), + n_columns_to_hash.end()); - std::pair, std::vector> result = + auto [partitioned_table, partition_offsets] = cudf::hash_partition(*n_input_table, columns_to_hash_vec, number_of_partitions, hash_func); - for (size_t i = 0; i < result.second.size(); i++) { - n_output_offsets[i] = result.second[i]; - } + cudf::jni::native_jintArray n_output_offsets(env, output_offsets); + std::copy(partition_offsets.begin(), partition_offsets.end(), n_output_offsets.begin()); - return cudf::jni::convert_table_for_return(env, result.first); + return convert_table_for_return(env, partitioned_table); } CATCH_STD(env, NULL); } @@ -2827,15 +2793,14 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_roundRobinPartition( try { cudf::jni::auto_set_device(env); auto n_input_table = reinterpret_cast(input_table); - cudf::jni::native_jintArray n_output_offsets(env, output_offsets); - auto result = cudf::round_robin_partition(*n_input_table, num_partitions, start_partition); + auto [partitioned_table, partition_offsets] = + cudf::round_robin_partition(*n_input_table, num_partitions, start_partition); - for (size_t i = 0; i < result.second.size(); i++) { - n_output_offsets[i] = result.second[i]; - } + cudf::jni::native_jintArray n_output_offsets(env, output_offsets); + std::copy(partition_offsets.begin(), partition_offsets.end(), n_output_offsets.begin()); - return cudf::jni::convert_table_for_return(env, result.first); + return convert_table_for_return(env, partitioned_table); } CATCH_STD(env, NULL); } @@ -2905,7 +2870,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate( result_columns.push_back(std::move(result.second[agg_result_index].results[col_agg_index])); } } - return cudf::jni::convert_table_for_return(env, result.first, result_columns); + return convert_table_for_return(env, result.first, std::move(result_columns)); } CATCH_STD(env, NULL); } @@ -2975,7 +2940,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan( result_columns.push_back(std::move(result.second[agg_result_index].results[col_agg_index])); } } - return cudf::jni::convert_table_for_return(env, result.first, result_columns); + return convert_table_for_return(env, result.first, std::move(result_columns)); } CATCH_STD(env, NULL); } @@ -3020,10 +2985,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByReplaceNulls( std::vector policies = n_is_preceding.transform_if_else( cudf::replace_policy::PRECEDING, cudf::replace_policy::FOLLOWING); - std::pair, std::unique_ptr> result = - grouper.replace_nulls(n_replace_table, policies); - - return cudf::jni::convert_table_for_return(env, result.first, result.second); + auto [keys, results] = grouper.replace_nulls(n_replace_table, policies); + return convert_table_for_return(env, keys, results); } CATCH_STD(env, NULL); } @@ -3034,10 +2997,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_filter(JNIEnv *env, jclas JNI_NULL_CHECK(env, mask_jcol, "mask column is null", 0); try { cudf::jni::auto_set_device(env); - cudf::table_view *input = reinterpret_cast(input_jtable); - cudf::column_view *mask = reinterpret_cast(mask_jcol); - std::unique_ptr result = cudf::apply_boolean_mask(*input, *mask); - return cudf::jni::convert_table_for_return(env, result); + auto const input = reinterpret_cast(input_jtable); + auto const mask = reinterpret_cast(mask_jcol); + return convert_table_for_return(env, cudf::apply_boolean_mask(*input, *mask)); } CATCH_STD(env, 0); } @@ -3063,7 +3025,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates( nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL, nulls_before ? cudf::null_order::BEFORE : cudf::null_order::AFTER, rmm::mr::get_current_device_resource()); - return cudf::jni::convert_table_for_return(env, result); + return convert_table_for_return(env, result); } CATCH_STD(env, 0); } @@ -3074,12 +3036,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas JNI_NULL_CHECK(env, j_map, "map column is null", 0); try { cudf::jni::auto_set_device(env); - cudf::table_view *input = reinterpret_cast(j_input); - cudf::column_view *map = reinterpret_cast(j_map); + auto const input = reinterpret_cast(j_input); + auto const map = reinterpret_cast(j_map); auto bounds_policy = check_bounds ? cudf::out_of_bounds_policy::NULLIFY : cudf::out_of_bounds_policy::DONT_CHECK; - std::unique_ptr result = cudf::gather(*input, *map, bounds_policy); - return cudf::jni::convert_table_for_return(env, result); + return convert_table_for_return(env, cudf::gather(*input, *map, bounds_policy)); } CATCH_STD(env, 0); } @@ -3090,7 +3051,7 @@ Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass, try { cudf::jni::auto_set_device(env); - cudf::table_view *n_input_table = reinterpret_cast(input_table); + auto const n_input_table = reinterpret_cast(input_table); std::vector> cols = cudf::jni::convert_to_rows_fixed_width_optimized(*n_input_table); int num_columns = cols.size(); @@ -3114,8 +3075,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterTable(JNIEnv *env, auto const input = reinterpret_cast(j_input); auto const map = reinterpret_cast(j_map); auto const target = reinterpret_cast(j_target); - auto result = cudf::scatter(*input, *map, *target, check_bounds); - return cudf::jni::convert_table_for_return(env, result); + return convert_table_for_return(env, cudf::scatter(*input, *map, *target, check_bounds)); } CATCH_STD(env, 0); } @@ -3131,13 +3091,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterScalars(JNIEnv *en cudf::jni::auto_set_device(env); auto const scalars_array = cudf::jni::native_jpointerArray(env, j_input); std::vector> input; - for (int i = 0; i < scalars_array.size(); ++i) { - input.emplace_back(*scalars_array[i]); - } + std::transform(scalars_array.begin(), scalars_array.end(), std::back_inserter(input), + [](auto &scalar) { return std::ref(*scalar); }); auto const map = reinterpret_cast(j_map); auto const target = reinterpret_cast(j_target); - auto result = cudf::scatter(input, *map, *target, check_bounds); - return cudf::jni::convert_table_for_return(env, result); + return convert_table_for_return(env, cudf::scatter(input, *map, *target, check_bounds)); } CATCH_STD(env, 0); } @@ -3148,7 +3106,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env try { cudf::jni::auto_set_device(env); - cudf::table_view *n_input_table = reinterpret_cast(input_table); + auto const n_input_table = reinterpret_cast(input_table); std::vector> cols = cudf::jni::convert_to_rows(*n_input_table); int num_columns = cols.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); @@ -3166,8 +3124,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidth try { cudf::jni::auto_set_device(env); - cudf::column_view *input = reinterpret_cast(input_column); - cudf::lists_column_view list_input(*input); + cudf::lists_column_view const list_input{*reinterpret_cast(input_column)}; cudf::jni::native_jintArray n_types(env, types); cudf::jni::native_jintArray n_scale(env, scale); if (n_types.size() != n_scale.size()) { @@ -3179,7 +3136,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidth [](jint type, jint scale) { return cudf::jni::make_data_type(type, scale); }); std::unique_ptr result = cudf::jni::convert_from_rows_fixed_width_optimized(list_input, types_vec); - return cudf::jni::convert_table_for_return(env, result); + return convert_table_for_return(env, result); } CATCH_STD(env, 0); } @@ -3193,8 +3150,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e try { cudf::jni::auto_set_device(env); - cudf::column_view *input = reinterpret_cast(input_column); - cudf::lists_column_view list_input(*input); + cudf::lists_column_view const list_input{*reinterpret_cast(input_column)}; cudf::jni::native_jintArray n_types(env, types); cudf::jni::native_jintArray n_scale(env, scale); if (n_types.size() != n_scale.size()) { @@ -3205,7 +3161,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e std::transform(n_types.begin(), n_types.end(), n_scale.begin(), std::back_inserter(types_vec), [](jint type, jint scale) { return cudf::jni::make_data_type(type, scale); }); std::unique_ptr result = cudf::jni::convert_from_rows(list_input, types_vec); - return cudf::jni::convert_table_for_return(env, result); + return convert_table_for_return(env, result); } CATCH_STD(env, 0); } @@ -3216,9 +3172,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatStaticCount(JNIEnv JNI_NULL_CHECK(env, input_jtable, "input table is null", 0); try { cudf::jni::auto_set_device(env); - cudf::table_view *input = reinterpret_cast(input_jtable); - std::unique_ptr result = cudf::repeat(*input, count); - return cudf::jni::convert_table_for_return(env, result); + auto const input = reinterpret_cast(input_jtable); + return convert_table_for_return(env, cudf::repeat(*input, count)); } CATCH_STD(env, 0); } @@ -3231,10 +3186,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatColumnCount(JNIEnv JNI_NULL_CHECK(env, count_jcol, "count column is null", 0); try { cudf::jni::auto_set_device(env); - cudf::table_view *input = reinterpret_cast(input_jtable); - cudf::column_view *count = reinterpret_cast(count_jcol); - std::unique_ptr result = cudf::repeat(*input, *count, check_count); - return cudf::jni::convert_table_for_return(env, result); + auto const input = reinterpret_cast(input_jtable); + auto const count = reinterpret_cast(count_jcol); + return convert_table_for_return(env, cudf::repeat(*input, *count, check_count)); } CATCH_STD(env, 0); } @@ -3351,7 +3305,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate( } auto result_table = std::make_unique(std::move(result_columns)); - return cudf::jni::convert_table_for_return(env, result_table); + return convert_table_for_return(env, result_table); } CATCH_STD(env, NULL); } @@ -3444,7 +3398,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega } auto result_table = std::make_unique(std::move(result_columns)); - return cudf::jni::convert_table_for_return(env, result_table); + return convert_table_for_return(env, result_table); } CATCH_STD(env, NULL); } @@ -3455,10 +3409,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explode(JNIEnv *env, jcla JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0); try { cudf::jni::auto_set_device(env); - cudf::table_view *input_table = reinterpret_cast(input_jtable); - cudf::size_type col_index = static_cast(column_index); - std::unique_ptr exploded = cudf::explode(*input_table, col_index); - return cudf::jni::convert_table_for_return(env, exploded); + auto const input_table = reinterpret_cast(input_jtable); + auto const col_index = static_cast(column_index); + return convert_table_for_return(env, cudf::explode(*input_table, col_index)); } CATCH_STD(env, 0); } @@ -3469,10 +3422,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodePosition(JNIEnv *e JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0); try { cudf::jni::auto_set_device(env); - cudf::table_view *input_table = reinterpret_cast(input_jtable); - cudf::size_type col_index = static_cast(column_index); - std::unique_ptr exploded = cudf::explode_position(*input_table, col_index); - return cudf::jni::convert_table_for_return(env, exploded); + auto const input_table = reinterpret_cast(input_jtable); + auto const col_index = static_cast(column_index); + return convert_table_for_return(env, cudf::explode_position(*input_table, col_index)); } CATCH_STD(env, 0); } @@ -3483,10 +3435,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuter(JNIEnv *env, JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0); try { cudf::jni::auto_set_device(env); - cudf::table_view *input_table = reinterpret_cast(input_jtable); - cudf::size_type col_index = static_cast(column_index); - std::unique_ptr exploded = cudf::explode_outer(*input_table, col_index); - return cudf::jni::convert_table_for_return(env, exploded); + auto const input_table = reinterpret_cast(input_jtable); + auto const col_index = static_cast(column_index); + return convert_table_for_return(env, cudf::explode_outer(*input_table, col_index)); } CATCH_STD(env, 0); } @@ -3497,10 +3448,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuterPosition(JNIE JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0); try { cudf::jni::auto_set_device(env); - cudf::table_view *input_table = reinterpret_cast(input_jtable); - cudf::size_type col_index = static_cast(column_index); - std::unique_ptr exploded = cudf::explode_outer_position(*input_table, col_index); - return cudf::jni::convert_table_for_return(env, exploded); + auto const input_table = reinterpret_cast(input_jtable); + auto const col_index = static_cast(column_index); + return convert_table_for_return(env, cudf::explode_outer_position(*input_table, col_index)); } CATCH_STD(env, 0); } @@ -3509,8 +3459,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_rowBitCount(JNIEnv *env, jclas JNI_NULL_CHECK(env, j_table, "table is null", 0); try { cudf::jni::auto_set_device(env); - auto t = reinterpret_cast(j_table); - return release_as_jlong(cudf::row_bit_count(*t)); + auto const input_table = reinterpret_cast(j_table); + return release_as_jlong(cudf::row_bit_count(*input_table)); } CATCH_STD(env, 0); } @@ -3528,7 +3478,7 @@ JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups( try { cudf::jni::auto_set_device(env); cudf::jni::native_jintArray n_key_indices(env, jkey_indices); - cudf::table_view *input_table = reinterpret_cast(jinput_table); + auto const input_table = reinterpret_cast(jinput_table); // Prepares arguments for the groupby: // (keys, null_handling, keys_are_sorted, column_order, null_precedence) @@ -3622,11 +3572,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_sample(JNIEnv *env, jclas JNI_NULL_CHECK(env, j_input, "input table is null", 0); try { cudf::jni::auto_set_device(env); - cudf::table_view *input = reinterpret_cast(j_input); + auto const input = reinterpret_cast(j_input); auto sample_with_replacement = replacement ? cudf::sample_with_replacement::TRUE : cudf::sample_with_replacement::FALSE; - std::unique_ptr result = cudf::sample(*input, n, sample_with_replacement, seed); - return cudf::jni::convert_table_for_return(env, result); + return convert_table_for_return(env, cudf::sample(*input, n, sample_with_replacement, seed)); } CATCH_STD(env, 0); } diff --git a/java/src/main/native/src/cudf_jni_apis.hpp b/java/src/main/native/src/cudf_jni_apis.hpp index fbcca0c82ee..12fd45b831a 100644 --- a/java/src/main/native/src/cudf_jni_apis.hpp +++ b/java/src/main/native/src/cudf_jni_apis.hpp @@ -23,7 +23,28 @@ namespace cudf { namespace jni { -jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr &table_result); +/** + * @brief Detach all columns from the specified table, and pointers to them as an array. + * + * This function takes a table (presumably returned by some operation), and turns it into an + * array of column* (as jlongs). + * The lifetime of the columns is decoupled from that of the table, and is managed by the caller. + * + * @param env The JNI environment + * @param table_result the table to convert for return + * @param extra_columns columns not in the table that will be appended to the result. + */ +jlongArray +convert_table_for_return(JNIEnv *env, std::unique_ptr &table_result, + std::vector> &&extra_columns = {}); + +/** + * @copydoc convert_table_for_return(JNIEnv*, std::unique_ptr&, + * std::vector>&&) + */ +jlongArray +convert_table_for_return(JNIEnv *env, std::unique_ptr &&table_result, + std::vector> &&extra_columns = {}); // // ContiguousTable APIs diff --git a/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java index a26dbec4907..7b476c31b95 100644 --- a/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ import org.junit.jupiter.api.Test; import java.util.Random; +import java.util.function.Consumer; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -39,21 +40,34 @@ public void testCreateColumnVectorBuilder() { @Test public void testArrayAllocation() { - try (HostColumnVector byteColumnVector = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) { - assertFalse(byteColumnVector.hasNulls()); - assertEquals(byteColumnVector.getByte(0), 2); - assertEquals(byteColumnVector.getByte(1), 3); - assertEquals(byteColumnVector.getByte(2), 5); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); + assertEquals(cv.getByte(0), 2); + assertEquals(cv.getByte(1), 3); + assertEquals(cv.getByte(2), 5); + }; + try (HostColumnVector bcv = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) { + verify.accept(bcv); + } + try (HostColumnVector bcv = ColumnBuilderHelper.fromBytes(true, new byte[]{2, 3, 5})) { + verify.accept(bcv); } } @Test public void testUnsignedArrayAllocation() { - try (HostColumnVector v = HostColumnVector.fromUnsignedBytes(new byte[]{(byte)0xff, (byte)128, 5})) { - assertFalse(v.hasNulls()); - assertEquals(0xff, Byte.toUnsignedInt(v.getByte(0)), 0xff); - assertEquals(128, Byte.toUnsignedInt(v.getByte(1)), 128); - assertEquals(5, Byte.toUnsignedInt(v.getByte(2)), 5); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); + assertEquals(0xff, Byte.toUnsignedInt(cv.getByte(0)), 0xff); + assertEquals(128, Byte.toUnsignedInt(cv.getByte(1)), 128); + assertEquals(5, Byte.toUnsignedInt(cv.getByte(2)), 5); + }; + try (HostColumnVector bcv = HostColumnVector.fromUnsignedBytes(new byte[]{(byte)0xff, (byte)128, 5})) { + verify.accept(bcv); + } + try (HostColumnVector bcv = ColumnBuilderHelper.fromBytes(false, + new byte[]{(byte)0xff, (byte)128, 5})) { + verify.accept(bcv); } } @@ -70,47 +84,73 @@ public void testAppendRepeatingValues() { @Test public void testUpperIndexOutOfBoundsException() { - try (HostColumnVector byteColumnVector = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) { - assertThrows(AssertionError.class, () -> byteColumnVector.getByte(3)); - assertFalse(byteColumnVector.hasNulls()); + Consumer verify = (cv) -> { + assertThrows(AssertionError.class, () -> cv.getByte(3)); + assertFalse(cv.hasNulls()); + }; + try (HostColumnVector bcv = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) { + verify.accept(bcv); + } + try (HostColumnVector bcv = ColumnBuilderHelper.fromBytes(true, new byte[]{2, 3, 5})) { + verify.accept(bcv); } } @Test public void testLowerIndexOutOfBoundsException() { - try (HostColumnVector byteColumnVector = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) { - assertFalse(byteColumnVector.hasNulls()); - assertThrows(AssertionError.class, () -> byteColumnVector.getByte(-1)); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); + assertThrows(AssertionError.class, () -> cv.getByte(-1)); + }; + try (HostColumnVector bcv = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) { + verify.accept(bcv); + } + try (HostColumnVector bcv = ColumnBuilderHelper.fromBytes(true, new byte[]{2, 3, 5})) { + verify.accept(bcv); } } @Test public void testAddingNullValues() { - try (HostColumnVector byteColumnVector = HostColumnVector.fromBoxedBytes( - new Byte[]{2, 3, 4, 5, 6, 7, null, null})) { - assertTrue(byteColumnVector.hasNulls()); - assertEquals(2, byteColumnVector.getNullCount()); + Consumer verify = (cv) -> { + assertTrue(cv.hasNulls()); + assertEquals(2, cv.getNullCount()); for (int i = 0; i < 6; i++) { - assertFalse(byteColumnVector.isNull(i)); + assertFalse(cv.isNull(i)); } - assertTrue(byteColumnVector.isNull(6)); - assertTrue(byteColumnVector.isNull(7)); + assertTrue(cv.isNull(6)); + assertTrue(cv.isNull(7)); + }; + try (HostColumnVector bcv = HostColumnVector.fromBoxedBytes( + new Byte[]{2, 3, 4, 5, 6, 7, null, null})) { + verify.accept(bcv); + } + try (HostColumnVector bcv = ColumnBuilderHelper.fromBoxedBytes(true, + new Byte[]{2, 3, 4, 5, 6, 7, null, null})) { + verify.accept(bcv); } } @Test public void testAddingUnsignedNullValues() { - try (HostColumnVector byteColumnVector = HostColumnVector.fromBoxedUnsignedBytes( - new Byte[]{2, 3, 4, 5, (byte)128, (byte)254, null, null})) { - assertTrue(byteColumnVector.hasNulls()); - assertEquals(2, byteColumnVector.getNullCount()); + Consumer verify = (cv) -> { + assertTrue(cv.hasNulls()); + assertEquals(2, cv.getNullCount()); for (int i = 0; i < 6; i++) { - assertFalse(byteColumnVector.isNull(i)); + assertFalse(cv.isNull(i)); } - assertEquals(128, Byte.toUnsignedInt(byteColumnVector.getByte(4))); - assertEquals(254, Byte.toUnsignedInt(byteColumnVector.getByte(5))); - assertTrue(byteColumnVector.isNull(6)); - assertTrue(byteColumnVector.isNull(7)); + assertEquals(128, Byte.toUnsignedInt(cv.getByte(4))); + assertEquals(254, Byte.toUnsignedInt(cv.getByte(5))); + assertTrue(cv.isNull(6)); + assertTrue(cv.isNull(7)); + }; + try (HostColumnVector bcv = HostColumnVector.fromBoxedUnsignedBytes( + new Byte[]{2, 3, 4, 5, (byte)128, (byte)254, null, null})) { + verify.accept(bcv); + } + try (HostColumnVector bcv = ColumnBuilderHelper.fromBoxedBytes(false, + new Byte[]{2, 3, 4, 5, (byte)128, (byte)254, null, null})) { + verify.accept(bcv); } } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnBuilderHelper.java b/java/src/test/java/ai/rapids/cudf/ColumnBuilderHelper.java new file mode 100644 index 00000000000..263244b2413 --- /dev/null +++ b/java/src/test/java/ai/rapids/cudf/ColumnBuilderHelper.java @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +import java.math.BigDecimal; +import java.math.RoundingMode; +import java.util.Arrays; +import java.util.Comparator; +import java.util.Objects; +import java.util.function.Consumer; + +/** + * ColumnBuilderHelper helps to test ColumnBuilder with existed ColumnVector tests. + */ +public class ColumnBuilderHelper { + + public static HostColumnVector build( + HostColumnVector.DataType type, + int rows, + Consumer init) { + try (HostColumnVector.ColumnBuilder b = new HostColumnVector.ColumnBuilder(type, rows)) { + init.accept(b); + return b.build(); + } + } + + public static ColumnVector buildOnDevice( + HostColumnVector.DataType type, + int rows, + Consumer init) { + try (HostColumnVector.ColumnBuilder b = new HostColumnVector.ColumnBuilder(type, rows)) { + init.accept(b); + return b.buildAndPutOnDevice(); + } + } + + public static HostColumnVector fromBoxedBytes(boolean signed, Byte... values) { + DType dt = signed ? DType.INT8 : DType.UINT8; + return ColumnBuilderHelper.build( + new HostColumnVector.BasicType(true, dt), + values.length, + (b) -> { + for (Byte v : values) + if (v == null) b.appendNull(); + else b.append(v); + }); + } + + public static HostColumnVector fromBoxedDoubles(Double... values) { + return ColumnBuilderHelper.build( + new HostColumnVector.BasicType(true, DType.FLOAT64), + values.length, + (b) -> { + for (Double v : values) + if (v == null) b.appendNull(); + else b.append(v); + }); + } + + public static HostColumnVector fromBoxedInts(boolean signed, Integer... values) { + DType dt = signed ? DType.INT32 : DType.UINT32; + return ColumnBuilderHelper.build( + new HostColumnVector.BasicType(true, dt), + values.length, + (b) -> { + for (Integer v : values) + if (v == null) b.appendNull(); + else b.append(v); + }); + } + + public static HostColumnVector fromBoxedLongs(boolean signed, Long... values) { + DType dt = signed ? DType.INT64 : DType.UINT64; + return ColumnBuilderHelper.build( + new HostColumnVector.BasicType(true, dt), + values.length, + (b) -> { + for (Long v : values) + if (v == null) b.appendNull(); + else b.append(v); + }); + } + + public static HostColumnVector fromBytes(boolean signed, byte... values) { + DType dt = signed ? DType.INT8 : DType.UINT8; + return ColumnBuilderHelper.build( + new HostColumnVector.BasicType(false, dt), + values.length, + (b) -> { + for (byte v : values) b.append(v); + }); + } + + public static HostColumnVector fromDecimals(BigDecimal... values) { + // Simply copy from HostColumnVector.fromDecimals + BigDecimal maxDec = Arrays.stream(values).filter(Objects::nonNull) + .max(Comparator.comparingInt(BigDecimal::precision)) + .orElse(BigDecimal.ZERO); + int maxScale = Arrays.stream(values).filter(Objects::nonNull) + .map(decimal -> decimal.scale()) + .max(Comparator.naturalOrder()) + .orElse(0); + maxDec = maxDec.setScale(maxScale, RoundingMode.UNNECESSARY); + + return ColumnBuilderHelper.build( + new HostColumnVector.BasicType(true, DType.fromJavaBigDecimal(maxDec)), + values.length, + (b) -> { + for (BigDecimal v : values) + if (v == null) b.appendNull(); + else b.append(v); + }); + } + + public static HostColumnVector fromDoubles(double... values) { + return ColumnBuilderHelper.build( + new HostColumnVector.BasicType(false, DType.FLOAT64), + values.length, + (b) -> { + for (double v : values) b.append(v); + }); + } + + public static HostColumnVector fromInts(boolean signed, int... values) { + DType dt = signed ? DType.INT32 : DType.UINT32; + return ColumnBuilderHelper.build( + new HostColumnVector.BasicType(false, dt), + values.length, + (b) -> { + for (int v : values) b.append(v); + }); + } + + public static HostColumnVector fromLongs(boolean signed, long... values) { + DType dt = signed ? DType.INT64 : DType.UINT64; + return ColumnBuilderHelper.build( + new HostColumnVector.BasicType(false, dt), + values.length, + (b) -> { + for (long v : values) b.append(v); + }); + } +} diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 8d4bbff1542..8f39c3c51ce 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -3559,7 +3559,6 @@ void testCastDecimal64ToString() { for (int scale : new int[]{-5, -2, -1, 0, 1, 2, 5}) { for (int i = 0; i < strDecimalValues.length; i++) { strDecimalValues[i] = dumpDecimal(unScaledValues[i], scale); - System.out.println(strDecimalValues[i]); } testCastFixedWidthToStringsAndBack(DType.create(DType.DTypeEnum.DECIMAL64, scale), @@ -4703,13 +4702,21 @@ void testStringSplitRecord() { @Test void testStringSplit() { - try (ColumnVector v = ColumnVector.fromStrings("Héllo there", "thésé", null, "", "ARé some", "test strings"); - Table expected = new Table.TestBuilder().column("Héllo", "thésé", null, "", "ARé", "test") + try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", "ARé some things", "test strings here"); + Table expectedSplitOnce = new Table.TestBuilder() + .column("Héllo", "thésé", null, "", "ARé", "test") + .column("there all", null, null, null, "some things", "strings here") + .build(); + Table expectedSplitAll = new Table.TestBuilder() + .column("Héllo", "thésé", null, "", "ARé", "test") .column("there", null, null, null, "some", "strings") + .column("all", null, null, null, "things", "here") .build(); Scalar pattern = Scalar.fromString(" "); - Table result = v.stringSplit(pattern)) { - assertTablesAreEqual(expected, result); + Table resultSplitOnce = v.stringSplit(pattern, 1); + Table resultSplitAll = v.stringSplit(pattern)) { + assertTablesAreEqual(expectedSplitOnce, resultSplitOnce); + assertTablesAreEqual(expectedSplitAll, resultSplitAll); } } diff --git a/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java index c2772520f57..994066c5df0 100644 --- a/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,10 +22,12 @@ import org.junit.jupiter.api.Test; import java.math.BigDecimal; +import java.math.BigInteger; import java.math.RoundingMode; import java.util.Arrays; import java.util.Objects; import java.util.Random; +import java.util.function.Consumer; import static org.junit.jupiter.api.Assertions.*; @@ -33,9 +35,11 @@ public class DecimalColumnVectorTest extends CudfTestBase { private static final Random rdSeed = new Random(1234); private static final int dec32Scale = 4; private static final int dec64Scale = 10; + private static final int dec128Scale = 30; private static final BigDecimal[] decimal32Zoo = new BigDecimal[20]; private static final BigDecimal[] decimal64Zoo = new BigDecimal[20]; + private static final BigDecimal[] decimal128Zoo = new BigDecimal[20]; private static final int[] unscaledDec32Zoo = new int[decimal32Zoo.length]; private static final long[] unscaledDec64Zoo = new long[decimal64Zoo.length]; @@ -45,6 +49,9 @@ public class DecimalColumnVectorTest extends CudfTestBase { private final BigDecimal[] boundaryDecimal64 = new BigDecimal[]{ new BigDecimal("999999999999999999"), new BigDecimal("-999999999999999999")}; + private final BigDecimal[] boundaryDecimal128 = new BigDecimal[]{ + new BigDecimal("99999999999999999999999999999999999999"), new BigDecimal("-99999999999999999999999999999999999999")}; + private final BigDecimal[] overflowDecimal32 = new BigDecimal[]{ BigDecimal.valueOf(Integer.MAX_VALUE), BigDecimal.valueOf(Integer.MIN_VALUE)}; @@ -72,6 +79,12 @@ public static void setup() { } else { decimal64Zoo[i] = null; } + if (rdSeed.nextBoolean()) { + BigInteger unscaledVal = BigInteger.valueOf(rdSeed.nextLong()).multiply(BigInteger.valueOf(rdSeed.nextLong())); + decimal128Zoo[i] = new BigDecimal(unscaledVal, dec128Scale); + } else { + decimal128Zoo[i] = null; + } } } @@ -190,27 +203,44 @@ public void testDecimalGeneral() { @Test public void testDecimalFromDecimals() { - DecimalColumnVectorTest.testDecimalImpl(false, dec32Scale, decimal32Zoo); - DecimalColumnVectorTest.testDecimalImpl(true, dec64Scale, decimal64Zoo); - DecimalColumnVectorTest.testDecimalImpl(false, 0, boundaryDecimal32); - DecimalColumnVectorTest.testDecimalImpl(true, 0, boundaryDecimal64); + DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL32, dec32Scale, decimal32Zoo); + DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL64, dec64Scale, decimal64Zoo); + DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL128, dec128Scale, decimal128Zoo); + DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL32, 0, boundaryDecimal32); + DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL64, 0, boundaryDecimal64); + DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL128, 0, boundaryDecimal128); } - private static void testDecimalImpl(boolean isInt64, int scale, BigDecimal[] decimalZoo) { - try (ColumnVector cv = ColumnVector.fromDecimals(decimalZoo)) { - try (HostColumnVector hcv = cv.copyToHost()) { - assertEquals(-scale, hcv.getType().getScale()); - assertEquals(isInt64, hcv.getType().typeId == DType.DTypeEnum.DECIMAL64); - assertEquals(decimalZoo.length, hcv.rows); - for (int i = 0; i < decimalZoo.length; i++) { - assertEquals(decimalZoo[i] == null, hcv.isNull(i)); - if (decimalZoo[i] != null) { - assertEquals(decimalZoo[i].floatValue(), hcv.getBigDecimal(i).floatValue()); - long backValue = isInt64 ? hcv.getLong(i) : hcv.getInt(i); - assertEquals(decimalZoo[i].setScale(scale, RoundingMode.UNNECESSARY), BigDecimal.valueOf(backValue, scale)); + private static void testDecimalImpl(DType.DTypeEnum decimalType, int scale, BigDecimal[] decimalZoo) { + Consumer assertions = (hcv) -> { + assertEquals(-scale, hcv.getType().getScale()); + assertEquals(hcv.getType().typeId, decimalType); + assertEquals(decimalZoo.length, hcv.rows); + for (int i = 0; i < decimalZoo.length; i++) { + assertEquals(decimalZoo[i] == null, hcv.isNull(i)); + if (decimalZoo[i] != null) { + BigDecimal actual; + switch (decimalType) { + case DECIMAL32: + actual = BigDecimal.valueOf(hcv.getInt(i), scale); + break; + case DECIMAL64: + actual = BigDecimal.valueOf(hcv.getLong(i), scale); + break; + default: + actual = hcv.getBigDecimal(i); } + assertEquals(decimalZoo[i].subtract(actual).longValueExact(), 0L); } } + }; + try (ColumnVector cv = ColumnVector.fromDecimals(decimalZoo)) { + try (HostColumnVector hcv = cv.copyToHost()) { + assertions.accept(hcv); + } + } + try (HostColumnVector hcv = ColumnBuilderHelper.fromDecimals(decimalZoo)) { + assertions.accept(hcv); } } diff --git a/java/src/test/java/ai/rapids/cudf/DoubleColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/DoubleColumnVectorTest.java index d82565e1d2d..fa34429685e 100644 --- a/java/src/test/java/ai/rapids/cudf/DoubleColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/DoubleColumnVectorTest.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ import org.junit.jupiter.api.Test; import java.util.Random; +import java.util.function.Consumer; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -40,34 +41,51 @@ public void testCreateColumnVectorBuilder() { @Test public void testArrayAllocation() { - try (HostColumnVector doubleColumnVector = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) { - assertFalse(doubleColumnVector.hasNulls()); - assertEqualsWithinPercentage(doubleColumnVector.getDouble(0), 2.1, 0.01); - assertEqualsWithinPercentage(doubleColumnVector.getDouble(1), 3.02, 0.01); - assertEqualsWithinPercentage(doubleColumnVector.getDouble(2), 5.003, 0.001); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); + assertEqualsWithinPercentage(cv.getDouble(0), 2.1, 0.01); + assertEqualsWithinPercentage(cv.getDouble(1), 3.02, 0.01); + assertEqualsWithinPercentage(cv.getDouble(2), 5.003, 0.001); + }; + try (HostColumnVector dcv = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) { + verify.accept(dcv); + } + try (HostColumnVector dcv = ColumnBuilderHelper.fromDoubles(2.1, 3.02, 5.003)) { + verify.accept(dcv); } } @Test public void testUpperIndexOutOfBoundsException() { - try (HostColumnVector doubleColumnVector = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) { - assertThrows(AssertionError.class, () -> doubleColumnVector.getDouble(3)); - assertFalse(doubleColumnVector.hasNulls()); + Consumer verify = (cv) -> { + assertThrows(AssertionError.class, () -> cv.getDouble(3)); + assertFalse(cv.hasNulls()); + }; + try (HostColumnVector dcv = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) { + verify.accept(dcv); + } + try (HostColumnVector dcv = ColumnBuilderHelper.fromDoubles(2.1, 3.02, 5.003)) { + verify.accept(dcv); } } @Test public void testLowerIndexOutOfBoundsException() { - try (HostColumnVector doubleColumnVector = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) { - assertFalse(doubleColumnVector.hasNulls()); - assertThrows(AssertionError.class, () -> doubleColumnVector.getDouble(-1)); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); + assertThrows(AssertionError.class, () -> cv.getDouble(-1)); + }; + try (HostColumnVector dcv = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) { + verify.accept(dcv); + } + try (HostColumnVector dcv = ColumnBuilderHelper.fromDoubles(2.1, 3.02, 5.003)) { + verify.accept(dcv); } } @Test public void testAddingNullValues() { - try (HostColumnVector cv = - HostColumnVector.fromBoxedDoubles(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, null, null)) { + Consumer verify = (cv) -> { assertTrue(cv.hasNulls()); assertEquals(2, cv.getNullCount()); for (int i = 0; i < 6; i++) { @@ -75,6 +93,14 @@ public void testAddingNullValues() { } assertTrue(cv.isNull(6)); assertTrue(cv.isNull(7)); + }; + try (HostColumnVector dcv = + HostColumnVector.fromBoxedDoubles(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, null, null)) { + verify.accept(dcv); + } + try (HostColumnVector dcv = ColumnBuilderHelper.fromBoxedDoubles( + 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, null, null)) { + verify.accept(dcv); } } diff --git a/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java index 2fb8164534b..7d6311fb24c 100644 --- a/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ import org.junit.jupiter.api.Test; import java.util.Random; +import java.util.function.Consumer; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -34,47 +35,75 @@ public void testCreateColumnVectorBuilder() { try (ColumnVector intColumnVector = ColumnVector.build(DType.INT32, 3, (b) -> b.append(1))) { assertFalse(intColumnVector.hasNulls()); } + try (ColumnVector intColumnVector = ColumnBuilderHelper.buildOnDevice( + new HostColumnVector.BasicType(true, DType.INT32), 3, (b) -> b.append(1))) { + assertFalse(intColumnVector.hasNulls()); + } } @Test public void testArrayAllocation() { - try (HostColumnVector intColumnVector = HostColumnVector.fromInts(2, 3, 5)) { - assertFalse(intColumnVector.hasNulls()); - assertEquals(intColumnVector.getInt(0), 2); - assertEquals(intColumnVector.getInt(1), 3); - assertEquals(intColumnVector.getInt(2), 5); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); + assertEquals(cv.getInt(0), 2); + assertEquals(cv.getInt(1), 3); + assertEquals(cv.getInt(2), 5); + }; + try (HostColumnVector cv = HostColumnVector.fromInts(2, 3, 5)) { + verify.accept(cv); + } + try (HostColumnVector cv = ColumnBuilderHelper.fromInts(true, 2, 3, 5)) { + verify.accept(cv); } } @Test public void testUnsignedArrayAllocation() { - try (HostColumnVector v = HostColumnVector.fromUnsignedInts(0xfedcba98, 0x80000000, 5)) { - assertFalse(v.hasNulls()); - assertEquals(0xfedcba98L, Integer.toUnsignedLong(v.getInt(0))); - assertEquals(0x80000000L, Integer.toUnsignedLong(v.getInt(1))); - assertEquals(5, Integer.toUnsignedLong(v.getInt(2))); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); + assertEquals(0xfedcba98L, Integer.toUnsignedLong(cv.getInt(0))); + assertEquals(0x80000000L, Integer.toUnsignedLong(cv.getInt(1))); + assertEquals(5, Integer.toUnsignedLong(cv.getInt(2))); + }; + try (HostColumnVector cv = HostColumnVector.fromUnsignedInts(0xfedcba98, 0x80000000, 5)) { + verify.accept(cv); + } + try (HostColumnVector cv = ColumnBuilderHelper.fromInts(false, 0xfedcba98, 0x80000000, 5)) { + verify.accept(cv); } } @Test public void testUpperIndexOutOfBoundsException() { - try (HostColumnVector intColumnVector = HostColumnVector.fromInts(2, 3, 5)) { - assertThrows(AssertionError.class, () -> intColumnVector.getInt(3)); - assertFalse(intColumnVector.hasNulls()); + Consumer verify = (cv) -> { + assertThrows(AssertionError.class, () -> cv.getInt(3)); + assertFalse(cv.hasNulls()); + }; + try (HostColumnVector icv = HostColumnVector.fromInts(2, 3, 5)) { + verify.accept(icv); + } + try (HostColumnVector icv = ColumnBuilderHelper.fromInts(true, 2, 3, 5)) { + verify.accept(icv); } } @Test public void testLowerIndexOutOfBoundsException() { - try (HostColumnVector intColumnVector = HostColumnVector.fromInts(2, 3, 5)) { - assertFalse(intColumnVector.hasNulls()); - assertThrows(AssertionError.class, () -> intColumnVector.getInt(-1)); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); + assertThrows(AssertionError.class, () -> cv.getInt(-1)); + }; + try (HostColumnVector icv = HostColumnVector.fromInts(2, 3, 5)) { + verify.accept(icv); + } + try (HostColumnVector icv = ColumnBuilderHelper.fromInts(true, 2, 3, 5)) { + verify.accept(icv); } } @Test public void testAddingNullValues() { - try (HostColumnVector cv = HostColumnVector.fromBoxedInts(2, 3, 4, 5, 6, 7, null, null)) { + Consumer verify = (cv) -> { assertTrue(cv.hasNulls()); assertEquals(2, cv.getNullCount()); for (int i = 0; i < 6; i++) { @@ -82,13 +111,18 @@ public void testAddingNullValues() { } assertTrue(cv.isNull(6)); assertTrue(cv.isNull(7)); + }; + try (HostColumnVector cv = HostColumnVector.fromBoxedInts(2, 3, 4, 5, 6, 7, null, null)) { + verify.accept(cv); + } + try (HostColumnVector cv = ColumnBuilderHelper.fromBoxedInts(true, 2, 3, 4, 5, 6, 7, null, null)) { + verify.accept(cv); } } @Test public void testAddingUnsignedNullValues() { - try (HostColumnVector cv = HostColumnVector.fromBoxedUnsignedInts( - 2, 3, 4, 5, 0xfedbca98, 0x80000000, null, null)) { + Consumer verify = (cv) -> { assertTrue(cv.hasNulls()); assertEquals(2, cv.getNullCount()); for (int i = 0; i < 6; i++) { @@ -98,6 +132,14 @@ public void testAddingUnsignedNullValues() { assertEquals(0x80000000L, Integer.toUnsignedLong(cv.getInt(5))); assertTrue(cv.isNull(6)); assertTrue(cv.isNull(7)); + }; + try (HostColumnVector cv = HostColumnVector.fromBoxedUnsignedInts( + 2, 3, 4, 5, 0xfedbca98, 0x80000000, null, null)) { + verify.accept(cv); + } + try (HostColumnVector cv = ColumnBuilderHelper.fromBoxedInts(false, + 2, 3, 4, 5, 0xfedbca98, 0x80000000, null, null)) { + verify.accept(cv); } } diff --git a/java/src/test/java/ai/rapids/cudf/LongColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/LongColumnVectorTest.java index 43c2b5a99c2..193992f5304 100644 --- a/java/src/test/java/ai/rapids/cudf/LongColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/LongColumnVectorTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ import org.junit.jupiter.api.Test; import java.util.Random; +import java.util.function.Consumer; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -38,46 +39,71 @@ public void testCreateColumnVectorBuilder() { @Test public void testArrayAllocation() { - try (HostColumnVector longColumnVector = HostColumnVector.fromLongs(2L, 3L, 5L)) { - assertFalse(longColumnVector.hasNulls()); - assertEquals(longColumnVector.getLong(0), 2); - assertEquals(longColumnVector.getLong(1), 3); - assertEquals(longColumnVector.getLong(2), 5); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); + assertEquals(cv.getLong(0), 2); + assertEquals(cv.getLong(1), 3); + assertEquals(cv.getLong(2), 5); + }; + try (HostColumnVector lcv = HostColumnVector.fromLongs(2L, 3L, 5L)) { + verify.accept(lcv); + } + try (HostColumnVector lcv = ColumnBuilderHelper.fromLongs(true,2L, 3L, 5L)) { + verify.accept(lcv); } } @Test public void testUnsignedArrayAllocation() { - try (HostColumnVector longColumnVector = HostColumnVector.fromUnsignedLongs( - 0xfedcba9876543210L, 0x8000000000000000L, 5L)) { - assertFalse(longColumnVector.hasNulls()); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); assertEquals(Long.toUnsignedString(0xfedcba9876543210L), - Long.toUnsignedString(longColumnVector.getLong(0))); + Long.toUnsignedString(cv.getLong(0))); assertEquals(Long.toUnsignedString(0x8000000000000000L), - Long.toUnsignedString(longColumnVector.getLong(1))); - assertEquals(5L, longColumnVector.getLong(2)); + Long.toUnsignedString(cv.getLong(1))); + assertEquals(5L, cv.getLong(2)); + }; + try (HostColumnVector lcv = HostColumnVector.fromUnsignedLongs( + 0xfedcba9876543210L, 0x8000000000000000L, 5L)) { + verify.accept(lcv); + } + try (HostColumnVector lcv = ColumnBuilderHelper.fromLongs(false, + 0xfedcba9876543210L, 0x8000000000000000L, 5L)) { + verify.accept(lcv); } } @Test public void testUpperIndexOutOfBoundsException() { - try (HostColumnVector longColumnVector = HostColumnVector.fromLongs(2L, 3L, 5L)) { - assertThrows(AssertionError.class, () -> longColumnVector.getLong(3)); - assertFalse(longColumnVector.hasNulls()); + Consumer verify = (cv) -> { + assertThrows(AssertionError.class, () -> cv.getLong(3)); + assertFalse(cv.hasNulls()); + }; + try (HostColumnVector lcv = HostColumnVector.fromLongs(2L, 3L, 5L)) { + verify.accept(lcv); + } + try (HostColumnVector lcv = ColumnBuilderHelper.fromLongs(true, 2L, 3L, 5L)) { + verify.accept(lcv); } } @Test public void testLowerIndexOutOfBoundsException() { - try (HostColumnVector longColumnVector = HostColumnVector.fromLongs(2L, 3L, 5L)) { - assertFalse(longColumnVector.hasNulls()); - assertThrows(AssertionError.class, () -> longColumnVector.getLong(-1)); + Consumer verify = (cv) -> { + assertFalse(cv.hasNulls()); + assertThrows(AssertionError.class, () -> cv.getLong(-1)); + }; + try (HostColumnVector lcv = HostColumnVector.fromLongs(2L, 3L, 5L)) { + verify.accept(lcv); + } + try (HostColumnVector lcv = ColumnBuilderHelper.fromLongs(true, 2L, 3L, 5L)) { + verify.accept(lcv); } } @Test public void testAddingNullValues() { - try (HostColumnVector cv = HostColumnVector.fromBoxedLongs(2L, 3L, 4L, 5L, 6L, 7L, null, null)) { + Consumer verify = (cv) -> { assertTrue(cv.hasNulls()); assertEquals(2, cv.getNullCount()); for (int i = 0; i < 6; i++) { @@ -85,13 +111,19 @@ public void testAddingNullValues() { } assertTrue(cv.isNull(6)); assertTrue(cv.isNull(7)); + }; + try (HostColumnVector lcv = HostColumnVector.fromBoxedLongs(2L, 3L, 4L, 5L, 6L, 7L, null, null)) { + verify.accept(lcv); + } + try (HostColumnVector lcv = ColumnBuilderHelper.fromBoxedLongs(true, + 2L, 3L, 4L, 5L, 6L, 7L, null, null)) { + verify.accept(lcv); } } @Test public void testAddingUnsignedNullValues() { - try (HostColumnVector cv = HostColumnVector.fromBoxedUnsignedLongs( - 2L, 3L, 4L, 5L, 0xfedcba9876543210L, 0x8000000000000000L, null, null)) { + Consumer verify = (cv) -> { assertTrue(cv.hasNulls()); assertEquals(2, cv.getNullCount()); for (int i = 0; i < 6; i++) { @@ -103,6 +135,14 @@ public void testAddingUnsignedNullValues() { Long.toUnsignedString(cv.getLong(5))); assertTrue(cv.isNull(6)); assertTrue(cv.isNull(7)); + }; + try (HostColumnVector lcv = HostColumnVector.fromBoxedUnsignedLongs( + 2L, 3L, 4L, 5L, 0xfedcba9876543210L, 0x8000000000000000L, null, null)) { + verify.accept(lcv); + } + try (HostColumnVector lcv = ColumnBuilderHelper.fromBoxedLongs(false, + 2L, 3L, 4L, 5L, 0xfedcba9876543210L, 0x8000000000000000L, null, null)) { + verify.accept(lcv); } } diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd index e5a8bb926c1..0c2f971a26c 100644 --- a/python/cudf/cudf/_lib/cpp/io/orc.pxd +++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd @@ -37,7 +37,6 @@ cdef extern from "cudf/io/orc.hpp" \ void enable_use_np_dtypes(bool val) except+ void set_timestamp_type(data_type type) except+ void set_decimal_cols_as_float(vector[string] val) except+ - void enable_decimal128(bool val) except+ @staticmethod orc_reader_options_builder builder( @@ -59,7 +58,6 @@ cdef extern from "cudf/io/orc.hpp" \ orc_reader_options_builder& decimal_cols_as_float( vector[string] val ) except+ - orc_reader_options_builder& decimal128(bool val) except+ orc_reader_options build() except+ diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd index 5b81d369ef5..897b61f8001 100644 --- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -33,11 +33,13 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" \ column_view boolean_mask ) except + - cdef unique_ptr[table] drop_duplicates(table_view source_table, - vector[size_type] keys, - duplicate_keep_option keep, - null_equality nulls_equal) except + - - cdef size_type distinct_count(column_view source_table, - null_policy null_handling, - nan_policy nan_handling) except + + cdef unique_ptr[table] drop_duplicates( + table_view source_table, + vector[size_type] keys, + duplicate_keep_option keep, + null_equality nulls_equal) except + + + cdef size_type unordered_distinct_count( + column_view source_table, + null_policy null_handling, + nan_policy nan_handling) except + diff --git a/python/cudf/cudf/_lib/cpp/strings/findall.pxd b/python/cudf/cudf/_lib/cpp/strings/findall.pxd index 189d0770b81..5533467d72a 100644 --- a/python/cudf/cudf/_lib/cpp/strings/findall.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/findall.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -10,6 +10,10 @@ from cudf._lib.cpp.table.table cimport table cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil: - cdef unique_ptr[table] findall_re( - column_view source_strings, - string pattern) except + + cdef unique_ptr[table] findall( + const column_view& source_strings, + const string& pattern) except + + + cdef unique_ptr[column] findall_record( + const column_view& source_strings, + const string& pattern) except + diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 4330c565982..c4f885382f3 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import pandas as pd @@ -11,10 +11,10 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.stream_compaction cimport ( apply_boolean_mask as cpp_apply_boolean_mask, - distinct_count as cpp_distinct_count, drop_duplicates as cpp_drop_duplicates, drop_nulls as cpp_drop_nulls, duplicate_keep_option, + unordered_distinct_count as cpp_unordered_distinct_count, ) from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -190,7 +190,7 @@ def distinct_count(Column source_column, ignore_nulls=True, nan_as_null=False): cdef column_view source_column_view = source_column.view() with nogil: - count = cpp_distinct_count( + count = cpp_unordered_distinct_count( source_column_view, cpp_null_handling, cpp_nan_handling diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx index 80af18e7c71..b17988018a6 100644 --- a/python/cudf/cudf/_lib/strings/findall.pyx +++ b/python/cudf/cudf/_lib/strings/findall.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -8,7 +8,10 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.scalar.scalar cimport string_scalar -from cudf._lib.cpp.strings.findall cimport findall_re as cpp_findall_re +from cudf._lib.cpp.strings.findall cimport ( + findall as cpp_findall, + findall_record as cpp_findall_record, +) from cudf._lib.cpp.table.table cimport table from cudf._lib.scalar cimport DeviceScalar from cudf._lib.utils cimport data_from_unique_ptr @@ -25,7 +28,7 @@ def findall(Column source_strings, pattern): cdef string pattern_string = str(pattern).encode() with nogil: - c_result = move(cpp_findall_re( + c_result = move(cpp_findall( source_view, pattern_string )) @@ -34,3 +37,22 @@ def findall(Column source_strings, pattern): move(c_result), column_names=range(0, c_result.get()[0].num_columns()) ) + + +def findall_record(Column source_strings, pattern): + """ + Returns data with all non-overlapping matches of `pattern` + in each string of `source_strings` as a lists column. + """ + cdef unique_ptr[column] c_result + cdef column_view source_view = source_strings.view() + + cdef string pattern_string = str(pattern).encode() + + with nogil: + c_result = move(cpp_findall_record( + source_view, + pattern_string + )) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index b1335c7c076..6569184e90b 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -569,17 +569,6 @@ def to_dlpack(self): return cudf.io.dlpack.to_dlpack(self) - @property - def gpu_values(self): - """ - View the data as a numba device array object - """ - warnings.warn( - "The gpu_values property is deprecated and will be removed.", - FutureWarning, - ) - return self._values.data_array_view - def append(self, other): """ Append a collection of Index options together. @@ -1254,10 +1243,6 @@ def astype(self, dtype, copy=False): self.copy(deep=copy)._values.astype(dtype), name=self.name ) - # TODO: This method is deprecated and can be removed. - def to_array(self, fillna=None): - return self._values.to_array(fillna=fillna) - def to_series(self, index=None, name=None): """ Create a Series with both index and values equal to the index keys. @@ -1536,14 +1521,6 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None): "`allow_fill` and `fill_value` are unsupported." ) - indices = cudf.core.column.as_column(indices) - if is_bool_dtype(indices): - warnings.warn( - "Calling take with a boolean array is deprecated and will be " - "removed in the future.", - FutureWarning, - ) - return self._apply_boolean_mask(indices) return self._gather(indices) def _apply_boolean_mask(self, boolean_mask): diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index de06e62cbb1..24f9dc83ca9 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -46,6 +46,9 @@ ) +_DEFAULT_CATEGORICAL_VALUE = -1 + + class CategoricalAccessor(ColumnMethods): """ Accessor object for categorical properties of the Series values. @@ -946,7 +949,11 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> pd.Series: col = self signed_dtype = min_signed_type(len(col.categories)) - codes = col.codes.astype(signed_dtype).fillna(-1).to_array() + codes = ( + col.codes.astype(signed_dtype) + .fillna(_DEFAULT_CATEGORICAL_VALUE) + .values_host + ) if is_interval_dtype(col.categories.dtype): # leaving out dropna because it temporarily changes an interval # index into a struct and throws off results. @@ -1015,13 +1022,10 @@ def _encode(self, value) -> ScalarLike: return self.categories.find_first_value(value) def _decode(self, value: int) -> ScalarLike: - if value == self._default_na_value(): + if value == _DEFAULT_CATEGORICAL_VALUE: return None return self.categories.element_indexing(value) - def _default_na_value(self) -> ScalarLike: - return -1 - def find_and_replace( self, to_replace: ColumnLike, @@ -1178,7 +1182,7 @@ def fillna( fill_is_scalar = np.isscalar(fill_value) if fill_is_scalar: - if fill_value == self._default_na_value(): + if fill_value == _DEFAULT_CATEGORICAL_VALUE: fill_value = self.codes.dtype.type(fill_value) else: try: @@ -1578,7 +1582,7 @@ def _create_empty_categorical_column( categories=column.as_column(dtype.categories), codes=column.as_column( cudf.utils.utils.scalar_broadcast_to( - categorical_column._default_na_value(), + _DEFAULT_CATEGORICAL_VALUE, categorical_column.size, categorical_column.codes.dtype, ) @@ -1601,7 +1605,7 @@ def pandas_categorical_as_column( codes = categorical.codes if codes is None else codes codes = column.as_column(codes) - valid_codes = codes != codes.dtype.type(-1) + valid_codes = codes != codes.dtype.type(_DEFAULT_CATEGORICAL_VALUE) mask = None if not valid_codes.all(): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 7999fa9039b..82641d83b07 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -314,51 +314,6 @@ def memory_usage(self) -> int: n += bitmask_allocation_size_bytes(self.size) return n - def _default_na_value(self) -> Any: - raise NotImplementedError() - - # TODO: This method is deprecated and can be removed when the associated - # Frame methods are removed. - def to_gpu_array(self, fillna=None) -> "cuda.devicearray.DeviceNDArray": - """Get a dense numba device array for the data. - - Parameters - ---------- - fillna : scalar, 'pandas', or None - See *fillna* in ``.to_array``. - - Notes - ----- - - if ``fillna`` is ``None``, null values are skipped. Therefore, the - output size could be smaller. - """ - if fillna: - return self.fillna(self._default_na_value()).data_array_view - else: - return self.dropna(drop_nan=False).data_array_view - - # TODO: This method is deprecated and can be removed when the associated - # Frame methods are removed. - def to_array(self, fillna=None) -> np.ndarray: - """Get a dense numpy array for the data. - - Parameters - ---------- - fillna : scalar, 'pandas', or None - Defaults to None, which will skip null values. - If it equals "pandas", null values are filled with NaNs. - Non integral dtype is promoted to np.float64. - - Notes - ----- - - if ``fillna`` is ``None``, null values are skipped. Therefore, the - output size could be smaller. - """ - - return self.to_gpu_array(fillna=fillna).copy_to_host() - def _fill( self, fill_value: ScalarLike, @@ -1031,7 +986,7 @@ def __array__(self, dtype=None): raise TypeError( "Implicit conversion to a host NumPy array via __array__ is not " "allowed. To explicitly construct a host array, consider using " - ".to_array()" + ".to_numpy()" ) @property @@ -1316,6 +1271,12 @@ def column_empty( column_empty(row_count, field_dtype) for field_dtype in dtype.fields.values() ) + elif is_list_dtype(dtype): + data = None + children = ( + full(row_count + 1, 0, dtype="int32"), + column_empty(row_count, dtype=dtype.element_type), + ) elif is_categorical_dtype(dtype): data = None children = ( diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index b763790986a..c72fb66addc 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -199,7 +199,7 @@ def to_pandas( # Pandas supports only `datetime64[ns]`, hence the cast. return pd.Series( - self.astype("datetime64[ns]").to_array("NAT"), + self.astype("datetime64[ns]").fillna("NaT").values_host, copy=False, index=index, ) @@ -346,10 +346,6 @@ def as_string_column( column.column_empty(0, dtype="object", masked=False), ) - def _default_na_value(self) -> DatetimeLikeScalar: - """Returns the default NA value for this column""" - return np.datetime64("nat", self.time_unit) - def mean(self, skipna=None, dtype=np.float64) -> ScalarLike: return pd.Timestamp( self.as_numerical.mean(skipna=skipna, dtype=dtype), @@ -488,15 +484,6 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: return False -def binop_offset(lhs, rhs, op): - if rhs._is_no_op: - return lhs - else: - rhs = rhs._generate_column(len(lhs), op) - out = libcudf.datetime.add_months(lhs, rhs) - return out - - def infer_format(element: str, **kwargs) -> str: """ Infers datetime format from a string, also takes cares for `ms` and `ns` diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index a7481ce62a3..9b54c4d9acd 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -355,20 +355,6 @@ def _process_for_reduction( skipna=skipna, min_count=min_count ) - def _default_na_value(self) -> ScalarLike: - """Returns the default NA value for this column""" - dkind = self.dtype.kind - if dkind == "f": - return self.dtype.type(np.nan) - elif dkind == "i": - return np.iinfo(self.dtype).min - elif dkind == "u": - return np.iinfo(self.dtype).max - elif dkind == "b": - return self.dtype.type(False) - else: - raise TypeError(f"numeric column of {self.dtype} has no NaN value") - def find_and_replace( self, to_replace: ColumnLike, diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 9b44b4e6831..6467fd39ddd 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5218,26 +5218,6 @@ def values(self) -> cupy.ndarray: """ raise TypeError("String Arrays is not yet implemented in cudf") - # TODO: This method is deprecated and should be removed when the associated - # Frame methods are removed. - def to_array(self, fillna: bool = None) -> np.ndarray: - """Get a dense numpy array for the data. - - Notes - ----- - - if ``fillna`` is ``None``, null values are skipped. Therefore, the - output size could be smaller. - - Raises - ------ - ``NotImplementedError`` if there are nulls - """ - if fillna is not None: - warnings.warn("fillna parameter not supported for string arrays") - - return self.to_arrow().to_pandas().values - def to_pandas( self, index: pd.Index = None, nullable: bool = False, **kwargs ) -> "pd.Series": @@ -5402,9 +5382,6 @@ def normalize_binop_value(self, other) -> "column.ColumnBase": else: raise TypeError(f"cannot broadcast {type(other)}") - def _default_na_value(self) -> ScalarLike: - return None - def binary_operator( self, op: builtins.str, rhs, reflect: bool = False ) -> "column.ColumnBase": diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 4b7a3bcc197..6c8c904e13c 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -12,13 +12,7 @@ import cudf from cudf import _lib as libcudf -from cudf._typing import ( - BinaryOperand, - DatetimeLikeScalar, - Dtype, - DtypeObj, - ScalarLike, -) +from cudf._typing import BinaryOperand, DatetimeLikeScalar, Dtype, DtypeObj from cudf.api.types import is_scalar from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, column, string @@ -123,7 +117,8 @@ def to_pandas( # Pandas supports only `timedelta64[ns]`, hence the cast. pd_series = pd.Series( - self.astype("timedelta64[ns]").to_array("NAT"), copy=False + self.astype("timedelta64[ns]").fillna("NaT").values_host, + copy=False, ) if index is not None: @@ -304,10 +299,6 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn": ), ) - def _default_na_value(self) -> ScalarLike: - """Returns the default NA value for this column""" - return np.timedelta64("nat", self.time_unit) - @property def time_unit(self) -> str: return self._time_unit diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index c2ea9d756f7..67976ac27d4 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. from __future__ import annotations @@ -523,14 +523,19 @@ def rename_column(x): raise IndexError( f"Too many levels: Index has only 1 level, not {level+1}" ) + if isinstance(mapper, Mapping): - new_names = ( + new_col_names = [ mapper.get(col_name, col_name) for col_name in self.keys() - ) + ] else: - new_names = (mapper(col_name) for col_name in self.keys()) + new_col_names = [mapper(col_name) for col_name in self.keys()] + + if len(new_col_names) != len(set(new_col_names)): + raise ValueError("Duplicate column names are not allowed") + ca = ColumnAccessor( - dict(zip(new_names, self.values())), + dict(zip(new_col_names, self.values())), level_names=self.level_names, multiindex=self.multiindex, ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e973468a322..0035a979760 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -17,7 +17,6 @@ import numpy as np import pandas as pd import pyarrow as pa -from numba import cuda from nvtx import annotate from pandas._config import get_option from pandas.core.dtypes.common import is_float, is_integer @@ -64,6 +63,7 @@ from cudf.core.multiindex import MultiIndex from cudf.core.resample import DataFrameResampler from cudf.core.series import Series +from cudf.core.udf.row_function import _get_row_kernel from cudf.utils import applyutils, docutils, ioutils, queryutils, utils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( @@ -269,7 +269,9 @@ def _getitem_tuple_arg(self, arg): else: row_selection = as_column(arg[0]) if is_bool_dtype(row_selection.dtype): - df.index = self._frame.index.take(row_selection) + df.index = self._frame.index._apply_boolean_mask( + row_selection + ) else: df.index = as_index(row_selection) # Step 4: Downcast @@ -3096,124 +3098,6 @@ def add_suffix(self, suffix): ] return out - def as_gpu_matrix(self, columns=None, order="F"): - warnings.warn( - "The as_gpu_matrix method will be removed in a future cuDF " - "release. Consider using `to_cupy` instead.", - FutureWarning, - ) - if columns is None: - columns = self._data.names - - cols = [self._data[k] for k in columns] - ncol = len(cols) - nrow = len(self) - if ncol < 1: - # This is the case for empty dataframe - construct empty cupy array - matrix = cupy.empty( - shape=(0, 0), dtype=cudf.dtype("float64"), order=order - ) - return cuda.as_cuda_array(matrix) - - if any( - (is_categorical_dtype(c) or np.issubdtype(c, cudf.dtype("object"))) - for c in cols - ): - raise TypeError("non-numeric data not yet supported") - - dtype = find_common_type([col.dtype for col in cols]) - for k, c in self._data.items(): - if c.has_nulls(): - raise ValueError( - f"column '{k}' has null values. " - f"hint: use .fillna() to replace null values" - ) - cupy_dtype = dtype - if np.issubdtype(cupy_dtype, np.datetime64): - cupy_dtype = cudf.dtype("int64") - - if order not in ("F", "C"): - raise ValueError( - "order parameter should be 'C' for row major or 'F' for" - "column major GPU matrix" - ) - - matrix = cupy.empty(shape=(nrow, ncol), dtype=cupy_dtype, order=order) - for colidx, inpcol in enumerate(cols): - dense = inpcol.astype(cupy_dtype) - matrix[:, colidx] = cupy.asarray(dense) - return cuda.as_cuda_array(matrix).view(dtype) - - def as_matrix(self, columns=None): - warnings.warn( - "The as_matrix method will be removed in a future cuDF " - "release. Consider using `to_numpy` instead.", - FutureWarning, - ) - return self.as_gpu_matrix(columns=columns).copy_to_host() - - def label_encoding( - self, column, prefix, cats, prefix_sep="_", dtype=None, na_sentinel=-1 - ): - """Encode labels in a column with label encoding. - - Parameters - ---------- - column : str - the source column with binary encoding for the data. - prefix : str - the new column name prefix. - cats : sequence of ints - the sequence of categories as integers. - prefix_sep : str - the separator between the prefix and the category. - dtype : - the dtype for the outputs; see Series.label_encoding - na_sentinel : number - Value to indicate missing category. - - Returns - ------- - A new DataFrame with a new column appended for the coded values. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a':[1, 2, 3], 'b':[10, 10, 20]}) - >>> df - a b - 0 1 10 - 1 2 10 - 2 3 20 - >>> df.label_encoding(column="b", prefix="b_col", cats=[10, 20]) - a b b_col_labels - 0 1 10 0 - 1 2 10 0 - 2 3 20 1 - """ - - warnings.warn( - "DataFrame.label_encoding is deprecated and will be removed in " - "the future. Consider using cuML's LabelEncoder instead.", - FutureWarning, - ) - - return self._label_encoding( - column, prefix, cats, prefix_sep, dtype, na_sentinel - ) - - def _label_encoding( - self, column, prefix, cats, prefix_sep="_", dtype=None, na_sentinel=-1 - ): - # Private implementation of deprecated public label_encoding method - newname = prefix_sep.join([prefix, "labels"]) - newcol = self[column]._label_encoding( - cats=cats, dtype=dtype, na_sentinel=na_sentinel - ) - outdf = self.copy() - outdf.insert(len(outdf._data), newname, newcol) - return outdf - def agg(self, aggs, axis=None): """ Aggregate using one or more operations over the specified axis. @@ -4001,10 +3885,8 @@ def apply( raise ValueError("The `raw` kwarg is not yet supported.") if result_type is not None: raise ValueError("The `result_type` kwarg is not yet supported.") - if kwargs: - raise ValueError("UDFs using **kwargs are not yet supported.") - return self._apply(func, *args) + return self._apply(func, _get_row_kernel, *args, **kwargs) @applyutils.doc_apply() def apply_rows( @@ -5611,7 +5493,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): ) .fillna(np.nan) ) - arr = cupy.asarray(prepared.as_gpu_matrix()) + arr = prepared.to_cupy() if skipna is not False and method in _cupy_nan_methods_map: method = _cupy_nan_methods_map[method] @@ -6220,6 +6102,37 @@ def __dataframe__( self, nan_as_null=nan_as_null, allow_copy=allow_copy ) + def nunique(self, axis=0, dropna=True): + """ + Count number of distinct elements in specified axis. + Return Series with number of distinct elements. Can ignore NaN values. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for + column-wise. + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]}) + >>> df.nunique() + A 3 + B 2 + dtype: int64 + """ + if axis != 0: + raise NotImplementedError("axis parameter is not supported yet.") + + return cudf.Series(super().nunique(method="sort", dropna=dropna)) + def from_dataframe(df, allow_copy=False): return df_protocol.from_dataframe(df, allow_copy=allow_copy) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 69dc5389e7a..7eabc39aa4b 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2,6 +2,7 @@ from __future__ import annotations +import builtins import copy import pickle import warnings @@ -45,7 +46,6 @@ ) from cudf.core.column_accessor import ColumnAccessor from cudf.core.join import Merge, MergeSemi -from cudf.core.udf.pipeline import compile_or_get, supported_cols_from_frame from cudf.core.window import Rolling from cudf.utils import ioutils from cudf.utils.docutils import copy_docstring @@ -1367,39 +1367,6 @@ def _quantiles( result._copy_type_metadata(self) return result - @annotate("APPLY", color="purple", domain="cudf_python") - def _apply(self, func, *args): - """ - Apply `func` across the rows of the frame. - """ - kernel, retty = compile_or_get(self, func, args) - - # Mask and data column preallocated - ans_col = cupy.empty(len(self), dtype=retty) - ans_mask = cudf.core.column.column_empty(len(self), dtype="bool") - launch_args = [(ans_col, ans_mask), len(self)] - offsets = [] - - # if compile_or_get succeeds, it is safe to create a kernel that only - # consumes the columns that are of supported dtype - for col in supported_cols_from_frame(self).values(): - data = col.data - mask = col.mask - if mask is None: - launch_args.append(data) - else: - launch_args.append((data, mask)) - offsets.append(col.offset) - launch_args += offsets - launch_args += list(args) - kernel.forall(len(self))(*launch_args) - - col = as_column(ans_col) - col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask)) - result = cudf.Series._from_data({None: col}, self._index) - - return result - def rank( self, axis=0, @@ -6016,12 +5983,12 @@ def eq(self, other, axis="columns", level=None, fill_value=None): ... 'd': [10, 12, 12]} ... ) >>> left.eq(right) - a b c d + a b c d 0 True True 1 True True 2 True True >>> left.eq(right, fill_value=7) - a b c d + a b c d 0 True True True False 1 True True False False 2 True True False False @@ -6091,12 +6058,12 @@ def ne(self, other, axis="columns", level=None, fill_value=None): ... 'd': [10, 12, 12]} ... ) >>> left.ne(right) - a b c d + a b c d 0 False False 1 False False 2 False False >>> left.ne(right, fill_value=7) - a b c d + a b c d 0 False False False True 1 False False True True 2 False False True True @@ -6166,12 +6133,12 @@ def lt(self, other, axis="columns", level=None, fill_value=None): ... 'd': [10, 12, 12]} ... ) >>> left.lt(right) - a b c d + a b c d 0 False False 1 False False 2 False False >>> left.lt(right, fill_value=7) - a b c d + a b c d 0 False False False True 1 False False False True 2 False False False True @@ -6241,12 +6208,12 @@ def le(self, other, axis="columns", level=None, fill_value=None): ... 'd': [10, 12, 12]} ... ) >>> left.le(right) - a b c d + a b c d 0 True True 1 True True 2 True True >>> left.le(right, fill_value=7) - a b c d + a b c d 0 True True True True 1 True True False True 2 True True False True @@ -6316,12 +6283,12 @@ def gt(self, other, axis="columns", level=None, fill_value=None): ... 'd': [10, 12, 12]} ... ) >>> left.gt(right) - a b c d + a b c d 0 False False 1 False False 2 False False >>> left.gt(right, fill_value=7) - a b c d + a b c d 0 False False False False 1 False False True False 2 False False True False @@ -6391,12 +6358,12 @@ def ge(self, other, axis="columns", level=None, fill_value=None): ... 'd': [10, 12, 12]} ... ) >>> left.ge(right) - a b c d + a b c d 0 True True 1 True True 2 True True >>> left.ge(right, fill_value=7) - a b c d + a b c d 0 True True True False 1 True True True False 2 True True True False @@ -6436,6 +6403,28 @@ def ge(self, other, axis="columns", level=None, fill_value=None): other=other, fn="ge", fill_value=fill_value, can_reindex=True ) + def nunique(self, method: builtins.str = "sort", dropna: bool = True): + """ + Returns a per column mapping with counts of unique values for + each column. + + Parameters + ---------- + method : builtins.str, default "sort" + Method used by cpp_distinct_count + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + dict + Name and unique value counts of each column in frame. + """ + return { + name: col.distinct_count(method=method, dropna=dropna) + for name, col in self._data.items() + } + def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 91c7a740699..fc59d15e264 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -682,6 +682,7 @@ def _intersection(self, other, sort=False): return new_index def _gather(self, gather_map, nullify=False, check_bounds=True): + gather_map = cudf.core.column.as_column(gather_map) return Int64Index._from_columns( [self._values.take(gather_map, nullify, check_bounds)], [self.name] ) @@ -771,23 +772,6 @@ def __init__(self, data, **kwargs): name = kwargs.get("name") super().__init__({name: data}) - @classmethod - def deserialize(cls, header, frames): - if "index_column" in header: - warnings.warn( - "Index objects serialized in cudf version " - "21.10 or older will no longer be deserializable " - "after version 21.12. Please load and resave any " - "pickles before upgrading to version 22.02.", - FutureWarning, - ) - header["columns"] = [header.pop("index_column")] - header["column_names"] = pickle.dumps( - [pickle.loads(header["name"])] - ) - - return super().deserialize(header, frames) - def _binaryop( self, other: T, @@ -2508,7 +2492,7 @@ def to_pandas(self): def __repr__(self): return ( - f"{self.__class__.__name__}({self._values.to_array()}," + f"{self.__class__.__name__}({self._values.values_host}," f" dtype='object'" + ( f", name={pd.io.formats.printing.default_pprint(self.name)}" diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index e9f2de1cb1c..8ecab2c7c65 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -24,11 +24,12 @@ is_integer_dtype, is_list_like, ) -from cudf.core.column import arange +from cudf.core.column import arange, as_column from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame from cudf.core.index import Index, RangeIndex, _index_from_columns from cudf.core.multiindex import MultiIndex +from cudf.core.udf.utils import _compile_or_get, _supported_cols_from_frame from cudf.utils.utils import cached_property doc_reset_index_template = """ @@ -756,6 +757,51 @@ def add_suffix(self, suffix): Use `Series.add_suffix` or `DataFrame.add_suffix`" ) + @annotate("APPLY", color="purple", domain="cudf_python") + def _apply(self, func, kernel_getter, *args, **kwargs): + """Apply `func` across the rows of the frame.""" + if kwargs: + raise ValueError("UDFs using **kwargs are not yet supported.") + + try: + kernel, retty = _compile_or_get( + self, func, args, kernel_getter=kernel_getter + ) + except Exception as e: + raise ValueError( + "user defined function compilation failed." + ) from e + + # Mask and data column preallocated + ans_col = cp.empty(len(self), dtype=retty) + ans_mask = cudf.core.column.column_empty(len(self), dtype="bool") + launch_args = [(ans_col, ans_mask), len(self)] + offsets = [] + + # if _compile_or_get succeeds, it is safe to create a kernel that only + # consumes the columns that are of supported dtype + for col in _supported_cols_from_frame(self).values(): + data = col.data + mask = col.mask + if mask is None: + launch_args.append(data) + else: + launch_args.append((data, mask)) + offsets.append(col.offset) + launch_args += offsets + launch_args += list(args) + + try: + kernel.forall(len(self))(*launch_args) + except Exception as e: + raise RuntimeError("UDF kernel execution failed.") from e + + col = as_column(ans_col) + col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask)) + result = cudf.Series._from_data({None: col}, self._index) + + return result + def sort_values( self, by, @@ -1298,9 +1344,7 @@ def dropna( 0 Alfred Batmobile 1940-04-25 """ if axis == 0: - result = self._drop_na_rows( - how=how, subset=subset, thresh=thresh, drop_nan=True - ) + result = self._drop_na_rows(how=how, subset=subset, thresh=thresh) else: result = self._drop_na_columns( how=how, subset=subset, thresh=thresh @@ -1308,9 +1352,7 @@ def dropna( return self._mimic_inplace(result, inplace=inplace) - def _drop_na_rows( - self, how="any", subset=None, thresh=None, drop_nan=False - ): + def _drop_na_rows(self, how="any", subset=None, thresh=None): """ Drop null rows from `self`. @@ -1321,7 +1363,7 @@ def _drop_na_rows( *all* null values. subset : list, optional List of columns to consider when dropping rows. - thresh: int, optional + thresh : int, optional If specified, then drops every row containing less than `thresh` non-null values. """ @@ -1341,17 +1383,16 @@ def _drop_na_rows( if len(subset) == 0: return self.copy(deep=True) - if drop_nan: - data_columns = [ - col.nans_to_nulls() - if isinstance(col, cudf.core.column.NumericalColumn) - else col - for col in self._columns - ] + data_columns = [ + col.nans_to_nulls() + if isinstance(col, cudf.core.column.NumericalColumn) + else col + for col in self._columns + ] return self._from_columns_like_self( libcudf.stream_compaction.drop_nulls( - list(self._index._data.columns) + data_columns, + [*self._index._data.columns, *data_columns], how=how, keys=self._positions_from_column_names( subset, offset_by_index_columns=True @@ -1418,18 +1459,9 @@ def take(self, indices, axis=0): 0 1.0 a 2 3.0 c """ - axis = self._get_axis_from_axis_arg(axis) - if axis != 0: + if self._get_axis_from_axis_arg(axis) != 0: raise NotImplementedError("Only axis=0 is supported.") - indices = cudf.core.column.as_column(indices) - if is_bool_dtype(indices): - warnings.warn( - "Calling take with a boolean array is deprecated and will be " - "removed in the future.", - FutureWarning, - ) - return self._apply_boolean_mask(indices) return self._gather(indices) def _reset_index(self, level, drop, col_level=0, col_fill=""): diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index fa84889adea..adce3c24a83 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -859,28 +859,8 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): - if "names" in header: - warnings.warn( - "MultiIndex objects serialized in cudf version " - "21.10 or older will no longer be deserializable " - "after version 21.12. Please load and resave any " - "pickles before upgrading to version 22.02.", - FutureWarning, - ) - header["column_names"] = header["names"] - column_names = pickle.loads(header["column_names"]) - if "source_data" in header: - warnings.warn( - "MultiIndex objects serialized in cudf version " - "21.08 or older will no longer be deserializable " - "after version 21.10. Please load and resave any " - "pickles before upgrading to version 21.12.", - FutureWarning, - ) - df = cudf.DataFrame.deserialize(header["source_data"], frames) - return cls.from_frame(df)._set_names(column_names) - # Spoof the column names to construct the frame, then set manually. + column_names = pickle.loads(header["column_names"]) header["column_names"] = pickle.dumps(range(0, len(column_names))) obj = super().deserialize(header, frames) return obj._set_names(column_names) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 0371c40274f..12a2538b776 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -14,7 +14,6 @@ import cupy import numpy as np import pandas as pd -from numba import cuda from pandas._config import get_option import cudf @@ -67,6 +66,7 @@ doc_reset_index_template, ) from cudf.core.single_column_frame import SingleColumnFrame +from cudf.core.udf.scalar_function import _get_scalar_kernel from cudf.utils import cudautils, docutils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( @@ -559,19 +559,6 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): - if "column" in header: - warnings.warn( - "Series objects serialized in cudf version " - "21.10 or older will no longer be deserializable " - "after version 21.12. Please load and resave any " - "pickles before upgrading to version 22.02.", - FutureWarning, - ) - header["columns"] = [header.pop("column")] - header["column_names"] = pickle.dumps( - [pickle.loads(header["name"])] - ) - index_nframes = header["index_frame_count"] obj = super().deserialize( header, frames[header["index_frame_count"] :] @@ -965,15 +952,6 @@ def to_frame(self, name=None): return cudf.DataFrame({col: self._column}, index=self.index) - def set_mask(self, mask, null_count=None): - warnings.warn( - "Series.set_mask is deprecated and will be removed in the future.", - FutureWarning, - ) - return self._from_data( - {self.name: self._column.set_mask(mask)}, self._index - ) - def memory_usage(self, index=True, deep=False): """ Return the memory usage of the Series. @@ -1623,25 +1601,6 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False): return self._mimic_inplace(result, inplace=inplace) - def fill(self, fill_value, begin=0, end=-1, inplace=False): - warnings.warn( - "The fill method will be removed in a future cuDF release.", - FutureWarning, - ) - fill_values = [fill_value] - col_and_fill = zip(self._columns, fill_values) - - if not inplace: - data_columns = (c._fill(v, begin, end) for (c, v) in col_and_fill) - return self.__class__._from_data( - zip(self._column_names, data_columns), self._index - ) - - for (c, v) in col_and_fill: - c.fill(v, begin, end, inplace=True) - - return self - def fillna( self, value=None, method=None, axis=None, inplace=False, limit=None ): @@ -1665,15 +1624,6 @@ def fillna( value=value, method=method, axis=axis, inplace=inplace, limit=limit ) - # TODO: When this method is removed we can also remove ColumnBase.to_array. - def to_array(self, fillna=None): - warnings.warn( - "The to_array method will be removed in a future cuDF " - "release. Consider using `to_numpy` instead.", - FutureWarning, - ) - return self._column.to_array(fillna=fillna) - def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): if bool_only not in (None, True): raise NotImplementedError( @@ -1782,27 +1732,6 @@ def nullmask(self): """The gpu buffer for the null-mask""" return cudf.Series(self._column.nullmask) - def as_mask(self): - """Convert booleans to bitmask - - Returns - ------- - device array - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([True, False, True]) - >>> s.as_mask() - - """ - if not is_bool_dtype(self.dtype): - raise TypeError( - f"Series must of boolean dtype, found: {self.dtype}" - ) - - return self._column.as_mask() - def astype(self, dtype, copy=False, errors="raise"): """ Cast the Series to the given dtype @@ -2243,76 +2172,6 @@ def update(self, other): self.mask(mask, other, inplace=True) - def reverse(self): - warnings.warn( - "Series.reverse is deprecated and will be removed in the future.", - FutureWarning, - ) - rinds = column.arange((self._column.size - 1), -1, -1, dtype=np.int32) - return self._from_data( - {self.name: self._column[rinds]}, self.index._values[rinds] - ) - - def label_encoding(self, cats, dtype=None, na_sentinel=-1): - """Perform label encoding. - - Parameters - ---------- - values : sequence of input values - dtype : numpy.dtype; optional - Specifies the output dtype. If `None` is given, the - smallest possible integer dtype (starting with np.int8) - is used. - na_sentinel : number, default -1 - Value to indicate missing category. - - Returns - ------- - A sequence of encoded labels with value between 0 and n-1 classes(cats) - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([1, 2, 3, 4, 10]) - >>> s.label_encoding([2, 3]) - 0 -1 - 1 0 - 2 1 - 3 -1 - 4 -1 - dtype: int8 - - `na_sentinel` parameter can be used to - control the value when there is no encoding. - - >>> s.label_encoding([2, 3], na_sentinel=10) - 0 10 - 1 0 - 2 1 - 3 10 - 4 10 - dtype: int8 - - When none of `cats` values exist in s, entire - Series will be `na_sentinel`. - - >>> s.label_encoding(['a', 'b', 'c']) - 0 -1 - 1 -1 - 2 -1 - 3 -1 - 4 -1 - dtype: int8 - """ - - warnings.warn( - "Series.label_encoding is deprecated and will be removed in the " - "future. Consider using cuML's LabelEncoder instead.", - FutureWarning, - ) - - return self._label_encoding(cats, dtype, na_sentinel) - def _label_encoding(self, cats, dtype=None, na_sentinel=-1): # Private implementation of deprecated public label_encoding method def _return_sentinel_series(): @@ -2374,7 +2233,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs): by numba based on the function logic and argument types. See examples for details. args : tuple - Not supported + Positional arguments passed to func after the series value. **kwargs Not supported @@ -2440,20 +2299,9 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs): 2 4.5 dtype: float64 """ - if args or kwargs: - raise ValueError( - "UDFs using *args or **kwargs are not yet supported." - ) - - # these functions are generally written as functions of scalar - # values rather than rows. Rather than writing an entirely separate - # numba kernel that is not built around a row object, its simpler - # to just turn this into the equivalent single column dataframe case - name = self.name or "__temp_srname" - df = cudf.DataFrame({name: self}) - f_ = cuda.jit(device=True)(func) - - return df.apply(lambda row: f_(row[name])) + if convert_dtype is not True: + raise ValueError("Series.apply only supports convert_dtype=True") + return self._apply(func, _get_scalar_kernel, *args, **kwargs) def applymap(self, udf, out_dtype=None): """Apply an elementwise function to transform the values in the Column. @@ -2908,7 +2756,7 @@ def nunique(self, method="sort", dropna=True): raise NotImplementedError(msg) if self.null_count == len(self): return 0 - return self._column.distinct_count(method, dropna) + return super().nunique(method, dropna) def value_counts( self, diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 7793a2fdf29..ef479f19363 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -3,7 +3,7 @@ from __future__ import annotations -import warnings +import builtins from typing import Any, Dict, MutableMapping, Optional, Tuple, TypeVar, Union import cupy @@ -143,16 +143,6 @@ def tolist(self): # noqa: D102 to_list = tolist - # TODO: When this method is removed we can also remove - # ColumnBase.to_gpu_array. - def to_gpu_array(self, fillna=None): # noqa: D102 - warnings.warn( - "The to_gpu_array method will be removed in a future cuDF " - "release. Consider using `to_cupy` instead.", - FutureWarning, - ) - return self._column.to_gpu_array(fillna=fillna) - @classmethod def from_arrow(cls, array): """Create from PyArrow Array/ChunkedArray. @@ -336,3 +326,21 @@ def _make_operands_for_binop( return NotImplemented return {result_name: (self._column, other, reflect, fill_value)} + + def nunique(self, method: builtins.str = "sort", dropna: bool = True): + """ + Return count of unique values for the column. + + Parameters + ---------- + method : builtins.str, default "sort" + Method used by cpp_distinct_count + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + int + Number of unique values in the column. + """ + return self._column.distinct_count(method=method, dropna=dropna) diff --git a/python/cudf/cudf/core/udf/pipeline.py b/python/cudf/cudf/core/udf/pipeline.py deleted file mode 100644 index 2464906be04..00000000000 --- a/python/cudf/cudf/core/udf/pipeline.py +++ /dev/null @@ -1,388 +0,0 @@ -import math -from typing import Callable - -import cachetools -import numpy as np -from numba import cuda, typeof -from numba.np import numpy_support -from numba.types import Poison, Record, Tuple, boolean, int64, void -from nvtx import annotate - -from cudf.core.dtypes import CategoricalDtype -from cudf.core.udf.api import Masked, pack_return -from cudf.core.udf.typing import MaskedType -from cudf.utils import cudautils -from cudf.utils.dtypes import ( - BOOL_TYPES, - DATETIME_TYPES, - NUMERIC_TYPES, - TIMEDELTA_TYPES, -) - -libcudf_bitmask_type = numpy_support.from_dtype(np.dtype("int32")) -MASK_BITSIZE = np.dtype("int32").itemsize * 8 -precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32) - -JIT_SUPPORTED_TYPES = ( - NUMERIC_TYPES | BOOL_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES -) - - -def _is_jit_supported_type(dtype): - # category dtype isn't hashable - if isinstance(dtype, CategoricalDtype): - return False - return str(dtype) in JIT_SUPPORTED_TYPES - - -def all_dtypes_from_frame(frame): - return { - colname: col.dtype - if _is_jit_supported_type(col.dtype) - else np.dtype("O") - for colname, col in frame._data.items() - } - - -def supported_dtypes_from_frame(frame): - return { - colname: col.dtype - for colname, col in frame._data.items() - if _is_jit_supported_type(col.dtype) - } - - -def supported_cols_from_frame(frame): - return { - colname: col - for colname, col in frame._data.items() - if _is_jit_supported_type(col.dtype) - } - - -def generate_cache_key(frame, func: Callable): - """Create a cache key that uniquely identifies a compilation. - - A new compilation is needed any time any of the following things change: - - The UDF itself as defined in python by the user - - The types of the columns utilized by the UDF - - The existence of the input columns masks - """ - return ( - *cudautils.make_cache_key(func, all_dtypes_from_frame(frame).values()), - *(col.mask is None for col in frame._data.values()), - *frame._data.keys(), - ) - - -def get_frame_row_type(dtype): - """ - Get the numba `Record` type corresponding to a frame. - Models each column and its mask as a MaskedType and - models the row as a dictionary like data structure - containing these MaskedTypes. - - Large parts of this function are copied with comments - from the Numba internals and slightly modified to - account for validity bools to be present in the final - struct. - """ - - # Create the numpy structured type corresponding to the numpy dtype. - - fields = [] - offset = 0 - - sizes = [val[0].itemsize for val in dtype.fields.values()] - for i, (name, info) in enumerate(dtype.fields.items()): - # *info* consists of the element dtype, its offset from the beginning - # of the record, and an optional "title" containing metadata. - # We ignore the offset in info because its value assumes no masking; - # instead, we compute the correct offset based on the masked type. - elemdtype = info[0] - title = info[2] if len(info) == 3 else None - ty = numpy_support.from_dtype(elemdtype) - infos = { - "type": MaskedType(ty), - "offset": offset, - "title": title, - } - fields.append((name, infos)) - - # increment offset by itemsize plus one byte for validity - offset += elemdtype.itemsize + 1 - - # Align the next member of the struct to be a multiple of the - # memory access size, per PTX ISA 7.4/5.4.5 - if i < len(sizes) - 1: - next_itemsize = sizes[i + 1] - offset = int(math.ceil(offset / next_itemsize) * next_itemsize) - - # Numba requires that structures are aligned for the CUDA target - _is_aligned_struct = True - return Record(fields, offset, _is_aligned_struct) - - -@annotate("NUMBA JIT", color="green", domain="cudf_python") -def get_udf_return_type(frame, func: Callable, args=()): - - """ - Get the return type of a masked UDF for a given set of argument dtypes. It - is assumed that the function consumes a dictionary whose keys are strings - and whose values are of MaskedType. Initially assume that the UDF may be - written to utilize any field in the row - including those containing an - unsupported dtype. If an unsupported dtype is actually used in the function - the compilation should fail at `compile_udf`. If compilation succeeds, one - can infer that the function does not use any of the columns of unsupported - dtype - meaning we can drop them going forward and the UDF will still end - up getting fed rows containing all the fields it actually needs to use to - compute the answer for that row. - """ - - # present a row containing all fields to the UDF and try and compile - row_type = get_frame_row_type( - np.dtype(list(all_dtypes_from_frame(frame).items())) - ) - compile_sig = (row_type, *(typeof(arg) for arg in args)) - - # Get the return type. The PTX is also returned by compile_udf, but is not - # needed here. - ptx, output_type = cudautils.compile_udf(func, compile_sig) - if not isinstance(output_type, MaskedType): - numba_output_type = numpy_support.from_dtype(np.dtype(output_type)) - else: - numba_output_type = output_type - - return ( - numba_output_type - if not isinstance(numba_output_type, MaskedType) - else numba_output_type.value_type - ) - - -def masked_array_type_from_col(col): - """ - Return a type representing a tuple of arrays, - the first element an array of the numba type - corresponding to `dtype`, and the second an - array of bools representing a mask. - """ - nb_scalar_ty = numpy_support.from_dtype(col.dtype) - if col.mask is None: - return nb_scalar_ty[::1] - else: - return Tuple((nb_scalar_ty[::1], libcudf_bitmask_type[::1])) - - -def construct_signature(frame, return_type, args): - """ - Build the signature of numba types that will be used to - actually JIT the kernel itself later, accounting for types - and offsets. Skips columns with unsupported dtypes. - """ - - # Tuple of arrays, first the output data array, then the mask - return_type = Tuple((return_type[::1], boolean[::1])) - offsets = [] - sig = [return_type, int64] - for col in supported_cols_from_frame(frame).values(): - sig.append(masked_array_type_from_col(col)) - offsets.append(int64) - - # return_type, size, data, masks, offsets, extra args - sig = void(*(sig + offsets + [typeof(arg) for arg in args])) - - return sig - - -@cuda.jit(device=True) -def mask_get(mask, pos): - return (mask[pos // MASK_BITSIZE] >> (pos % MASK_BITSIZE)) & 1 - - -kernel_template = """\ -def _kernel(retval, size, {input_columns}, {input_offsets}, {extra_args}): - i = cuda.grid(1) - ret_data_arr, ret_mask_arr = retval - if i < size: - # Create a structured array with the desired fields - rows = cuda.local.array(1, dtype=row_type) - - # one element of that array - row = rows[0] - -{masked_input_initializers} -{row_initializers} - - # pass the assembled row into the udf - ret = f_(row, {extra_args}) - - # pack up the return values and set them - ret_masked = pack_return(ret) - ret_data_arr[i] = ret_masked.value - ret_mask_arr[i] = ret_masked.valid -""" - -unmasked_input_initializer_template = """\ - d_{idx} = input_col_{idx} - masked_{idx} = Masked(d_{idx}[i], True) -""" - -masked_input_initializer_template = """\ - d_{idx}, m_{idx} = input_col_{idx} - masked_{idx} = Masked(d_{idx}[i], mask_get(m_{idx}, i + offset_{idx})) -""" - -row_initializer_template = """\ - row["{name}"] = masked_{idx} -""" - - -def _define_function(frame, row_type, args): - """ - The kernel we want to JIT compile looks something like the following, - which is an example for two columns that both have nulls present - - def _kernel(retval, input_col_0, input_col_1, offset_0, offset_1, size): - i = cuda.grid(1) - ret_data_arr, ret_mask_arr = retval - if i < size: - rows = cuda.local.array(1, dtype=row_type) - row = rows[0] - - d_0, m_0 = input_col_0 - masked_0 = Masked(d_0[i], mask_get(m_0, i + offset_0)) - d_1, m_1 = input_col_1 - masked_1 = Masked(d_1[i], mask_get(m_1, i + offset_1)) - - row["a"] = masked_0 - row["b"] = masked_1 - - ret = f_(row) - - ret_masked = pack_return(ret) - ret_data_arr[i] = ret_masked.value - ret_mask_arr[i] = ret_masked.valid - - However we do not always have two columns and columns do not always have - an associated mask. Ideally, we would just write one kernel and make use - of `*args` - and then one function would work for any number of columns, - currently numba does not support `*args` and treats functions it JITs as - if `*args` is a singular argument. Thus we are forced to write the right - functions dynamically at runtime and define them using `exec`. - """ - # Create argument list for kernel - frame = supported_cols_from_frame(frame) - - input_columns = ", ".join([f"input_col_{i}" for i in range(len(frame))]) - input_offsets = ", ".join([f"offset_{i}" for i in range(len(frame))]) - extra_args = ", ".join([f"extra_arg_{i}" for i in range(len(args))]) - - # Generate the initializers for each device function argument - initializers = [] - row_initializers = [] - for i, (colname, col) in enumerate(frame.items()): - idx = str(i) - if col.mask is not None: - template = masked_input_initializer_template - else: - template = unmasked_input_initializer_template - - initializer = template.format(idx=idx) - - initializers.append(initializer) - - row_initializer = row_initializer_template.format( - idx=idx, name=colname - ) - row_initializers.append(row_initializer) - - # Incorporate all of the above into the kernel code template - d = { - "input_columns": input_columns, - "input_offsets": input_offsets, - "extra_args": extra_args, - "masked_input_initializers": "\n".join(initializers), - "row_initializers": "\n".join(row_initializers), - "numba_rectype": row_type, # from global - } - - return kernel_template.format(**d) - - -@annotate("UDF COMPILATION", color="darkgreen", domain="cudf_python") -def compile_or_get(frame, func, args): - """ - Return a compiled kernel in terms of MaskedTypes that launches a - kernel equivalent of `f` for the dtypes of `df`. The kernel uses - a thread for each row and calls `f` using that rows data / mask - to produce an output value and output validity for each row. - - If the UDF has already been compiled for this requested dtypes, - a cached version will be returned instead of running compilation. - - CUDA kernels are void and do not return values. Thus, we need to - preallocate a column of the correct dtype and pass it in as one of - the kernel arguments. This creates a chicken-and-egg problem where - we need the column type to compile the kernel, but normally we would - be getting that type FROM compiling the kernel (and letting numba - determine it as a return value). As a workaround, we compile the UDF - itself outside the final kernel to invoke a full typing pass, which - unfortunately is difficult to do without running full compilation. - we then obtain the return type from that separate compilation and - use it to allocate an output column of the right dtype. - """ - - # check to see if we already compiled this function - cache_key = generate_cache_key(frame, func) - if precompiled.get(cache_key) is not None: - kernel, masked_or_scalar = precompiled[cache_key] - return kernel, masked_or_scalar - - # precompile the user udf to get the right return type. - # could be a MaskedType or a scalar type. - scalar_return_type = get_udf_return_type(frame, func, args) - - # get_udf_return_type will throw a TypingError if the user tries to use - # a field in the row containing an unsupported dtype, except in the - # edge case where all the function does is return that element: - - # def f(row): - # return row[] - # In this case numba is happy to return MaskedType() - # because it relies on not finding overloaded operators for types to raise - # the exception, so we have to explicitly check for that case. - if isinstance(scalar_return_type, Poison): - raise TypeError(str(scalar_return_type)) - - # this is the signature for the final full kernel compilation - sig = construct_signature(frame, scalar_return_type, args) - - # this row type is used within the kernel to pack up the column and - # mask data into the dict like data structure the user udf expects - np_field_types = np.dtype(list(supported_dtypes_from_frame(frame).items())) - row_type = get_frame_row_type(np_field_types) - - f_ = cuda.jit(device=True)(func) - # Dict of 'local' variables into which `_kernel` is defined - local_exec_context = {} - global_exec_context = { - "f_": f_, - "cuda": cuda, - "Masked": Masked, - "mask_get": mask_get, - "pack_return": pack_return, - "row_type": row_type, - } - exec( - _define_function(frame, row_type, args), - global_exec_context, - local_exec_context, - ) - # The python function definition representing the kernel - _kernel = local_exec_context["_kernel"] - kernel = cuda.jit(sig)(_kernel) - np_return_type = numpy_support.as_dtype(scalar_return_type) - precompiled[cache_key] = (kernel, np_return_type) - - return kernel, np_return_type diff --git a/python/cudf/cudf/core/udf/row_function.py b/python/cudf/cudf/core/udf/row_function.py new file mode 100644 index 00000000000..5cda9fb8218 --- /dev/null +++ b/python/cudf/cudf/core/udf/row_function.py @@ -0,0 +1,151 @@ +import math + +import numpy as np +from numba import cuda +from numba.np import numpy_support +from numba.types import Record + +from cudf.core.udf.api import Masked, pack_return +from cudf.core.udf.templates import ( + masked_input_initializer_template, + row_initializer_template, + row_kernel_template, + unmasked_input_initializer_template, +) +from cudf.core.udf.typing import MaskedType +from cudf.core.udf.utils import ( + _all_dtypes_from_frame, + _construct_signature, + _get_kernel, + _get_udf_return_type, + _mask_get, + _supported_cols_from_frame, + _supported_dtypes_from_frame, +) + + +def _get_frame_row_type(dtype): + """ + Get the numba `Record` type corresponding to a frame. + Models each column and its mask as a MaskedType and + models the row as a dictionary like data structure + containing these MaskedTypes. + + Large parts of this function are copied with comments + from the Numba internals and slightly modified to + account for validity bools to be present in the final + struct. + + See numba.np.numpy_support.from_struct_dtype for details. + """ + + # Create the numpy structured type corresponding to the numpy dtype. + + fields = [] + offset = 0 + + sizes = [val[0].itemsize for val in dtype.fields.values()] + for i, (name, info) in enumerate(dtype.fields.items()): + # *info* consists of the element dtype, its offset from the beginning + # of the record, and an optional "title" containing metadata. + # We ignore the offset in info because its value assumes no masking; + # instead, we compute the correct offset based on the masked type. + elemdtype = info[0] + title = info[2] if len(info) == 3 else None + ty = numpy_support.from_dtype(elemdtype) + infos = { + "type": MaskedType(ty), + "offset": offset, + "title": title, + } + fields.append((name, infos)) + + # increment offset by itemsize plus one byte for validity + offset += elemdtype.itemsize + 1 + + # Align the next member of the struct to be a multiple of the + # memory access size, per PTX ISA 7.4/5.4.5 + if i < len(sizes) - 1: + next_itemsize = sizes[i + 1] + offset = int(math.ceil(offset / next_itemsize) * next_itemsize) + + # Numba requires that structures are aligned for the CUDA target + _is_aligned_struct = True + return Record(fields, offset, _is_aligned_struct) + + +def _row_kernel_string_from_template(frame, row_type, args): + """ + Function to write numba kernels for `DataFrame.apply` as a string. + Workaround until numba supports functions that use `*args` + + `DataFrame.apply` expects functions of a dict like row as well as + possibly one or more scalar arguments + + def f(row, c, k): + return (row['x'] + c) / k + + Both the number of input columns as well as their nullability and any + scalar arguments may vary, so the kernels vary significantly. See + templates.py for the full row kernel template and more details. + """ + # Create argument list for kernel + frame = _supported_cols_from_frame(frame) + + input_columns = ", ".join([f"input_col_{i}" for i in range(len(frame))]) + input_offsets = ", ".join([f"offset_{i}" for i in range(len(frame))]) + extra_args = ", ".join([f"extra_arg_{i}" for i in range(len(args))]) + + # Generate the initializers for each device function argument + initializers = [] + row_initializers = [] + for i, (colname, col) in enumerate(frame.items()): + idx = str(i) + template = ( + masked_input_initializer_template + if col.mask is not None + else unmasked_input_initializer_template + ) + initializers.append(template.format(idx=idx)) + row_initializers.append( + row_initializer_template.format(idx=idx, name=colname) + ) + + return row_kernel_template.format( + input_columns=input_columns, + input_offsets=input_offsets, + extra_args=extra_args, + masked_input_initializers="\n".join(initializers), + row_initializers="\n".join(row_initializers), + numba_rectype=row_type, + ) + + +def _get_row_kernel(frame, func, args): + row_type = _get_frame_row_type( + np.dtype(list(_all_dtypes_from_frame(frame).items())) + ) + scalar_return_type = _get_udf_return_type(row_type, func, args) + + # this is the signature for the final full kernel compilation + sig = _construct_signature(frame, scalar_return_type, args) + + # this row type is used within the kernel to pack up the column and + # mask data into the dict like data structure the user udf expects + np_field_types = np.dtype( + list(_supported_dtypes_from_frame(frame).items()) + ) + row_type = _get_frame_row_type(np_field_types) + + # Dict of 'local' variables into which `_kernel` is defined + global_exec_context = { + "cuda": cuda, + "Masked": Masked, + "_mask_get": _mask_get, + "pack_return": pack_return, + "row_type": row_type, + } + kernel_string = _row_kernel_string_from_template(frame, row_type, args) + kernel = _get_kernel(kernel_string, global_exec_context, sig, func) + + return kernel, scalar_return_type diff --git a/python/cudf/cudf/core/udf/scalar_function.py b/python/cudf/cudf/core/udf/scalar_function.py new file mode 100644 index 00000000000..7f3b461a1f0 --- /dev/null +++ b/python/cudf/cudf/core/udf/scalar_function.py @@ -0,0 +1,64 @@ +from numba import cuda +from numba.np import numpy_support + +from cudf.core.udf.api import Masked, pack_return +from cudf.core.udf.templates import ( + masked_input_initializer_template, + scalar_kernel_template, + unmasked_input_initializer_template, +) +from cudf.core.udf.typing import MaskedType +from cudf.core.udf.utils import ( + _construct_signature, + _get_kernel, + _get_udf_return_type, + _mask_get, +) + + +def _scalar_kernel_string_from_template(sr, args): + """ + Function to write numba kernels for `Series.apply` as a string. + Workaround until numba supports functions that use `*args` + + `Series.apply` expects functions of a single variable and possibly + one or more constants, such as: + + def f(x, c, k): + return (x + c) / k + + where the `x` are meant to be the values of the series. Since there + can be only one column, the only thing that varies in the kinds of + kernels that we want is the number of extra_args. See templates.py + for the full kernel template. + """ + extra_args = ", ".join([f"extra_arg_{i}" for i in range(len(args))]) + + masked_initializer = ( + masked_input_initializer_template + if sr._column.mask + else unmasked_input_initializer_template + ).format(idx=0) + + return scalar_kernel_template.format( + extra_args=extra_args, masked_initializer=masked_initializer + ) + + +def _get_scalar_kernel(sr, func, args): + sr_type = MaskedType(numpy_support.from_dtype(sr.dtype)) + scalar_return_type = _get_udf_return_type(sr_type, func, args) + + sig = _construct_signature(sr, scalar_return_type, args=args) + f_ = cuda.jit(device=True)(func) + global_exec_context = { + "f_": f_, + "cuda": cuda, + "Masked": Masked, + "_mask_get": _mask_get, + "pack_return": pack_return, + } + kernel_string = _scalar_kernel_string_from_template(sr, args=args) + kernel = _get_kernel(kernel_string, global_exec_context, sig, func) + + return kernel, scalar_return_type diff --git a/python/cudf/cudf/core/udf/templates.py b/python/cudf/cudf/core/udf/templates.py new file mode 100644 index 00000000000..8cb11133323 --- /dev/null +++ b/python/cudf/cudf/core/udf/templates.py @@ -0,0 +1,52 @@ +unmasked_input_initializer_template = """\ + d_{idx} = input_col_{idx} + masked_{idx} = Masked(d_{idx}[i], True) +""" + +masked_input_initializer_template = """\ + d_{idx}, m_{idx} = input_col_{idx} + masked_{idx} = Masked(d_{idx}[i], _mask_get(m_{idx}, i + offset_{idx})) +""" + +row_initializer_template = """\ + row["{name}"] = masked_{idx} +""" + +row_kernel_template = """\ +def _kernel(retval, size, {input_columns}, {input_offsets}, {extra_args}): + i = cuda.grid(1) + ret_data_arr, ret_mask_arr = retval + if i < size: + # Create a structured array with the desired fields + rows = cuda.local.array(1, dtype=row_type) + + # one element of that array + row = rows[0] + +{masked_input_initializers} +{row_initializers} + + # pass the assembled row into the udf + ret = f_(row, {extra_args}) + + # pack up the return values and set them + ret_masked = pack_return(ret) + ret_data_arr[i] = ret_masked.value + ret_mask_arr[i] = ret_masked.valid +""" + +scalar_kernel_template = """ +def _kernel(retval, size, input_col_0, offset_0, {extra_args}): + i = cuda.grid(1) + ret_data_arr, ret_mask_arr = retval + + if i < size: + +{masked_initializer} + + ret = f_(masked_0, {extra_args}) + + ret_masked = pack_return(ret) + ret_data_arr[i] = ret_masked.value + ret_mask_arr[i] = ret_masked.valid +""" diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py new file mode 100644 index 00000000000..a98ee40274e --- /dev/null +++ b/python/cudf/cudf/core/udf/utils.py @@ -0,0 +1,216 @@ +from typing import Callable + +import cachetools +import numpy as np +from numba import cuda, typeof +from numba.core.errors import TypingError +from numba.np import numpy_support +from numba.types import Poison, Tuple, boolean, int64, void +from nvtx import annotate + +from cudf.core.dtypes import CategoricalDtype +from cudf.core.udf.typing import MaskedType +from cudf.utils import cudautils +from cudf.utils.dtypes import ( + BOOL_TYPES, + DATETIME_TYPES, + NUMERIC_TYPES, + TIMEDELTA_TYPES, +) + +JIT_SUPPORTED_TYPES = ( + NUMERIC_TYPES | BOOL_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES +) + +libcudf_bitmask_type = numpy_support.from_dtype(np.dtype("int32")) +MASK_BITSIZE = np.dtype("int32").itemsize * 8 + +precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32) + + +@annotate("NUMBA JIT", color="green", domain="cudf_python") +def _get_udf_return_type(argty, func: Callable, args=()): + """ + Get the return type of a masked UDF for a given set of argument dtypes. It + is assumed that the function consumes a dictionary whose keys are strings + and whose values are of MaskedType. Initially assume that the UDF may be + written to utilize any field in the row - including those containing an + unsupported dtype. If an unsupported dtype is actually used in the function + the compilation should fail at `compile_udf`. If compilation succeeds, one + can infer that the function does not use any of the columns of unsupported + dtype - meaning we can drop them going forward and the UDF will still end + up getting fed rows containing all the fields it actually needs to use to + compute the answer for that row. + """ + + # present a row containing all fields to the UDF and try and compile + compile_sig = (argty, *(typeof(arg) for arg in args)) + + # Get the return type. The PTX is also returned by compile_udf, but is not + # needed here. + ptx, output_type = cudautils.compile_udf(func, compile_sig) + if not isinstance(output_type, MaskedType): + numba_output_type = numpy_support.from_dtype(np.dtype(output_type)) + else: + numba_output_type = output_type + + result = ( + numba_output_type + if not isinstance(numba_output_type, MaskedType) + else numba_output_type.value_type + ) + + # _get_udf_return_type will throw a TypingError if the user tries to use + # a field in the row containing an unsupported dtype, except in the + # edge case where all the function does is return that element: + + # def f(row): + # return row[] + # In this case numba is happy to return MaskedType() + # because it relies on not finding overloaded operators for types to raise + # the exception, so we have to explicitly check for that case. + if isinstance(result, Poison): + raise TypingError(str(result)) + + return result + + +def _is_jit_supported_type(dtype): + # category dtype isn't hashable + if isinstance(dtype, CategoricalDtype): + return False + return str(dtype) in JIT_SUPPORTED_TYPES + + +def _all_dtypes_from_frame(frame): + return { + colname: col.dtype + if _is_jit_supported_type(col.dtype) + else np.dtype("O") + for colname, col in frame._data.items() + } + + +def _supported_dtypes_from_frame(frame): + return { + colname: col.dtype + for colname, col in frame._data.items() + if _is_jit_supported_type(col.dtype) + } + + +def _supported_cols_from_frame(frame): + return { + colname: col + for colname, col in frame._data.items() + if _is_jit_supported_type(col.dtype) + } + + +def _masked_array_type_from_col(col): + """ + Return a type representing a tuple of arrays, + the first element an array of the numba type + corresponding to `dtype`, and the second an + array of bools representing a mask. + """ + nb_scalar_ty = numpy_support.from_dtype(col.dtype) + if col.mask is None: + return nb_scalar_ty[::1] + else: + return Tuple((nb_scalar_ty[::1], libcudf_bitmask_type[::1])) + + +def _construct_signature(frame, return_type, args): + """ + Build the signature of numba types that will be used to + actually JIT the kernel itself later, accounting for types + and offsets. Skips columns with unsupported dtypes. + """ + + # Tuple of arrays, first the output data array, then the mask + return_type = Tuple((return_type[::1], boolean[::1])) + offsets = [] + sig = [return_type, int64] + for col in _supported_cols_from_frame(frame).values(): + sig.append(_masked_array_type_from_col(col)) + offsets.append(int64) + + # return_type, size, data, masks, offsets, extra args + sig = void(*(sig + offsets + [typeof(arg) for arg in args])) + + return sig + + +@cuda.jit(device=True) +def _mask_get(mask, pos): + """Return the validity of mask[pos] as a word.""" + return (mask[pos // MASK_BITSIZE] >> (pos % MASK_BITSIZE)) & 1 + + +def _generate_cache_key(frame, func: Callable): + """Create a cache key that uniquely identifies a compilation. + + A new compilation is needed any time any of the following things change: + - The UDF itself as defined in python by the user + - The types of the columns utilized by the UDF + - The existence of the input columns masks + """ + return ( + *cudautils.make_cache_key( + func, tuple(_all_dtypes_from_frame(frame).values()) + ), + *(col.mask is None for col in frame._data.values()), + *frame._data.keys(), + ) + + +@annotate("UDF COMPILATION", color="darkgreen", domain="cudf_python") +def _compile_or_get(frame, func, args, kernel_getter=None): + """ + Return a compiled kernel in terms of MaskedTypes that launches a + kernel equivalent of `f` for the dtypes of `df`. The kernel uses + a thread for each row and calls `f` using that rows data / mask + to produce an output value and output validity for each row. + + If the UDF has already been compiled for this requested dtypes, + a cached version will be returned instead of running compilation. + + CUDA kernels are void and do not return values. Thus, we need to + preallocate a column of the correct dtype and pass it in as one of + the kernel arguments. This creates a chicken-and-egg problem where + we need the column type to compile the kernel, but normally we would + be getting that type FROM compiling the kernel (and letting numba + determine it as a return value). As a workaround, we compile the UDF + itself outside the final kernel to invoke a full typing pass, which + unfortunately is difficult to do without running full compilation. + we then obtain the return type from that separate compilation and + use it to allocate an output column of the right dtype. + """ + + # check to see if we already compiled this function + cache_key = _generate_cache_key(frame, func) + if precompiled.get(cache_key) is not None: + kernel, masked_or_scalar = precompiled[cache_key] + return kernel, masked_or_scalar + + # precompile the user udf to get the right return type. + # could be a MaskedType or a scalar type. + + kernel, scalar_return_type = kernel_getter(frame, func, args) + + np_return_type = numpy_support.as_dtype(scalar_return_type) + precompiled[cache_key] = (kernel, np_return_type) + + return kernel, np_return_type + + +def _get_kernel(kernel_string, globals_, sig, func): + """template kernel compilation helper function""" + f_ = cuda.jit(device=True)(func) + globals_["f_"] = f_ + exec(kernel_string, globals_) + _kernel = globals_["_kernel"] + kernel = cuda.jit(sig)(_kernel) + + return kernel diff --git a/python/cudf/cudf/errors.py b/python/cudf/cudf/errors.py index 8a31afab9cf..5d6f52c0307 100644 --- a/python/cudf/cudf/errors.py +++ b/python/cudf/cudf/errors.py @@ -1,9 +1,5 @@ # Copyright (c) 2020, NVIDIA CORPORATION. -class UnSupportedGPUError(Exception): - pass - - -class UnSupportedCUDAError(Exception): +class UnsupportedCUDAError(Exception): pass diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 5c35d004ac0..a09fb1f8e12 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -291,7 +291,12 @@ def read_orc( **kwargs, ): """{docstring}""" - + if decimal_cols_as_float is not None: + warnings.warn( + "`decimal_cols_as_float` is deprecated and will be removed in " + "the future", + FutureWarning, + ) from cudf import DataFrame # Multiple sources are passed as a list. If a single source is passed, diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index cc5aec36853..41dac26edf8 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -40,6 +40,17 @@ ALL_TYPES = sorted(list(dtypeutils.ALL_TYPES)) +def set_random_null_mask_inplace(series, null_probability=0.5, seed=None): + """Randomly nullify elements in series with the provided probability.""" + probs = [null_probability, 1 - null_probability] + rng = np.random.default_rng(seed=seed) + mask = rng.choice([False, True], size=len(series), p=probs) + series[mask] = None + + +# TODO: This function should be removed. Anywhere that it is being used should +# instead be generating a random boolean array (bytemask) and use the public +# APIs to set those elements to None. def random_bitmask(size): """ Parameters diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py index 925c9ef720c..ff6e79e7804 100644 --- a/python/cudf/cudf/tests/test_applymap.py +++ b/python/cudf/cudf/tests/test_applymap.py @@ -24,14 +24,10 @@ def test_applymap_round(nelem, masked): boolmask = np.asarray( utils.expand_bits_to_bytes(bitmask), dtype=np.bool_ )[:nelem] - data[~boolmask] = np.nan + data[~boolmask] = None sr = Series(data) - if masked: - # Mask the Series - sr = sr.set_mask(bitmask) - # Call applymap out = sr.applymap( lambda x: (floor(x) + 1 if x - floor(x) >= 0.5 else floor(x)) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index e01b952be94..748cf958ac3 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -437,7 +437,7 @@ def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype): expect_mask = [True if x is not pd.NA else False for x in pd_data["a"]] got_mask = mask_to_bools( gd_data["a"]._column.base_mask, 0, len(gd_data) - ).to_array() + ).values_host np.testing.assert_array_equal(expect_mask, got_mask) @@ -475,7 +475,7 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype): expect_mask = [True if x is not pd.NA else False for x in pd_data] got_mask = mask_to_bools( gd_data._column.base_mask, 0, len(gd_data) - ).to_array() + ).values_host np.testing.assert_array_equal(expect_mask, got_mask) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 194d838a04e..a3b7bd2373f 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -968,7 +968,7 @@ def test_dataframe_dir_and_getattr(): df.not_a_column -def test_empty_dataframe_to_array(): +def test_empty_dataframe_to_cupy(): df = cudf.DataFrame() # Check fully empty dataframe. @@ -1023,7 +1023,7 @@ def test_dataframe_to_cupy_null_values(): for k in "abcd": df[k] = data = np.random.random(nelem) bitmask = utils.random_bitmask(nelem) - df[k] = df[k].set_mask(bitmask) + df[k] = df[k]._column.set_mask(bitmask) boolmask = np.asarray( utils.expand_bits_to_bytes(bitmask)[:nelem], dtype=np.bool_ ) @@ -1194,7 +1194,7 @@ def test_dataframe_hash_partition_masked_value(nrows): gdf["val"] = np.arange(nrows) + 100 bitmask = utils.random_bitmask(nrows) bytemask = utils.expand_bits_to_bytes(bitmask) - gdf["val"] = gdf["val"].set_mask(bitmask) + gdf["val"] = gdf["val"]._column.set_mask(bitmask) parted = gdf.partition_by_hash(["key"], nparts=3) # Verify that the valid mask is correct for p in parted: @@ -1215,7 +1215,7 @@ def test_dataframe_hash_partition_masked_keys(nrows): gdf["val"] = np.arange(nrows) + 100 bitmask = utils.random_bitmask(nrows) bytemask = utils.expand_bits_to_bytes(bitmask) - gdf["key"] = gdf["key"].set_mask(bitmask) + gdf["key"] = gdf["key"]._column.set_mask(bitmask) parted = gdf.partition_by_hash(["key"], nparts=3, keep_index=False) # Verify that the valid mask is correct for p in parted: @@ -9150,3 +9150,37 @@ def test_dataframe_assign_cp_np_array(): gdf[[f"f_{i}" for i in range(n)]] = cp_ndarray assert_eq(pdf, gdf) + + +@pytest.mark.parametrize( + "data", [{"a": [1, 2, 3], "b": [1, 1, 0]}], +) +def test_dataframe_nunique(data): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.nunique() + expected = pdf.nunique() + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", [{"key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}], +) +def test_dataframe_nunique_index(data): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.index.nunique() + expected = pdf.index.nunique() + + assert_eq(expected, actual) + + +def test_dataframe_rename_duplicate_column(): + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) + with pytest.raises( + ValueError, match="Duplicate column names are not allowed" + ): + gdf.rename(columns={"a": "b"}, inplace=True) diff --git a/python/cudf/cudf/tests/test_fill.py b/python/cudf/cudf/tests/test_fill.py deleted file mode 100644 index 224db2b39d1..00000000000 --- a/python/cudf/cudf/tests/test_fill.py +++ /dev/null @@ -1,64 +0,0 @@ -import pandas as pd -import pytest - -import cudf -from cudf.testing._utils import assert_eq - - -@pytest.mark.parametrize( - "fill_value,data", - [ - (7, [6, 3, 4]), - ("x", ["a", "b", "c", "d", "e", "f"]), - (7, [6, 3, 4, 2, 1, 7, 8, 5]), - (0.8, [0.6, 0.3, 0.4, 0.2, 0.1, 0.7, 0.8, 0.5]), - ("b", pd.Categorical(["a", "b", "c"])), - (None, [0.0, 1.0, 2.0, 3.0]), - ], -) -@pytest.mark.parametrize( - "begin,end", - [ - (0, -1), - (0, 4), - (1, -1), - (1, 4), - (-2, 1), - (-2, -1), - (10, 12), - (8, 10), - (10, 8), - (-10, -8), - (-2, 6), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_fill(data, fill_value, begin, end, inplace): - gs = cudf.Series(data) - ps = gs.to_pandas() - - if inplace: - actual = gs - gs[begin:end] = fill_value - else: - # private impl doesn't take care of rounding or bounds check - if begin < 0: - begin += len(gs) - - if end < 0: - end += len(gs) - - begin = max(0, min(len(gs), begin)) - end = max(0, min(len(gs), end)) - actual = gs.fill(fill_value, begin, end, False) - assert actual is not gs - - ps[begin:end] = fill_value - - assert_eq(ps, actual) - - -@pytest.mark.xfail(raises=ValueError) -def test_fill_new_category(): - gs = cudf.Series(pd.Categorical(["a", "b", "c"])) - gs[0:1] = "d" diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index e452dc5d7f7..102e5b57e8e 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -783,8 +783,8 @@ def test_dataframe_masked_slicing(nelem, slice_start, slice_end): gdf = cudf.DataFrame() gdf["a"] = list(range(nelem)) gdf["b"] = list(range(nelem, 2 * nelem)) - gdf["a"] = gdf["a"].set_mask(utils.random_bitmask(nelem)) - gdf["b"] = gdf["b"].set_mask(utils.random_bitmask(nelem)) + gdf["a"] = gdf["a"]._column.set_mask(utils.random_bitmask(nelem)) + gdf["b"] = gdf["b"]._column.set_mask(utils.random_bitmask(nelem)) def do_slice(x): return x[slice_start:slice_end] diff --git a/python/cudf/cudf/tests/test_label_encode.py b/python/cudf/cudf/tests/test_label_encode.py deleted file mode 100644 index e5c6bacf1d1..00000000000 --- a/python/cudf/cudf/tests/test_label_encode.py +++ /dev/null @@ -1,132 +0,0 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. - -import random -from itertools import product - -import numpy as np -import pytest - -import cudf -from cudf import DataFrame, Series - - -def _random_float(nelem, dtype): - return np.random.random(nelem).astype(dtype) - - -def _random_int(nelem, dtype): - return np.random.randint(low=0, high=nelem, size=nelem, dtype=dtype) - - -def _random(nelem, dtype): - dtype = cudf.dtype(dtype) - if dtype.kind in {"i", "u"}: - return _random_int(nelem, dtype) - elif dtype.kind == "f": - return _random_float(nelem, dtype) - - -_param_sizes = [1, 7, 10, 100, 1000] -_param_dtypes = [np.int32, np.float32] - - -@pytest.mark.filterwarnings("ignore:DataFrame.label_encoding is deprecated") -@pytest.mark.filterwarnings("ignore:Series.label_encoding is deprecated") -@pytest.mark.parametrize( - "nelem,dtype", list(product(_param_sizes, _param_dtypes)) -) -def test_label_encode(nelem, dtype): - df = DataFrame() - np.random.seed(0) - - # initialize data frame - df["cats"] = _random(nelem, dtype) - vals = df["cats"].unique() - lab = dict({vals[i]: i for i in range(len(vals))}) - - # label encode series - ncol = df["cats"].label_encoding(cats=vals) - arr = ncol.to_numpy() - - # verify labels of new column - for i in range(arr.size): - np.testing.assert_equal(arr[i], lab.get(df.cats[i], None)) - - # label encode data frame - df2 = df.label_encoding(column="cats", prefix="cats", cats=vals) - - assert df2.columns[0] == "cats" - assert df2.columns[1] == "cats_labels" - - -@pytest.mark.filterwarnings("ignore:DataFrame.label_encoding is deprecated") -@pytest.mark.filterwarnings("ignore:Series.label_encoding is deprecated") -def test_label_encode_drop_one(): - random.seed(0) - np.random.seed(0) - - df = DataFrame() - - # initialize data frame - df["cats"] = np.random.randint(7, size=10, dtype=np.int32) - vals = df["cats"].unique() - # drop 1 randomly - vals = vals[vals.index != random.randrange(len(vals))].reset_index( - drop=True - ) - - lab = dict({vals[i]: i for i in range(len(vals))}) - - # label encode series - ncol = df["cats"].label_encoding(cats=vals, dtype="float32") - arr = ncol.to_numpy() - - # verify labels of new column - - for i in range(arr.size): - # assuming -1 is used for missing value - np.testing.assert_equal(arr[i], lab.get(df.cats[i], -1)) - - # label encode data frame - df2 = df.label_encoding( - column="cats", prefix="cats", cats=vals, dtype="float32" - ) - - assert df2.columns[0] == "cats" - assert df2.columns[1] == "cats_labels" - - -@pytest.mark.filterwarnings("ignore:DataFrame.label_encoding is deprecated") -def test_label_encode_float_output(): - random.seed(0) - np.random.seed(0) - - df = DataFrame() - - # initialize data frame - df["cats"] = arr = np.random.randint(7, size=10, dtype=np.int32) - cats = [1, 2, 3, 4] - encoder = {c: i for i, c in enumerate(cats)} - df2 = df.label_encoding( - column="cats", - prefix="cats", - cats=cats, - dtype=np.float32, - na_sentinel=np.nan, - ) - - got = df2["cats_labels"].to_numpy(na_value=np.nan) - - handcoded = np.array([encoder.get(v, np.nan) for v in arr]) - np.testing.assert_equal(got, handcoded) - - -@pytest.mark.filterwarnings("ignore:Series.label_encoding is deprecated") -@pytest.mark.parametrize( - "ncats,cat_dtype", [(10, np.int8), (127, np.int8), (128, np.int16)] -) -def test_label_encode_dtype(ncats, cat_dtype): - s = Series([str(i % ncats) for i in range(ncats + 1)]) - cats = s.unique().astype(s.dtype) - encoded_col = s.label_encoding(cats=cats) - np.testing.assert_equal(encoded_col.dtype, cat_dtype) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 44749103b54..fc9ad9711d1 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. + import functools import operator @@ -586,3 +587,18 @@ def test_listcol_setitem_error_cases(data, item, error): sr = cudf.Series(data) with pytest.raises(BaseException, match=error): sr[1] = item + + +def test_listcol_setitem_retain_dtype(): + df = cudf.DataFrame( + {"a": cudf.Series([["a", "b"], []]), "b": [1, 2], "c": [123, 321]} + ) + df1 = df.head(0) + # Performing a setitem on `b` triggers a `column.column_empty_like` call + # which tries to create an empty ListColumn. + df1["b"] = df1["c"] + # Performing a copy to trigger a copy dtype which is obtained by accessing + # `ListColumn.children` that would have been corrupted in previous call + # prior to this fix: https://github.com/rapidsai/cudf/pull/10151/ + df2 = df1.copy() + assert df2["a"].dtype == df["a"].dtype diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 21556aad1eb..80ab0671a0d 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -28,7 +28,7 @@ TIMEDELTA_TYPES, assert_eq, assert_exceptions_equal, - random_bitmask, + set_random_null_mask_inplace, ) @@ -2124,7 +2124,7 @@ def test_parquet_writer_statistics(tmpdir, pdf, add_nulls): gdf = cudf.from_pandas(pdf) if add_nulls: for col in gdf: - gdf[col] = gdf[col].set_mask(random_bitmask(len(gdf))) + set_random_null_mask_inplace(gdf[col]) gdf.to_parquet(file_path, index=False) # Read back from pyarrow diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 82020f30f7c..ca02ee55df0 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -20,10 +20,8 @@ @pytest.mark.parametrize("nrows", [0, 5, 10]) def test_null_series(nrows, dtype): size = 5 - mask = utils.random_bitmask(size) - data = cudf.Series(np.random.randint(1, 9, size)) - column = data.set_mask(mask) - sr = cudf.Series(column).astype(dtype) + sr = cudf.Series(np.random.randint(1, 9, size)).astype(dtype) + sr[np.random.choice([False, True], size=size)] = None if dtype != "category" and cudf.dtype(dtype).kind in {"u", "i"}: ps = pd.Series( sr._column.data_array_view.copy_to_host(), @@ -62,10 +60,8 @@ def test_null_dataframe(ncols): size = 20 gdf = cudf.DataFrame() for idx, dtype in enumerate(dtype_categories): - mask = utils.random_bitmask(size) - data = cudf.Series(np.random.randint(0, 128, size)) - column = data.set_mask(mask) - sr = cudf.Series(column).astype(dtype) + sr = cudf.Series(np.random.randint(0, 128, size)).astype(dtype) + sr[np.random.choice([False, True], size=size)] = None gdf[dtype] = sr pdf = gdf.to_pandas() pd.options.display.max_columns = int(ncols) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index ffdd53c58ac..358484d79b9 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -561,7 +561,9 @@ def test_series_value_counts(dropna, normalize): for size in [10 ** x for x in range(5)]: arr = np.random.randint(low=-1, high=10, size=size) mask = arr != -1 - sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask()) + sr = cudf.Series.from_masked_array( + arr, cudf.Series(mask)._column.as_mask() + ) sr.name = "col" expect = ( @@ -1517,3 +1519,74 @@ def test_series_transpose(data): assert_eq(pd_transposed, cudf_transposed) assert_eq(pd_property, cudf_property) assert_eq(cudf_transposed, csr) + + +@pytest.mark.parametrize( + "data", [1, 3, 5, 7, 7], +) +def test_series_nunique(data): + cd_s = cudf.Series(data) + pd_s = cd_s.to_pandas() + + actual = cd_s.nunique() + expected = pd_s.nunique() + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", [1, 3, 5, 7, 7], +) +def test_series_nunique_index(data): + cd_s = cudf.Series(data) + pd_s = cd_s.to_pandas() + + actual = cd_s.index.nunique() + expected = pd_s.index.nunique() + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "fill_value,data", + [ + (7, [6, 3, 4]), + ("x", ["a", "b", "c", "d", "e", "f"]), + (7, [6, 3, 4, 2, 1, 7, 8, 5]), + (0.8, [0.6, 0.3, 0.4, 0.2, 0.1, 0.7, 0.8, 0.5]), + ("b", pd.Categorical(["a", "b", "c"])), + (None, [0.0, 1.0, 2.0, 3.0]), + ], +) +@pytest.mark.parametrize( + "begin,end", + [ + (0, -1), + (0, 4), + (1, -1), + (1, 4), + (-2, 1), + (-2, -1), + (10, 12), + (8, 10), + (10, 8), + (-10, -8), + (-2, 6), + ], +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_fill(data, fill_value, begin, end, inplace): + gs = cudf.Series(data) + ps = gs.to_pandas() + + actual = gs + gs[begin:end] = fill_value + ps[begin:end] = fill_value + + assert_eq(ps, actual) + + +@pytest.mark.xfail(raises=ValueError) +def test_fill_new_category(): + gs = cudf.Series(pd.Categorical(["a", "b", "c"])) + gs[0:1] = "d" diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 142ca6c6831..cb3a369d067 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -32,7 +32,8 @@ def test_series_reductions(method, dtype, skipna): arr = arr.astype(dtype) if dtype in (np.float32, np.float64): arr[[2, 5, 14, 19, 50, 70]] = np.nan - sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask()) + sr = cudf.Series(arr) + sr[~mask] = None psr = sr.to_pandas() psr[~mask] = np.nan @@ -83,7 +84,8 @@ def test_series_unique(): for size in [10 ** x for x in range(5)]: arr = np.random.randint(low=-1, high=10, size=size) mask = arr != -1 - sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask()) + sr = cudf.Series(arr) + sr[~mask] = None assert set(arr[mask]) == set(sr.unique().dropna().to_numpy()) assert len(set(arr[mask])) == sr.nunique() @@ -298,7 +300,8 @@ def test_series_median(dtype, num_na): mask = np.arange(100) >= num_na arr = arr.astype(dtype) - sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask()) + sr = cudf.Series(arr) + sr[~mask] = None arr2 = arr[mask] ps = pd.Series(arr2, dtype=dtype) diff --git a/python/cudf/cudf/tests/test_udf_binops.py b/python/cudf/cudf/tests/test_udf_binops.py index 935c3868a68..c5cd8f8b717 100644 --- a/python/cudf/cudf/tests/test_udf_binops.py +++ b/python/cudf/cudf/tests/test_udf_binops.py @@ -49,4 +49,4 @@ def generic_function(a, b): result = lhs_arr ** 3 + rhs_arr - np.testing.assert_almost_equal(result, out_col.to_array()) + np.testing.assert_almost_equal(result, out_col.values_host) diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index c9c2c440632..56090c8eacf 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -13,6 +13,7 @@ comparison_ops, unary_ops, ) +from cudf.core.udf.utils import precompiled from cudf.testing._utils import NUMERIC_TYPES, _decimal_series, assert_eq @@ -485,7 +486,7 @@ def outer(row): {"a": [1, cudf.NA, 3, cudf.NA], "b": [1, 2, cudf.NA, cudf.NA]} ) - with pytest.raises(AttributeError): + with pytest.raises(ValueError): gdf.apply(outer, axis=1) pdf = gdf.to_pandas(nullable=True) @@ -538,7 +539,7 @@ def func(row): return row["unsupported_col"] # check that we fail when an unsupported type is used within a function - with pytest.raises(TypeError): + with pytest.raises(ValueError): data.apply(func, axis=1) # also check that a DF containing unsupported dtypes can still run a @@ -595,6 +596,44 @@ def func(row, c, k): run_masked_udf_test(func, data, args=(1, 2), check_dtype=False) +@pytest.mark.parametrize( + "data", + [ + [1, cudf.NA, 3], + [0.5, 2.0, cudf.NA, cudf.NA, 5.0], + [True, False, cudf.NA], + ], +) +@pytest.mark.parametrize("op", arith_ops + comparison_ops) +def test_mask_udf_scalar_args_binops_series(data, op): + data = cudf.Series(data) + + def func(x, c): + return x + c + + run_masked_udf_series(func, data, args=(1,), check_dtype=False) + + +@pytest.mark.parametrize( + "data", + [ + [1, cudf.NA, 3], + [0.5, 2.0, cudf.NA, cudf.NA, 5.0], + [True, False, cudf.NA], + ], +) +@pytest.mark.parametrize("op", arith_ops + comparison_ops) +def test_masked_udf_scalar_args_binops_multiple_series(data, op): + data = cudf.Series(data) + + def func(data, c, k): + x = op(data, c) + y = op(x, k) + return y + + run_masked_udf_series(func, data, args=(1, 2), check_dtype=False) + + def test_masked_udf_caching(): # Make sure similar functions that differ # by simple things like constants actually @@ -612,3 +651,16 @@ def test_masked_udf_caching(): expect = data ** 3 got = data.applymap(lambda x: x ** 3) assert_eq(expect, got, check_dtype=False) + + # make sure we get a hit when reapplying + def f(x): + return x + 1 + + precompiled.clear() + assert precompiled.currsize == 0 + data.apply(f) + + assert precompiled.currsize == 1 + data.apply(f) + + assert precompiled.currsize == 1 diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py index fa5cde76524..3cbbc1e1ce7 100644 --- a/python/cudf/cudf/utils/applyutils.py +++ b/python/cudf/cudf/utils/applyutils.py @@ -173,7 +173,9 @@ def run(self, df, **launch_params): outputs[k], index=outdf.index, nan_as_null=False ) if out_mask is not None: - outdf[k] = outdf[k].set_mask(out_mask.data_array_view) + outdf._data[k] = outdf[k]._column.set_mask( + out_mask.data_array_view + ) return outdf diff --git a/python/cudf/cudf/utils/gpu_utils.py b/python/cudf/cudf/utils/gpu_utils.py index 8947760e052..bd3da4ea2ba 100644 --- a/python/cudf/cudf/utils/gpu_utils.py +++ b/python/cudf/cudf/utils/gpu_utils.py @@ -26,16 +26,7 @@ def validate_setup(): runtimeGetVersion, ) - def _try_get_old_or_new_symbols(): - try: - # CUDA 10.2+ symbols - return [ - cudaError_t.cudaErrorDeviceUninitialized, - cudaError_t.cudaErrorTimeout, - ] - except AttributeError: - # CUDA 10.1 symbols - return [cudaError_t.cudaErrorDeviceUninitilialized] + from cudf.errors import UnsupportedCUDAError notify_caller_errors = { cudaError_t.cudaErrorInitializationError, @@ -51,7 +42,8 @@ def _try_get_old_or_new_symbols(): cudaError_t.cudaErrorSystemNotReady, cudaError_t.cudaErrorSystemDriverMismatch, cudaError_t.cudaErrorCompatNotSupportedOnDevice, - *_try_get_old_or_new_symbols(), + cudaError_t.cudaErrorDeviceUninitialized, + cudaError_t.cudaErrorTimeout, cudaError_t.cudaErrorUnknown, cudaError_t.cudaErrorApiFailureBase, } @@ -72,43 +64,38 @@ def _try_get_old_or_new_symbols(): cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, 0 ) - if major_version >= 6: - # You have a GPU with NVIDIA Pascal™ architecture or better + if major_version < 6: + # A GPU with NVIDIA Pascal™ architecture or newer is required. + # Reference: https://developer.nvidia.com/cuda-gpus # Hardware Generation Compute Capability + # Ampere 8.x # Turing 7.5 - # Volta 7.x + # Volta 7.0, 7.2 # Pascal 6.x - # Maxwell 5.x + # Maxwell 5.x # Kepler 3.x # Fermi 2.x - pass - else: device_name = deviceGetName(0) minor_version = getDeviceAttribute( cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, 0 ) warnings.warn( - f"You will need a GPU with NVIDIA Pascal™ or " - f"newer architecture" - f"\nDetected GPU 0: {device_name} \n" - f"Detected Compute Capability: " - f"{major_version}.{minor_version}" + "A GPU with NVIDIA Pascal™ (Compute Capability 6.0) " + "or newer architecture is required.\n" + f"Detected GPU 0: {device_name}\n" + f"Detected Compute Capability: {major_version}.{minor_version}" ) cuda_runtime_version = runtimeGetVersion() - if cuda_runtime_version >= 10000: - # CUDA Runtime Version Check: Runtime version is greater than 10000 - pass - else: - from cudf.errors import UnSupportedCUDAError - - minor_version = cuda_runtime_version % 100 - major_version = (cuda_runtime_version - minor_version) // 1000 - raise UnSupportedCUDAError( - f"Detected CUDA Runtime version is " - f"{major_version}.{str(minor_version)[0]}" - f"Please update your CUDA Runtime to 10.0 or above" + if cuda_runtime_version < 11000: + # Require CUDA Runtime version 11.0 or greater. + major_version = cuda_runtime_version // 1000 + minor_version = (cuda_runtime_version % 1000) // 10 + raise UnsupportedCUDAError( + "Detected CUDA Runtime version is " + f"{major_version}.{minor_version}. " + "Please update your CUDA Runtime to 11.0 or above." ) cuda_driver_supported_rt_version = driverGetVersion() @@ -124,15 +111,12 @@ def _try_get_old_or_new_symbols(): # https://docs.nvidia.com/deploy/cuda-compatibility/index.html if cuda_driver_supported_rt_version == 0: - from cudf.errors import UnSupportedCUDAError - - raise UnSupportedCUDAError( - "We couldn't detect the GPU driver " - "properly. Please follow the linux installation guide to " - "ensure your driver is properly installed " - ": https://docs.nvidia.com/cuda/cuda-installation-guide-linux/" + raise UnsupportedCUDAError( + "We couldn't detect the GPU driver properly. Please follow " + "the installation guide to ensure your driver is properly " + "installed: " + "https://docs.nvidia.com/cuda/cuda-installation-guide-linux/" ) - elif cuda_driver_supported_rt_version >= cuda_runtime_version: # CUDA Driver Version Check: # Driver Runtime version is >= Runtime version @@ -147,17 +131,12 @@ def _try_get_old_or_new_symbols(): # version 450.80.02 supports. pass else: - from cudf.errors import UnSupportedCUDAError - - raise UnSupportedCUDAError( - f"Please update your NVIDIA GPU Driver to support CUDA " - f"Runtime.\n" - f"Detected CUDA Runtime version : {cuda_runtime_version}" - f"\n" - f"Latest version of CUDA supported by current " + raise UnsupportedCUDAError( + "Please update your NVIDIA GPU Driver to support CUDA " + "Runtime.\n" + f"Detected CUDA Runtime version : {cuda_runtime_version}\n" + "Latest version of CUDA supported by current " f"NVIDIA GPU Driver : {cuda_driver_supported_rt_version}" ) - else: - warnings.warn("No NVIDIA GPU detected") diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index d23094ef3f9..add4ecd8f01 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -242,7 +242,6 @@ def _fillna_natwise(col): return column.build_column( data=result.base_data, dtype=result.dtype, - mask=col.base_mask, size=result.size, offset=result.offset, children=result.base_children, diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index 149d98ebfb9..1bc270a5b9f 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. + import math from operator import getitem from typing import Set @@ -42,19 +43,11 @@ def __init__(self, *args, **kwargs): def __getitem__(self, key): if isinstance(key, list): g = CudfDataFrameGroupBy( - self.obj, - by=self.index, - slice=key, - sort=self.sort, - **self.dropna, + self.obj, by=self.by, slice=key, sort=self.sort, **self.dropna, ) else: g = CudfSeriesGroupBy( - self.obj, - by=self.index, - slice=key, - sort=self.sort, - **self.dropna, + self.obj, by=self.by, slice=key, sort=self.sort, **self.dropna, ) g._meta = g._meta[key] @@ -63,8 +56,8 @@ def __getitem__(self, key): def mean(self, split_every=None, split_out=1): return groupby_agg( self.obj, - self.index, - {c: "mean" for c in self.obj.columns if c not in self.index}, + self.by, + {c: "mean" for c in self.obj.columns if c not in self.by}, split_every=split_every, split_out=split_out, dropna=self.dropna, @@ -76,8 +69,8 @@ def mean(self, split_every=None, split_out=1): def collect(self, split_every=None, split_out=1): return groupby_agg( self.obj, - self.index, - {c: "collect" for c in self.obj.columns if c not in self.index}, + self.by, + {c: "collect" for c in self.obj.columns if c not in self.by}, split_every=split_every, split_out=split_out, dropna=self.dropna, @@ -94,10 +87,10 @@ def aggregate(self, arg, split_every=None, split_out=1): if ( isinstance(self.obj, DaskDataFrame) and ( - isinstance(self.index, str) + isinstance(self.by, str) or ( - isinstance(self.index, list) - and all(isinstance(x, str) for x in self.index) + isinstance(self.by, list) + and all(isinstance(x, str) for x in self.by) ) ) and _is_supported(arg, SUPPORTED_AGGS) @@ -133,7 +126,7 @@ def __init__(self, *args, **kwargs): def mean(self, split_every=None, split_out=1): return groupby_agg( self.obj, - self.index, + self.by, {self._slice: "mean"}, split_every=split_every, split_out=split_out, @@ -146,7 +139,7 @@ def mean(self, split_every=None, split_out=1): def std(self, split_every=None, split_out=1): return groupby_agg( self.obj, - self.index, + self.by, {self._slice: "std"}, split_every=split_every, split_out=split_out, @@ -159,7 +152,7 @@ def std(self, split_every=None, split_out=1): def var(self, split_every=None, split_out=1): return groupby_agg( self.obj, - self.index, + self.by, {self._slice: "var"}, split_every=split_every, split_out=split_out, @@ -172,7 +165,7 @@ def var(self, split_every=None, split_out=1): def collect(self, split_every=None, split_out=1): return groupby_agg( self.obj, - self.index, + self.by, {self._slice: "collect"}, split_every=split_every, split_out=split_out, @@ -192,12 +185,12 @@ def aggregate(self, arg, split_every=None, split_out=1): if ( isinstance(self.obj, DaskDataFrame) - and isinstance(self.index, (str, list)) + and isinstance(self.by, (str, list)) and _is_supported(arg, SUPPORTED_AGGS) ): return groupby_agg( self.obj, - self.index, + self.by, arg, split_every=split_every, split_out=split_out, diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 1521ce41806..c7342818610 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -41,7 +41,7 @@ def test_series(data): sr = Series(pdsr) dsr = dgd.from_cudf(sr, npartitions=5) - np.testing.assert_equal(np.array(pdsr), dsr.compute().to_array()) + np.testing.assert_equal(np.array(pdsr), dsr.compute().values_host) @pytest.mark.parametrize("data", [data_dt_1()]) @@ -114,7 +114,7 @@ def test_categorical_basic(data): sr = Series(cat) dsr = dgd.from_cudf(sr, npartitions=2) result = dsr.compute() - np.testing.assert_array_equal(cat.codes, result.to_array()) + np.testing.assert_array_equal(cat.codes, result.cat.codes.values_host) assert dsr.dtype.to_pandas() == pdsr.dtype # Test attributes @@ -122,7 +122,9 @@ def test_categorical_basic(data): assert_eq(pdsr.cat.categories, dsr.cat.categories) - np.testing.assert_array_equal(pdsr.cat.codes.values, result.to_array()) + np.testing.assert_array_equal( + pdsr.cat.codes.values, result.cat.codes.values_host + ) string = str(result) expect_str = """ @@ -207,12 +209,12 @@ def test_categorical_compare_ordered(data): # Test equality out = dsr1 == dsr1 assert out.dtype == np.bool_ - assert np.all(out.compute().to_array()) + assert np.all(out.compute().values_host) assert np.all(pdsr1 == pdsr1) # Test inequality out = dsr1 != dsr1 - assert not np.any(out.compute().to_array()) + assert not np.any(out.compute().values_host) assert not np.any(pdsr1 != pdsr1) assert dsr1.cat.ordered @@ -220,10 +222,10 @@ def test_categorical_compare_ordered(data): # Test ordered operators np.testing.assert_array_equal( - pdsr1 < pdsr2, (dsr1 < dsr2).compute().to_array() + pdsr1 < pdsr2, (dsr1 < dsr2).compute().values_host ) np.testing.assert_array_equal( - pdsr1 > pdsr2, (dsr1 > dsr2).compute().to_array() + pdsr1 > pdsr2, (dsr1 > dsr2).compute().values_host ) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index ace9701b677..67fed62c582 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -284,7 +284,7 @@ def test_assign(): got = dgf.assign(z=newcol) dd.assert_eq(got.loc[:, ["x", "y"]], df) - np.testing.assert_array_equal(got["z"].compute().to_array(), pdcol) + np.testing.assert_array_equal(got["z"].compute().values_host, pdcol) @pytest.mark.parametrize("data_type", ["int8", "int16", "int32", "int64"])