diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3623db5a283..69f6634b5c2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,6 +32,15 @@ repos: language: system files: \.(cu|cuh|h|hpp|cpp|inl)$ args: ['-fallback-style=none'] + - repo: local + hooks: + - id: mypy + name: mypy + description: mypy + pass_filenames: false + entry: mypy --config-file=python/cudf/setup.cfg python/cudf/cudf + language: system + types: [python] default_language_version: python: python3 diff --git a/CHANGELOG.md b/CHANGELOG.md index a3c84ba1b72..3b027220032 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,8 @@ - PR #6929 Add `Index.set_names` api - PR #6907 Add `replace_null` API with `replace_policy` parameter, `fixed_width` column support - PR #6885 Share `factorize` implementation with Index and cudf module - - PR #6775 Implement cudf.DateOffset for months +- PR #7039 Support contains() on lists of primitives ## Improvements diff --git a/ci/checks/style.sh b/ci/checks/style.sh index 2534f857ee4..17599c6d74d 100755 --- a/ci/checks/style.sh +++ b/ci/checks/style.sh @@ -29,6 +29,10 @@ FLAKE_RETVAL=$? FLAKE_CYTHON=`flake8 --config=python/.flake8.cython` FLAKE_CYTHON_RETVAL=$? +# Run mypy and get results/return code +MYPY_CUDF=`mypy --config=python/cudf/setup.cfg python/cudf/cudf` +MYPY_CUDF_RETVAL=$? + # Run clang-format and check for a consistent code format CLANG_FORMAT=`python cpp/scripts/run-clang-format.py 2>&1` CLANG_FORMAT_RETVAL=$? @@ -66,6 +70,14 @@ else echo -e "\n\n>>>> PASSED: flake8-cython style check\n\n" fi +if [ "$MYPY_CUDF_RETVAL" != "0" ]; then + echo -e "\n\n>>>> FAILED: mypy style check; begin output\n\n" + echo -e "$MYPY_CUDF" + echo -e "\n\n>>>> FAILED: mypy style check; end output\n\n" +else + echo -e "\n\n>>>> PASSED: mypy style check\n\n" +fi + if [ "$CLANG_FORMAT_RETVAL" != "0" ]; then echo -e "\n\n>>>> FAILED: clang format check; begin output\n\n" echo -e "$CLANG_FORMAT" @@ -79,7 +91,7 @@ HEADER_META=`ci/checks/headers_test.sh` HEADER_META_RETVAL=$? echo -e "$HEADER_META" -RETVALS=($ISORT_RETVAL $BLACK_RETVAL $FLAKE_RETVAL $FLAKE_CYTHON_RETVAL $CLANG_FORMAT_RETVAL $HEADER_META_RETVAL) +RETVALS=($ISORT_RETVAL $BLACK_RETVAL $FLAKE_RETVAL $FLAKE_CYTHON_RETVAL $CLANG_FORMAT_RETVAL $HEADER_META_RETVAL $MYPY_CUDF_RETVAL) IFS=$'\n' RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1` diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml index 24882d9b3e2..b810b87111a 100644 --- a/conda/environments/cudf_dev_cuda10.1.yml +++ b/conda/environments/cudf_dev_cuda10.1.yml @@ -40,6 +40,8 @@ dependencies: - flake8=3.8.3 - black=19.10 - isort=5.0.7 + - mypy=0.782 + - typing_extensions - pre_commit - dask>=2.22.0 - distributed>=2.22.0 diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml index 49675fe2154..b4e95bc6730 100644 --- a/conda/environments/cudf_dev_cuda10.2.yml +++ b/conda/environments/cudf_dev_cuda10.2.yml @@ -40,6 +40,8 @@ dependencies: - flake8=3.8.3 - black=19.10 - isort=5.0.7 + - mypy=0.782 + - typing_extensions - pre_commit - dask>=2.22.0 - distributed>=2.22.0 diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index 2917c2c3ce0..3b21f00ab16 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -40,6 +40,8 @@ dependencies: - flake8=3.8.3 - black=19.10 - isort=5.0.7 + - mypy=0.782 + - typing_extensions - pre_commit - dask>=2.22.0 - distributed>=2.22.0 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index ea93c5eb279..c5f7bd34c25 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -34,6 +34,7 @@ requirements: run: - protobuf - python + - typing_extensions - pandas >=1.0,<1.2.0dev0 - cupy >7.1.0,<9.0.0a0 - numba >=0.49.0 diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 0da16cd83b8..1d660e2cd74 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -123,7 +123,9 @@ test: - test -f $PREFIX/include/cudf/join.hpp - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp - test -f $PREFIX/include/cudf/lists/detail/copying.hpp + - test -f $PREFIX/include/cudf/lists/count_elements.hpp - test -f $PREFIX/include/cudf/lists/extract.hpp + - test -f $PREFIX/include/cudf/lists/contains.hpp - test -f $PREFIX/include/cudf/lists/gather.hpp - test -f $PREFIX/include/cudf/lists/lists_column_view.hpp - test -f $PREFIX/include/cudf/merge.hpp @@ -170,6 +172,7 @@ test: - test -f $PREFIX/include/cudf/strings/replace_re.hpp - test -f $PREFIX/include/cudf/strings/split/partition.hpp - test -f $PREFIX/include/cudf/strings/split/split.hpp + - test -f $PREFIX/include/cudf/strings/string_view.hpp - test -f $PREFIX/include/cudf/strings/strings_column_view.hpp - test -f $PREFIX/include/cudf/strings/strip.hpp - test -f $PREFIX/include/cudf/strings/substring.hpp @@ -200,7 +203,6 @@ test: - test -f $PREFIX/include/cudf_test/cudf_gtest.hpp - test -f $PREFIX/include/cudf_test/cxxopts.hpp - test -f $PREFIX/include/cudf_test/file_utilities.hpp - - test -f $PREFIX/include/cudf_test/scalar_utilities.hpp - test -f $PREFIX/include/cudf_test/table_utilities.hpp - test -f $PREFIX/include/cudf_test/timestamp_utilities.cuh - test -f $PREFIX/include/cudf_test/type_list_utilities.hpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 19bde0519db..073f0d62c0a 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -164,8 +164,8 @@ ConfigureBench(SEARCH_BENCH "${SEARCH_BENCH_SRC}") # - sort benchmark -------------------------------------------------------------------------------- set(SORT_BENCH_SRC - "${CMAKE_CURRENT_SOURCE_DIR}/sort/sort_benchmark.cu" - "${CMAKE_CURRENT_SOURCE_DIR}/sort/sort_strings_benchmark.cu") + "${CMAKE_CURRENT_SOURCE_DIR}/sort/sort_benchmark.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/sort/sort_strings_benchmark.cpp") ConfigureBench(SORT_BENCH "${SORT_BENCH_SRC}") diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp index 6006be505bc..d17e7b126c7 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp @@ -63,9 +63,9 @@ void BM_parq_write_varying_inout(benchmark::State& state) void BM_parq_write_varying_options(benchmark::State& state) { - auto const compression = static_cast(state.range(0)); - auto const enable_stats = static_cast(state.range(1)); - auto const output_metadata = state.range(2) != 0; + auto const compression = static_cast(state.range(0)); + auto const enable_stats = static_cast(state.range(1)); + auto const file_path = state.range(2) != 0 ? "unused_path.parquet" : ""; auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED), int32_t(type_group_id::FLOATING_POINT), @@ -82,8 +82,7 @@ void BM_parq_write_varying_options(benchmark::State& state) cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view) .compression(compression) .stats_level(enable_stats) - .return_filemetadata(output_metadata) - .column_chunks_file_path("dummy_path.parquet"); + .column_chunks_file_path(file_path); cudf_io::write_parquet(options); } diff --git a/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp index 3dd2c3782fa..b38dda4d17e 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -72,12 +72,11 @@ void PQ_write_chunked(benchmark::State& state) cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 cudf_io::chunked_parquet_writer_options opts = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info()); - auto writer_state = cudf_io::write_parquet_chunked_begin(opts); - std::for_each( - tables.begin(), tables.end(), [&writer_state](std::unique_ptr const& tbl) { - cudf_io::write_parquet_chunked(*tbl, writer_state); - }); - cudf_io::write_parquet_chunked_end(writer_state); + cudf_io::parquet_chunked_writer writer(opts); + std::for_each(tables.begin(), tables.end(), [&writer](std::unique_ptr const& tbl) { + writer.write(*tbl); + }); + writer.close(); } state.SetBytesProcessed(static_cast(state.iterations()) * state.range(0)); diff --git a/cpp/benchmarks/sort/sort_benchmark.cu b/cpp/benchmarks/sort/sort_benchmark.cpp similarity index 68% rename from cpp/benchmarks/sort/sort_benchmark.cu rename to cpp/benchmarks/sort/sort_benchmark.cpp index 2ba99eb53d9..89eea0f0ce9 100644 --- a/cpp/benchmarks/sort/sort_benchmark.cu +++ b/cpp/benchmarks/sort/sort_benchmark.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,8 +14,6 @@ * limitations under the License. */ -#include - #include #include @@ -24,18 +22,17 @@ #include #include -#include - -#include "../common/generate_benchmark_input.hpp" -#include "../fixture/benchmark_fixture.hpp" -#include "../synchronization/synchronization.hpp" +#include +#include +#include +#include template class Sort : public cudf::benchmark { }; template -static void BM_sort(benchmark::State& state) +static void BM_sort(benchmark::State& state, bool nulls) { using Type = int; using column_wrapper = cudf::test::fixed_width_column_wrapper; @@ -44,16 +41,16 @@ static void BM_sort(benchmark::State& state) const cudf::size_type n_rows{(cudf::size_type)state.range(0)}; const cudf::size_type n_cols{(cudf::size_type)state.range(1)}; - auto type_size = cudf::size_of(cudf::data_type(cudf::type_to_id())); // Create columns with values in the range [0,100) std::vector columns; columns.reserve(n_cols); std::generate_n(std::back_inserter(columns), n_cols, [&, n_rows]() { - auto valids = cudf::test::make_counting_transform_iterator( - 0, [](auto i) { return i % 100 == 0 ? false : true; }); auto elements = cudf::test::make_counting_transform_iterator( 0, [&](auto row) { return distribution(generator); }); + if (!nulls) return column_wrapper(elements, elements + n_rows); + auto valids = cudf::test::make_counting_transform_iterator( + 0, [](auto i) { return i % 100 == 0 ? false : true; }); return column_wrapper(elements, elements + n_rows, valids); }); @@ -70,14 +67,16 @@ static void BM_sort(benchmark::State& state) } } -#define SORT_BENCHMARK_DEFINE(name, stable) \ - BENCHMARK_TEMPLATE_DEFINE_F(Sort, name, stable) \ - (::benchmark::State & st) { BM_sort(st); } \ - BENCHMARK_REGISTER_F(Sort, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 10, 1 << 26}, {1, 8}}) \ - ->UseManualTime() \ +#define SORT_BENCHMARK_DEFINE(name, stable, nulls) \ + BENCHMARK_TEMPLATE_DEFINE_F(Sort, name, stable) \ + (::benchmark::State & st) { BM_sort(st, nulls); } \ + BENCHMARK_REGISTER_F(Sort, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 10, 1 << 26}, {1, 8}}) \ + ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -SORT_BENCHMARK_DEFINE(sort_stable, true) -SORT_BENCHMARK_DEFINE(sort_unstable, false) +SORT_BENCHMARK_DEFINE(unstable_no_nulls, false, false) +SORT_BENCHMARK_DEFINE(stable_no_nulls, true, false) +SORT_BENCHMARK_DEFINE(unstable, false, true) +SORT_BENCHMARK_DEFINE(stable, true, true) diff --git a/cpp/benchmarks/sort/sort_strings_benchmark.cu b/cpp/benchmarks/sort/sort_strings_benchmark.cpp similarity index 100% rename from cpp/benchmarks/sort/sort_strings_benchmark.cu rename to cpp/benchmarks/sort/sort_strings_benchmark.cpp diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh index 5ae1eaa2b9d..fbf68a20364 100644 --- a/cpp/include/cudf/detail/copy_if.cuh +++ b/cpp/include/cudf/detail/copy_if.cuh @@ -99,7 +99,6 @@ __launch_bounds__(block_size) __global__ { T* __restrict__ output_data = output_view.data(); cudf::bitmask_type* __restrict__ output_valid = output_view.null_mask(); - constexpr cudf::size_type leader_lane{0}; static_assert(block_size <= 1024, "Maximum thread block size exceeded"); int tid = threadIdx.x + per_thread * block_size * blockIdx.x; @@ -109,8 +108,8 @@ __launch_bounds__(block_size) __global__ __shared__ bool temp_valids[has_validity ? block_size + cudf::detail::warp_size : 1]; __shared__ T temp_data[block_size]; - cudf::size_type warp_valid_counts{0}; - cudf::size_type block_sum = 0; + cudf::size_type warp_valid_counts{0}; // total valid sum over the `per_thread` loop below + cudf::size_type block_sum = 0; // count passing filter over the `per_thread` loop below // Note that since the maximum gridDim.x on all supported GPUs is as big as // cudf::size_type, this loop is sufficient to cover our maximum column size @@ -160,6 +159,8 @@ __launch_bounds__(block_size) __global__ const int wid = threadIdx.x / cudf::detail::warp_size; const int lane = threadIdx.x % cudf::detail::warp_size; + cudf::size_type tmp_warp_valid_counts{0}; + if (tmp_block_sum > 0 && wid <= last_warp) { int valid_index = (block_offset / cudf::detail::warp_size) + wid; @@ -168,9 +169,8 @@ __launch_bounds__(block_size) __global__ // Note the atomicOr's below assume that output_valid has been set to // all zero before the kernel - if (lane == 0 && valid_warp != 0) { - warp_valid_counts = __popc(valid_warp); + tmp_warp_valid_counts = __popc(valid_warp); if (wid > 0 && wid < last_warp) output_valid[valid_index] = valid_warp; else { @@ -182,19 +182,22 @@ __launch_bounds__(block_size) __global__ if ((wid == 0) && (last_warp == num_warps)) { uint32_t valid_warp = __ballot_sync(0xffffffff, temp_valids[block_size + threadIdx.x]); if (lane == 0 && valid_warp != 0) { - warp_valid_counts += __popc(valid_warp); + tmp_warp_valid_counts += __popc(valid_warp); atomicOr(&output_valid[valid_index + num_warps], valid_warp); } } } + warp_valid_counts += tmp_warp_valid_counts; } block_offset += tmp_block_sum; tid += block_size; } // Compute total null_count for this block and add it to global count + constexpr cudf::size_type leader_lane{0}; cudf::size_type block_valid_count = cudf::detail::single_lane_block_sum_reduce(warp_valid_counts); + if (threadIdx.x == 0) { // one thread computes and adds to null count atomicAdd(output_null_count, block_sum - block_valid_count); } diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh index 75a710d1d5c..e95d932920e 100644 --- a/cpp/include/cudf/detail/iterator.cuh +++ b/cpp/include/cudf/detail/iterator.cuh @@ -174,6 +174,21 @@ auto inline make_validity_iterator(column_device_view const& column) validity_accessor{column}); } +/** + * @brief Constructs a constant device iterator over a scalar's validity. + * + * Dereferencing the returned iterator returns a `bool`. + * + * For `p = *(iter + i)`, `p` is the validity of the scalar. + * + * @param scalar_value The scalar to iterate + * @return auto Iterator that returns scalar validity + */ +auto inline make_validity_iterator(scalar const& scalar_value) +{ + return thrust::make_constant_iterator(scalar_value.is_valid()); +} + /** * @brief value accessor for scalar with valid data. * The unary functor returns data of Element type of the scalar. diff --git a/cpp/include/cudf/detail/utilities/trie.cuh b/cpp/include/cudf/detail/utilities/trie.cuh index 5370c8678cf..77b184a4874 100644 --- a/cpp/include/cudf/detail/utilities/trie.cuh +++ b/cpp/include/cudf/detail/utilities/trie.cuh @@ -135,24 +135,22 @@ inline thrust::host_vector createSerializedTrie( * @return Boolean value, true if string is found, false otherwise */ __host__ __device__ inline bool serialized_trie_contains(device_span trie, - char const *key, - size_t key_len) + device_span key) { if (trie.data() == nullptr || trie.empty()) return false; - if (key_len == 0) return trie[0].is_leaf; - int curr_node = 1; - for (size_t i = 0; i < key_len; ++i) { + if (key.empty()) return trie.front().is_leaf; + auto curr_node = trie.begin() + 1; + for (auto curr_key = key.begin(); curr_key < key.end(); ++curr_key) { // Don't jump away from root node - if (i != 0) { curr_node += trie[curr_node].children_offset; } + if (curr_key != key.begin()) { curr_node += curr_node->children_offset; } // Search for the next character in the array of children nodes // Nodes are sorted - terminate search if the node is larger or equal - while (trie[curr_node].character != trie_terminating_character && - trie[curr_node].character < key[i]) { + while (curr_node->character != trie_terminating_character && curr_node->character < *curr_key) { ++curr_node; } // Could not find the next character, done with the search - if (trie[curr_node].character != key[i]) { return false; } + if (curr_node->character != *curr_key) { return false; } } // Even if the node is present, return true only if that node is at the end of a word - return trie[curr_node].is_leaf; + return curr_node->is_leaf; } diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp index 163d8c9d735..2c946dae748 100644 --- a/cpp/include/cudf/io/detail/parquet.hpp +++ b/cpp/include/cudf/io/detail/parquet.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,14 +20,26 @@ #pragma once -#include - +#include +#include +#include #include +#include + +#include +#include namespace cudf { namespace io { + +// Forward declaration +class parquet_reader_options; +class parquet_writer_options; +class chunked_parquet_writer_options; + namespace detail { namespace parquet { + /** * @brief Class to read Parquet dataset data into columns. */ @@ -90,63 +102,54 @@ class writer { * * @param sink The data sink to write the data to * @param options Settings for controlling writing behavior + * @param mode Option to write at once or in chunks * @param mr Device memory resource to use for device memory allocation + * @param stream CUDA stream used for device memory operations and kernel launches */ explicit writer(std::unique_ptr sink, parquet_writer_options const& options, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - - /** - * @brief Destructor explicitly-declared to avoid inlined in header - */ - ~writer(); + SingleWriteMode mode = SingleWriteMode::YES, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** - * @brief Writes the dataset as per options provided. + * @brief Constructor for writer to handle chunked parquet options. * - * @param table Set of columns to output - * @param metadata Table metadata and column names - * @param return_filemetadata If true, return the raw file metadata - * @param column_chunks_file_path Column chunks file path to be set in the raw output metadata - * @param int96_timestamps If true, write timestamps as INT96 values - * @param stream CUDA stream used for device memory operations and kernel launches. + * @param sink The data sink to write the data to + * @param options Settings for controlling writing behavior for chunked writer + * @param mode Option to write at once or in chunks + * @param mr Device memory resource to use for device memory allocation + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return A parquet-compatible blob that contains the data for all rowgroups in the list */ - std::unique_ptr> write( - table_view const& table, - const table_metadata* metadata = nullptr, - bool return_filemetadata = false, - const std::string column_chunks_file_path = "", - std::vector const& decimal_precision = {}, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); + explicit writer(std::unique_ptr sink, + chunked_parquet_writer_options const& options, + SingleWriteMode mode = SingleWriteMode::NO, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** - * @brief Begins the chunked/streamed write process. - * - * @param[in] pq_chunked_state Internal state maintained between chunks. + * @brief Destructor explicitly-declared to avoid inlined in header */ - void write_chunked_begin(struct pq_chunked_state& state); + ~writer(); /** * @brief Writes a single subtable as part of a larger parquet file/table write. * * @param[in] table The table information to be written - * @param[in] pq_chunked_state Internal state maintained between chunks. */ - void write_chunk(table_view const& table, struct pq_chunked_state& state); + void write(table_view const& table); /** * @brief Finishes the chunked/streamed write process. * - * @param[in] pq_chunked_state Internal state maintained between chunks. - * @param[in] return_filemetadata If true, return the raw file metadata * @param[in] column_chunks_file_path Column chunks file path to be set in the raw output metadata * - * @return A parquet-compatible blob that contains the data for all rowgroups in the list + * @return A parquet-compatible blob that contains the data for all rowgroups in the list only if + * `column_chunks_file_path` is provided, else null. */ - std::unique_ptr> write_chunked_end( - struct pq_chunked_state& state, - bool return_filemetadata = false, - const std::string& column_chunks_file_path = ""); + std::unique_ptr> close(std::string const& column_chunks_file_path = ""); /** * @brief Merges multiple metadata blobs returned by write_all into a single metadata blob diff --git a/cpp/include/cudf/io/detail/utils.hpp b/cpp/include/cudf/io/detail/utils.hpp index 3c674985ef9..adb7078d96d 100644 --- a/cpp/include/cudf/io/detail/utils.hpp +++ b/cpp/include/cudf/io/detail/utils.hpp @@ -20,7 +20,7 @@ namespace cudf { namespace io { namespace detail { /** - * @brief Whether writer writes in chunks or at once + * @brief Whether writer writes in chunks or all at once */ enum class SingleWriteMode : bool { YES, NO }; } // namespace detail diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index a602fb2cfcf..cd3b7bf27da 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -395,8 +396,6 @@ class parquet_writer_options { table_view _table; // Optional associated metadata const table_metadata* _metadata = nullptr; - // Optionally return the raw parquet file metadata output - bool _return_filemetadata = false; // Parquet writes can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS. bool _write_timestamps_as_int96 = false; // Column chunks file path to be set in the raw output metadata @@ -473,11 +472,6 @@ class parquet_writer_options { */ bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; } - /** - * @brief Returns `true` if metadata is required, `false` otherwise. - */ - bool is_enabled_return_filemetadata() const { return _return_filemetadata; } - /** * @brief Returns Column chunks file path to be set in the raw output metadata. */ @@ -509,13 +503,6 @@ class parquet_writer_options { */ void set_compression(compression_type compression) { _compression = compression; } - /** - * @brief Sets whether filemetadata is required or not. - * - * @param req Boolean value to enable/disable return of file metadata. - */ - void enable_return_filemetadata(bool req) { _return_filemetadata = req; } - /** * @brief Sets timestamp writing preferences. INT96 timestamps will be written * if `true` and TIMESTAMP_MICROS will be written if `false`. @@ -598,18 +585,6 @@ class parquet_writer_options_builder { return *this; } - /** - * @brief Sets whether filemetadata is required or not in parquet_writer_options. - * - * @param req Boolean value to enable/disable return of file metadata. - * @return this for chaining. - */ - parquet_writer_options_builder& return_filemetadata(bool req) - { - options._return_filemetadata = req; - return *this; - } - /** * @brief Sets column chunks file path to be set in the raw output metadata. * @@ -899,82 +874,77 @@ class chunked_parquet_writer_options_builder { }; /** - * @brief Forward declaration of anonymous chunked-writer state struct. + * @brief Merges multiple raw metadata blobs that were previously created by write_parquet + * into a single metadata blob + * + * @ingroup io_writers + * + * @param[in] metadata_list List of input file metadata + * @return A parquet-compatible blob that contains the data for all rowgroups in the list */ -struct pq_chunked_state; +std::unique_ptr> merge_rowgroup_metadata( + const std::vector>>& metadata_list); /** - * @brief Begin the process of writing a parquet file in a chunked/stream form. + * @brief chunked parquet writer class to handle options and write tables in chunks. * - * The intent of the write_parquet_chunked_ path is to allow writing of an + * The intent of the parquet_chunked_writer is to allow writing of an * arbitrarily large / arbitrary number of rows to a parquet file in multiple passes. * * The following code snippet demonstrates how to write a single parquet file containing * one logical table by writing a series of individual cudf::tables. + * * @code * ... * std::string filepath = "dataset.parquet"; * cudf::io::chunked_parquet_writer_options options = * cudf::io::chunked_parquet_writer_options::builder(cudf::sink_info(filepath), table->view()); * ... - * auto state = cudf::write_parquet_chunked_begin(options); - * cudf::write_parquet_chunked(table0, state); - * cudf::write_parquet_chunked(table1, state); - * ... - * cudf_write_parquet_chunked_end(state); - * @endcode - * - * @param[in] options Settings for controlling writing behavior. - * @param[in] mr Device memory resource to use for device memory allocation. - * - * @return pointer to an anonymous state structure storing information about the chunked write. - * this pointer must be passed to all subsequent write_parquet_chunked() and - * write_parquet_chunked_end() calls. + * cudf::io::parquet_chunked_writer writer(options) + * writer.write(table0) + * writer.write(table1) + * ... + * writer.close() + * @endcode */ -std::shared_ptr write_parquet_chunked_begin( - chunked_parquet_writer_options const& options, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +class parquet_chunked_writer { + public: + /** + * @brief Default constructor, this should never be used. + * This is added just to satisfy cython. + */ + parquet_chunked_writer() = default; -/** - * @brief Write a single table as a subtable of a larger logical parquet file/table. - * - * All tables passed into multiple calls of this function must contain the same # of columns and - * have columns of the same type. - * - * @param[in] table The table data to be written. - * @param[in] state Opaque state information about the writer process. Must be the same pointer - * returned from write_parquet_chunked_begin(). - * @param[in] int96_timestamps Write out timestamps as INT96 type - */ -void write_parquet_chunked(table_view const& table, std::shared_ptr state); + /** + * @brief Constructor with chunked writer options + * + * @param[in] op options used to write table + * @param[in] mr Device memory resource to use for device memory allocation + */ + parquet_chunked_writer( + chunked_parquet_writer_options const& op, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @brief Finish writing a chunked/stream parquet file. - * - * @param[in] state Opaque state information about the writer process. Must be the same pointer - * returned from write_parquet_chunked_begin(). - * @param[in] return_filemetadata If true, return the raw file metadata. - * @param[in] column_chunks_file_path Column chunks file path to be set in the raw output metadata. - * - * @return A blob that contains the file metadata (parquet FileMetadata thrift message) if - * requested in parquet_writer_options (empty blob otherwise). - */ -std::unique_ptr> write_parquet_chunked_end( - std::shared_ptr& state, - bool return_filemetadata = false, - const std::string& column_chunks_file_path = ""); + /** + * @brief Writes table to output. + * + * @param[in] table Table that needs to be written + * @return returns reference of the class object + */ + parquet_chunked_writer& write(table_view const& table); -/** - * @brief Merges multiple raw metadata blobs that were previously created by write_parquet - * into a single metadata blob - * - * @ingroup io_writers - * - * @param[in] metadata_list List of input file metadata - * @return A parquet-compatible blob that contains the data for all rowgroups in the list - */ -std::unique_ptr> merge_rowgroup_metadata( - const std::vector>>& metadata_list); + /** + * @brief Finishes the chunked/streamed write process. + * + * @param[in] column_chunks_file_path Column chunks file path to be set in the raw output metadata + * @return A parquet-compatible blob that contains the data for all rowgroups in the list only if + * `column_chunks_file_path` is provided, else null. + */ + std::unique_ptr> close(std::string const& column_chunks_file_path = ""); + + // Unique pointer to impl writer class + std::unique_ptr writer; +}; /** @} */ // end of group } // namespace io diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp new file mode 100644 index 00000000000..7cd40bb2f86 --- /dev/null +++ b/cpp/include/cudf/lists/contains.hpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace cudf { +namespace lists { +/** + * @addtogroup lists_contains + * @{ + * @file + */ + +/** + * @brief Create a column of bool values indicating whether the specified scalar + * is an element of each row of a list column. + * + * The output column has as many elements as the input `lists` column. + * Output `column[i]` is set to true if the lists row `lists[i]` contains the value + * specified in `search_key`. Otherwise, it is set to false. + * + * Output `column[i]` is set to null if one or more of the following are true: + * 1. The search key `search_key` is null + * 2. The list row `lists[i]` is null + * 3. The list row `lists[i]` does not contain the search key, and contains at least + * one null. + * + * @param lists Lists column whose `n` rows are to be searched + * @param search_key The scalar key to be looked up in each list row + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return std::unique_ptr BOOL8 column of `n` rows with the result of the lookup + */ +std::unique_ptr contains( + cudf::lists_column_view const& lists, + cudf::scalar const& search_key, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Create a column of bool values indicating whether the list rows of the first + * column contain the corresponding values in the second column + * + * The output column has as many elements as the input `lists` column. + * Output `column[i]` is set to true if the lists row `lists[i]` contains the value + * in `search_keys[i]`. Otherwise, it is set to false. + * + * Output `column[i]` is set to null if one or more of the following are true: + * 1. The row `search_keys[i]` is null + * 2. The list row `lists[i]` is null + * 3. The list row `lists[i]` does not contain the `search_keys[i]`, and contains at least + * one null. + * + * @param lists Lists column whose `n` rows are to be searched + * @param search_keys Column of elements to be looked up in each list row + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return std::unique_ptr BOOL8 column of `n` rows with the result of the lookup + */ +std::unique_ptr contains( + cudf::lists_column_view const& lists, + cudf::column_view const& search_keys, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of group +} // namespace lists +} // namespace cudf diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp new file mode 100644 index 00000000000..6b802d2ad5e --- /dev/null +++ b/cpp/include/cudf/lists/count_elements.hpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace cudf { +namespace lists { +/** + * @addtogroup lists_elements + * @{ + * @file + */ + +/** + * @brief Returns a numeric column containing the number of rows in + * each list element in the given lists column. + * + * The output column will have the same number of rows as the + * input lists column. Each `output[i]` will be `input[i].size()`. + * + * @code{.pseudo} + * l = { {1, 2, 3}, {4}, {5, 6} } + * r = count_elements(l) + * r is now {3, 1, 2} + * @endcode + * + * Any null input element will result in a corresponding null entry + * in the output column. + * + * @param input Input lists column. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New INT32 column with the number of elements for each row. + */ +std::unique_ptr count_elements( + lists_column_view const& input, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of lists_elements group + +} // namespace lists +} // namespace cudf diff --git a/cpp/include/cudf/lists/list_device_view.cuh b/cpp/include/cudf/lists/list_device_view.cuh index 38708d4878e..824b10ced83 100644 --- a/cpp/include/cudf/lists/list_device_view.cuh +++ b/cpp/include/cudf/lists/list_device_view.cuh @@ -112,12 +112,82 @@ class list_device_view { */ CUDA_DEVICE_CALLABLE lists_column_device_view const& get_column() const { return lists_column; } + template + struct pair_accessor; + + template + using const_pair_iterator = + thrust::transform_iterator, thrust::counting_iterator>; + + /** + * @brief Fetcher for a pair iterator to the first element in the list_device_view. + * + * Dereferencing the returned iterator yields a `thrust::pair`. + * + * If the element at index `i` is valid, then for `p = iter[i]`, + * 1. `p.first` is the value of the element at `i` + * 2. `p.second == true` + * + * If the element at index `i` is null, + * 1. `p.first` is undefined + * 2. `p.second == false` + */ + template + CUDA_DEVICE_CALLABLE const_pair_iterator pair_begin() const + { + return const_pair_iterator{thrust::counting_iterator(0), pair_accessor{*this}}; + } + + /** + * @brief Fetcher for a pair iterator to one position past the last element in the + * list_device_view. + */ + template + CUDA_DEVICE_CALLABLE const_pair_iterator pair_end() const + { + return const_pair_iterator{thrust::counting_iterator(size()), + pair_accessor{*this}}; + } + private: lists_column_device_view const& lists_column; size_type _row_index{}; // Row index in the Lists column vector. size_type _size{}; // Number of elements in *this* list row. size_type begin_offset; // Offset in list_column_device_view where this list begins. + + /** + * @brief pair accessor for elements in a `list_device_view` + * + * This unary functor returns a pair of: + * 1. data element at a specified index + * 2. boolean validity flag for that element + * + * @tparam T The element-type of the list row + */ + template + struct pair_accessor { + list_device_view const& list; + + /** + * @brief constructor + * + * @param _list The `list_device_view` whose rows are being accessed. + */ + explicit CUDA_HOST_DEVICE_CALLABLE pair_accessor(list_device_view const& _list) : list{_list} {} + + /** + * @brief Accessor for the {data, validity} pair at the specified index + * + * @param i Index into the list_device_view + * @return A pair of data element and its validity flag. + */ + CUDA_DEVICE_CALLABLE + thrust::pair operator()(cudf::size_type i) const + { + return {list.element(i), !list.is_null(i)}; + } + }; }; } // namespace cudf diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp index 4561554a0f5..29c9fa2e720 100644 --- a/cpp/include/cudf/reshape.hpp +++ b/cpp/include/cudf/reshape.hpp @@ -97,6 +97,48 @@ std::unique_ptr byte_cast( flip_endianness endian_configuration, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Explodes a list column's elements. + * + * Any list is exploded, which means the elements of the list in each row are expanded into new rows + * in the output. The corresponding rows for other columns in the input are duplicated. Example: + * ``` + * [[5,10,15], 100], + * [[20,25], 200], + * [[30], 300], + * returns + * [5, 100], + * [10, 100], + * [15, 100], + * [20, 200], + * [25, 200], + * [30, 300], + * ``` + * + * Nulls and empty lists propagate in different ways depending on what is null or empty. + *``` + * [[5,null,15], 100], + * [null, 200], + * [[], 300], + * returns + * [5, 100], + * [null, 100], + * [15, 100], + * ``` + * Note that null lists are completely removed from the output + * and nulls and empty lists inside lists are pulled out and remain. + * + * @param input_table Table to explode. + * @param explode_column_idx Column index to explode inside the table. + * @param mr Device memory resource used to allocate the returned column's device memory. + * + * @return A new table with explode_col exploded. + */ +std::unique_ptr explode( + table_view const& input_table, + size_type explode_column_idx, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp index 2f4a54e8143..ded833f4ca0 100644 --- a/cpp/include/cudf/scalar/scalar.hpp +++ b/cpp/include/cudf/scalar/scalar.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -430,11 +430,7 @@ class string_scalar : public scalar { string_scalar(value_type const& source, bool is_valid = true, rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) - : scalar(data_type(type_id::STRING), is_valid), - _data(source.data(), source.size_bytes(), stream, mr) - { - } + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Construct a new string scalar object from string_view in device memory @@ -448,10 +444,7 @@ class string_scalar : public scalar { string_scalar(rmm::device_scalar& data, bool is_valid = true, rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) - : string_scalar(data.value(stream), is_valid, stream, mr) - { - } + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Implicit conversion operator to get the value of the scalar in a host std::string @@ -470,10 +463,7 @@ class string_scalar : public scalar { * * @param stream CUDA stream used for device memory operations. */ - value_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const - { - return value_type{data(), size()}; - } + value_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; /** * @brief Returns the size of the string in bytes diff --git a/cpp/include/cudf/scalar/scalar_device_view.cuh b/cpp/include/cudf/scalar/scalar_device_view.cuh index aa3cd932f4f..d1b542a6cf2 100644 --- a/cpp/include/cudf/scalar/scalar_device_view.cuh +++ b/cpp/include/cudf/scalar/scalar_device_view.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ #pragma once #include -#include +#include #include /** diff --git a/cpp/include/cudf/strings/detail/sorting.cuh b/cpp/include/cudf/strings/detail/sorting.cuh deleted file mode 100644 index d23c6d3d4f4..00000000000 --- a/cpp/include/cudf/strings/detail/sorting.cuh +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -namespace cudf { -namespace strings { -namespace detail { - -/** - * @brief Comparator for sorting strings column rows. - */ -struct sort_strings_comparator { - __device__ bool operator()(size_type lhs, size_type rhs) - { - if (has_nulls) { - bool lhs_null{d_column.is_null(lhs)}; - bool rhs_null{d_column.is_null(rhs)}; - if (lhs_null || rhs_null) { - if (!ascending) thrust::swap(lhs_null, rhs_null); - return null_prec == cudf::null_order::BEFORE ? !rhs_null : !lhs_null; - } - } - auto const lhs_str = d_column.element(lhs); - auto const rhs_str = d_column.element(rhs); - auto const cmp = lhs_str.compare(rhs_str); - return ascending ? (cmp < 0) : (cmp > 0); - } - column_device_view const d_column; - bool has_nulls; - bool ascending; - cudf::null_order null_prec; -}; - -/** - * @brief Returns an indices column that is the sorted rows of the - * input strings column. - * - * @param strings Strings instance for this operation. - * @param sort_order Sort strings in ascending or descending order. - * @param null_precedence Sort nulls to the beginning or the end of the new column. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return Indices of the sorted rows. - */ -template -std::unique_ptr sorted_order( - strings_column_view const strings, - cudf::order sort_order = cudf::order::ASCENDING, - cudf::null_order null_precedence = cudf::null_order::BEFORE, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) -{ - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - - std::unique_ptr sorted_indices = cudf::make_numeric_column( - data_type(type_to_id()), strings.size(), mask_state::UNALLOCATED, stream, mr); - auto d_indices = sorted_indices->mutable_view(); - thrust::sequence( - rmm::exec_policy(stream), d_indices.begin(), d_indices.end(), 0); - - sort_strings_comparator comparator{ - d_column, strings.has_nulls(), sort_order == cudf::order::ASCENDING, null_precedence}; - if (stable) { - thrust::stable_sort(rmm::exec_policy(stream), - d_indices.begin(), - d_indices.end(), - comparator); - } else { - thrust::sort(rmm::exec_policy(stream), - d_indices.begin(), - d_indices.end(), - comparator); - } - return sorted_indices; -} - -} // namespace detail -} // namespace strings -} // namespace cudf diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh index aca719ad978..ba903c87485 100644 --- a/cpp/include/cudf/strings/detail/utilities.cuh +++ b/cpp/include/cudf/strings/detail/utilities.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include +#include #include #include diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 802312d91b1..9a57ac1e20d 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,316 +13,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #pragma once -#include -#include -#include -#include +#include -/** - * @file - * @brief Class definition for cudf::string_view. - */ - -namespace cudf { +#include +#include +#include -using char_utf8 = uint32_t; ///< UTF-8 characters are 1-4 bytes - -/** - * @brief A non-owning, immutable view of device data that is a variable length - * char array representing a UTF-8 string. - * - * @ingroup strings_classes - * - * The caller must maintain the device memory for the lifetime of this instance. - * - * It provides a simple wrapper and string operations for an individual string - * within a column of strings. - */ -class string_view { - public: - /** - * @brief Default constructor represents an empty string. - */ - __host__ __device__ string_view(); - - /** - * @brief Create instance from existing device char array. - * - * @param data Device char array encoded in UTF8. - * @param bytes Number of bytes in data array. - */ - __host__ __device__ string_view(const char* data, size_type bytes); - - string_view(const string_view&) = default; - string_view(string_view&&) = default; - ~string_view() = default; - string_view& operator=(const string_view&) = default; - string_view& operator=(string_view&&) = default; - - /** - * @brief Return the number of bytes in this string - */ - __host__ __device__ size_type size_bytes() const; - /** - * @brief Return the number of characters in this string - */ - __device__ size_type length() const; - /** - * @brief Return a pointer to the internal device array - */ - __host__ __device__ const char* data() const; - - /** - * @brief Return true if string has no characters - */ - __host__ __device__ bool empty() const; - - /** - * @brief Handy iterator for navigating through encoded characters. - */ - class const_iterator { - public: - using difference_type = ptrdiff_t; - using value_type = char_utf8; - using reference = char_utf8&; - using pointer = char_utf8*; - using iterator_category = std::input_iterator_tag; - __device__ const_iterator(const string_view& str, size_type pos); - const_iterator(const const_iterator& mit) = default; - const_iterator(const_iterator&& mit) = default; - const_iterator& operator=(const const_iterator&) = default; - const_iterator& operator=(const_iterator&&) = default; - __device__ const_iterator& operator++(); - __device__ const_iterator operator++(int); - __device__ const_iterator& operator+=(difference_type); - __device__ const_iterator operator+(difference_type); - __device__ const_iterator& operator--(); - __device__ const_iterator operator--(int); - __device__ const_iterator& operator-=(difference_type); - __device__ const_iterator operator-(difference_type); - __device__ bool operator==(const const_iterator&) const; - __device__ bool operator!=(const const_iterator&) const; - __device__ bool operator<(const const_iterator&) const; - __device__ bool operator<=(const const_iterator&) const; - __device__ bool operator>(const const_iterator&) const; - __device__ bool operator>=(const const_iterator&) const; - __device__ char_utf8 operator*() const; - __device__ size_type position() const; - __device__ size_type byte_offset() const; - - private: - const char* p{}; - size_type bytes{}; - size_type char_pos{}; - size_type byte_pos{}; - }; - - /** - * @brief Return new iterator pointing to the beginning of this string - */ - __device__ const_iterator begin() const; - /** - * @brief Return new iterator pointing past the end of this string - */ - __device__ const_iterator end() const; - - /** - * @brief Return single UTF-8 character at the given character position - * - * @param pos Character position - */ - __device__ char_utf8 operator[](size_type pos) const; - /** - * @brief Return the byte offset from data() for a given character position - * - * @param pos Character position - */ - __device__ size_type byte_offset(size_type pos) const; - - /** - * @brief Comparing target string with this string. Each character is compared - * as a UTF-8 code-point value. - * - * @param str Target string to compare with this string. - * @return 0 If they compare equal. - * <0 Either the value of the first character of this string that does - * not match is lower in the arg string, or all compared characters - * match but the arg string is shorter. - * >0 Either the value of the first character of this string that does - * not match is greater in the arg string, or all compared characters - * match but the arg string is longer. - */ - __device__ int compare(const string_view& str) const; - /** - * @brief Comparing target string with this string. Each character is compared - * as a UTF-8 code-point value. - * - * @param str Target string to compare with this string. - * @param bytes Number of bytes in str. - * @return 0 If they compare equal. - * <0 Either the value of the first character of this string that does - * not match is lower in the arg string, or all compared characters - * match but the arg string is shorter. - * >0 Either the value of the first character of this string that does - * not match is greater in the arg string, or all compared characters - * match but the arg string is longer. - */ - __device__ int compare(const char* str, size_type bytes) const; - - /** - * @brief Returns true if rhs matches this string exactly. - */ - __device__ bool operator==(const string_view& rhs) const; - /** - * @brief Returns true if rhs does not match this string. - */ - __device__ bool operator!=(const string_view& rhs) const; - /** - * @brief Returns true if this string is ordered before rhs. - */ - __device__ bool operator<(const string_view& rhs) const; - /** - * @brief Returns true if rhs is ordered before this string. - */ - __device__ bool operator>(const string_view& rhs) const; - /** - * @brief Returns true if this string matches or is ordered before rhs. - */ - __device__ bool operator<=(const string_view& rhs) const; - /** - * @brief Returns true if rhs matches or is ordered before this string. - */ - __device__ bool operator>=(const string_view& rhs) const; - - /** - * @brief Returns the character position of the first occurrence where the - * argument str is found in this string within the character range [pos,pos+n). - * - * @param str Target string to search within this string. - * @param pos Character position to start search within this string. - * @param count Number of characters from pos to include in the search. - * Specify -1 to indicate to the end of the string. - * @return -1 if str is not found in this string. - */ - __device__ size_type find(const string_view& str, size_type pos = 0, size_type count = -1) const; - /** - * @brief Returns the character position of the first occurrence where the - * array str is found in this string within the character range [pos,pos+n). - * - * @param str Target array to search within this string. - * @param bytes Number of bytes in str. - * @param pos Character position to start search within this string. - * @param count Number of characters from pos to include in the search. - * Specify -1 to indicate to the end of the string. - * @return -1 if arg string is not found in this string. - */ - __device__ size_type find(const char* str, - size_type bytes, - size_type pos = 0, - size_type count = -1) const; - /** - * @brief Returns the character position of the first occurrence where - * character is found in this string within the character range [pos,pos+n). - * - * @param character Single encoded character. - * @param pos Character position to start search within this string. - * @param count Number of characters from pos to include in the search. - * Specify -1 to indicate to the end of the string. - * @return -1 if arg string is not found in this string. - */ - __device__ size_type find(char_utf8 character, size_type pos = 0, size_type count = -1) const; - /** - * @brief Returns the character position of the last occurrence where the - * argument str is found in this string within the character range [pos,pos+n). - * - * @param str Target string to search within this string. - * @param pos Character position to start search within this string. - * @param count Number of characters from pos to include in the search. - * Specify -1 to indicate to the end of the string. - * @return -1 if arg string is not found in this string. - */ - __device__ size_type rfind(const string_view& str, size_type pos = 0, size_type count = -1) const; - /** - * @brief Returns the character position of the last occurrence where the - * array str is found in this string within the character range [pos,pos+n). - * - * @param str Target string to search with this string. - * @param bytes Number of bytes in str. - * @param pos Character position to start search within this string. - * @param count Number of characters from pos to include in the search. - * Specify -1 to indicate to the end of the string. - * @return -1 if arg string is not found in this string. - */ - __device__ size_type rfind(const char* str, - size_type bytes, - size_type pos = 0, - size_type count = -1) const; - /** - * @brief Returns the character position of the last occurrence where - * character is found in this string within the character range [pos,pos+n). - * - * @param character Single encoded character. - * @param pos Character position to start search within this string. - * @param count Number of characters from pos to include in the search. - * Specify -1 to indicate to the end of the string. - * @return -1 if arg string is not found in this string. - */ - __device__ size_type rfind(char_utf8 character, size_type pos = 0, size_type count = -1) const; - - /** - * @brief Return a sub-string of this string. The original string and device - * memory must still be maintained for the lifetime of the returned instance. - * - * @param start Character position to start the sub-string. - * @param length Number of characters from start to include in the sub-string. - * @return New instance pointing to a subset of the characters within this instance. - */ - __device__ string_view substr(size_type start, size_type length) const; - - private: - const char* _data{}; ///< Pointer to device memory contain char array for this string - size_type _bytes{}; ///< Number of bytes in _data for this string - mutable size_type _length{}; ///< Number of characters in this string (computed) - mutable int8_t _char_width{}; ///< Number of bytes per character if uniform width (computed) - - /** - * @brief Return the character position of the given byte offset. - * - * @param bytepos Byte position from start of _data. - * @return The character position for the specified byte. - */ - __device__ size_type character_offset(size_type bytepos) const; -}; +// This file should only include device code logic. +// Host-only or host/device code should be defined in the string_view.hpp header file. +namespace cudf { namespace strings { namespace detail { -/** - * @brief Returns the number of bytes in the specified character. - * - * @param character Single character - * @return Number of bytes - */ -__host__ __device__ size_type bytes_in_char_utf8(char_utf8 character); - -/** - * @brief Convert a char array into a char_utf8 value. - * - * @param str String containing encoded char bytes. - * @param[out] character Single char_utf8 value. - * @return The number of bytes in the character - */ -__host__ __device__ size_type to_char_utf8(const char* str, char_utf8& character); - -/** - * @brief Place a char_utf8 value into a char array. - * - * @param character Single character - * @param[out] str Allocated char array with enough space to hold the encoded characer. - * @return The number of bytes in the character - */ -__host__ __device__ size_type from_char_utf8(char_utf8 character, char* str); /** * @brief Return the number of UTF-8 characters in this provided char array. @@ -331,22 +36,338 @@ __host__ __device__ size_type from_char_utf8(char_utf8 character, char* str); * @param bytes Number of bytes in str. * @return The number of characters in the array. */ -__host__ __device__ size_type characters_in_string(const char* str, size_type bytes); - -/** - * @brief This will return true if passed the first byte of a UTF-8 character. - * - * @param byte Any byte from a valid UTF-8 character - * @return true if this the first byte of the character - */ -constexpr bool is_begin_utf8_char(uint8_t byte) +__device__ inline size_type characters_in_string(const char* str, size_type bytes) { - // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character. - return (byte & 0xC0) != 0x80; + if ((str == 0) || (bytes == 0)) return 0; + auto ptr = reinterpret_cast(str); + return thrust::count_if( + thrust::seq, ptr, ptr + bytes, [](uint8_t chr) { return is_begin_utf8_char(chr); }); } - } // namespace detail } // namespace strings -} // namespace cudf -#include "./string_view.inl" +__device__ inline size_type string_view::length() const +{ + if (_length == UNKNOWN_STRING_LENGTH) + _length = strings::detail::characters_in_string(_data, _bytes); + if (_length && (_char_width == UNKNOWN_CHAR_WIDTH)) { + uint8_t const* ptr = reinterpret_cast(data()); + auto const first = strings::detail::bytes_in_utf8_byte(*ptr); + // see if they are all the same width + _char_width = (thrust::find_if(thrust::seq, + ptr, + ptr + size_bytes(), + [first](auto ch) { + auto width = strings::detail::bytes_in_utf8_byte(ch); + return (width != 0) && (width != first); + })) == (ptr + size_bytes()) + ? first + : VARIABLE_CHAR_WIDTH; + } + return _length; +} + +// this custom iterator knows about UTF8 encoding +__device__ inline string_view::const_iterator::const_iterator(const string_view& str, size_type pos) + : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{str.byte_offset(pos)} +{ +} + +__device__ inline string_view::const_iterator& string_view::const_iterator::operator++() +{ + if (byte_pos < bytes) + byte_pos += strings::detail::bytes_in_utf8_byte(static_cast(p[byte_pos])); + ++char_pos; + return *this; +} + +__device__ inline string_view::const_iterator string_view::const_iterator::operator++(int) +{ + string_view::const_iterator tmp(*this); + operator++(); + return tmp; +} + +__device__ inline string_view::const_iterator string_view::const_iterator::operator+( + string_view::const_iterator::difference_type offset) +{ + const_iterator tmp(*this); + size_type adjust = abs(offset); + while (adjust-- > 0) offset > 0 ? ++tmp : --tmp; + return tmp; +} + +__device__ inline string_view::const_iterator& string_view::const_iterator::operator+=( + string_view::const_iterator::difference_type offset) +{ + size_type adjust = abs(offset); + while (adjust-- > 0) offset > 0 ? operator++() : operator--(); + return *this; +} + +__device__ inline string_view::const_iterator& string_view::const_iterator::operator--() +{ + if (byte_pos > 0) + while (strings::detail::bytes_in_utf8_byte(static_cast(p[--byte_pos])) == 0) + ; + --char_pos; + return *this; +} + +__device__ inline string_view::const_iterator string_view::const_iterator::operator--(int) +{ + string_view::const_iterator tmp(*this); + operator--(); + return tmp; +} + +__device__ inline string_view::const_iterator& string_view::const_iterator::operator-=( + string_view::const_iterator::difference_type offset) +{ + size_type adjust = abs(offset); + while (adjust-- > 0) offset > 0 ? operator--() : operator++(); + return *this; +} + +__device__ inline string_view::const_iterator string_view::const_iterator::operator-( + string_view::const_iterator::difference_type offset) +{ + const_iterator tmp(*this); + size_type adjust = abs(offset); + while (adjust-- > 0) offset > 0 ? --tmp : ++tmp; + return tmp; +} + +__device__ inline bool string_view::const_iterator::operator==( + const string_view::const_iterator& rhs) const +{ + return (p == rhs.p) && (char_pos == rhs.char_pos); +} + +__device__ inline bool string_view::const_iterator::operator!=( + const string_view::const_iterator& rhs) const +{ + return (p != rhs.p) || (char_pos != rhs.char_pos); +} + +__device__ inline bool string_view::const_iterator::operator<( + const string_view::const_iterator& rhs) const +{ + return (p == rhs.p) && (char_pos < rhs.char_pos); +} + +__device__ inline bool string_view::const_iterator::operator<=( + const string_view::const_iterator& rhs) const +{ + return (p == rhs.p) && (char_pos <= rhs.char_pos); +} + +__device__ inline bool string_view::const_iterator::operator>( + const string_view::const_iterator& rhs) const +{ + return (p == rhs.p) && (char_pos > rhs.char_pos); +} + +__device__ inline bool string_view::const_iterator::operator>=( + const string_view::const_iterator& rhs) const +{ + return (p == rhs.p) && (char_pos >= rhs.char_pos); +} + +__device__ inline char_utf8 string_view::const_iterator::operator*() const +{ + char_utf8 chr = 0; + strings::detail::to_char_utf8(p + byte_offset(), chr); + return chr; +} + +__device__ inline size_type string_view::const_iterator::position() const { return char_pos; } + +__device__ inline size_type string_view::const_iterator::byte_offset() const { return byte_pos; } + +__device__ inline string_view::const_iterator string_view::begin() const +{ + return const_iterator(*this, 0); +} + +__device__ inline string_view::const_iterator string_view::end() const +{ + return const_iterator(*this, length()); +} + +__device__ inline char_utf8 string_view::operator[](size_type pos) const +{ + size_type offset = byte_offset(pos); + if (offset >= _bytes) return 0; + char_utf8 chr = 0; + strings::detail::to_char_utf8(data() + offset, chr); + return chr; +} + +__device__ inline size_type string_view::byte_offset(size_type pos) const +{ + size_type offset = 0; + const char* sptr = _data; + const char* eptr = sptr + _bytes; + if (_char_width > 0) return pos * _char_width; + while ((pos > 0) && (sptr < eptr)) { + size_type charbytes = strings::detail::bytes_in_utf8_byte(static_cast(*sptr++)); + if (charbytes) --pos; + offset += charbytes; + } + return offset; +} + +__device__ inline int string_view::compare(const string_view& in) const +{ + return compare(in.data(), in.size_bytes()); +} + +__device__ inline int string_view::compare(const char* data, size_type bytes) const +{ + size_type const len1 = size_bytes(); + const unsigned char* ptr1 = reinterpret_cast(this->data()); + const unsigned char* ptr2 = reinterpret_cast(data); + size_type idx = 0; + for (; (idx < len1) && (idx < bytes); ++idx) { + if (*ptr1 != *ptr2) return static_cast(*ptr1) - static_cast(*ptr2); + ++ptr1; + ++ptr2; + } + if (idx < len1) return 1; + if (idx < bytes) return -1; + return 0; +} + +__device__ inline bool string_view::operator==(const string_view& rhs) const +{ + return compare(rhs) == 0; +} + +__device__ inline bool string_view::operator!=(const string_view& rhs) const +{ + return compare(rhs) != 0; +} + +__device__ inline bool string_view::operator<(const string_view& rhs) const +{ + return compare(rhs) < 0; +} + +__device__ inline bool string_view::operator>(const string_view& rhs) const +{ + return compare(rhs) > 0; +} + +__device__ inline bool string_view::operator<=(const string_view& rhs) const +{ + int rc = compare(rhs); + return (rc == 0) || (rc < 0); +} + +__device__ inline bool string_view::operator>=(const string_view& rhs) const +{ + int rc = compare(rhs); + return (rc == 0) || (rc > 0); +} + +__device__ inline size_type string_view::find(const string_view& str, + size_type pos, + size_type count) const +{ + return find(str.data(), str.size_bytes(), pos, count); +} + +__device__ inline size_type string_view::find(const char* str, + size_type bytes, + size_type pos, + size_type count) const +{ + const char* sptr = data(); + if (!str || !bytes) return -1; + size_type nchars = length(); + if (count < 0) count = nchars; + size_type end = pos + count; + if (end < 0 || end > nchars) end = nchars; + size_type spos = byte_offset(pos); + size_type epos = byte_offset(end); + + size_type len2 = bytes; + size_type len1 = (epos - spos) - len2 + 1; + + const char* ptr1 = sptr + spos; + const char* ptr2 = str; + for (size_type idx = 0; idx < len1; ++idx) { + bool match = true; + for (size_type jdx = 0; match && (jdx < len2); ++jdx) match = (ptr1[jdx] == ptr2[jdx]); + if (match) return character_offset(idx + spos); + ptr1++; + } + return -1; +} + +__device__ inline size_type string_view::find(char_utf8 chr, size_type pos, size_type count) const +{ + char str[sizeof(char_utf8)]; + size_type chwidth = strings::detail::from_char_utf8(chr, str); + return find(str, chwidth, pos, count); +} + +__device__ inline size_type string_view::rfind(const string_view& str, + size_type pos, + size_type count) const +{ + return rfind(str.data(), str.size_bytes(), pos, count); +} + +__device__ inline size_type string_view::rfind(const char* str, + size_type bytes, + size_type pos, + size_type count) const +{ + const char* sptr = data(); + if (!str || !bytes) return -1; + size_type nchars = length(); + size_type end = pos + count; + if (end < 0 || end > nchars) end = nchars; + size_type spos = byte_offset(pos); + size_type epos = byte_offset(end); + + size_type len2 = bytes; + size_type len1 = (epos - spos) - len2 + 1; + + const char* ptr1 = sptr + epos - len2; + const char* ptr2 = str; + for (int idx = 0; idx < len1; ++idx) { + bool match = true; + for (size_type jdx = 0; match && (jdx < len2); ++jdx) match = (ptr1[jdx] == ptr2[jdx]); + if (match) return character_offset(epos - len2 - idx); + ptr1--; // go backwards + } + return -1; +} + +__device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, size_type count) const +{ + char str[sizeof(char_utf8)]; + size_type chwidth = strings::detail::from_char_utf8(chr, str); + return rfind(str, chwidth, pos, count); +} + +// parameters are character position values +__device__ inline string_view string_view::substr(size_type pos, size_type length) const +{ + size_type spos = byte_offset(pos); + size_type epos = byte_offset(pos + length); + if (epos > size_bytes()) epos = size_bytes(); + if (spos >= epos) return string_view("", 0); + return string_view(data() + spos, epos - spos); +} + +__device__ inline size_type string_view::character_offset(size_type bytepos) const +{ + if (_char_width > 0) return bytepos / _char_width; + return strings::detail::characters_in_string(data(), bytepos); +} + +} // namespace cudf diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp new file mode 100644 index 00000000000..9c42c216791 --- /dev/null +++ b/cpp/include/cudf/strings/string_view.hpp @@ -0,0 +1,422 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +/** + * @file + * @brief Class definition for cudf::string_view. + */ + +namespace cudf { + +using char_utf8 = uint32_t; ///< UTF-8 characters are 1-4 bytes + +/** + * @brief The string length is initialized to this value as a place-holder + * + * The number of characters in a string computed on-demand. + */ +constexpr cudf::size_type UNKNOWN_STRING_LENGTH{-1}; + +/** + * @brief The char width is initialized to this value as a place-holder. + * + * The byte-width of the characters in a string is computed on-demand. + */ +constexpr int8_t UNKNOWN_CHAR_WIDTH{-1}; + +/** + * @brief This value is assigned to the _char_width member if the string + * contains characters of different widths. + */ +constexpr int8_t VARIABLE_CHAR_WIDTH{0}; + +/** + * @brief A non-owning, immutable view of device data that is a variable length + * char array representing a UTF-8 string. + * + * @ingroup strings_classes + * + * The caller must maintain the device memory for the lifetime of this instance. + * + * This may be used to wrap a device pointer and size but any member function + * that requires accessing the device memory must be called from a kernel. + */ +class string_view { + public: + /** + * @brief Return the number of bytes in this string + */ + CUDA_HOST_DEVICE_CALLABLE size_type size_bytes() const { return _bytes; } + /** + * @brief Return the number of characters in this string + */ + CUDA_DEVICE_CALLABLE size_type length() const; + /** + * @brief Return a pointer to the internal device array + */ + CUDA_HOST_DEVICE_CALLABLE const char* data() const { return _data; } + + /** + * @brief Return true if string has no characters + */ + CUDA_HOST_DEVICE_CALLABLE bool empty() const { return size_bytes() == 0; } + + /** + * @brief Handy iterator for navigating through encoded characters. + */ + class const_iterator { + public: + using difference_type = ptrdiff_t; + using value_type = char_utf8; + using reference = char_utf8&; + using pointer = char_utf8*; + using iterator_category = std::input_iterator_tag; + CUDA_DEVICE_CALLABLE const_iterator(const string_view& str, size_type pos); + const_iterator(const const_iterator& mit) = default; + const_iterator(const_iterator&& mit) = default; + const_iterator& operator=(const const_iterator&) = default; + const_iterator& operator=(const_iterator&&) = default; + CUDA_DEVICE_CALLABLE const_iterator& operator++(); + CUDA_DEVICE_CALLABLE const_iterator operator++(int); + CUDA_DEVICE_CALLABLE const_iterator& operator+=(difference_type); + CUDA_DEVICE_CALLABLE const_iterator operator+(difference_type); + CUDA_DEVICE_CALLABLE const_iterator& operator--(); + CUDA_DEVICE_CALLABLE const_iterator operator--(int); + CUDA_DEVICE_CALLABLE const_iterator& operator-=(difference_type); + CUDA_DEVICE_CALLABLE const_iterator operator-(difference_type); + CUDA_DEVICE_CALLABLE bool operator==(const const_iterator&) const; + CUDA_DEVICE_CALLABLE bool operator!=(const const_iterator&) const; + CUDA_DEVICE_CALLABLE bool operator<(const const_iterator&) const; + CUDA_DEVICE_CALLABLE bool operator<=(const const_iterator&) const; + CUDA_DEVICE_CALLABLE bool operator>(const const_iterator&) const; + CUDA_DEVICE_CALLABLE bool operator>=(const const_iterator&) const; + CUDA_DEVICE_CALLABLE char_utf8 operator*() const; + CUDA_DEVICE_CALLABLE size_type position() const; + CUDA_DEVICE_CALLABLE size_type byte_offset() const; + + private: + const char* p{}; + size_type bytes{}; + size_type char_pos{}; + size_type byte_pos{}; + }; + + /** + * @brief Return new iterator pointing to the beginning of this string + */ + CUDA_DEVICE_CALLABLE const_iterator begin() const; + /** + * @brief Return new iterator pointing past the end of this string + */ + CUDA_DEVICE_CALLABLE const_iterator end() const; + + /** + * @brief Return single UTF-8 character at the given character position + * + * @param pos Character position + */ + CUDA_DEVICE_CALLABLE char_utf8 operator[](size_type pos) const; + /** + * @brief Return the byte offset from data() for a given character position + * + * @param pos Character position + */ + CUDA_DEVICE_CALLABLE size_type byte_offset(size_type pos) const; + + /** + * @brief Comparing target string with this string. Each character is compared + * as a UTF-8 code-point value. + * + * @param str Target string to compare with this string. + * @return 0 If they compare equal. + * <0 Either the value of the first character of this string that does + * not match is lower in the arg string, or all compared characters + * match but the arg string is shorter. + * >0 Either the value of the first character of this string that does + * not match is greater in the arg string, or all compared characters + * match but the arg string is longer. + */ + CUDA_DEVICE_CALLABLE int compare(const string_view& str) const; + /** + * @brief Comparing target string with this string. Each character is compared + * as a UTF-8 code-point value. + * + * @param str Target string to compare with this string. + * @param bytes Number of bytes in str. + * @return 0 If they compare equal. + * <0 Either the value of the first character of this string that does + * not match is lower in the arg string, or all compared characters + * match but the arg string is shorter. + * >0 Either the value of the first character of this string that does + * not match is greater in the arg string, or all compared characters + * match but the arg string is longer. + */ + CUDA_DEVICE_CALLABLE int compare(const char* str, size_type bytes) const; + + /** + * @brief Returns true if rhs matches this string exactly. + */ + CUDA_DEVICE_CALLABLE bool operator==(const string_view& rhs) const; + /** + * @brief Returns true if rhs does not match this string. + */ + CUDA_DEVICE_CALLABLE bool operator!=(const string_view& rhs) const; + /** + * @brief Returns true if this string is ordered before rhs. + */ + CUDA_DEVICE_CALLABLE bool operator<(const string_view& rhs) const; + /** + * @brief Returns true if rhs is ordered before this string. + */ + CUDA_DEVICE_CALLABLE bool operator>(const string_view& rhs) const; + /** + * @brief Returns true if this string matches or is ordered before rhs. + */ + CUDA_DEVICE_CALLABLE bool operator<=(const string_view& rhs) const; + /** + * @brief Returns true if rhs matches or is ordered before this string. + */ + CUDA_DEVICE_CALLABLE bool operator>=(const string_view& rhs) const; + + /** + * @brief Returns the character position of the first occurrence where the + * argument str is found in this string within the character range [pos,pos+n). + * + * @param str Target string to search within this string. + * @param pos Character position to start search within this string. + * @param count Number of characters from pos to include in the search. + * Specify -1 to indicate to the end of the string. + * @return -1 if str is not found in this string. + */ + CUDA_DEVICE_CALLABLE size_type find(const string_view& str, + size_type pos = 0, + size_type count = -1) const; + /** + * @brief Returns the character position of the first occurrence where the + * array str is found in this string within the character range [pos,pos+n). + * + * @param str Target array to search within this string. + * @param bytes Number of bytes in str. + * @param pos Character position to start search within this string. + * @param count Number of characters from pos to include in the search. + * Specify -1 to indicate to the end of the string. + * @return -1 if arg string is not found in this string. + */ + CUDA_DEVICE_CALLABLE size_type find(const char* str, + size_type bytes, + size_type pos = 0, + size_type count = -1) const; + /** + * @brief Returns the character position of the first occurrence where + * character is found in this string within the character range [pos,pos+n). + * + * @param character Single encoded character. + * @param pos Character position to start search within this string. + * @param count Number of characters from pos to include in the search. + * Specify -1 to indicate to the end of the string. + * @return -1 if arg string is not found in this string. + */ + CUDA_DEVICE_CALLABLE size_type find(char_utf8 character, + size_type pos = 0, + size_type count = -1) const; + /** + * @brief Returns the character position of the last occurrence where the + * argument str is found in this string within the character range [pos,pos+n). + * + * @param str Target string to search within this string. + * @param pos Character position to start search within this string. + * @param count Number of characters from pos to include in the search. + * Specify -1 to indicate to the end of the string. + * @return -1 if arg string is not found in this string. + */ + CUDA_DEVICE_CALLABLE size_type rfind(const string_view& str, + size_type pos = 0, + size_type count = -1) const; + /** + * @brief Returns the character position of the last occurrence where the + * array str is found in this string within the character range [pos,pos+n). + * + * @param str Target string to search with this string. + * @param bytes Number of bytes in str. + * @param pos Character position to start search within this string. + * @param count Number of characters from pos to include in the search. + * Specify -1 to indicate to the end of the string. + * @return -1 if arg string is not found in this string. + */ + CUDA_DEVICE_CALLABLE size_type rfind(const char* str, + size_type bytes, + size_type pos = 0, + size_type count = -1) const; + /** + * @brief Returns the character position of the last occurrence where + * character is found in this string within the character range [pos,pos+n). + * + * @param character Single encoded character. + * @param pos Character position to start search within this string. + * @param count Number of characters from pos to include in the search. + * Specify -1 to indicate to the end of the string. + * @return -1 if arg string is not found in this string. + */ + CUDA_DEVICE_CALLABLE size_type rfind(char_utf8 character, + size_type pos = 0, + size_type count = -1) const; + + /** + * @brief Return a sub-string of this string. The original string and device + * memory must still be maintained for the lifetime of the returned instance. + * + * @param start Character position to start the sub-string. + * @param length Number of characters from start to include in the sub-string. + * @return New instance pointing to a subset of the characters within this instance. + */ + CUDA_DEVICE_CALLABLE string_view substr(size_type start, size_type length) const; + + /** + * @brief Default constructor represents an empty string. + */ + CUDA_HOST_DEVICE_CALLABLE string_view() : _data(""), _bytes(0), _length(0), _char_width(0) {} + + /** + * @brief Create instance from existing device char array. + * + * @param data Device char array encoded in UTF8. + * @param bytes Number of bytes in data array. + */ + CUDA_HOST_DEVICE_CALLABLE string_view(const char* data, size_type bytes) + : _data(data), _bytes(bytes), _length(UNKNOWN_STRING_LENGTH), _char_width(UNKNOWN_CHAR_WIDTH) + { + } + + string_view(const string_view&) = default; + string_view(string_view&&) = default; + ~string_view() = default; + string_view& operator=(const string_view&) = default; + string_view& operator=(string_view&&) = default; + + private: + const char* _data{}; ///< Pointer to device memory contain char array for this string + size_type _bytes{}; ///< Number of bytes in _data for this string + mutable size_type _length{}; ///< Number of characters in this string (computed) + mutable int8_t _char_width{}; ///< Number of bytes per character if uniform width (computed) + + /** + * @brief Return the character position of the given byte offset. + * + * @param bytepos Byte position from start of _data. + * @return The character position for the specified byte. + */ + CUDA_DEVICE_CALLABLE size_type character_offset(size_type bytepos) const; +}; + +namespace strings { +namespace detail { + +/** + * @brief This will return true if passed the first byte of a UTF-8 character. + * + * @param byte Any byte from a valid UTF-8 character + * @return true if this the first byte of the character + */ +constexpr bool is_begin_utf8_char(uint8_t byte) +{ + // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character. + return (byte & 0xC0) != 0x80; +} + +/** + * @brief Returns the number of bytes in the specified character. + * + * @param character Single character + * @return Number of bytes + */ +constexpr size_type bytes_in_char_utf8(char_utf8 character) +{ + return 1 + static_cast((character & unsigned{0x0000FF00}) > 0) + + static_cast((character & unsigned{0x00FF0000}) > 0) + + static_cast((character & unsigned{0xFF000000}) > 0); +} + +/** + * @brief Returns the number of bytes used to represent the provided byte. + * + * This could be 0 to 4 bytes. 0 is returned for intermediate bytes within a + * single character. For example, for the two-byte 0xC3A8 single character, + * the first byte would return 2 and the second byte would return 0. + * + * @param byte Byte from an encoded character. + * @return Number of bytes. + */ +constexpr size_type bytes_in_utf8_byte(uint8_t byte) +{ + return 1 + static_cast((byte & 0xF0) == 0xF0) // 4-byte character prefix + + static_cast((byte & 0xE0) == 0xE0) // 3-byte character prefix + + static_cast((byte & 0xC0) == 0xC0) // 2-byte character prefix + - static_cast((byte & 0xC0) == 0x80); // intermediate byte +} + +/** + * @brief Convert a char array into a char_utf8 value. + * + * @param str String containing encoded char bytes. + * @param[out] character Single char_utf8 value. + * @return The number of bytes in the character + */ +CUDA_HOST_DEVICE_CALLABLE size_type to_char_utf8(const char* str, char_utf8& character) +{ + size_type const chr_width = bytes_in_utf8_byte(static_cast(*str)); + + character = static_cast(*str++) & 0xFF; + if (chr_width > 1) { + character = character << 8; + character |= (static_cast(*str++) & 0xFF); // << 8; + if (chr_width > 2) { + character = character << 8; + character |= (static_cast(*str++) & 0xFF); // << 16; + if (chr_width > 3) { + character = character << 8; + character |= (static_cast(*str++) & 0xFF); // << 24; + } + } + } + return chr_width; +} + +/** + * @brief Place a char_utf8 value into a char array. + * + * @param character Single character + * @param[out] str Allocated char array with enough space to hold the encoded characer. + * @return The number of bytes in the character + */ +CUDA_HOST_DEVICE_CALLABLE size_type from_char_utf8(char_utf8 character, char* str) +{ + size_type const chr_width = bytes_in_char_utf8(character); + for (size_type idx = 0; idx < chr_width; ++idx) { + str[chr_width - idx - 1] = static_cast(character) & 0xFF; + character = character >> 8; + } + return chr_width; +} + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/include/cudf/strings/string_view.inl b/cpp/include/cudf/strings/string_view.inl deleted file mode 100644 index eee59604171..00000000000 --- a/cpp/include/cudf/strings/string_view.inl +++ /dev/null @@ -1,463 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -namespace { -using BYTE = uint8_t; - -// number of characters in a string computed on-demand -// the _length member is initialized to this value as a place-holder -constexpr cudf::size_type UNKNOWN_STRING_LENGTH{-1}; -// the byte-width of the characters in a string is computed on-demand -// the _char_width member is initialized to this value as a place-holder -constexpr int8_t UNKNOWN_CHAR_WIDTH{-1}; -// this value is assigned to the _char_width member if the string -// contains characters of different widths -constexpr int8_t VARIABLE_CHAR_WIDTH{0}; - -/** - * @brief Returns the number of bytes used to represent the provided byte. - * This could be 0 to 4 bytes. 0 is returned for intermediate bytes within a - * single character. For example, for the two-byte 0xC3A8 single character, - * the first byte would return 2 and the second byte would return 0. - * - * @param byte Byte from an encoded character. - * @return Number of bytes. - */ -__host__ __device__ inline cudf::size_type bytes_in_utf8_byte(BYTE byte) -{ - cudf::size_type count = 1; - count += (int)((byte & 0xF0) == 0xF0); // 4-byte character prefix - count += (int)((byte & 0xE0) == 0xE0); // 3-byte character prefix - count += (int)((byte & 0xC0) == 0xC0); // 2-byte character prefix - count -= (int)((byte & 0xC0) == 0x80); // intermediate byte - return count; -} - -/** - * @brief Returns the number of bytes used in the provided char array by - * searching for a null-terminator byte. - * - * @param str Null-terminated array of chars. - * @return Number of bytes. - */ -__device__ inline cudf::size_type string_bytes(const char* str) -{ - if (!str) return 0; - cudf::size_type bytes = 0; - while (*str++) ++bytes; - return bytes; -} - -} // namespace - -namespace cudf { - -__host__ __device__ inline string_view::string_view() - : _data(""), _bytes(0), _length(0), _char_width(0) -{ -} - -__host__ __device__ inline string_view::string_view(const char* data, size_type bytes) - : _data(data), _bytes(bytes), _length(UNKNOWN_STRING_LENGTH), _char_width(UNKNOWN_CHAR_WIDTH) -{ -} - -// -__host__ __device__ inline size_type string_view::size_bytes() const { return _bytes; } - -__device__ inline size_type string_view::length() const -{ - if (_length == UNKNOWN_STRING_LENGTH) - _length = strings::detail::characters_in_string(_data, _bytes); - if (_length && (_char_width == UNKNOWN_CHAR_WIDTH)) { - const BYTE* bytes = reinterpret_cast(data()); - auto chwidth = bytes_in_utf8_byte(*bytes); // see if they are all the same width - _char_width = (thrust::find_if(thrust::seq, - bytes, - bytes + size_bytes(), - [chwidth](auto ch) { - auto width = bytes_in_utf8_byte(ch); - return (width != 0) && (width != chwidth); - })) == (bytes + size_bytes()) - ? chwidth - : VARIABLE_CHAR_WIDTH; - } - return _length; -} - -__host__ __device__ inline const char* string_view::data() const { return _data; } - -__host__ __device__ inline bool string_view::empty() const { return _bytes == 0; } - -// this custom iterator knows about UTF8 encoding -__device__ inline string_view::const_iterator::const_iterator(const string_view& str, size_type pos) - : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{str.byte_offset(pos)} -{ -} - -__device__ inline string_view::const_iterator& string_view::const_iterator::operator++() -{ - if (byte_pos < bytes) byte_pos += bytes_in_utf8_byte((BYTE)p[byte_pos]); - ++char_pos; - return *this; -} - -__device__ inline string_view::const_iterator string_view::const_iterator::operator++(int) -{ - string_view::const_iterator tmp(*this); - operator++(); - return tmp; -} - -__device__ inline string_view::const_iterator string_view::const_iterator::operator+( - string_view::const_iterator::difference_type offset) -{ - const_iterator tmp(*this); - size_type adjust = abs(offset); - while (adjust-- > 0) offset > 0 ? ++tmp : --tmp; - return tmp; -} - -__device__ inline string_view::const_iterator& string_view::const_iterator::operator+=( - string_view::const_iterator::difference_type offset) -{ - size_type adjust = abs(offset); - while (adjust-- > 0) offset > 0 ? operator++() : operator--(); - return *this; -} - -__device__ inline string_view::const_iterator& string_view::const_iterator::operator--() -{ - if (byte_pos > 0) - while (bytes_in_utf8_byte((BYTE)p[--byte_pos]) == 0) - ; - --char_pos; - return *this; -} - -__device__ inline string_view::const_iterator string_view::const_iterator::operator--(int) -{ - string_view::const_iterator tmp(*this); - operator--(); - return tmp; -} - -__device__ inline string_view::const_iterator& string_view::const_iterator::operator-=( - string_view::const_iterator::difference_type offset) -{ - size_type adjust = abs(offset); - while (adjust-- > 0) offset > 0 ? operator--() : operator++(); - return *this; -} - -__device__ inline string_view::const_iterator string_view::const_iterator::operator-( - string_view::const_iterator::difference_type offset) -{ - const_iterator tmp(*this); - size_type adjust = abs(offset); - while (adjust-- > 0) offset > 0 ? --tmp : ++tmp; - return tmp; -} - -__device__ inline bool string_view::const_iterator::operator==( - const string_view::const_iterator& rhs) const -{ - return (p == rhs.p) && (char_pos == rhs.char_pos); -} - -__device__ inline bool string_view::const_iterator::operator!=( - const string_view::const_iterator& rhs) const -{ - return (p != rhs.p) || (char_pos != rhs.char_pos); -} - -__device__ inline bool string_view::const_iterator::operator<( - const string_view::const_iterator& rhs) const -{ - return (p == rhs.p) && (char_pos < rhs.char_pos); -} - -__device__ inline bool string_view::const_iterator::operator<=( - const string_view::const_iterator& rhs) const -{ - return (p == rhs.p) && (char_pos <= rhs.char_pos); -} - -__device__ inline bool string_view::const_iterator::operator>( - const string_view::const_iterator& rhs) const -{ - return (p == rhs.p) && (char_pos > rhs.char_pos); -} - -__device__ inline bool string_view::const_iterator::operator>=( - const string_view::const_iterator& rhs) const -{ - return (p == rhs.p) && (char_pos >= rhs.char_pos); -} - -__device__ inline char_utf8 string_view::const_iterator::operator*() const -{ - char_utf8 chr = 0; - strings::detail::to_char_utf8(p + byte_offset(), chr); - return chr; -} - -__device__ inline size_type string_view::const_iterator::position() const { return char_pos; } - -__device__ inline size_type string_view::const_iterator::byte_offset() const { return byte_pos; } - -__device__ inline string_view::const_iterator string_view::begin() const -{ - return const_iterator(*this, 0); -} - -__device__ inline string_view::const_iterator string_view::end() const -{ - return const_iterator(*this, length()); -} - -__device__ inline char_utf8 string_view::operator[](size_type pos) const -{ - size_type offset = byte_offset(pos); - if (offset >= _bytes) return 0; - char_utf8 chr = 0; - strings::detail::to_char_utf8(data() + offset, chr); - return chr; -} - -__device__ inline size_type string_view::byte_offset(size_type pos) const -{ - size_type offset = 0; - const char* sptr = _data; - const char* eptr = sptr + _bytes; - if (_char_width > 0) return pos * _char_width; - while ((pos > 0) && (sptr < eptr)) { - size_type charbytes = bytes_in_utf8_byte((BYTE)*sptr++); - if (charbytes) --pos; - offset += charbytes; - } - return offset; -} - -__device__ inline int string_view::compare(const string_view& in) const -{ - return compare(in.data(), in.size_bytes()); -} - -__device__ inline int string_view::compare(const char* data, size_type bytes) const -{ - size_type const len1 = size_bytes(); - const unsigned char* ptr1 = reinterpret_cast(this->data()); - const unsigned char* ptr2 = reinterpret_cast(data); - size_type idx = 0; - for (; (idx < len1) && (idx < bytes); ++idx) { - if (*ptr1 != *ptr2) return static_cast(*ptr1) - static_cast(*ptr2); - ++ptr1; - ++ptr2; - } - if (idx < len1) return 1; - if (idx < bytes) return -1; - return 0; -} - -__device__ inline bool string_view::operator==(const string_view& rhs) const -{ - return compare(rhs) == 0; -} - -__device__ inline bool string_view::operator!=(const string_view& rhs) const -{ - return compare(rhs) != 0; -} - -__device__ inline bool string_view::operator<(const string_view& rhs) const -{ - return compare(rhs) < 0; -} - -__device__ inline bool string_view::operator>(const string_view& rhs) const -{ - return compare(rhs) > 0; -} - -__device__ inline bool string_view::operator<=(const string_view& rhs) const -{ - int rc = compare(rhs); - return (rc == 0) || (rc < 0); -} - -__device__ inline bool string_view::operator>=(const string_view& rhs) const -{ - int rc = compare(rhs); - return (rc == 0) || (rc > 0); -} - -__device__ inline size_type string_view::find(const string_view& str, - size_type pos, - size_type count) const -{ - return find(str.data(), str.size_bytes(), pos, count); -} - -__device__ inline size_type string_view::find(const char* str, - size_type bytes, - size_type pos, - size_type count) const -{ - const char* sptr = data(); - if (!str || !bytes) return -1; - size_type nchars = length(); - if (count < 0) count = nchars; - size_type end = pos + count; - if (end < 0 || end > nchars) end = nchars; - size_type spos = byte_offset(pos); - size_type epos = byte_offset(end); - - size_type len2 = bytes; - size_type len1 = (epos - spos) - len2 + 1; - - const char* ptr1 = sptr + spos; - const char* ptr2 = str; - for (size_type idx = 0; idx < len1; ++idx) { - bool match = true; - for (size_type jdx = 0; match && (jdx < len2); ++jdx) match = (ptr1[jdx] == ptr2[jdx]); - if (match) return character_offset(idx + spos); - ptr1++; - } - return -1; -} - -__device__ inline size_type string_view::find(char_utf8 chr, size_type pos, size_type count) const -{ - char str[sizeof(char_utf8)]; - size_type chwidth = strings::detail::from_char_utf8(chr, str); - return find(str, chwidth, pos, count); -} - -__device__ inline size_type string_view::rfind(const string_view& str, - size_type pos, - size_type count) const -{ - return rfind(str.data(), str.size_bytes(), pos, count); -} - -__device__ inline size_type string_view::rfind(const char* str, - size_type bytes, - size_type pos, - size_type count) const -{ - const char* sptr = data(); - if (!str || !bytes) return -1; - size_type nchars = length(); - size_type end = pos + count; - if (end < 0 || end > nchars) end = nchars; - size_type spos = byte_offset(pos); - size_type epos = byte_offset(end); - - size_type len2 = bytes; - size_type len1 = (epos - spos) - len2 + 1; - - const char* ptr1 = sptr + epos - len2; - const char* ptr2 = str; - for (int idx = 0; idx < len1; ++idx) { - bool match = true; - for (size_type jdx = 0; match && (jdx < len2); ++jdx) match = (ptr1[jdx] == ptr2[jdx]); - if (match) return character_offset(epos - len2 - idx); - ptr1--; // go backwards - } - return -1; -} - -__device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, size_type count) const -{ - char str[sizeof(char_utf8)]; - size_type chwidth = strings::detail::from_char_utf8(chr, str); - return rfind(str, chwidth, pos, count); -} - -// parameters are character position values -__device__ inline string_view string_view::substr(size_type pos, size_type length) const -{ - size_type spos = byte_offset(pos); - size_type epos = byte_offset(pos + length); - if (epos > size_bytes()) epos = size_bytes(); - if (spos >= epos) return string_view("", 0); - return string_view(data() + spos, epos - spos); -} - -__device__ inline size_type string_view::character_offset(size_type bytepos) const -{ - if (_char_width > 0) return bytepos / _char_width; - return strings::detail::characters_in_string(data(), bytepos); -} - -namespace strings { -namespace detail { -__host__ __device__ inline size_type bytes_in_char_utf8(char_utf8 chr) -{ - size_type count = 1; - count += (int)((chr & (unsigned)0x0000FF00) > 0); - count += (int)((chr & (unsigned)0x00FF0000) > 0); - count += (int)((chr & (unsigned)0xFF000000) > 0); - return count; -} - -__host__ __device__ inline size_type to_char_utf8(const char* pSrc, char_utf8& chr) -{ - size_type chwidth = bytes_in_utf8_byte((BYTE)*pSrc); - chr = (char_utf8)(*pSrc++) & 0xFF; - if (chwidth > 1) { - chr = chr << 8; - chr |= ((char_utf8)(*pSrc++) & 0xFF); // << 8; - if (chwidth > 2) { - chr = chr << 8; - chr |= ((char_utf8)(*pSrc++) & 0xFF); // << 16; - if (chwidth > 3) { - chr = chr << 8; - chr |= ((char_utf8)(*pSrc++) & 0xFF); // << 24; - } - } - } - return chwidth; -} - -__host__ __device__ inline size_type from_char_utf8(char_utf8 chr, char* dst) -{ - size_type chwidth = bytes_in_char_utf8(chr); - for (size_type idx = 0; idx < chwidth; ++idx) { - dst[chwidth - idx - 1] = (char)chr & 0xFF; - chr = chr >> 8; - } - return chwidth; -} - -// counts the number of characters in the given char array -__host__ __device__ inline size_type characters_in_string(const char* str, size_type bytes) -{ - if ((str == 0) || (bytes == 0)) return 0; - // - unsigned int nchars = 0; - for (size_type idx = 0; idx < bytes; ++idx) - nchars += (unsigned int)(((BYTE)str[idx] & 0xC0) != 0x80); - return (size_type)nchars; -} - -} // namespace detail -} // namespace strings -} // namespace cudf diff --git a/cpp/include/cudf/strings/translate.hpp b/cpp/include/cudf/strings/translate.hpp index 9588214488c..e014f88c451 100644 --- a/cpp/include/cudf/strings/translate.hpp +++ b/cpp/include/cudf/strings/translate.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include +#include #include #include diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index 69035a36c58..0e89058050d 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,6 @@ #pragma once #include -#include #include #include #include @@ -505,6 +504,8 @@ constexpr inline bool is_fixed_width(data_type type) return cudf::type_dispatcher(type, is_fixed_width_impl{}); } +class string_view; + /** * @brief Indicates whether the type `T` is a compound type. * diff --git a/cpp/include/cudf_test/scalar_utilities.hpp b/cpp/include/cudf_test/scalar_utilities.hpp deleted file mode 100644 index 7e34630365e..00000000000 --- a/cpp/include/cudf_test/scalar_utilities.hpp +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -namespace cudf { -namespace test { -/** - * @brief Verifies the equality of two scalars. - * - * Treats invalid scalars as equivalent. - * - * @param lhs The first scalar - * @param rhs The second scalar - */ -void expect_scalars_equal(cudf::scalar const& lhs, cudf::scalar const& rhs); - -} // namespace test -} // namespace cudf diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp index 1d7174e05d7..71c2b74b37b 100644 --- a/cpp/include/cudf_test/type_lists.hpp +++ b/cpp/include/cudf_test/type_lists.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include #include diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index 03e00b881d8..e732a13e67c 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -143,6 +143,9 @@ * @defgroup lists_apis Lists * @{ * @defgroup lists_extract Extracting + * @defgroup lists_contains Searching + * @defgroup lists_gather Gathering + * @defgroup lists_elements Counting * @} * @defgroup nvtext_apis NVText * @{ diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp index 5f4fcb1c108..46d070e14af 100644 --- a/cpp/src/interop/dlpack.cpp +++ b/cpp/src/interop/dlpack.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,6 +34,12 @@ struct get_column_data_impl { } }; +template <> +void const* get_column_data_impl::operator()(column_view const& col) +{ + return nullptr; +} + void const* get_column_data(column_view const& col) { return type_dispatcher(col.type(), get_column_data_impl{}, col); diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 69d894f9b49..ef1c17aa817 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -52,37 +52,6 @@ namespace gpu { /// Block dimension for dtype detection and conversion kernels constexpr uint32_t csvparse_block_dim = 128; -/* - * @brief Checks whether the given character is a whitespace character. - * - * @param ch The character to check - * - * @return True if the input is whitespace, False otherwise - */ -__device__ __inline__ bool is_whitespace(char c) { return c == '\t' || c == ' '; } - -// TODO: replace with `trim_whitespaces_quotes` once `end` semantics is fixed -/* - * @brief Scans a character stream within a range, and adjusts the start and end - * indices of the range to ignore whitespace and quotation characters. - * - * @param data The character stream to scan - * @param start The start index to adjust - * @param end The end index to adjust - * @param quotechar The character used to denote quotes - * - * @return Adjusted or unchanged start_idx and end_idx - */ -__device__ __inline__ void trim_field_start_end(const char **start, - const char **end, - char quotechar = '\0') -{ - while ((*start < *end) && is_whitespace(**start)) { (*start)++; } - if ((*start < *end) && **start == quotechar) { (*start)++; } - while ((*start <= *end) && is_whitespace(**end)) { (*end)--; } - if ((*start <= *end) && **end == quotechar) { (*end)--; } -} - /* * @brief Returns true is the input character is a valid digit. * Supports both decimal and hexadecimal digits (uppercase and lowercase). @@ -217,19 +186,16 @@ __global__ void __launch_bounds__(csvparse_block_dim) while (col < column_flags.size() && field_start <= row_end) { auto next_delimiter = cudf::io::gpu::seek_field_end(field_start, row_end, opts); - // Checking if this is a column that the user wants --- user can filter - // columns + // Checking if this is a column that the user wants --- user can filter columns if (column_flags[col] & column_parse::enabled) { // points to last character in the field - auto field_end = next_delimiter - 1; - long field_len = next_delimiter - field_start; - - if (serialized_trie_contains(opts.trie_na, field_start, field_len)) { + auto const field_len = static_cast(next_delimiter - field_start); + if (serialized_trie_contains(opts.trie_na, {field_start, field_len})) { atomicAdd(&d_columnData[actual_col].null_count, 1); - } else if (serialized_trie_contains(opts.trie_true, field_start, field_len) || - serialized_trie_contains(opts.trie_false, field_start, field_len)) { + } else if (serialized_trie_contains(opts.trie_true, {field_start, field_len}) || + serialized_trie_contains(opts.trie_false, {field_start, field_len})) { atomicAdd(&d_columnData[actual_col].bool_count, 1); - } else if (cudf::io::gpu::is_infinity(field_start, field_end)) { + } else if (cudf::io::gpu::is_infinity(field_start, next_delimiter)) { atomicAdd(&d_columnData[actual_col].float_count, 1); } else { long countNumber = 0; @@ -243,10 +209,10 @@ __global__ void __launch_bounds__(csvparse_block_dim) // Modify field_start & end to ignore whitespace and quotechars // This could possibly result in additional empty fields - trim_field_start_end(&field_start, &field_end); - field_len = field_end - field_start + 1; + auto const trimmed_field_range = trim_whitespaces_quotes(field_start, next_delimiter); + auto const trimmed_field_len = trimmed_field_range.second - trimmed_field_range.first; - for (auto cur = field_start; cur <= field_end; cur++) { + for (auto cur = trimmed_field_range.first; cur < trimmed_field_range.second; ++cur) { if (is_digit(*cur)) { countNumber++; continue; @@ -260,16 +226,18 @@ __global__ void __launch_bounds__(csvparse_block_dim) case ':': countColon++; break; case 'e': case 'E': - if (cur > field_start && cur < field_end) countExponent++; + if (cur > trimmed_field_range.first && cur < trimmed_field_range.second - 1) + countExponent++; break; default: countString++; break; } } // Integers have to have the length of the string - long int_req_number_cnt = field_len; // Off by one if they start with a minus sign - if ((*field_start == '-' || *field_start == '+') && field_len > 1) { --int_req_number_cnt; } + auto const int_req_number_cnt = trimmed_field_len - ((*trimmed_field_range.first == '-' || + *trimmed_field_range.first == '+') && + trimmed_field_len > 1); if (column_flags[col] & column_parse::as_datetime) { // PANDAS uses `object` dtype if the date is unparseable @@ -279,13 +247,17 @@ __global__ void __launch_bounds__(csvparse_block_dim) atomicAdd(&d_columnData[actual_col].string_count, 1); } } else if (countNumber == int_req_number_cnt) { - bool is_negative = (*field_start == '-'); - char const *data_begin = field_start + (is_negative || (*field_start == '+')); - cudf::size_type *ptr = cudf::io::gpu::infer_integral_field_counter( + auto const is_negative = (*trimmed_field_range.first == '-'); + auto const data_begin = + trimmed_field_range.first + (is_negative || (*trimmed_field_range.first == '+')); + cudf::size_type *ptr = cudf::io::gpu::infer_integral_field_counter( data_begin, data_begin + countNumber, is_negative, d_columnData[actual_col]); atomicAdd(ptr, 1); - } else if (is_floatingpoint( - field_len, countNumber, countDecimal, countDash + countPlus, countExponent)) { + } else if (is_floatingpoint(trimmed_field_len, + countNumber, + countDecimal, + countDash + countPlus, + countExponent)) { atomicAdd(&d_columnData[actual_col].float_count, 1); } else { atomicAdd(&d_columnData[actual_col].string_count, 1); @@ -470,21 +442,13 @@ struct decode_op { parse_options_view const &opts, column_parse::flags flags) { - static_cast(out_buffer)[row] = [&]() { - // Check for user-specified true/false values first, where the output is - // replaced with 1/0 respectively - const size_t field_len = end - begin + 1; - if (serialized_trie_contains(opts.trie_true, begin, field_len)) { - return static_cast(1); - } else if (serialized_trie_contains(opts.trie_false, begin, field_len)) { - return static_cast(0); - } else { - if (flags & column_parse::as_hexadecimal) { - return decode_value(begin, end, opts); - } else { - return decode_value(begin, end, opts); - } - } + static_cast(out_buffer)[row] = [&flags, &opts, begin, end]() -> T { + // Check for user-specified true/false values + auto const field_len = static_cast(end - begin); + if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return 1; } + if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return 0; } + return flags & column_parse::as_hexadecimal ? decode_value(begin, end, opts) + : decode_value(begin, end, opts); }(); return true; @@ -501,18 +465,14 @@ struct decode_op { parse_options_view const &opts, column_parse::flags flags) { - auto &value{static_cast(out_buffer)[row]}; - - // Check for user-specified true/false values first, where the output is - // replaced with 1/0 respectively - const size_t field_len = end - begin + 1; - if (serialized_trie_contains(opts.trie_true, begin, field_len)) { - value = 1; - } else if (serialized_trie_contains(opts.trie_false, begin, field_len)) { - value = 0; - } else { - value = decode_value(begin, end, opts); - } + static_cast(out_buffer)[row] = [&opts, begin, end]() { + // Check for user-specified true/false values + auto const field_len = static_cast(end - begin); + if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; } + if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; } + return decode_value(begin, end, opts); + }(); + return true; } @@ -528,9 +488,9 @@ struct decode_op { parse_options_view const &opts, column_parse::flags flags) { - auto &value{static_cast(out_buffer)[row]}; + T const value = decode_value(begin, end, opts); + static_cast(out_buffer)[row] = value; - value = decode_value(begin, end, opts); return !std::isnan(value); } @@ -547,9 +507,8 @@ struct decode_op { parse_options_view const &opts, column_parse::flags flags) { - auto &value{static_cast(out_buffer)[row]}; + static_cast(out_buffer)[row] = decode_value(begin, end, opts); - value = decode_value(begin, end, opts); return true; } }; @@ -601,13 +560,16 @@ __global__ void __launch_bounds__(csvparse_block_dim) if (column_flags[col] & column_parse::enabled) { // check if the entire field is a NaN string - consistent with pandas - auto const is_valid = - !serialized_trie_contains(options.trie_na, field_start, next_delimiter - field_start); + auto const is_valid = !serialized_trie_contains( + options.trie_na, {field_start, static_cast(next_delimiter - field_start)}); // Modify field_start & end to ignore whitespace and quotechars - auto field_end = next_delimiter - 1; + auto field_end = next_delimiter; if (is_valid && dtypes[actual_col].id() != cudf::type_id::STRING) { - trim_field_start_end(&field_start, &field_end, options.quotechar); + auto const trimmed_field = + trim_whitespaces_quotes(field_start, field_end, options.quotechar); + field_start = trimmed_field.first; + field_end = trimmed_field.second; } if (is_valid) { // Type dispatcher does not handle STRING diff --git a/cpp/src/io/csv/datetime.cuh b/cpp/src/io/csv/datetime.cuh index f0aead071fd..7f3c2ab4942 100644 --- a/cpp/src/io/csv/datetime.cuh +++ b/cpp/src/io/csv/datetime.cuh @@ -16,9 +16,13 @@ #pragma once +#include "thrust/reduce.h" + #include +#include -#include "thrust/reduce.h" +namespace cudf { +namespace io { /** * @brief Parses non-negative integral vales. @@ -27,7 +31,7 @@ * character string is expected to be well-formed. * * @param begin Pointer to the first element of the string - * @param end Pointer to the last element of the string + * @param end Pointer to the first element after the string * @return The parsed and converted value */ template @@ -35,7 +39,7 @@ __inline__ __device__ T to_non_negative_integer(char const* begin, char const* e { T value = 0; - for (; begin <= end; ++begin) { + for (; begin < end; ++begin) { if (*begin >= '0' && *begin <= '9') { value *= 10; value += *begin - '0'; @@ -130,7 +134,7 @@ __inline__ __device__ constexpr int64_t seconds_since_epoch( * @brief Extracts the Day, Month, and Year from a string. * * @param[in] begin Pointer to the first element of the string - * @param[in] end Pointer to the last element of the string + * @param[in] end Pointer to the first element after the string * @param[in] dayfirst Flag indicating that first field is the day * @param[out] year * @param[out] month @@ -153,7 +157,7 @@ __inline__ __device__ bool extract_date( //--- is year the first filed? if ((sep_pos - begin) == 4) { - *year = to_non_negative_integer(begin, (sep_pos - 1)); + *year = to_non_negative_integer(begin, sep_pos); // Month auto s2 = sep_pos + 1; @@ -165,23 +169,23 @@ __inline__ __device__ bool extract_date( *day = 1; } else { - *month = to_non_negative_integer(s2, (sep_pos - 1)); + *month = to_non_negative_integer(s2, sep_pos); *day = to_non_negative_integer((sep_pos + 1), end); } } else { //--- if the dayfirst flag is set, then restricts the format options if (dayfirst) { - *day = to_non_negative_integer(begin, (sep_pos - 1)); + *day = to_non_negative_integer(begin, sep_pos); auto s2 = sep_pos + 1; sep_pos = thrust::find(thrust::seq, s2, end, sep); - *month = to_non_negative_integer(s2, (sep_pos - 1)); + *month = to_non_negative_integer(s2, sep_pos); *year = to_non_negative_integer((sep_pos + 1), end); } else { - *month = to_non_negative_integer(begin, (sep_pos - 1)); + *month = to_non_negative_integer(begin, sep_pos); auto s2 = sep_pos + 1; sep_pos = thrust::find(thrust::seq, s2, end, sep); @@ -192,7 +196,7 @@ __inline__ __device__ bool extract_date( *day = 1; } else { - *day = to_non_negative_integer(s2, (sep_pos - 1)); + *day = to_non_negative_integer(s2, sep_pos); *year = to_non_negative_integer((sep_pos + 1), end); } } @@ -211,7 +215,7 @@ __inline__ __device__ bool extract_date( * at the end. * * @param[in] begin Pointer to the first element of the string - * @param[in] end Pointer to the last element of the string + * @param[in] end Pointer to the first element after the string * @param[out] hour The hour value * @param[out] minute The minute value * @param[out] second The second value (0 if not present) @@ -224,15 +228,17 @@ __inline__ __device__ void extract_time( // Adjust for AM/PM and any whitespace before int hour_adjust = 0; - if (*end == 'M' || *end == 'm') { - if (*(end - 1) == 'P' || *(end - 1) == 'p') { hour_adjust = 12; } - end = end - 2; - while (*end == ' ') { --end; } + auto last = end - 1; + if (*last == 'M' || *last == 'm') { + if (*(last - 1) == 'P' || *(last - 1) == 'p') { hour_adjust = 12; } + last = last - 2; + while (*last == ' ') { --last; } } + end = last + 1; // Find hour-minute separator const auto hm_sep = thrust::find(thrust::seq, begin, end, sep); - *hour = to_non_negative_integer(begin, hm_sep - 1) + hour_adjust; + *hour = to_non_negative_integer(begin, hm_sep) + hour_adjust; // Find minute-second separator (if present) const auto ms_sep = thrust::find(thrust::seq, hm_sep + 1, end, sep); @@ -241,7 +247,7 @@ __inline__ __device__ void extract_time( *second = 0; *millisecond = 0; } else { - *minute = to_non_negative_integer(hm_sep + 1, ms_sep - 1); + *minute = to_non_negative_integer(hm_sep + 1, ms_sep); // Find second-millisecond separator (if present) const auto sms_sep = thrust::find(thrust::seq, ms_sep + 1, end, '.'); @@ -249,7 +255,7 @@ __inline__ __device__ void extract_time( *second = to_non_negative_integer(ms_sep + 1, end); *millisecond = 0; } else { - *second = to_non_negative_integer(ms_sep + 1, sms_sep - 1); + *second = to_non_negative_integer(ms_sep + 1, sms_sep); *millisecond = to_non_negative_integer(sms_sep + 1, end); } } @@ -262,20 +268,17 @@ __inline__ __device__ void extract_time( * Acceptable formats are a combination of `MM/YYYY` and `MM/DD/YYYY`. * * @param[in] begin Pointer to the first element of the string - * @param[in] end Pointer to the last element of the string + * @param[in] end Pointer to the first element after the string * @param[in] dayfirst Flag to indicate that day is the first field - `DD/MM/YYYY` * @return Number of days since epoch */ __inline__ __device__ int32_t to_date(char const* begin, char const* end, bool dayfirst) { int day, month, year; - int32_t e = -1; - bool status = extract_date(begin, end, dayfirst, &year, &month, &day); - - if (status) e = days_since_epoch(year, month, day); - - return e; + return extract_date(begin, end, dayfirst, &year, &month, &day) + ? days_since_epoch(year, month, day) + : -1; } /** @@ -284,9 +287,9 @@ __inline__ __device__ int32_t to_date(char const* begin, char const* end, bool d * This function takes a string and produces a `date32` representation. * Acceptable formats are a combination of `MM/YYYY` and `MM/DD/YYYY`. * - * @param[in] begin Pointer to the first element of the string - * @param[in] end Pointer to the last element of the string - * @param[in] dayfirst Flag to indicate day/month or month/day order + * @param begin Pointer to the first element of the string + * @param end Pointer to the first element after the string + * @param dayfirst Flag to indicate day/month or month/day order * @return Milliseconds since epoch */ __inline__ __device__ int64_t to_date_time(char const* begin, char const* end, bool dayfirst) @@ -303,7 +306,7 @@ __inline__ __device__ int64_t to_date_time(char const* begin, char const* end, b // Attempt to locate the position between date and time, ignore premature space separators // around the day/month/year portions int count = 0; - for (auto i = begin; i <= end; ++i) { + for (auto i = begin; i < end; ++i) { if (count == 3 && *i == ' ') { sep_pos = i; break; @@ -315,7 +318,7 @@ __inline__ __device__ int64_t to_date_time(char const* begin, char const* end, b // There is only date if there's no separator, otherwise it's malformed if (sep_pos != end) { - if (extract_date(begin, sep_pos - 1, dayfirst, &year, &month, &day)) { + if (extract_date(begin, sep_pos, dayfirst, &year, &month, &day)) { extract_time(sep_pos + 1, end, &hour, &minute, &second, &millisecond); answer = seconds_since_epoch(year, month, day, hour, minute, second) * 1000 + millisecond; } @@ -334,7 +337,7 @@ __inline__ __device__ int64_t to_date_time(char const* begin, char const* end, b * Moves the `begin` iterator past the parsed value. * * @param begin[in, out] Pointer to the first element of the string - * @param end Pointer to the last element of the string + * @param end Pointer to the first element after the string * @return The parsed and converted value */ template @@ -364,7 +367,7 @@ __inline__ __device__ T parse_integer(char const** begin, char const* end) * Moves the `begin` iterator past the parsed value. * * @param begin[in, out] Pointer to the first element of the string - * @param end Pointer to the last element of the string + * @param end Pointer to the first element after the string * @return The parsed and converted value, zero is delimiter is not present */ template @@ -376,49 +379,16 @@ __inline__ __device__ T parse_optional_integer(char const** begin, char const* e return parse_integer(begin, end); } -/** - * @brief Excludes the prefix from the input range if the string starts with the prefix. - * - * @tparam N length on the prefix, plus one - * @param begin[in, out] Pointer to the first element of the string - * @param end Pointer to the first element after the string - * @param prefix String we're searching for at the start of the input range - * @return true if the input range starts with the given prefix - */ -template -__inline__ __device__ bool skip_if_starts_with(char const** begin, - char const* end, - const char (&prefix)[N]) -{ - static constexpr size_t prefix_len = N - 1; - if (end - *begin < prefix_len) return false; - auto const found = thrust::equal(thrust::seq, *begin, *begin + prefix_len, prefix); - if (found) (*begin) += prefix_len; - return found; -} - -/** - * @brief Modifies the input range to exclude the leading space characters. - * - * @param begin[in, out] Pointer to the first element of the string - * @param end Pointer to the first element after the string - */ -__inline__ __device__ void skip_spaces(char const** begin, char const* end) -{ - *begin = thrust::find_if(thrust::seq, *begin, end, [](auto elem) { return elem != ' '; }); -} - /** * @brief Parses the input string into a duration of the given type. * * @param begin Pointer to the first element of the string - * @param end Pointer to the last element of the string + * @param end Pointer to the first element after the string * @return The parsed duration */ template __inline__ __device__ int64_t to_time_delta(char const* begin, char const* end) { - ++end; // %d days [+]%H:%M:%S.n => %d days, %d days [+]%H:%M:%S, %H:%M:%S.n, %H:%M:%S, %value. constexpr char sep = ':'; @@ -427,13 +397,14 @@ __inline__ __device__ int64_t to_time_delta(char const* begin, char const* end) // single pass to parse days, hour, minute, seconds, nanosecond auto cur = begin; auto const value = parse_integer(&cur, end); - skip_spaces(&cur, end + 1); + cur = skip_spaces(cur, end); if (std::is_same::value || cur >= end) { // %value return value; } // " days [+]" - bool const has_days_seperator = skip_if_starts_with(&cur, end + 1, "days"); - skip_spaces(&cur, end + 1); + auto const after_days_sep = skip_if_starts_with(cur, end, "days"); + auto const has_days_seperator = (after_days_sep != cur); + cur = skip_spaces(after_days_sep, end); cur += (*cur == '+'); if (has_days_seperator) { days = value; @@ -462,3 +433,6 @@ __inline__ __device__ int64_t to_time_delta(char const* begin, char const* end) .count() + cuda::std::chrono::duration_cast(cudf::duration_ns{nanosecond}).count(); } + +} // namespace io +} // namespace cudf \ No newline at end of file diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index c761dadf198..1b7635f8d0d 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -32,7 +32,6 @@ #include #include #include -#include namespace cudf { namespace io { @@ -410,20 +409,6 @@ table_with_metadata read_parquet(parquet_reader_options const& options, return reader->read(options); } -// Freeform API wraps the detail writer class API -std::unique_ptr> write_parquet(parquet_writer_options const& options, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - auto writer = make_writer(options.get_sink(), options, mr); - - return writer->write(options.get_table(), - options.get_metadata(), - options.is_enabled_return_filemetadata(), - options.get_column_chunks_file_path(), - options.get_decimal_precision()); -} - /** * @copydoc cudf::io::merge_rowgroup_metadata */ @@ -435,54 +420,52 @@ std::unique_ptr> merge_rowgroup_metadata( } /** - * @copydoc cudf::io::write_parquet_chunked_begin + * @copydoc cudf::io::write_parquet */ -std::shared_ptr write_parquet_chunked_begin( - chunked_parquet_writer_options const& op, rmm::mr::device_memory_resource* mr) +std::unique_ptr> write_parquet(parquet_writer_options const& options, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - parquet_writer_options options = parquet_writer_options::builder() - .compression(op.get_compression()) - .stats_level(op.get_stats_level()) - .int96_timestamps(op.is_enabled_int96_timestamps()); - - auto state = std::make_shared(); - state->wp = make_writer(op.get_sink(), options, mr); - - // have to make a copy of the metadata here since we can't really - // guarantee the lifetime of the incoming pointer - if (op.get_nullable_metadata() != nullptr) { - state->user_metadata_with_nullability = *op.get_nullable_metadata(); - state->user_metadata = &state->user_metadata_with_nullability; - } - state->int96_timestamps = op.is_enabled_int96_timestamps(); - state->_decimal_precision = op.get_decimal_precision(); - state->stream = 0; - state->wp->write_chunked_begin(*state); - return state; + namespace io_detail = cudf::io::detail; + + auto writer = make_writer( + options.get_sink(), options, io_detail::SingleWriteMode::YES, mr, rmm::cuda_stream_default); + + writer->write(options.get_table()); + return writer->close(options.get_column_chunks_file_path()); } /** - * @copydoc cudf::io::write_parquet_chunked + * @copydoc cudf::io::parquet_chunked_writer::parquet_chunked_writer */ -void write_parquet_chunked(table_view const& table, std::shared_ptr state) +parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options const& op, + rmm::mr::device_memory_resource* mr) +{ + namespace io_detail = cudf::io::detail; + writer = make_writer( + op.get_sink(), op, io_detail::SingleWriteMode::NO, mr, rmm::cuda_stream_default); +} + +/** + * @copydoc cudf::io::parquet_chunked_writer::write + */ +parquet_chunked_writer& parquet_chunked_writer::write(table_view const& table) { CUDF_FUNC_RANGE(); - state->wp->write_chunk(table, *state); + + writer->write(table); + + return *this; } /** - * @copydoc cudf::io::write_parquet_chunked_end + * @copydoc cudf::io::parquet_chunked_writer::close */ -std::unique_ptr> write_parquet_chunked_end( - std::shared_ptr& state, - bool return_filemetadata, - const std::string& column_chunks_file_path) +std::unique_ptr> parquet_chunked_writer::close( + std::string const& column_chunks_file_path) { CUDF_FUNC_RANGE(); - auto meta = state->wp->write_chunked_end(*state, return_filemetadata, column_chunks_file_path); - state.reset(); - return meta; + return writer->close(column_chunks_file_path); } } // namespace io diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu index 4deae310a53..7448d49e117 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/json_gpu.cu @@ -61,14 +61,14 @@ namespace { __device__ std::pair limit_range_to_brackets(char const *begin, char const *end) { - begin = thrust::find_if( - thrust::seq, begin, end, [] __device__(auto c) { return c == '[' || c == '{'; }); - end = thrust::find_if(thrust::seq, - thrust::make_reverse_iterator(end), - thrust::make_reverse_iterator(++begin), - [](auto c) { return c == ']' || c == '}'; }) - .base(); - return {begin, --end}; + auto const data_begin = thrust::next(thrust::find_if( + thrust::seq, begin, end, [] __device__(auto c) { return c == '[' || c == '{'; })); + auto const data_end = thrust::next(thrust::find_if(thrust::seq, + thrust::make_reverse_iterator(end), + thrust::make_reverse_iterator(data_begin), + [](auto c) { return c == ']' || c == '}'; })) + .base(); + return {data_begin, data_end}; } /** @@ -307,16 +307,12 @@ struct ConvertFunctor { { T &value{static_cast(output_column)[row]}; - // Check for user-specified true/false values first, where the output is - // replaced with 1/0 respectively value = [&opts, end, begin]() -> T { - if (serialized_trie_contains(opts.trie_true, begin, end - begin)) { - return 1; - } else if (serialized_trie_contains(opts.trie_false, begin, end - begin)) { - return 0; - } else { - return decode_value(begin, end - 1, opts); - } + // Check for user-specified true/false values + auto const len = static_cast(end - begin); + if (serialized_trie_contains(opts.trie_true, {begin, len})) { return 1; } + if (serialized_trie_contains(opts.trie_false, {begin, len})) { return 0; } + return decode_value(begin, end, opts); }(); return true; @@ -333,8 +329,9 @@ struct ConvertFunctor { size_t row, parse_options_view const &opts) { - auto &value{static_cast(out_buffer)[row]}; - value = decode_value(begin, end - 1, opts); + T const value = decode_value(begin, end, opts); + static_cast(out_buffer)[row] = value; + return !std::isnan(value); } @@ -351,46 +348,12 @@ struct ConvertFunctor { cudf::size_type row, const parse_options_view &opts) { - T &value{static_cast(output_column)[row]}; - value = decode_value(begin, end - 1, opts); + static_cast(output_column)[row] = decode_value(begin, end, opts); return true; } }; -/** - * @brief Checks whether the given character is a whitespace character. - * - * @param[in] ch The character to check - * - * @return True if the input is whitespace, False otherwise - */ -__inline__ __device__ bool is_whitespace(char ch) { return ch == '\t' || ch == ' '; } - -/** - * @brief Adjusts the range to ignore starting/trailing whitespace and quotation characters. - * - * @param[in] begin Pointer to the first character in the parsing range - * @param[in] end pointer to the first character after the parsing range - * @param[in] quotechar The character used to denote quotes; '\0' if none - * - * @return Trimmed range - */ -__inline__ __device__ std::pair trim_whitespaces_quotes( - char const *begin, char const *end, char quotechar = '\0') -{ - auto not_whitespace = [] __device__(auto c) { return !is_whitespace(c); }; - - begin = thrust::find_if(thrust::seq, begin, end, not_whitespace); - end = thrust::find_if(thrust::seq, - thrust::make_reverse_iterator(end), - thrust::make_reverse_iterator(begin), - not_whitespace) - .base(); - - return {(*begin == quotechar) ? ++begin : begin, (*(end - 1) == quotechar) ? end - 1 : end}; -} - /** * @brief Returns true is the input character is a valid digit. * Supports both decimal and hexadecimal digits (uppercase and lowercase). @@ -550,7 +513,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts, current = desc.value_end + 1; // Empty fields are not legal values - if (!serialized_trie_contains(opts.trie_na, desc.value_begin, value_len)) { + if (!serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) { // Type dispatcher does not handle strings if (column_types[desc.column].id() == type_id::STRING) { auto str_list = static_cast(output_columns[desc.column]); @@ -622,7 +585,7 @@ __global__ void detect_data_types_kernel( current = desc.value_end + 1; // Checking if the field is empty/valid - if (serialized_trie_contains(opts.trie_na, desc.value_begin, value_len)) { + if (serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) { // Increase the null count for array rows, where the null count is initialized to zero. if (!are_rows_objects) { atomicAdd(&column_infos[desc.column].null_count, 1); } continue; @@ -678,8 +641,8 @@ __global__ void detect_data_types_kernel( } // Off by one if they are a hexadecimal number if (maybe_hex) { --int_req_number_cnt; } - if (serialized_trie_contains(opts.trie_true, desc.value_begin, value_len) || - serialized_trie_contains(opts.trie_false, desc.value_begin, value_len)) { + if (serialized_trie_contains(opts.trie_true, {desc.value_begin, value_len}) || + serialized_trie_contains(opts.trie_false, {desc.value_begin, value_len})) { atomicAdd(&column_infos[desc.column].bool_count, 1); } else if (digit_count == int_req_number_cnt) { bool is_negative = (*desc.value_begin == '-'); diff --git a/cpp/src/io/parquet/chunked_state.hpp b/cpp/src/io/parquet/chunked_state.hpp deleted file mode 100644 index d6758efe417..00000000000 --- a/cpp/src/io/parquet/chunked_state.hpp +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file chunked_state.hpp - * @brief definition for chunked state structure used by Parquet writer - */ - -#pragma once - -#include - -#include - -#include - -namespace cudf { -namespace io { - -enum class SingleWriteMode : bool { YES, NO }; - -/** - * @brief Chunked writer state struct. Contains various pieces of information - * needed that span the begin() / write() / end() call process. - */ -struct pq_chunked_state { - /// The writer to be used - std::unique_ptr wp; - /// Cuda stream to be used - rmm::cuda_stream_view stream; - /// Overall file metadata. Filled in during the process and written during write_chunked_end() - cudf::io::parquet::FileMetaData md; - /// current write position for rowgroups/chunks - std::size_t current_chunk_offset; - /// optional user metadata - table_metadata_with_nullability user_metadata_with_nullability; - /// special parameter only used by detail::write() to indicate that we are guaranteeing - /// a single table write. this enables some internal optimizations. - table_metadata const* user_metadata = nullptr; - /// only used in the write_chunked() case. copied from the (optionally) user supplied - /// argument to write_parquet_chunked_begin() - bool single_write_mode; - /// timestamps should be written as int96 types - bool int96_timestamps; - /// vector of precision values for decimal writing. Exactly one entry - /// per decimal column. - std::vector _decimal_precision; - - pq_chunked_state() = default; - - pq_chunked_state(table_metadata const* metadata, - SingleWriteMode mode = SingleWriteMode::NO, - bool write_int96_timestamps = false, - std::vector const& decimal_precision = {}, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) - : stream{stream}, - user_metadata{metadata}, - single_write_mode{mode == SingleWriteMode::YES}, - int96_timestamps(write_int96_timestamps), - _decimal_precision(decimal_precision) - { - } -}; - -} // namespace io -} // namespace cudf diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 9a4eab260b0..5f572e7544f 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -503,8 +503,7 @@ void writer::impl::init_page_fragments(hostdevice_vector &fra uint32_t num_columns, uint32_t num_fragments, uint32_t num_rows, - uint32_t fragment_size, - rmm::cuda_stream_view stream) + uint32_t fragment_size) { CUDA_TRY(cudaMemcpyAsync(col_desc.device_ptr(), col_desc.host_ptr(), @@ -531,8 +530,7 @@ void writer::impl::gather_fragment_statistics(statistics_chunk *frag_stats_chunk hostdevice_vector &col_desc, uint32_t num_columns, uint32_t num_fragments, - uint32_t fragment_size, - rmm::cuda_stream_view stream) + uint32_t fragment_size) { rmm::device_vector frag_stats_group(num_fragments * num_columns); @@ -552,8 +550,7 @@ void writer::impl::build_chunk_dictionaries(hostdevice_vector &col_desc, uint32_t num_rowgroups, uint32_t num_columns, - uint32_t num_dictionaries, - rmm::cuda_stream_view stream) + uint32_t num_dictionaries) { size_t dict_scratch_size = (size_t)num_dictionaries * gpu::kDictScratchSize; rmm::device_vector dict_scratch(dict_scratch_size / sizeof(uint32_t)); @@ -591,8 +588,7 @@ void writer::impl::init_encoder_pages(hostdevice_vector &ch uint32_t num_rowgroups, uint32_t num_columns, uint32_t num_pages, - uint32_t num_stats_bfr, - rmm::cuda_stream_view stream) + uint32_t num_stats_bfr) { rmm::device_vector page_stats_mrg(num_stats_bfr); CUDA_TRY(cudaMemcpyAsync(chunks.device_ptr(), @@ -631,8 +627,7 @@ void writer::impl::encode_pages(hostdevice_vector &chunks, gpu_inflate_input_s *comp_in, gpu_inflate_status_s *comp_out, const statistics_chunk *page_stats, - const statistics_chunk *chunk_stats, - rmm::cuda_stream_view stream) + const statistics_chunk *chunk_stats) { gpu::EncodePages( pages, chunks.device_ptr(), pages_in_batch, first_page_in_batch, comp_in, comp_out, stream); @@ -672,42 +667,59 @@ void writer::impl::encode_pages(hostdevice_vector &chunks, writer::impl::impl(std::unique_ptr sink, parquet_writer_options const &options, - rmm::mr::device_memory_resource *mr) + SingleWriteMode mode, + rmm::mr::device_memory_resource *mr, + rmm::cuda_stream_view stream) : _mr(mr), + stream(stream), compression_(to_parquet_compression(options.get_compression())), stats_granularity_(options.get_stats_level()), int96_timestamps(options.is_enabled_int96_timestamps()), - out_sink_(std::move(sink)) + out_sink_(std::move(sink)), + decimal_precision(options.get_decimal_precision()), + single_write_mode(mode == SingleWriteMode::YES), + user_metadata(options.get_metadata()) { + init_state(); } -std::unique_ptr> writer::impl::write( - table_view const &table, - const table_metadata *metadata, - bool return_filemetadata, - const std::string &column_chunks_file_path, - std::vector const &decimal_precisions, - rmm::cuda_stream_view stream) +writer::impl::impl(std::unique_ptr sink, + chunked_parquet_writer_options const &options, + SingleWriteMode mode, + rmm::mr::device_memory_resource *mr, + rmm::cuda_stream_view stream) + : _mr(mr), + stream(stream), + compression_(to_parquet_compression(options.get_compression())), + stats_granularity_(options.get_stats_level()), + int96_timestamps(options.is_enabled_int96_timestamps()), + decimal_precision(options.get_decimal_precision()), + single_write_mode(mode == SingleWriteMode::YES), + out_sink_(std::move(sink)) { - pq_chunked_state state{ - metadata, SingleWriteMode::YES, int96_timestamps, decimal_precisions, stream}; + if (options.get_nullable_metadata() != nullptr) { + user_metadata_with_nullability = *options.get_nullable_metadata(); + user_metadata = &user_metadata_with_nullability; + } - write_chunked_begin(state); - write_chunk(table, state); - return write_chunked_end(state, return_filemetadata, column_chunks_file_path); + init_state(); } -void writer::impl::write_chunked_begin(pq_chunked_state &state) +writer::impl::~impl() { close(); } + +void writer::impl::init_state() { // Write file header file_header_s fhdr; fhdr.magic = parquet_magic; out_sink_->host_write(&fhdr, sizeof(fhdr)); - state.current_chunk_offset = sizeof(file_header_s); + current_chunk_offset = sizeof(file_header_s); } -void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) +void writer::impl::write(table_view const &table) { + CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed"); + size_type num_columns = table.num_columns(); size_type num_rows = 0; @@ -724,9 +736,9 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) // The user can pass in information about the nullability of a column to be enforced across // write_chunk() calls, in a flattened bool vector. Figure out that per column. auto per_column_nullability = - (state.single_write_mode) + (single_write_mode) ? std::vector>{} - : get_per_column_nullability(table, state.user_metadata_with_nullability.column_nullable); + : get_per_column_nullability(table, user_metadata_with_nullability.column_nullable); uint decimal_precision_idx = 0; @@ -740,19 +752,19 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) // one table tell us everything we need to know about their nullability. // Empty nullability means the writer figures out the nullability from the cudf columns. auto const &this_column_nullability = - (state.single_write_mode) ? std::vector{} : per_column_nullability[current_id]; + (single_write_mode) ? std::vector{} : per_column_nullability[current_id]; parquet_columns.emplace_back(current_id, col, this_column_nullability, - state.user_metadata, - state.int96_timestamps, - state._decimal_precision, + user_metadata, + int96_timestamps, + decimal_precision, decimal_precision_idx, - state.stream); + stream); } - CUDF_EXPECTS(decimal_precision_idx == state._decimal_precision.size(), + CUDF_EXPECTS(decimal_precision_idx == decimal_precision.size(), "Too many decimal precision values!"); // first call. setup metadata. num_rows will get incremented as write_chunk is @@ -825,8 +837,7 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) physical_type == parquet::Type::INT96 ? ConvertedType::UNKNOWN : col.converted_type(); col_schema.repetition_type = - (col.max_def_level() == 1 || - (state.single_write_mode && col.row_count() < (size_t)num_rows)) + (col.max_def_level() == 1 || (single_write_mode && col.row_count() < (size_t)num_rows)) ? OPTIONAL : REQUIRED; @@ -840,27 +851,27 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) } } - if (state.md.version == 0) { - state.md.version = 1; - state.md.num_rows = num_rows; - state.md.column_order_listsize = + if (md.version == 0) { + md.version = 1; + md.num_rows = num_rows; + md.column_order_listsize = (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? num_columns : 0; - if (state.user_metadata != nullptr) { - std::transform(state.user_metadata->user_data.begin(), - state.user_metadata->user_data.end(), - std::back_inserter(state.md.key_value_metadata), + if (user_metadata != nullptr) { + std::transform(user_metadata->user_data.begin(), + user_metadata->user_data.end(), + std::back_inserter(md.key_value_metadata), [](auto const &kv) { return KeyValue{kv.first, kv.second}; }); } - state.md.schema = this_table_schema; + md.schema = this_table_schema; } else { // verify the user isn't passing mismatched tables - CUDF_EXPECTS(state.md.schema == this_table_schema, + CUDF_EXPECTS(md.schema == this_table_schema, "Mismatch in schema between multiple calls to write_chunk"); // increment num rows - state.md.num_rows += num_rows; + md.num_rows += num_rows; } // Initialize column description @@ -920,11 +931,10 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) uint32_t num_fragments = (uint32_t)((num_rows + fragment_size - 1) / fragment_size); hostdevice_vector fragments(num_columns * num_fragments); if (fragments.size() != 0) { - init_page_fragments( - fragments, col_desc, num_columns, num_fragments, num_rows, fragment_size, state.stream); + init_page_fragments(fragments, col_desc, num_columns, num_fragments, num_rows, fragment_size); } - size_t global_rowgroup_base = state.md.row_groups.size(); + size_t global_rowgroup_base = md.row_groups.size(); // Decide row group boundaries based on uncompressed data size size_t rowgroup_size = 0; @@ -939,8 +949,8 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) if (f > rowgroup_start && (rowgroup_size + fragment_data_size > max_rowgroup_size_ || (f + 1 - rowgroup_start) * fragment_size > max_rowgroup_rows_)) { // update schema - state.md.row_groups.resize(state.md.row_groups.size() + 1); - state.md.row_groups[global_r++].num_rows = (f - rowgroup_start) * fragment_size; + md.row_groups.resize(md.row_groups.size() + 1); + md.row_groups[global_r++].num_rows = (f - rowgroup_start) * fragment_size; num_rowgroups++; rowgroup_start = f; rowgroup_size = 0; @@ -948,8 +958,8 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) rowgroup_size += fragment_data_size; if (f + 1 == num_fragments) { // update schema - state.md.row_groups.resize(state.md.row_groups.size() + 1); - state.md.row_groups[global_r++].num_rows = num_rows - rowgroup_start * fragment_size; + md.row_groups.resize(md.row_groups.size() + 1); + md.row_groups[global_r++].num_rows = num_rows - rowgroup_start * fragment_size; num_rowgroups++; } } @@ -959,13 +969,8 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) if (stats_granularity_ != statistics_freq::STATISTICS_NONE) { frag_stats.resize(num_fragments * num_columns); if (frag_stats.size() != 0) { - gather_fragment_statistics(frag_stats.data().get(), - fragments, - col_desc, - num_columns, - num_fragments, - fragment_size, - state.stream); + gather_fragment_statistics( + frag_stats.data().get(), fragments, col_desc, num_columns, num_fragments, fragment_size); } } // Initialize row groups and column chunks @@ -975,9 +980,9 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) for (uint32_t r = 0, global_r = global_rowgroup_base, f = 0, start_row = 0; r < num_rowgroups; r++, global_r++) { uint32_t fragments_in_chunk = - (uint32_t)((state.md.row_groups[global_r].num_rows + fragment_size - 1) / fragment_size); - state.md.row_groups[global_r].total_byte_size = 0; - state.md.row_groups[global_r].columns.resize(num_columns); + (uint32_t)((md.row_groups[global_r].num_rows + fragment_size - 1) / fragment_size); + md.row_groups[global_r].total_byte_size = 0; + md.row_groups[global_r].columns.resize(num_columns); for (int i = 0; i < num_columns; i++) { gpu::EncColumnChunk *ck = &chunks[r * num_columns + i]; bool dict_enable = false; @@ -991,7 +996,7 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) ck->stats = (frag_stats.size() != 0) ? frag_stats.data().get() + i * num_fragments + f : nullptr; ck->start_row = start_row; - ck->num_rows = (uint32_t)state.md.row_groups[global_r].num_rows; + ck->num_rows = (uint32_t)md.row_groups[global_r].num_rows; ck->first_fragment = i * num_fragments + f; ck->num_values = std::accumulate(fragments.host_ptr(i * num_fragments + f), @@ -1020,21 +1025,20 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) num_dictionaries++; } } - ck->has_dictionary = dict_enable; - state.md.row_groups[global_r].columns[i].meta_data.type = parquet_columns[i].physical_type(); - state.md.row_groups[global_r].columns[i].meta_data.encodings = {Encoding::PLAIN, - Encoding::RLE}; + ck->has_dictionary = dict_enable; + md.row_groups[global_r].columns[i].meta_data.type = parquet_columns[i].physical_type(); + md.row_groups[global_r].columns[i].meta_data.encodings = {Encoding::PLAIN, Encoding::RLE}; if (dict_enable) { - state.md.row_groups[global_r].columns[i].meta_data.encodings.push_back( + md.row_groups[global_r].columns[i].meta_data.encodings.push_back( Encoding::PLAIN_DICTIONARY); } - state.md.row_groups[global_r].columns[i].meta_data.path_in_schema = + md.row_groups[global_r].columns[i].meta_data.path_in_schema = parquet_columns[i].get_path_in_schema(); - state.md.row_groups[global_r].columns[i].meta_data.codec = UNCOMPRESSED; - state.md.row_groups[global_r].columns[i].meta_data.num_values = ck->num_values; + md.row_groups[global_r].columns[i].meta_data.codec = UNCOMPRESSED; + md.row_groups[global_r].columns[i].meta_data.num_values = ck->num_values; } f += fragments_in_chunk; - start_row += (uint32_t)state.md.row_groups[global_r].num_rows; + start_row += (uint32_t)md.row_groups[global_r].num_rows; } // Free unused dictionaries @@ -1042,8 +1046,7 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) // Build chunk dictionaries and count pages if (num_chunks != 0) { - build_chunk_dictionaries( - chunks, col_desc, num_rowgroups, num_columns, num_dictionaries, state.stream); + build_chunk_dictionaries(chunks, col_desc, num_rowgroups, num_columns, num_dictionaries); } // Initialize batches of rowgroups to encode (mainly to limit peak memory usage) @@ -1092,8 +1095,8 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) (compression_ != parquet::Compression::UNCOMPRESSED) ? max_pages_in_batch : 0; uint32_t num_stats_bfr = (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? num_pages + num_chunks : 0; - rmm::device_buffer uncomp_bfr(max_uncomp_bfr_size, state.stream); - rmm::device_buffer comp_bfr(max_comp_bfr_size, state.stream); + rmm::device_buffer uncomp_bfr(max_uncomp_bfr_size, stream); + rmm::device_buffer comp_bfr(max_comp_bfr_size, stream); rmm::device_vector comp_in(max_comp_pages); rmm::device_vector comp_out(max_comp_pages); rmm::device_vector pages(num_pages); @@ -1121,8 +1124,7 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) num_rowgroups, num_columns, num_pages, - num_stats_bfr, - state.stream); + num_stats_bfr); } auto host_bfr = [&]() { @@ -1160,33 +1162,31 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) comp_out.data().get(), (stats_granularity_ == statistics_freq::STATISTICS_PAGE) ? page_stats.data().get() : nullptr, (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? page_stats.data().get() + num_pages - : nullptr, - state.stream); + : nullptr); for (; r < rnext; r++, global_r++) { for (auto i = 0; i < num_columns; i++) { gpu::EncColumnChunk *ck = &chunks[r * num_columns + i]; uint8_t *dev_bfr; if (ck->is_compressed) { - state.md.row_groups[global_r].columns[i].meta_data.codec = compression_; - dev_bfr = ck->compressed_bfr; + md.row_groups[global_r].columns[i].meta_data.codec = compression_; + dev_bfr = ck->compressed_bfr; } else { dev_bfr = ck->uncompressed_bfr; } if (out_sink_->supports_device_write()) { // let the writer do what it wants to retrieve the data from the gpu. - out_sink_->device_write(dev_bfr + ck->ck_stat_size, ck->compressed_size, state.stream); + out_sink_->device_write(dev_bfr + ck->ck_stat_size, ck->compressed_size, stream); // we still need to do a (much smaller) memcpy for the statistics. if (ck->ck_stat_size != 0) { - state.md.row_groups[global_r].columns[i].meta_data.statistics_blob.resize( - ck->ck_stat_size); - CUDA_TRY(cudaMemcpyAsync( - state.md.row_groups[global_r].columns[i].meta_data.statistics_blob.data(), - dev_bfr, - ck->ck_stat_size, - cudaMemcpyDeviceToHost, - state.stream.value())); - state.stream.synchronize(); + md.row_groups[global_r].columns[i].meta_data.statistics_blob.resize(ck->ck_stat_size); + CUDA_TRY( + cudaMemcpyAsync(md.row_groups[global_r].columns[i].meta_data.statistics_blob.data(), + dev_bfr, + ck->ck_stat_size, + cudaMemcpyDeviceToHost, + stream.value())); + stream.synchronize(); } } else { // copy the full data @@ -1194,54 +1194,54 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) dev_bfr, ck->ck_stat_size + ck->compressed_size, cudaMemcpyDeviceToHost, - state.stream.value())); - state.stream.synchronize(); + stream.value())); + stream.synchronize(); out_sink_->host_write(host_bfr.get() + ck->ck_stat_size, ck->compressed_size); if (ck->ck_stat_size != 0) { - state.md.row_groups[global_r].columns[i].meta_data.statistics_blob.resize( - ck->ck_stat_size); - memcpy(state.md.row_groups[global_r].columns[i].meta_data.statistics_blob.data(), + md.row_groups[global_r].columns[i].meta_data.statistics_blob.resize(ck->ck_stat_size); + memcpy(md.row_groups[global_r].columns[i].meta_data.statistics_blob.data(), host_bfr.get(), ck->ck_stat_size); } } - state.md.row_groups[global_r].total_byte_size += ck->compressed_size; - state.md.row_groups[global_r].columns[i].meta_data.data_page_offset = - state.current_chunk_offset + ((ck->has_dictionary) ? ck->dictionary_size : 0); - state.md.row_groups[global_r].columns[i].meta_data.dictionary_page_offset = - (ck->has_dictionary) ? state.current_chunk_offset : 0; - state.md.row_groups[global_r].columns[i].meta_data.total_uncompressed_size = ck->bfr_size; - state.md.row_groups[global_r].columns[i].meta_data.total_compressed_size = - ck->compressed_size; - state.current_chunk_offset += ck->compressed_size; + md.row_groups[global_r].total_byte_size += ck->compressed_size; + md.row_groups[global_r].columns[i].meta_data.data_page_offset = + current_chunk_offset + ((ck->has_dictionary) ? ck->dictionary_size : 0); + md.row_groups[global_r].columns[i].meta_data.dictionary_page_offset = + (ck->has_dictionary) ? current_chunk_offset : 0; + md.row_groups[global_r].columns[i].meta_data.total_uncompressed_size = ck->bfr_size; + md.row_groups[global_r].columns[i].meta_data.total_compressed_size = ck->compressed_size; + current_chunk_offset += ck->compressed_size; } } } } -std::unique_ptr> writer::impl::write_chunked_end( - pq_chunked_state &state, bool return_filemetadata, const std::string &column_chunks_file_path) +std::unique_ptr> writer::impl::close( + std::string const &column_chunks_file_path) { + if (closed) { return nullptr; } + closed = true; CompactProtocolWriter cpw(&buffer_); file_ender_s fendr; buffer_.resize(0); - fendr.footer_len = static_cast(cpw.write(state.md)); + fendr.footer_len = static_cast(cpw.write(md)); fendr.magic = parquet_magic; out_sink_->host_write(buffer_.data(), buffer_.size()); out_sink_->host_write(&fendr, sizeof(fendr)); out_sink_->flush(); // Optionally output raw file metadata with the specified column chunk file path - if (return_filemetadata) { + if (column_chunks_file_path.length() > 0) { file_header_s fhdr = {parquet_magic}; buffer_.resize(0); buffer_.insert(buffer_.end(), reinterpret_cast(&fhdr), reinterpret_cast(&fhdr) + sizeof(fhdr)); - for (auto &rowgroup : state.md.row_groups) { + for (auto &rowgroup : md.row_groups) { for (auto &col : rowgroup.columns) { col.file_path = column_chunks_file_path; } } - fendr.footer_len = static_cast(cpw.write(state.md)); + fendr.footer_len = static_cast(cpw.write(md)); buffer_.insert(buffer_.end(), reinterpret_cast(&fendr), reinterpret_cast(&fendr) + sizeof(fendr)); @@ -1254,43 +1254,32 @@ std::unique_ptr> writer::impl::write_chunked_end( // Forward to implementation writer::writer(std::unique_ptr sink, parquet_writer_options const &options, - rmm::mr::device_memory_resource *mr) - : _impl(std::make_unique(std::move(sink), options, mr)) + SingleWriteMode mode, + rmm::mr::device_memory_resource *mr, + rmm::cuda_stream_view stream) + : _impl(std::make_unique(std::move(sink), options, mode, mr, stream)) { } -// Destructor within this translation unit -writer::~writer() = default; - -// Forward to implementation -std::unique_ptr> writer::write(table_view const &table, - const table_metadata *metadata, - bool return_filemetadata, - const std::string column_chunks_file_path, - std::vector const &decimal_precisions, - rmm::cuda_stream_view stream) +writer::writer(std::unique_ptr sink, + chunked_parquet_writer_options const &options, + SingleWriteMode mode, + rmm::mr::device_memory_resource *mr, + rmm::cuda_stream_view stream) + : _impl(std::make_unique(std::move(sink), options, mode, mr, stream)) { - return _impl->write( - table, metadata, return_filemetadata, column_chunks_file_path, decimal_precisions, stream); } -// Forward to implementation -void writer::write_chunked_begin(pq_chunked_state &state) -{ - return _impl->write_chunked_begin(state); -} +// Destructor within this translation unit +writer::~writer() = default; // Forward to implementation -void writer::write_chunk(table_view const &table, pq_chunked_state &state) -{ - _impl->write_chunk(table, state); -} +void writer::write(table_view const &table) { _impl->write(table); } // Forward to implementation -std::unique_ptr> writer::write_chunked_end( - pq_chunked_state &state, bool return_filemetadata, const std::string &column_chunks_file_path) +std::unique_ptr> writer::close(std::string const &column_chunks_file_path) { - return _impl->write_chunked_end(state, return_filemetadata, column_chunks_file_path); + return _impl->close(column_chunks_file_path); } std::unique_ptr> writer::merge_rowgroup_metadata( diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index b664926f970..df76fb093fa 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,8 +21,6 @@ #pragma once -#include "chunked_state.hpp" - #include #include @@ -68,57 +66,57 @@ class writer::impl { * * @param filepath Filepath if storing dataset to a file * @param options Settings for controlling behavior + * @param mode Option to write at once or in chunks * @param mr Device memory resource to use for device memory allocation + * @param stream CUDA stream used for device memory operations and kernel launches. */ explicit impl(std::unique_ptr sink, parquet_writer_options const& options, - rmm::mr::device_memory_resource* mr); + SingleWriteMode mode, + rmm::mr::device_memory_resource* mr, + rmm::cuda_stream_view stream); /** - * @brief Write an entire dataset to parquet format. + * @brief Constructor with chunked writer options. * - * @param table The set of columns - * @param metadata The metadata associated with the table - * @param return_filemetadata If true, return the raw parquet file metadata - * @param column_chunks_file_path Column chunks file path to be set in the raw output metadata - * @param stream CUDA stream used for device memory operations and kernel launches. - * @return unique_ptr to FileMetadata thrift message if requested + * @param filepath Filepath if storing dataset to a file + * @param options Settings for controlling behavior + * @param mode Option to write at once or in chunks + * @param mr Device memory resource to use for device memory allocation + * @param stream CUDA stream used for device memory operations and kernel launches */ - std::unique_ptr> write(table_view const& table, - const table_metadata* metadata, - bool return_filemetadata, - const std::string& column_chunks_file_path, - std::vector const& decimal_precisions, - rmm::cuda_stream_view stream); + explicit impl(std::unique_ptr sink, + chunked_parquet_writer_options const& options, + SingleWriteMode mode, + rmm::mr::device_memory_resource* mr, + rmm::cuda_stream_view stream); /** - * @brief Begins the chunked/streamed write process. - * - * @param[in] pq_chunked_state Internal state maintained between chunks. + * @brief Destructor to complete any incomplete write and release resources. */ - void write_chunked_begin(pq_chunked_state& state); + ~impl(); /** - * @brief Writes a single subtable as part of a larger parquet file/table write. + * @brief Initializes the states before writing. + */ + void init_state(); + + /** + * @brief Writes a single subtable as part of a larger parquet file/table write, + * normally used for chunked writing. * * @param[in] table The table information to be written - * @param[in] pq_chunked_state Internal state maintained between chunks. - * boundaries. */ - void write_chunk(table_view const& table, pq_chunked_state& state); + void write(table_view const& table); /** * @brief Finishes the chunked/streamed write process. * - * @param[in] pq_chunked_state Internal state maintained between chunks. - * @param return_filemetadata If true, return the raw parquet file metadata - * @param column_chunks_file_path Column chunks file path to be set in the raw output metadata - * @return unique_ptr to FileMetadata thrift message if requested + * @param[in] column_chunks_file_path Column chunks file path to be set in the raw output metadata + * @return A parquet-compatible blob that contains the data for all rowgroups in the list only if + * `column_chunks_file_path` is provided, else null. */ - std::unique_ptr> write_chunked_end( - pq_chunked_state& state, - bool return_filemetadata = false, - const std::string& column_chunks_file_path = ""); + std::unique_ptr> close(std::string const& column_chunks_file_path = ""); private: /** @@ -130,15 +128,13 @@ class writer::impl { * @param num_fragments Total number of fragments per column * @param num_rows Total number of rows * @param fragment_size Number of rows per fragment - * @param stream CUDA stream used for device memory operations and kernel launches. */ void init_page_fragments(hostdevice_vector& frag, hostdevice_vector& col_desc, uint32_t num_columns, uint32_t num_fragments, uint32_t num_rows, - uint32_t fragment_size, - rmm::cuda_stream_view stream); + uint32_t fragment_size); /** * @brief Gather per-fragment statistics * @@ -148,15 +144,13 @@ class writer::impl { * @param num_columns Total number of columns * @param num_fragments Total number of fragments per column * @param fragment_size Number of rows per fragment - * @param stream CUDA stream used for device memory operations and kernel launches. */ void gather_fragment_statistics(statistics_chunk* dst_stats, hostdevice_vector& frag, hostdevice_vector& col_desc, uint32_t num_columns, uint32_t num_fragments, - uint32_t fragment_size, - rmm::cuda_stream_view stream); + uint32_t fragment_size); /** * @brief Build per-chunk dictionaries and count data pages * @@ -165,14 +159,12 @@ class writer::impl { * @param num_rowgroups Total number of rowgroups * @param num_columns Total number of columns * @param num_dictionaries Total number of dictionaries - * @param stream CUDA stream used for device memory operations and kernel launches. */ void build_chunk_dictionaries(hostdevice_vector& chunks, hostdevice_vector& col_desc, uint32_t num_rowgroups, uint32_t num_columns, - uint32_t num_dictionaries, - rmm::cuda_stream_view stream); + uint32_t num_dictionaries); /** * @brief Initialize encoder pages * @@ -183,7 +175,6 @@ class writer::impl { * @param num_columns Total number of columns * @param num_pages Total number of pages * @param num_stats_bfr Number of statistics buffers - * @param stream CUDA stream used for device memory operations and kernel launches. */ void init_encoder_pages(hostdevice_vector& chunks, hostdevice_vector& col_desc, @@ -193,8 +184,7 @@ class writer::impl { uint32_t num_rowgroups, uint32_t num_columns, uint32_t num_pages, - uint32_t num_stats_bfr, - rmm::cuda_stream_view stream); + uint32_t num_stats_bfr); /** * @brief Encode a batch pages * @@ -209,7 +199,6 @@ class writer::impl { * @param comp_out compressor status array * @param page_stats optional page-level statistics (nullptr if none) * @param chunk_stats optional chunk-level statistics (nullptr if none) - * @param stream CUDA stream used for device memory operations and kernel launches. */ void encode_pages(hostdevice_vector& chunks, gpu::EncPage* pages, @@ -221,12 +210,13 @@ class writer::impl { gpu_inflate_input_s* comp_in, gpu_inflate_status_s* comp_out, const statistics_chunk* page_stats, - const statistics_chunk* chunk_stats, - rmm::cuda_stream_view stream); + const statistics_chunk* chunk_stats); private: // TODO : figure out if we want to keep this. It is currently unused. rmm::mr::device_memory_resource* _mr = nullptr; + // Cuda stream to be used + rmm::cuda_stream_view stream = rmm::cuda_stream_default; size_t max_rowgroup_size_ = DEFAULT_ROWGROUP_MAXSIZE; size_t max_rowgroup_rows_ = DEFAULT_ROWGROUP_MAXROWS; @@ -234,6 +224,23 @@ class writer::impl { Compression compression_ = Compression::UNCOMPRESSED; statistics_freq stats_granularity_ = statistics_freq::STATISTICS_NONE; bool int96_timestamps = false; + // Overall file metadata. Filled in during the process and written during write_chunked_end() + cudf::io::parquet::FileMetaData md; + // optional user metadata + table_metadata_with_nullability user_metadata_with_nullability; + // only used in the write_chunked() case. copied from the (optionally) user supplied + // argument to write() + table_metadata const* user_metadata = nullptr; + // to track if the output has been written to sink + bool closed = false; + // vector of precision values for decimal writing. Exactly one entry + // per decimal column. + std::vector decimal_precision; + // current write position for rowgroups/chunks + std::size_t current_chunk_offset; + // special parameter only used by detail::write() to indicate that we are guaranteeing + // a single table write. this enables some internal optimizations. + bool const single_write_mode = true; std::vector buffer_; std::unique_ptr out_sink_; diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 49f5d285647..c7f405e1cc0 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -89,10 +89,10 @@ namespace gpu { * Also iterates over (one or more) delimiter characters after the field. * Function applies to formats with field delimiters and line terminators. * - * @param[in] begin Beginning of the character string - * @param[in] end End of the character string - * @param[in] opts A set of parsing options - * @param[in] escape_char A boolean value to signify whether to consider `\` as escape character or + * @param begin Pointer to the first element of the string + * @param end Pointer to the first element after the string + * @param opts A set of parsing options + * @param escape_char A boolean value to signify whether to consider `\` as escape character or * just a character. * * @return Pointer to the last character in the field, including the @@ -191,33 +191,33 @@ __inline__ __device__ char to_lower(char const c) } /** - * @brief Check if string is infinity, case insensitive with/without sign + * @brief Checks if string is infinity, case insensitive with/without sign * Valid infinity strings are inf, +inf, -inf, infinity, +infinity, -infinity * String comparison is case insensitive. * - * @param start The pointer to character array to start parsing from - * @param end The pointer to character array to end parsing + * @param begin Pointer to the first element of the string + * @param end Pointer to the first element after the string * @return true if string is valid infinity, else false. */ -__inline__ __device__ bool is_infinity(char const* start, char const* end) +__inline__ __device__ bool is_infinity(char const* begin, char const* end) { - if (*start == '-' || *start == '+') start++; + if (*begin == '-' || *begin == '+') begin++; char const* cinf = "infinity"; - auto index = start; - while (index <= end) { + auto index = begin; + while (index < end) { if (*cinf != to_lower(*index)) break; index++; cinf++; } - return ((index == start + 3 || index == start + 8) && index > end); + return ((index == begin + 3 || index == begin + 8) && index >= end); } /** * @brief Parses a character string and returns its numeric value. * - * @param[in] begin Beginning of the character string - * @param[in] end End of the character string - * @param[in] opts The global parsing behavior options + * @param begin Pointer to the first element of the string + * @param end Pointer to the first element after the string + * @param opts The global parsing behavior options * @tparam base Base (radix) to use for conversion * * @return The parsed and converted value @@ -240,11 +240,11 @@ __inline__ __device__ T parse_numeric(const char* begin, if (*begin == '-' || *begin == '+') begin++; // Skip over the "0x" prefix for hex notation - if (base == 16 && begin + 2 <= end && *begin == '0' && *(begin + 1) == 'x') { begin += 2; } + if (base == 16 && begin + 2 < end && *begin == '0' && *(begin + 1) == 'x') { begin += 2; } // Handle the whole part of the number // auto index = begin; - while (begin <= end) { + while (begin < end) { if (*begin == opts.decimal) { ++begin; break; @@ -259,7 +259,7 @@ __inline__ __device__ T parse_numeric(const char* begin, if (std::is_floating_point::value) { // Handle fractional part of the number if necessary double divisor = 1; - while (begin <= end) { + while (begin < end) { if (*begin == 'e' || *begin == 'E') { ++begin; break; @@ -271,11 +271,11 @@ __inline__ __device__ T parse_numeric(const char* begin, } // Handle exponential part of the number if necessary - if (begin <= end) { + if (begin < end) { const int32_t exponent_sign = *begin == '-' ? -1 : 1; if (*begin == '-' || *begin == '+') { ++begin; } int32_t exponent = 0; - while (begin <= end) { + while (begin < end) { exponent = (exponent * 10) + decode_digit(*(begin++), &all_digits_valid); } if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); } @@ -459,5 +459,75 @@ std::string infer_compression_type( const std::string& filename, const std::vector>& ext_to_comp_map); +/** + * @brief Checks whether the given character is a whitespace character. + * + * @param[in] ch The character to check + * + * @return True if the input is whitespace, False otherwise + */ +__inline__ __device__ bool is_whitespace(char ch) { return ch == '\t' || ch == ' '; } + +/** + * @brief Skips past the current character if it matches the given value. + */ +template +__inline__ __device__ It skip_character(It const& it, char ch) +{ + return it + (*it == ch); +} + +/** + * @brief Adjusts the range to ignore starting/trailing whitespace and quotation characters. + * + * @param[in] begin Pointer to the first character in the parsing range + * @param[in] end pointer to the first character after the parsing range + * @param[in] quotechar The character used to denote quotes; '\0' if none + * + * @return Trimmed range + */ +__inline__ __device__ std::pair trim_whitespaces_quotes( + char const* begin, char const* end, char quotechar = '\0') +{ + auto not_whitespace = [] __device__(auto c) { return !is_whitespace(c); }; + + auto const trim_begin = thrust::find_if(thrust::seq, begin, end, not_whitespace); + auto const trim_end = thrust::find_if(thrust::seq, + thrust::make_reverse_iterator(end), + thrust::make_reverse_iterator(trim_begin), + not_whitespace); + + return {skip_character(trim_begin, quotechar), skip_character(trim_end, quotechar).base()}; +} + +/** + * @brief Excludes the prefix from the input range if the string starts with the prefix. + * + * @tparam N length on the prefix, plus one + * @param begin[in, out] Pointer to the first element of the string + * @param end Pointer to the first element after the string + * @param prefix String we're searching for at the start of the input range + */ +template +__inline__ __device__ auto skip_if_starts_with(char const* begin, + char const* end, + const char (&prefix)[N]) +{ + static constexpr size_t prefix_len = N - 1; + if (end - begin < prefix_len) return begin; + return thrust::equal(thrust::seq, begin, begin + prefix_len, prefix) ? begin + prefix_len : begin; +} + +/** + * @brief Finds the first element after the leading space characters. + * + * @param begin Pointer to the first element of the string + * @param end Pointer to the first element after the string + */ +__inline__ __device__ auto skip_spaces(char const* begin, char const* end) +{ + return thrust::find_if(thrust::seq, begin, end, [](auto elem) { return elem != ' '; }); +} + } // namespace io } // namespace cudf diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp index 10647dd934d..c634aa8d06b 100644 --- a/cpp/src/jit/cache.cpp +++ b/cpp/src/jit/cache.cpp @@ -74,6 +74,19 @@ boost::filesystem::path getCacheDir() // empty, to disallow use of file cache at runtime. if (not kernel_cache_path.empty()) { kernel_cache_path /= std::string{CUDF_STRINGIFY(CUDF_VERSION)}; + + // Make per device cache based on compute capability. This is to avoid multiple devices of + // different compute capability to access the same kernel cache. + int device; + int cc_major; + int cc_minor; + CUDA_TRY(cudaGetDevice(&device)); + CUDA_TRY(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device)); + CUDA_TRY(cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device)); + int cc = cc_major * 10 + cc_minor; + + kernel_cache_path /= std::to_string(cc); + try { // `mkdir -p` the kernel cache path if it doesn't exist boost::filesystem::create_directories(kernel_cache_path); diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu new file mode 100644 index 00000000000..49f06d5acfd --- /dev/null +++ b/cpp/src/lists/contains.cu @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cudf { +namespace lists { + +namespace { + +auto get_search_keys_device_iterable_view(cudf::column_view const& search_keys, + rmm::cuda_stream_view stream) +{ + return column_device_view::create(search_keys, stream); +} + +auto get_search_keys_device_iterable_view(cudf::scalar const& search_key, rmm::cuda_stream_view) +{ + return &search_key; +} + +template +auto get_pair_iterator(cudf::column_device_view const& d_search_keys) +{ + return d_search_keys.pair_begin(); +} + +template +auto get_pair_iterator(cudf::scalar const& search_key) +{ + return cudf::detail::make_pair_iterator(search_key); +} + +/** + * @brief Functor to search each list row for the specified search keys. + */ +template +struct lookup_functor { + template + struct is_supported { + static constexpr bool value = cudf::is_numeric() || + cudf::is_chrono() || + std::is_same::value; + }; + + template + std::enable_if_t::value, std::unique_ptr> operator()( + Args&&...) const + { + CUDF_FAIL("lists::contains() is only supported on numeric types, chrono types, and strings."); + } + + std::pair construct_null_mask(lists_column_view const& input_lists, + column_view const& result_validity, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + if (!search_keys_have_nulls && !input_lists.has_nulls() && !input_lists.child().has_nulls()) { + return {rmm::device_buffer{0, stream, mr}, size_type{0}}; + } else { + return cudf::detail::valid_if(result_validity.begin(), + result_validity.end(), + thrust::identity{}, + stream, + mr); + } + } + + template + void search_each_list_row(cudf::detail::lists_column_device_view const& d_lists, + SearchKeyPairIter search_key_pair_iter, + cudf::mutable_column_device_view mutable_ret_bools, + cudf::mutable_column_device_view mutable_ret_validity, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + thrust::for_each( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(d_lists.size()), + [d_lists, + search_key_pair_iter, + d_bools = mutable_ret_bools.data(), + d_validity = mutable_ret_validity.data()] __device__(auto row_index) { + auto search_key_and_validity = search_key_pair_iter[row_index]; + auto const& search_key_is_valid = search_key_and_validity.second; + + if (search_keys_have_nulls && !search_key_is_valid) { + d_bools[row_index] = false; + d_validity[row_index] = false; + return; + } + + auto list = cudf::list_device_view(d_lists, row_index); + if (list.is_null()) { + d_bools[row_index] = false; + d_validity[row_index] = false; + return; + } + + auto search_key = search_key_and_validity.first; + d_bools[row_index] = thrust::find_if(thrust::seq, + list.pair_begin(), + list.pair_end(), + [search_key] __device__(auto element_and_validity) { + return element_and_validity.second && + (element_and_validity.first == search_key); + }) != list.pair_end(); + d_validity[row_index] = + d_bools[row_index] || + thrust::none_of(thrust::seq, + thrust::make_counting_iterator(size_type{0}), + thrust::make_counting_iterator(list.size()), + [&list] __device__(auto const& i) { return list.is_null(i); }); + }); + } + + template + std::enable_if_t::value, std::unique_ptr> operator()( + cudf::lists_column_view const& lists, + SearchKeyType const& search_key, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + using namespace cudf; + using namespace cudf::detail; + + CUDF_EXPECTS(!cudf::is_nested(lists.child().type()), + "Nested types not supported in lists::contains()"); + CUDF_EXPECTS(lists.child().type().id() == search_key.type().id(), + "Type of search key does not match list column element type."); + CUDF_EXPECTS(search_key.type().id() != type_id::EMPTY, "Type cannot be empty."); + + auto constexpr search_key_is_scalar = std::is_same::value; + + if (search_keys_have_nulls && search_key_is_scalar) { + return make_fixed_width_column(data_type(type_id::BOOL8), + lists.size(), + cudf::create_null_mask(lists.size(), mask_state::ALL_NULL, mr), + lists.size(), + stream, + mr); + } + + auto const device_view = column_device_view::create(lists.parent(), stream); + auto const d_lists = lists_column_device_view(*device_view); + auto const d_skeys = get_search_keys_device_iterable_view(search_key, stream); + + auto const lists_column_has_nulls = lists.has_nulls() || lists.child().has_nulls(); + + auto result_validity = make_fixed_width_column( + data_type{type_id::BOOL8}, lists.size(), cudf::mask_state::UNALLOCATED, stream, mr); + auto result_bools = make_fixed_width_column( + data_type{type_id::BOOL8}, lists.size(), cudf::mask_state::UNALLOCATED, stream, mr); + auto mutable_result_bools = + mutable_column_device_view::create(result_bools->mutable_view(), stream); + auto mutable_result_validity = + mutable_column_device_view::create(result_validity->mutable_view(), stream); + auto search_key_iter = get_pair_iterator(*d_skeys); + + search_each_list_row( + d_lists, search_key_iter, *mutable_result_bools, *mutable_result_validity, stream, mr); + + rmm::device_buffer null_mask; + size_type num_nulls; + + std::tie(null_mask, num_nulls) = + construct_null_mask(lists, result_validity->view(), stream, mr); + result_bools->set_null_mask(std::move(null_mask), num_nulls); + + return result_bools; + } +}; + +} // namespace + +namespace detail { + +std::unique_ptr contains(cudf::lists_column_view const& lists, + cudf::scalar const& search_key, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return search_key.is_valid(stream) + ? cudf::type_dispatcher( + search_key.type(), lookup_functor{}, lists, search_key, stream, mr) + : cudf::type_dispatcher( + search_key.type(), lookup_functor{}, lists, search_key, stream, mr); +} + +std::unique_ptr contains(cudf::lists_column_view const& lists, + cudf::column_view const& search_keys, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(search_keys.size() == lists.size(), + "Number of search keys must match list column size."); + + return search_keys.has_nulls() + ? cudf::type_dispatcher( + search_keys.type(), lookup_functor{}, lists, search_keys, stream, mr) + : cudf::type_dispatcher( + search_keys.type(), lookup_functor{}, lists, search_keys, stream, mr); +} + +} // namespace detail + +std::unique_ptr contains(cudf::lists_column_view const& lists, + cudf::scalar const& search_key, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::contains(lists, search_key, rmm::cuda_stream_default, mr); +} + +std::unique_ptr contains(cudf::lists_column_view const& lists, + cudf::column_view const& search_keys, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::contains(lists, search_keys, rmm::cuda_stream_default, mr); +} + +} // namespace lists +} // namespace cudf diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu new file mode 100644 index 00000000000..78549152770 --- /dev/null +++ b/cpp/src/lists/count_elements.cu @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +namespace cudf { +namespace lists { +namespace detail { +/** + * @brief Returns a numeric column containing lengths of each element. + * + * @param input Input lists column. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New INT32 column with lengths. + */ +std::unique_ptr count_elements(lists_column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto device_column = cudf::column_device_view::create(input.parent(), stream); + auto d_column = *device_column; + // create output column + auto output = make_fixed_width_column(data_type{type_to_id()}, + input.size(), + copy_bitmask(input.parent()), + input.null_count(), + stream, + mr); + + // fill in the sizes + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + output->mutable_view().begin(), + [d_column] __device__(size_type idx) { + if (d_column.is_null(idx)) return size_type{0}; + auto d_offsets = + d_column.child(lists_column_view::offsets_column_index).data() + + d_column.offset(); + return d_offsets[idx + 1] - d_offsets[idx]; + }); + + output->set_null_count(input.null_count()); // reset null count + return output; +} + +} // namespace detail + +// external APIS + +std::unique_ptr count_elements(lists_column_view const& input, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::count_elements(input, rmm::cuda_stream_default, mr); +} + +} // namespace lists +} // namespace cudf diff --git a/cpp/src/reshape/explode.cu b/cpp/src/reshape/explode.cu new file mode 100644 index 00000000000..bc532893fb0 --- /dev/null +++ b/cpp/src/reshape/explode.cu @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include +#include + +namespace cudf { +namespace detail { +namespace { +/** + * @brief Function object for exploding a column. + */ +struct explode_functor { + template + std::unique_ptr
operator()(table_view const& input_table, + size_type explode_column_idx, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const + { + CUDF_FAIL("Unsupported non-list column"); + + return std::make_unique
(); + } +}; + +template <> +std::unique_ptr
explode_functor::operator()( + table_view const& input_table, + size_type explode_column_idx, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const +{ + /* we explode by building a gather map that includes the number of entries in each list inside + the column for each index. Interestingly, this can be done with lower_bound across the offsets + as values between the offsets will all map down to the index below. We have some off-by-one + manipulations we need to do with the output, but it's almost our gather map by itself. Once we + build the gather map we need to remove the explode column from the table and run gather on it. + Next we build the explode column, which turns out is simply lifting the child column out of the + explode column. This unrolls the top level of lists. Then we need to insert the explode column + back into the table and return it. */ + lists_column_view lc{input_table.column(explode_column_idx)}; + auto sliced_child = lc.get_sliced_child(stream); + rmm::device_uvector gather_map_indices(sliced_child.size(), stream, mr); + + // sliced columns can make this a little tricky. We have to start iterating at the start of the + // offsets for this column, which could be > 0. Then we also have to handle rebasing the offsets + // as we go. + auto offsets = lc.offsets().begin() + lc.offset(); + auto offsets_minus_one = thrust::make_transform_iterator( + offsets, [offsets] __device__(auto i) { return (i - offsets[0]) - 1; }); + auto counting_iter = thrust::make_counting_iterator(0); + + // This looks like an off-by-one bug, but what is going on here is that we need to reduce each + // result from `lower_bound` by 1 to build the correct gather map. It was pointed out that + // this can be accomplished by simply skipping the first entry and using the result of + // `lower_bound` directly. + thrust::lower_bound(rmm::exec_policy(stream), + offsets_minus_one + 1, + offsets_minus_one + lc.size() + 1, + counting_iter, + counting_iter + gather_map_indices.size(), + gather_map_indices.begin()); + + auto select_iter = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [explode_column_idx](size_type i) { return i >= explode_column_idx ? i + 1 : i; }); + std::vector selected_columns(select_iter, select_iter + input_table.num_columns() - 1); + + auto gathered_table = cudf::detail::gather( + input_table.select(selected_columns), + column_view(data_type(type_to_id()), sliced_child.size(), gather_map_indices.data()), + cudf::out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::ALLOWED, + stream, + mr); + + std::vector> columns = gathered_table.release()->release(); + + columns.insert(columns.begin() + explode_column_idx, + std::make_unique(column(sliced_child, stream, mr))); + + return std::make_unique
(std::move(columns)); +} +} // namespace + +/** + * @copydoc + * cudf::explode(input_table,explode_column_idx,rmm::mr::device_memory_resource) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr
explode(table_view const& input_table, + size_type explode_column_idx, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return type_dispatcher(input_table.column(explode_column_idx).type(), + explode_functor{}, + input_table, + explode_column_idx, + stream, + mr); +} + +} // namespace detail + +/** + * @copydoc cudf::explode(input_table,explode_column_idx,rmm::mr::device_memory_resource) + */ +std::unique_ptr
explode(table_view const& input_table, + size_type explode_column_idx, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::explode(input_table, explode_column_idx, rmm::cuda_stream_default, mr); +} + +} // namespace cudf diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp index 052c2aaedc7..fe051b1ffc5 100644 --- a/cpp/src/scalar/scalar.cpp +++ b/cpp/src/scalar/scalar.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -22,6 +23,29 @@ #include namespace cudf { + +string_scalar::string_scalar(rmm::device_scalar& data, + bool is_valid, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : string_scalar(data.value(stream), is_valid, stream, mr) +{ +} + +string_scalar::string_scalar(value_type const& source, + bool is_valid, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : scalar(data_type(type_id::STRING), is_valid), + _data(source.data(), source.size_bytes(), stream, mr) +{ +} + +string_scalar::value_type string_scalar::value(rmm::cuda_stream_view stream) const +{ + return value_type{data(), size()}; +} + std::string string_scalar::to_string(rmm::cuda_stream_view stream) const { std::string result; diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu index b6c603c231f..2d36a573a49 100644 --- a/cpp/src/sort/sort.cu +++ b/cpp/src/sort/sort.cu @@ -55,6 +55,53 @@ std::unique_ptr
sort_by_key(table_view const& values, mr); } +struct inplace_column_sort_fn { + template ()>* = nullptr> + void operator()(mutable_column_view& col, bool ascending, rmm::cuda_stream_view stream) const + { + CUDF_EXPECTS(!col.has_nulls(), "Nulls not supported for in-place sort"); + using DeviceT = device_storage_type_t; + if (ascending) { + thrust::sort(rmm::exec_policy(stream), + col.begin(), + col.end(), + thrust::less()); + } else { + thrust::sort(rmm::exec_policy(stream), + col.begin(), + col.end(), + thrust::greater()); + } + } + + template ()>* = nullptr> + void operator()(mutable_column_view&, bool, rmm::cuda_stream_view) const + { + CUDF_FAIL("Column type must be relationally comparable and fixed-width"); + } +}; + +std::unique_ptr
sort(table_view input, + std::vector const& column_order, + std::vector const& null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + if (input.num_columns() == 1 && !input.column(0).has_nulls() && + cudf::is_fixed_width(input.column(0).type())) { + auto output = std::make_unique(input.column(0), stream, mr); + auto view = output->mutable_view(); + bool ascending = (column_order.empty() ? true : column_order.front() == order::ASCENDING); + cudf::type_dispatcher(output->type(), inplace_column_sort_fn{}, view, ascending, stream); + std::vector> columns; + columns.emplace_back(std::move(output)); + return std::make_unique
(std::move(columns)); + } + return detail::sort_by_key( + input, input, column_order, null_precedence, rmm::cuda_stream_default, mr); +} + } // namespace detail std::unique_ptr sorted_order(table_view input, @@ -72,8 +119,7 @@ std::unique_ptr
sort(table_view input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::sort_by_key( - input, input, column_order, null_precedence, rmm::cuda_stream_default, mr); + return detail::sort(input, column_order, null_precedence, rmm::cuda_stream_default, mr); } std::unique_ptr
sort_by_key(table_view const& values, diff --git a/cpp/src/sort/sort_column.cu b/cpp/src/sort/sort_column.cu new file mode 100644 index 00000000000..070aa6eae03 --- /dev/null +++ b/cpp/src/sort/sort_column.cu @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +namespace cudf { +namespace detail { +namespace { + +/** + * @brief Type-dispatched functor for sorting a single column. + */ +struct column_sorted_order_fn { + /** + * @brief Compile time check for allowing radix sort for column type. + * + * Floating point is removed here for special handling of NaNs. + */ + template + static constexpr bool is_radix_sort_supported() + { + return cudf::is_fixed_width() && !cudf::is_floating_point(); + } + + /** + * @brief Sorts fixed-width columns using faster thrust sort. + * + * @param input Column to sort + * @param indices Output sorted indices + * @param ascending True if sort order is ascending + * @param stream CUDA stream used for device memory operations and kernel launches + */ + template ()>* = nullptr> + void radix_sort(column_view const& input, + mutable_column_view& indices, + bool ascending, + rmm::cuda_stream_view stream) + { + // A non-stable sort on a column of arithmetic type with no nulls will use a radix sort + // if specifying only the `thrust::less` or `thrust::greater` comparators. + // But this also requires making a copy of the input data. + auto temp_col = column(input, stream); + auto d_col = temp_col.mutable_view(); + using DeviceT = device_storage_type_t; + if (ascending) { + thrust::sort_by_key(rmm::exec_policy(stream), + d_col.begin(), + d_col.end(), + indices.begin(), + thrust::less()); + } else { + thrust::sort_by_key(rmm::exec_policy(stream), + d_col.begin(), + d_col.end(), + indices.begin(), + thrust::greater()); + } + } + template ()>* = nullptr> + void radix_sort(column_view const&, mutable_column_view&, bool, rmm::cuda_stream_view) + { + CUDF_FAIL("Only fixed-width types are suitable for faster sorting"); + } + + /** + * @brief Sorts a single column with a relationally comparable type. + * + * This includes numeric, timestamp, duration, and string types. + * + * @param input Column to sort + * @param indices Output sorted indices + * @param ascending True if sort order is ascending + * @param null_precedence How null rows are to be ordered + * @param stream CUDA stream used for device memory operations and kernel launches + */ + template ()>* = nullptr> + void operator()(column_view const& input, + mutable_column_view& indices, + bool ascending, + null_order null_precedence, + rmm::cuda_stream_view stream) + { + // column with nulls or non-supported types will also use a comparator + if (input.has_nulls() || !is_radix_sort_supported()) { + auto keys = column_device_view::create(input, stream); + thrust::sort(rmm::exec_policy(stream), + indices.begin(), + indices.end(), + simple_comparator{*keys, input.has_nulls(), ascending, null_precedence}); + } else { + radix_sort(input, indices, ascending, stream); + } + } + + template ()>* = nullptr> + void operator()(column_view const&, mutable_column_view&, bool, null_order, rmm::cuda_stream_view) + { + CUDF_FAIL("Column type must be relationally comparable"); + } +}; + +} // namespace + +/** + * @copydoc + * sorted_order(column_view&,order,null_order,rmm::cuda_stream_view,rmm::mr::device_memory_resource*) + */ +template <> +std::unique_ptr sorted_order(column_view const& input, + order column_order, + null_order null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto sorted_indices = cudf::make_numeric_column( + data_type(type_to_id()), input.size(), mask_state::UNALLOCATED, stream, mr); + mutable_column_view indices_view = sorted_indices->mutable_view(); + thrust::sequence( + rmm::exec_policy(stream), indices_view.begin(), indices_view.end(), 0); + cudf::type_dispatcher(input.type(), + column_sorted_order_fn{}, + input, + indices_view, + column_order == order::ASCENDING, + null_precedence, + stream); + return sorted_indices; +} + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/sort/sort_impl.cuh b/cpp/src/sort/sort_impl.cuh index cfa3a726138..4fc83d343d5 100644 --- a/cpp/src/sort/sort_impl.cuh +++ b/cpp/src/sort/sort_impl.cuh @@ -18,20 +18,70 @@ #include #include -#include #include #include #include #include -#include +#include #include #include +#include namespace cudf { namespace detail { -// Create permuted row indices that would materialize sorted order + +/** + * @brief Comparator functor needed for single column sort. + * + * @tparam Column element type. + */ +template +struct simple_comparator { + __device__ bool operator()(size_type lhs, size_type rhs) + { + if (has_nulls) { + bool lhs_null{d_column.is_null(lhs)}; + bool rhs_null{d_column.is_null(rhs)}; + if (lhs_null || rhs_null) { + if (!ascending) thrust::swap(lhs_null, rhs_null); + return (null_precedence == cudf::null_order::BEFORE ? !rhs_null : !lhs_null); + } + } + return relational_compare(d_column.element(lhs), d_column.element(rhs)) == + (ascending ? weak_ordering::LESS : weak_ordering::GREATER); + } + column_device_view const d_column; + bool has_nulls; + bool ascending; + null_order null_precedence{}; +}; + +/** + * @brief Sort indices of a single column. + * + * @param input Column to sort. The column data is not modified. + * @param column_order Ascending or descending sort order + * @param null_precedence How null rows are to be ordered + * @param stable True if sort should be stable + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Sorted indices for the input column. + */ +template +std::unique_ptr sorted_order(column_view const& input, + order column_order, + null_order null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +/** + * @copydoc + * sorted_order(table_view&,std::vector,std::vector,rmm::mr::device_memory_resource*) + * + * @param stream CUDA stream used for device memory operations and kernel launches + */ template std::unique_ptr sorted_order(table_view input, std::vector const& column_order, @@ -53,28 +103,24 @@ std::unique_ptr sorted_order(table_view input, "Mismatch between number of columns and null_precedence size."); } - // fast-path for single strings column sort - if (input.num_columns() == 1 && input.column(0).type().id() == type_id::STRING) { - return cudf::strings::detail::sorted_order( - strings_column_view(input.column(0)), - column_order.empty() ? order::ASCENDING : column_order.front(), - null_precedence.empty() ? null_order::BEFORE : null_precedence.front(), - stream, - mr); - } - std::unique_ptr sorted_indices = cudf::make_numeric_column( data_type(type_to_id()), input.num_rows(), mask_state::UNALLOCATED, stream, mr); - mutable_column_view mutable_indices_view = sorted_indices->mutable_view(); - - auto device_table = table_device_view::create(input, stream); - thrust::sequence(rmm::exec_policy(stream), mutable_indices_view.begin(), mutable_indices_view.end(), 0); + // fast-path for single column sort + if (input.num_columns() == 1) { + auto const single_col = input.column(0); + auto const col_order = column_order.empty() ? order::ASCENDING : column_order.front(); + auto const null_prec = null_precedence.empty() ? null_order::BEFORE : null_precedence.front(); + return stable ? sorted_order(single_col, col_order, null_prec, stream, mr) + : sorted_order(single_col, col_order, null_prec, stream, mr); + } + + auto device_table = table_device_view::create(input, stream); rmm::device_vector d_column_order(column_order); if (has_nulls(input)) { diff --git a/cpp/src/sort/stable_sort_column.cu b/cpp/src/sort/stable_sort_column.cu new file mode 100644 index 00000000000..abeaa7bef76 --- /dev/null +++ b/cpp/src/sort/stable_sort_column.cu @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +namespace cudf { +namespace detail { +namespace { + +struct column_stable_sorted_order_fn { + /** + * @brief Stable sort of fixed-width columns using a thrust sort with no comparator. + * + * @param input Column to sort + * @param indices Output sorted indices + * @param stream CUDA stream used for device memory operations and kernel launches + */ + template ()>* = nullptr> + void faster_stable_sort(column_view const& input, + mutable_column_view& indices, + rmm::cuda_stream_view stream) + { + auto temp_col = column(input, stream); + auto d_col = temp_col.mutable_view(); + using DeviceT = device_storage_type_t; + thrust::stable_sort_by_key(rmm::exec_policy(stream), + d_col.begin(), + d_col.end(), + indices.begin()); + } + template ()>* = nullptr> + void faster_stable_sort(column_view const&, mutable_column_view&, rmm::cuda_stream_view) + { + CUDF_FAIL("Only fixed-width types are suitable for faster stable sorting"); + } + + /** + * @brief Stable sorts a single column with a relationally comparable type. + * + * This includes numeric, timestamp, duration, and string types. + * + * @param input Column to sort + * @param indices Output sorted indices + * @param ascending True if sort order is ascending + * @param null_precedence How null rows are to be ordered + * @param stream CUDA stream used for device memory operations and kernel launches + */ + template ()>* = nullptr> + void operator()(column_view const& input, + mutable_column_view& indices, + bool ascending, + null_order null_precedence, + rmm::cuda_stream_view stream) + { + if (!ascending || input.has_nulls() || !cudf::is_fixed_width()) { + auto keys = column_device_view::create(input, stream); + thrust::stable_sort( + rmm::exec_policy(stream), + indices.begin(), + indices.end(), + simple_comparator{*keys, input.has_nulls(), ascending, null_precedence}); + } else { + faster_stable_sort(input, indices, stream); + } + } + template ()>* = nullptr> + void operator()(column_view const&, mutable_column_view&, bool, null_order, rmm::cuda_stream_view) + { + CUDF_FAIL("Column type must be relationally comparable"); + } +}; + +} // namespace + +/** + * @copydoc + * sorted_order(column_view&,order,null_order,rmm::cuda_stream_view,rmm::mr::device_memory_resource*) + */ +template <> +std::unique_ptr sorted_order(column_view const& input, + order column_order, + null_order null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto sorted_indices = cudf::make_numeric_column( + data_type(type_to_id()), input.size(), mask_state::UNALLOCATED, stream, mr); + mutable_column_view indices_view = sorted_indices->mutable_view(); + thrust::sequence( + rmm::exec_policy(stream), indices_view.begin(), indices_view.end(), 0); + cudf::type_dispatcher(input.type(), + column_stable_sorted_order_fn{}, + input, + indices_view, + column_order == order::ASCENDING, + null_precedence, + stream); + return sorted_indices; +} + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu index 0f344fb7111..b76e1932196 100644 --- a/cpp/src/strings/regex/regexec.cu +++ b/cpp/src/strings/regex/regexec.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -40,7 +42,9 @@ namespace { std::vector string_to_char32_vector(std::string const& pattern) { size_type size = static_cast(pattern.size()); - size_type count = characters_in_string(pattern.c_str(), size); + size_type count = std::count_if(pattern.cbegin(), pattern.cend(), [](char ch) { + return is_begin_utf8_char(static_cast(ch)); + }); std::vector result(count + 1); char32_t* output_ptr = result.data(); const char* input_ptr = pattern.data(); diff --git a/cpp/src/structs/structs_column_view.cpp b/cpp/src/structs/structs_column_view.cpp index d5537957013..dba31ecc21e 100644 --- a/cpp/src/structs/structs_column_view.cpp +++ b/cpp/src/structs/structs_column_view.cpp @@ -36,7 +36,9 @@ column_view structs_column_view::get_sliced_child(int index) const size(), child(index).head(), child(index).null_mask(), - child(index).null_count(), + // TODO: could potentially compute the actual count here, but at + // the moment this interface doesn't take a stream. + UNKNOWN_NULL_COUNT, offset(), children}; } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index ad05c871012..8395a3cc1f2 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -39,7 +39,6 @@ endif("$ENV{CONDA_BUILD}" STREQUAL "1") add_library(cudftestutil STATIC "${CMAKE_CURRENT_SOURCE_DIR}/utilities/base_fixture.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/column_utilities.cu" - "${CMAKE_CURRENT_SOURCE_DIR}/utilities/scalar_utilities.cu" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/table_utilities.cu" "${CMAKE_CURRENT_SOURCE_DIR}/strings/utilities.cu") @@ -524,6 +523,7 @@ ConfigureTest(SEARCH_TEST "${SEARCH_TEST_SRC}") set(RESHAPE_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reshape/byte_cast_tests.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/reshape/explode_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/reshape/interleave_columns_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/reshape/tile_tests.cpp") @@ -661,6 +661,8 @@ ConfigureTest(AST_TEST "${AST_TEST_SRC}") # - lists tests ---------------------------------------------------------------------------------- set(LISTS_TEST_SRC + "${CMAKE_CURRENT_SOURCE_DIR}/lists/contains_tests.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/lists/count_elements_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/lists/extract_tests.cpp") ConfigureTest(LISTS_TEST "${LISTS_TEST_SRC}") diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp index 3994ab60a18..8f4a46e2a54 100644 --- a/cpp/tests/ast/transform_tests.cpp +++ b/cpp/tests/ast/transform_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,7 +29,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp index f2f68c7601b..7c02b4957b5 100644 --- a/cpp/tests/binaryop/binop-integration-test.cpp +++ b/cpp/tests/binaryop/binop-integration-test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Copyright 2018-2019 BlazingDB, Inc. * Copyright 2018 Christian Noboa Mardini @@ -2200,6 +2200,21 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiplyScalar) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); } +TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpSimplePlus) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = fp_wrapper{{150, 200}, scale_type{-2}}; + auto const rhs = fp_wrapper{{2250, 1005}, scale_type{-3}}; + auto const expected = fp_wrapper{{3750, 3005}, scale_type{-3}}; + + auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, {}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimple) { using namespace numeric; diff --git a/cpp/tests/copying/gather_str_tests.cu b/cpp/tests/copying/gather_str_tests.cu index 75cea81c950..6655f819190 100644 --- a/cpp/tests/copying/gather_str_tests.cu +++ b/cpp/tests/copying/gather_str_tests.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,7 +25,6 @@ #include #include #include -#include class GatherTestStr : public cudf::test::BaseFixture { }; diff --git a/cpp/tests/copying/sample_tests.cpp b/cpp/tests/copying/sample_tests.cpp index f010b504436..62415693363 100644 --- a/cpp/tests/copying/sample_tests.cpp +++ b/cpp/tests/copying/sample_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,7 +18,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/copying/scatter_list_tests.cu b/cpp/tests/copying/scatter_list_tests.cu index e8e11629628..786f1c57b26 100644 --- a/cpp/tests/copying/scatter_list_tests.cu +++ b/cpp/tests/copying/scatter_list_tests.cu @@ -733,7 +733,9 @@ TYPED_TEST(TypedScatterListsTest, ListsOfNullStructs) }; // clang-format on - auto expected_structs = structs_column_wrapper{{expected_numerics, expected_strings}}; + auto expected_structs = + structs_column_wrapper{{expected_numerics, expected_strings}, + make_counting_transform_iterator(0, [](auto i) { return i != 6; })}; auto expected_lists = cudf::make_lists_column( 6, offsets_column{0, 3, 5, 9, 11, 13, 15}.release(), expected_structs.release(), 0, {}); @@ -828,7 +830,9 @@ TYPED_TEST(TypedScatterListsTest, EmptyListsOfStructs) }; // clang-format on - auto expected_structs = structs_column_wrapper{{expected_numerics, expected_strings}}; + auto expected_structs = + structs_column_wrapper{{expected_numerics, expected_strings}, + make_counting_transform_iterator(0, [](auto i) { return i != 6; })}; auto expected_lists = cudf::make_lists_column( 6, offsets_column{0, 3, 5, 9, 11, 11, 13}.release(), expected_structs.release(), 0, {}); @@ -929,7 +933,9 @@ TYPED_TEST(TypedScatterListsTest, NullListsOfStructs) }; // clang-format on - auto expected_structs = structs_column_wrapper{{expected_numerics, expected_strings}}; + auto expected_structs = + structs_column_wrapper{{expected_numerics, expected_strings}, + make_counting_transform_iterator(0, [](auto i) { return i != 6; })}; auto expected_lists_null_mask_begin = make_counting_transform_iterator(0, [](auto i) { return i != 4; }); diff --git a/cpp/tests/copying/slice_tests.cpp b/cpp/tests/copying/slice_tests.cpp index 1f8e7ebe0bf..e9759aa0259 100644 --- a/cpp/tests/copying/slice_tests.cpp +++ b/cpp/tests/copying/slice_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu index 535cb32defc..5f969098b48 100644 --- a/cpp/tests/fixed_point/fixed_point_tests.cu +++ b/cpp/tests/fixed_point/fixed_point_tests.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -580,4 +581,69 @@ TYPED_TEST(FixedPointTestBothReps, SimpleFixedPointColumnWrapper) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(a, b); } +TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType32) +{ + // This is testing fixed_point values outside the range of its underlying type. + // For example, 100,000,000 with scale of 6 is 100,000,000,000,000 (100 trillion) and this is + // outside the range of a int32_t + + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + auto const a = fp_wrapper{{100000000}, scale_type{6}}; + auto const b = fp_wrapper{{5000000}, scale_type{7}}; + auto const c = fp_wrapper{{2}, scale_type{0}}; + + auto const expected1 = fp_wrapper{{150000000}, scale_type{6}}; + auto const expected2 = fp_wrapper{{50000000}, scale_type{6}}; + + auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, {}); + auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, {}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view()); +} + +TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType64) +{ + // This is testing fixed_point values outside the range of its underlying type. + // For example, 100,000,000 with scale of 100 is 10 ^ 108 and this is far outside the + // range of a int64_t + + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + auto const a = fp_wrapper{{100000000}, scale_type{100}}; + auto const b = fp_wrapper{{5000000}, scale_type{101}}; + auto const c = fp_wrapper{{2}, scale_type{0}}; + + auto const expected1 = fp_wrapper{{150000000}, scale_type{100}}; + auto const expected2 = fp_wrapper{{50000000}, scale_type{100}}; + + auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, {}); + auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, {}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view()); +} + +TYPED_TEST(FixedPointTestBothReps, ExtremelyLargeNegativeScale) +{ + // This is testing fixed_point values with an extremely large negative scale. The fixed_point + // implementation should be able to handle any scale representable by an int32_t + + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + auto const a = fp_wrapper{{10}, scale_type{-201}}; + auto const b = fp_wrapper{{50}, scale_type{-202}}; + auto const c = fp_wrapper{{2}, scale_type{0}}; + + auto const expected1 = fp_wrapper{{150}, scale_type{-202}}; + auto const expected2 = fp_wrapper{{5}, scale_type{-201}}; + + auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, {}); + auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, {}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view()); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 88b7a4f4bb2..4dae480d39e 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,7 +23,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 6a50aed3f7e..886af048aac 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index d1f799f0d84..a93c3170445 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 949c1bd2597..743634fd6d3 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -881,9 +880,7 @@ TEST_F(ParquetChunkedWriterTest, SingleTable) auto filepath = temp_env->get_temp_filepath("ChunkedSingle.parquet"); cudf_io::chunked_parquet_writer_options args = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}); - auto state = cudf_io::write_parquet_chunked_begin(args); - cudf_io::write_parquet_chunked(*table1, state); - cudf_io::write_parquet_chunked_end(state); + cudf_io::parquet_chunked_writer(args).write(*table1); cudf_io::parquet_reader_options read_opts = cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}); @@ -903,10 +900,7 @@ TEST_F(ParquetChunkedWriterTest, SimpleTable) auto filepath = temp_env->get_temp_filepath("ChunkedSimple.parquet"); cudf_io::chunked_parquet_writer_options args = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}); - auto state = cudf_io::write_parquet_chunked_begin(args); - cudf_io::write_parquet_chunked(*table1, state); - cudf_io::write_parquet_chunked(*table2, state); - cudf_io::write_parquet_chunked_end(state); + cudf_io::parquet_chunked_writer(args).write(*table1).write(*table2); cudf_io::parquet_reader_options read_opts = cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}); @@ -926,10 +920,7 @@ TEST_F(ParquetChunkedWriterTest, LargeTables) auto filepath = temp_env->get_temp_filepath("ChunkedLarge.parquet"); cudf_io::chunked_parquet_writer_options args = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}); - auto state = cudf_io::write_parquet_chunked_begin(args); - cudf_io::write_parquet_chunked(*table1, state); - cudf_io::write_parquet_chunked(*table2, state); - auto md = cudf_io::write_parquet_chunked_end(state); + auto md = cudf_io::parquet_chunked_writer(args).write(*table1).write(*table2).close(); CUDF_EXPECTS(!md, "The return value should be null."); cudf_io::parquet_reader_options read_opts = @@ -956,11 +947,11 @@ TEST_F(ParquetChunkedWriterTest, ManyTables) auto filepath = temp_env->get_temp_filepath("ChunkedManyTables.parquet"); cudf_io::chunked_parquet_writer_options args = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}); - auto state = cudf_io::write_parquet_chunked_begin(args); - std::for_each(table_views.begin(), table_views.end(), [&state](table_view const& tbl) { - cudf_io::write_parquet_chunked(tbl, state); + cudf_io::parquet_chunked_writer writer(args); + std::for_each(table_views.begin(), table_views.end(), [&writer](table_view const& tbl) { + writer.write(tbl); }); - auto md = cudf_io::write_parquet_chunked_end(state, true, "dummy/path"); + auto md = writer.close("dummy/path"); CUDF_EXPECTS(md, "The returned metadata should not be null."); cudf_io::parquet_reader_options read_opts = @@ -991,10 +982,7 @@ TEST_F(ParquetChunkedWriterTest, Strings) auto filepath = temp_env->get_temp_filepath("ChunkedStrings.parquet"); cudf_io::chunked_parquet_writer_options args = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}); - auto state = cudf_io::write_parquet_chunked_begin(args); - cudf_io::write_parquet_chunked(tbl1, state); - cudf_io::write_parquet_chunked(tbl2, state); - cudf_io::write_parquet_chunked_end(state); + cudf_io::parquet_chunked_writer(args).write(tbl1).write(tbl2); cudf_io::parquet_reader_options read_opts = cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}); @@ -1053,10 +1041,7 @@ TEST_F(ParquetChunkedWriterTest, ListColumn) auto filepath = temp_env->get_temp_filepath("ChunkedLists.parquet"); cudf_io::chunked_parquet_writer_options args = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}); - auto state = cudf_io::write_parquet_chunked_begin(args); - cudf_io::write_parquet_chunked(tbl0, state); - cudf_io::write_parquet_chunked(tbl1, state); - cudf_io::write_parquet_chunked_end(state); + cudf_io::parquet_chunked_writer(args).write(tbl0).write(tbl1); cudf_io::parquet_reader_options read_opts = cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}); @@ -1074,10 +1059,39 @@ TEST_F(ParquetChunkedWriterTest, MismatchedTypes) auto filepath = temp_env->get_temp_filepath("ChunkedMismatchedTypes.parquet"); cudf_io::chunked_parquet_writer_options args = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}); - auto state = cudf_io::write_parquet_chunked_begin(args); - cudf_io::write_parquet_chunked(*table1, state); - EXPECT_THROW(cudf_io::write_parquet_chunked(*table2, state), cudf::logic_error); - cudf_io::write_parquet_chunked_end(state); + cudf_io::parquet_chunked_writer writer(args); + writer.write(*table1); + EXPECT_THROW(writer.write(*table2), cudf::logic_error); + writer.close(); +} + +TEST_F(ParquetChunkedWriterTest, ChunkedWriteAfterClosing) +{ + srand(31337); + auto table = create_random_fixed_table(4, 4, true); + + auto filepath = temp_env->get_temp_filepath("ChunkedWriteAfterClosing.parquet"); + cudf_io::chunked_parquet_writer_options args = + cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}); + cudf_io::parquet_chunked_writer writer(args); + writer.write(*table).close(); + EXPECT_THROW(writer.write(*table), cudf::logic_error); +} + +TEST_F(ParquetChunkedWriterTest, ReadingUnclosedFile) +{ + srand(31337); + auto table = create_random_fixed_table(4, 4, true); + + auto filepath = temp_env->get_temp_filepath("ReadingUnlosedFile.parquet"); + cudf_io::chunked_parquet_writer_options args = + cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}); + cudf_io::parquet_chunked_writer writer(args); + writer.write(*table); + + cudf_io::parquet_reader_options read_opts = + cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}); + EXPECT_THROW(cudf_io::read_parquet(read_opts), cudf::logic_error); } TEST_F(ParquetChunkedWriterTest, MismatchedStructure) @@ -1089,10 +1103,10 @@ TEST_F(ParquetChunkedWriterTest, MismatchedStructure) auto filepath = temp_env->get_temp_filepath("ChunkedMismatchedStructure.parquet"); cudf_io::chunked_parquet_writer_options args = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}); - auto state = cudf_io::write_parquet_chunked_begin(args); - cudf_io::write_parquet_chunked(*table1, state); - EXPECT_THROW(cudf_io::write_parquet_chunked(*table2, state), cudf::logic_error); - cudf_io::write_parquet_chunked_end(state); + cudf_io::parquet_chunked_writer writer(args); + writer.write(*table1); + EXPECT_THROW(writer.write(*table2), cudf::logic_error); + writer.close(); } TEST_F(ParquetChunkedWriterTest, MismatchedStructureList) @@ -1129,9 +1143,9 @@ TEST_F(ParquetChunkedWriterTest, MismatchedStructureList) auto filepath = temp_env->get_temp_filepath("ChunkedLists.parquet"); cudf_io::chunked_parquet_writer_options args = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}); - auto state = cudf_io::write_parquet_chunked_begin(args); - cudf_io::write_parquet_chunked(tbl0, state); - CUDF_EXPECT_THROW_MESSAGE(cudf_io::write_parquet_chunked(tbl1, state), + cudf_io::parquet_chunked_writer writer(args); + writer.write(tbl0); + CUDF_EXPECT_THROW_MESSAGE(writer.write(tbl1), "Mismatch in schema between multiple calls to write_chunk"); } @@ -1146,10 +1160,7 @@ TEST_F(ParquetChunkedWriterTest, DifferentNullability) auto filepath = temp_env->get_temp_filepath("ChunkedNullable.parquet"); cudf_io::chunked_parquet_writer_options args = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}); - auto state = cudf_io::write_parquet_chunked_begin(args); - cudf_io::write_parquet_chunked(*table1, state); - cudf_io::write_parquet_chunked(*table2, state); - cudf_io::write_parquet_chunked_end(state); + cudf_io::parquet_chunked_writer(args).write(*table1).write(*table2); cudf_io::parquet_reader_options read_opts = cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}); @@ -1179,10 +1190,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullability) cudf_io::chunked_parquet_writer_options args = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}) .nullable_metadata(&nullable_metadata); - auto state = cudf_io::write_parquet_chunked_begin(args); - cudf_io::write_parquet_chunked(*table1, state); - cudf_io::write_parquet_chunked(*table2, state); - cudf_io::write_parquet_chunked_end(state); + cudf_io::parquet_chunked_writer(args).write(*table1).write(*table2); cudf_io::parquet_reader_options read_opts = cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}); @@ -1238,10 +1246,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityList) cudf_io::chunked_parquet_writer_options args = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}) .nullable_metadata(&nullable_metadata); - auto state = cudf_io::write_parquet_chunked_begin(args); - cudf_io::write_parquet_chunked(table1, state); - cudf_io::write_parquet_chunked(table2, state); - cudf_io::write_parquet_chunked_end(state); + cudf_io::parquet_chunked_writer(args).write(table1).write(table2); cudf_io::parquet_reader_options read_opts = cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}); @@ -1258,20 +1263,22 @@ TEST_F(ParquetChunkedWriterTest, WrongNullability) auto filepath = temp_env->get_temp_filepath("ChunkedWrongNullable.parquet"); cudf::io::table_metadata_with_nullability nullable_metadata; + // Number of columns with mask in table (i.e 5) and size of column nullability (i.e 6), are + // mismatching. nullable_metadata.column_nullable.insert(nullable_metadata.column_nullable.begin(), 6, true); cudf_io::chunked_parquet_writer_options args = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}) .nullable_metadata(&nullable_metadata); - auto state = cudf_io::write_parquet_chunked_begin(args); - EXPECT_THROW(cudf_io::write_parquet_chunked(*table1, state), cudf::logic_error); + EXPECT_THROW(cudf_io::parquet_chunked_writer(args).write(*table1), cudf::logic_error); nullable_metadata.column_nullable.clear(); + // Number of columns with mask in table (i.e 5) and size of column nullability (i.e 4), are + // mismatching. nullable_metadata.column_nullable.insert(nullable_metadata.column_nullable.begin(), 4, true); cudf_io::chunked_parquet_writer_options args2 = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}) .nullable_metadata(&nullable_metadata); - state = cudf_io::write_parquet_chunked_begin(args2); - EXPECT_THROW(cudf_io::write_parquet_chunked(*table1, state), cudf::logic_error); + EXPECT_THROW(cudf_io::parquet_chunked_writer(args2).write(*table1), cudf::logic_error); } TEST_F(ParquetChunkedWriterTest, ReadRowGroups) @@ -1285,10 +1292,9 @@ TEST_F(ParquetChunkedWriterTest, ReadRowGroups) auto filepath = temp_env->get_temp_filepath("ChunkedRowGroups.parquet"); cudf_io::chunked_parquet_writer_options args = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}); - auto state = cudf_io::write_parquet_chunked_begin(args); - cudf_io::write_parquet_chunked(*table1, state); - cudf_io::write_parquet_chunked(*table2, state); - cudf_io::write_parquet_chunked_end(state); + { + cudf_io::parquet_chunked_writer(args).write(*table1).write(*table2); + } cudf_io::parquet_reader_options read_opts = cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}) @@ -1306,9 +1312,7 @@ TEST_F(ParquetChunkedWriterTest, ReadRowGroupsError) auto filepath = temp_env->get_temp_filepath("ChunkedRowGroupsError.parquet"); cudf_io::chunked_parquet_writer_options args = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}); - auto state = cudf_io::write_parquet_chunked_begin(args); - cudf_io::write_parquet_chunked(*table1, state); - cudf_io::write_parquet_chunked_end(state); + cudf_io::parquet_chunked_writer(args).write(*table1); cudf_io::parquet_reader_options read_opts = cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}).row_groups({{0, 1}}); @@ -1339,40 +1343,32 @@ TEST_F(ParquetChunkedWriterTest, DecimalWrite) cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}); // verify failure if no decimal precision given - auto state = cudf_io::write_parquet_chunked_begin(args); - EXPECT_THROW(cudf_io::write_parquet_chunked(table, state), cudf::logic_error); + EXPECT_THROW(cudf_io::parquet_chunked_writer(args).write(table), cudf::logic_error); // verify failure if too small a precision is given std::vector precisions{7, 1}; args.set_decimal_precision_data(precisions); - state = cudf_io::write_parquet_chunked_begin(args); - EXPECT_THROW(cudf_io::write_parquet_chunked(table, state), cudf::logic_error); + EXPECT_THROW(cudf_io::parquet_chunked_writer(args).write(table), cudf::logic_error); // verify failure if too few precisions given precisions.pop_back(); args.set_decimal_precision_data(precisions); - state = cudf_io::write_parquet_chunked_begin(args); - EXPECT_THROW(cudf_io::write_parquet_chunked(table, state), cudf::logic_error); + EXPECT_THROW(cudf_io::parquet_chunked_writer(args).write(table), cudf::logic_error); // verify sucess if equal precision is given precisions = {7, 9}; args.set_decimal_precision_data(precisions); - state = cudf_io::write_parquet_chunked_begin(args); - cudf_io::write_parquet_chunked(table, state); - cudf_io::write_parquet_chunked_end(state); + cudf_io::parquet_chunked_writer(args).write(table); // verify failure if too many precisions given precisions = {7, 14, 11}; args.set_decimal_precision_data(precisions); - state = cudf_io::write_parquet_chunked_begin(args); - EXPECT_THROW(cudf_io::write_parquet_chunked(table, state), cudf::logic_error); + EXPECT_THROW(cudf_io::parquet_chunked_writer(args).write(table), cudf::logic_error); // write correctly precisions.pop_back(); args.set_decimal_precision_data(precisions); - state = cudf_io::write_parquet_chunked_begin(args); - cudf_io::write_parquet_chunked(table, state); - cudf_io::write_parquet_chunked_end(state); + cudf_io::parquet_chunked_writer(args).write(table); cudf_io::parquet_reader_options read_opts = cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}); @@ -1419,10 +1415,7 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize) auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize.parquet"); cudf_io::chunked_parquet_writer_options args = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}); - auto state = cudf_io::write_parquet_chunked_begin(args); - cudf_io::write_parquet_chunked(tbl1, state); - cudf_io::write_parquet_chunked(tbl2, state); - cudf_io::write_parquet_chunked_end(state); + cudf_io::parquet_chunked_writer(args).write(tbl1).write(tbl2); cudf_io::parquet_reader_options read_opts = cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}); @@ -1469,10 +1462,7 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize2) auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize2.parquet"); cudf_io::chunked_parquet_writer_options args = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}); - auto state = cudf_io::write_parquet_chunked_begin(args); - cudf_io::write_parquet_chunked(tbl1, state); - cudf_io::write_parquet_chunked(tbl2, state); - cudf_io::write_parquet_chunked_end(state); + cudf_io::parquet_chunked_writer(args).write(tbl1).write(tbl2); cudf_io::parquet_reader_options read_opts = cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}); diff --git a/cpp/tests/jit/jit-cache-test.hpp b/cpp/tests/jit/jit-cache-test.hpp index 44736b821ae..261cc0fd3b4 100644 --- a/cpp/tests/jit/jit-cache-test.hpp +++ b/cpp/tests/jit/jit-cache-test.hpp @@ -123,10 +123,10 @@ struct JitCacheTest : public ::testing::Test, public cudf::jit::cudfJitCache { /** * @brief Similar to JitCacheTest but it doesn't run warmUp() test in SetUp and - * purgeFileCache() in TearDown + * purgeFileCache() in SetUp and TearDown */ struct JitCacheMultiProcessTest : public JitCacheTest { - virtual void SetUp() { purgeFileCache(); } + virtual void SetUp() {} virtual void TearDown() {} }; diff --git a/cpp/tests/lists/contains_tests.cpp b/cpp/tests/lists/contains_tests.cpp new file mode 100644 index 00000000000..1885f626490 --- /dev/null +++ b/cpp/tests/lists/contains_tests.cpp @@ -0,0 +1,568 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace cudf { +namespace test { + +struct ContainsTest : public BaseFixture { +}; + +using ContainsTestTypes = Concat; + +template +struct TypedContainsTest : public ContainsTest { +}; + +TYPED_TEST_CASE(TypedContainsTest, ContainsTestTypes); + +namespace { +template (), void>* = nullptr> +auto create_scalar_search_key(T const& value) +{ + auto search_key = make_numeric_scalar(data_type{type_to_id()}); + search_key->set_valid(true); + static_cast*>(search_key.get())->set_value(value); + return search_key; +} + +template ::value, void>* = nullptr> +auto create_scalar_search_key(std::string const& value) +{ + return make_string_scalar(value); +} + +template (), void>* = nullptr> +auto create_scalar_search_key(typename T::rep const& value) +{ + auto search_key = make_timestamp_scalar(data_type{type_to_id()}); + search_key->set_valid(true); + static_cast*>(search_key.get())->set_value(value); + return search_key; +} + +template (), void>* = nullptr> +auto create_scalar_search_key(typename T::rep const& value) +{ + auto search_key = make_duration_scalar(data_type{type_to_id()}); + search_key->set_valid(true); + static_cast*>(search_key.get())->set_value(value); + return search_key; +} + +template (), void>* = nullptr> +auto create_null_search_key() +{ + auto search_key = make_numeric_scalar(data_type{type_to_id()}); + search_key->set_valid(false); + return search_key; +} + +template (), void>* = nullptr> +auto create_null_search_key() +{ + auto search_key = make_timestamp_scalar(data_type{type_to_id()}); + search_key->set_valid(false); + return search_key; +} + +template (), void>* = nullptr> +auto create_null_search_key() +{ + auto search_key = make_duration_scalar(data_type{type_to_id()}); + search_key->set_valid(false); + return search_key; +} + +} // namespace + +TYPED_TEST(TypedContainsTest, ListContainsScalarWithNoNulls) +{ + using T = TypeParam; + + auto search_space = lists_column_wrapper{ + {0, 1, 2}, + {3, 4, 5}, + {6, 7, 8}, + {9, 0, 1}, + {2, 3, 4}, + {5, 6, 7}, + {8, 9, 0}, + {}, + {1, 2, 3}, + {}}.release(); + + auto search_key_one = create_scalar_search_key(1); + + auto actual_result = lists::contains(search_space->view(), *search_key_one); + + auto expected_result = fixed_width_column_wrapper{1, 0, 0, 1, 0, 0, 0, 0, 1, 0}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); +} + +TYPED_TEST(TypedContainsTest, ListContainsScalarWithNullLists) +{ + // Test List columns that have NULL list rows. + + using T = TypeParam; + + auto search_space = lists_column_wrapper{ + {{0, 1, 2}, + {3, 4, 5}, + {6, 7, 8}, + {}, + {9, 0, 1}, + {2, 3, 4}, + {5, 6, 7}, + {8, 9, 0}, + {}, + {1, 2, 3}, + {}}, + make_counting_transform_iterator(0, [](auto i) { + return (i != 3) && (i != 10); + })}.release(); + + auto search_key_one = create_scalar_search_key(1); + + auto actual_result = lists::contains(search_space->view(), *search_key_one); + + auto expected_result = fixed_width_column_wrapper{ + {1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0}, + make_counting_transform_iterator(0, [](auto i) { return (i != 3) && (i != 10); })}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); +} + +TYPED_TEST(TypedContainsTest, ListContainsScalarNonNullListsWithNullValues) +{ + // Test List columns that have no NULL list rows, but NULL elements in some list rows. + using T = TypeParam; + + auto numerals = fixed_width_column_wrapper{ + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, + make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })}; + + auto search_space = + make_lists_column(8, + fixed_width_column_wrapper{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(), + numerals.release(), + 0, + {}); + + auto search_key_one = create_scalar_search_key(1); + + auto actual_result = lists::contains(search_space->view(), *search_key_one); + + auto expected_result = + fixed_width_column_wrapper{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 1, 0, 1, 1}}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); +} + +TYPED_TEST(TypedContainsTest, ListContainsScalarWithNullsInLists) +{ + using T = TypeParam; + + auto numerals = fixed_width_column_wrapper{ + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, + make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })}; + + auto input_null_mask_iter = make_counting_transform_iterator(0, [](auto i) { return i != 4; }); + + auto search_space = make_lists_column( + 8, + fixed_width_column_wrapper{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(), + numerals.release(), + 1, + cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8)); + + auto search_key_one = create_scalar_search_key(1); + + auto actual_result = lists::contains(search_space->view(), *search_key_one); + + auto expected_result = + fixed_width_column_wrapper{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 0, 0, 1, 1}}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); +} + +TEST_F(ContainsTest, BoolListContainsScalarWithNullsInLists) +{ + using T = bool; + + auto numerals = fixed_width_column_wrapper{ + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, + make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })}; + + auto input_null_mask_iter = make_counting_transform_iterator(0, [](auto i) { return i != 4; }); + + auto search_space = make_lists_column( + 8, + fixed_width_column_wrapper{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(), + numerals.release(), + 1, + cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8)); + + auto search_key_one = create_scalar_search_key(1); + + auto actual_result = lists::contains(search_space->view(), *search_key_one); + + auto expected_result = + fixed_width_column_wrapper{{0, 1, 1, 0, 0, 1, 0, 1}, {0, 1, 1, 1, 0, 1, 1, 1}}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); +} + +TEST_F(ContainsTest, StringListContainsScalarWithNullsInLists) +{ + using T = std::string; + + auto strings = strings_column_wrapper{ + {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "1", "2", "3", "4"}, + make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })}; + + auto input_null_mask_iter = make_counting_transform_iterator(0, [](auto i) { return i != 4; }); + + auto search_space = make_lists_column( + 8, + fixed_width_column_wrapper{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(), + strings.release(), + 1, + cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8)); + + auto search_key_one = create_scalar_search_key("1"); + + auto actual_result = lists::contains(search_space->view(), *search_key_one); + + auto expected_result = + fixed_width_column_wrapper{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 0, 0, 1, 1}}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); +} + +TYPED_TEST(TypedContainsTest, ContainsScalarNullSearchKey) +{ + using T = TypeParam; + + auto search_space = lists_column_wrapper{ + {{0, 1, 2}, + {3, 4, 5}, + {6, 7, 8}, + {}, + {9, 0, 1}, + {2, 3, 4}, + {5, 6, 7}, + {8, 9, 0}, + {}, + {1, 2, 3}, + {}}, + make_counting_transform_iterator(0, [](auto i) { + return (i != 3) && (i != 10); + })}.release(); + + auto search_key_null = create_null_search_key(); + + auto actual_result = lists::contains(search_space->view(), *search_key_null); + + auto expected_result = fixed_width_column_wrapper{ + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + make_counting_transform_iterator(0, [](auto i) { return false; })}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); +} + +TEST_F(ContainsTest, ScalarTypeRelatedExceptions) +{ + { + // Nested types unsupported. + auto list_of_lists = lists_column_wrapper{ + {{1, 2, 3}, {4, 5, 6}}, + {{1, 2, 3}, {4, 5, 6}}, + {{1, 2, 3}, + {4, 5, 6}}}.release(); + auto skey = create_scalar_search_key(10); + CUDF_EXPECT_THROW_MESSAGE(lists::contains(list_of_lists->view(), *skey), + "Nested types not supported in lists::contains()"); + } + + { + // Search key must match list elements in type. + auto list_of_ints = + lists_column_wrapper{ + {0, 1, 2}, + {3, 4, 5}, + } + .release(); + auto skey = create_scalar_search_key("Hello, World!"); + CUDF_EXPECT_THROW_MESSAGE(lists::contains(list_of_ints->view(), *skey), + "Type of search key does not match list column element type."); + } +} + +template +struct TypedVectorContainsTest : public ContainsTest { +}; + +using VectorContainsTestTypes = + cudf::test::Concat; + +TYPED_TEST_CASE(TypedVectorContainsTest, VectorContainsTestTypes); + +TYPED_TEST(TypedVectorContainsTest, ListContainsVectorWithNoNulls) +{ + using T = TypeParam; + + auto search_space = lists_column_wrapper{ + {0, 1, 2}, + {3, 4, 5}, + {6, 7, 8}, + {9, 0, 1}, + {2, 3, 4}, + {5, 6, 7}, + {8, 9, 0}, + {}, + {1, 2, 3}, + {}}.release(); + + auto search_key = fixed_width_column_wrapper{1, 2, 3, 1, 2, 3, 1, 2, 3, 1}; + + auto actual_result = lists::contains(search_space->view(), search_key); + + auto expected_result = fixed_width_column_wrapper{1, 0, 0, 1, 1, 0, 0, 0, 1, 0}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); +} + +TYPED_TEST(TypedVectorContainsTest, ListContainsVectorWithNullLists) +{ + // Test List columns that have NULL list rows. + + using T = TypeParam; + + auto search_space = lists_column_wrapper{ + {{0, 1, 2}, + {3, 4, 5}, + {6, 7, 8}, + {}, + {9, 0, 1}, + {2, 3, 4}, + {5, 6, 7}, + {8, 9, 0}, + {}, + {1, 2, 3}, + {}}, + make_counting_transform_iterator(0, [](auto i) { + return (i != 3) && (i != 10); + })}.release(); + + auto search_keys = fixed_width_column_wrapper{1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2}; + + auto actual_result = lists::contains(search_space->view(), search_keys); + + auto expected_result = fixed_width_column_wrapper{ + {1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0}, + make_counting_transform_iterator(0, [](auto i) { return (i != 3) && (i != 10); })}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); +} + +TYPED_TEST(TypedVectorContainsTest, ListContainsVectorNonNullListsWithNullValues) +{ + // Test List columns that have no NULL list rows, but NULL elements in some list rows. + using T = TypeParam; + + auto numerals = fixed_width_column_wrapper{ + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, + make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })}; + + auto search_space = + make_lists_column(8, + fixed_width_column_wrapper{0, 1, 3, 7, 7, 7, 10, 12, 15}.release(), + numerals.release(), + 0, + {}); + + auto search_keys = fixed_width_column_wrapper{1, 2, 3, 1, 2, 3, 1, 3}; + + auto actual_result = lists::contains(search_space->view(), search_keys); + + auto expected_result = + fixed_width_column_wrapper{{0, 1, 0, 0, 0, 0, 1, 1}, {0, 1, 0, 1, 1, 0, 1, 1}}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); +} + +TYPED_TEST(TypedVectorContainsTest, ListContainsVectorWithNullsInLists) +{ + using T = TypeParam; + + auto numerals = fixed_width_column_wrapper{ + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, + make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })}; + + auto input_null_mask_iter = make_counting_transform_iterator(0, [](auto i) { return i != 4; }); + + auto search_space = make_lists_column( + 8, + fixed_width_column_wrapper{0, 1, 3, 7, 7, 7, 10, 12, 15}.release(), + numerals.release(), + 1, + cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8)); + + auto search_keys = fixed_width_column_wrapper{1, 2, 3, 1, 2, 3, 1, 3}; + + auto actual_result = lists::contains(search_space->view(), search_keys); + + auto expected_result = + fixed_width_column_wrapper{{0, 1, 0, 0, 0, 0, 1, 1}, {0, 1, 0, 1, 0, 0, 1, 1}}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); +} + +TYPED_TEST(TypedVectorContainsTest, ListContainsVectorWithNullsInListsAndInSearchKeys) +{ + using T = TypeParam; + + auto numerals = fixed_width_column_wrapper{ + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, + make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })}; + + auto input_null_mask_iter = make_counting_transform_iterator(0, [](auto i) { return i != 4; }); + + auto search_space = make_lists_column( + 8, + fixed_width_column_wrapper{0, 1, 3, 7, 7, 7, 10, 12, 15}.release(), + numerals.release(), + 1, + cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8)); + + auto search_keys = fixed_width_column_wrapper{ + {1, 2, 3, 1, 2, 3, 1, 3}, make_counting_transform_iterator(0, [](auto i) { return i != 6; })}; + + auto actual_result = lists::contains(search_space->view(), search_keys); + + auto expected_result = + fixed_width_column_wrapper{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 0, 0, 0, 1}}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); +} + +TEST_F(ContainsTest, BoolListContainsVectorWithNullsInListsAndInSearchKeys) +{ + using T = bool; + + auto numerals = fixed_width_column_wrapper{ + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, + make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })}; + + auto input_null_mask_iter = make_counting_transform_iterator(0, [](auto i) { return i != 4; }); + + auto search_space = make_lists_column( + 8, + fixed_width_column_wrapper{0, 1, 3, 7, 7, 7, 10, 12, 15}.release(), + numerals.release(), + 1, + cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8)); + + auto search_keys = fixed_width_column_wrapper{ + {0, 1, 0, 1, 0, 0, 1, 1}, make_counting_transform_iterator(0, [](auto i) { return i != 6; })}; + + auto actual_result = lists::contains(search_space->view(), search_keys); + + auto expected_result = + fixed_width_column_wrapper{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 0, 0, 0, 1}}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); +} + +TEST_F(ContainsTest, StringListContainsVectorWithNullsInListsAndInSearchKeys) +{ + auto numerals = strings_column_wrapper{ + {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "1", "2", "3", "4"}, + make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })}; + + auto input_null_mask_iter = make_counting_transform_iterator(0, [](auto i) { return i != 4; }); + + auto search_space = make_lists_column( + 8, + fixed_width_column_wrapper{0, 1, 3, 7, 7, 7, 10, 12, 15}.release(), + numerals.release(), + 1, + cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8)); + + auto search_keys = + strings_column_wrapper{{"1", "2", "3", "1", "2", "3", "1", "3"}, + make_counting_transform_iterator(0, [](auto i) { return i != 6; })}; + + auto actual_result = lists::contains(search_space->view(), search_keys); + + auto expected_result = + fixed_width_column_wrapper{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 0, 0, 0, 1}}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result); +} + +TEST_F(ContainsTest, VectorTypeRelatedExceptions) +{ + { + // Nested types unsupported. + auto list_of_lists = lists_column_wrapper{ + {{1, 2, 3}, {4, 5, 6}}, + {{1, 2, 3}, {4, 5, 6}}, + {{1, 2, 3}, + {4, 5, 6}}}.release(); + auto skey = fixed_width_column_wrapper{0, 1, 2}; + CUDF_EXPECT_THROW_MESSAGE(lists::contains(list_of_lists->view(), skey), + "Nested types not supported in lists::contains()"); + } + + { + // Search key must match list elements in type. + auto list_of_ints = + lists_column_wrapper{ + {0, 1, 2}, + {3, 4, 5}, + } + .release(); + auto skey = strings_column_wrapper{"Hello", "World"}; + CUDF_EXPECT_THROW_MESSAGE(lists::contains(list_of_ints->view(), skey), + "Type of search key does not match list column element type."); + } + + { + // Search key column size must match lists column size. + auto list_of_ints = lists_column_wrapper{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}}.release(); + + auto skey = fixed_width_column_wrapper{0, 1, 2, 3}; + CUDF_EXPECT_THROW_MESSAGE(lists::contains(list_of_ints->view(), skey), + "Number of search keys must match list column size."); + } +} + +} // namespace test + +} // namespace cudf diff --git a/cpp/tests/lists/count_elements_tests.cpp b/cpp/tests/lists/count_elements_tests.cpp new file mode 100644 index 00000000000..c5cb9d230c3 --- /dev/null +++ b/cpp/tests/lists/count_elements_tests.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +struct ListsElementsTest : public cudf::test::BaseFixture { +}; + +using NumericTypesNotBool = + cudf::test::Concat; + +template +class ListsElementsNumericsTest : public ListsElementsTest { +}; + +TYPED_TEST_CASE(ListsElementsNumericsTest, NumericTypesNotBool); + +TYPED_TEST(ListsElementsNumericsTest, CountElements) +{ + auto validity = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [](auto i) { return i != 1; }); + using LCW = cudf::test::lists_column_wrapper; + LCW input({LCW{3, 2, 1}, LCW{}, LCW{30, 20, 10, 50}, LCW{100, 120}, LCW{0}}, validity); + + auto result = cudf::lists::count_elements(cudf::lists_column_view(input)); + cudf::test::fixed_width_column_wrapper expected({3, 0, 4, 2, 1}, {1, 0, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} + +TEST_F(ListsElementsTest, CountElementsStrings) +{ + auto validity = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [](auto i) { return i != 1; }); + using LCW = cudf::test::lists_column_wrapper; + LCW input( + {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", "", "z"}, LCW{"tést", "String"}, LCW{""}}, + validity); + + auto result = cudf::lists::count_elements(cudf::lists_column_view(input)); + cudf::test::fixed_width_column_wrapper expected({3, 0, 4, 2, 1}, {1, 0, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} + +TEST_F(ListsElementsTest, CountElementsSliced) +{ + auto validity = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [](auto i) { return i != 1; }); + using LCW = cudf::test::lists_column_wrapper; + LCW input( + {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", "", "z"}, LCW{"tést", "String"}, LCW{""}}, + validity); + + auto sliced = cudf::slice(input, {1, 4}).front(); + auto result = cudf::lists::count_elements(cudf::lists_column_view(sliced)); + cudf::test::fixed_width_column_wrapper expected({0, 4, 2}, {0, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} + +TYPED_TEST(ListsElementsNumericsTest, CountElementsNestedLists) +{ + std::vector validity{1, 0, 1, 1}; + using LCW = cudf::test::lists_column_wrapper; + LCW list({LCW{LCW{2, 3}, LCW{4, 5}}, + LCW{LCW{}}, + LCW{LCW{6, 7, 8}, LCW{9, 10, 11}, LCW({12, 13, 14}, validity.begin())}, + LCW{LCW{15, 16}, LCW{17, 18}, LCW{19, 20}, LCW{21, 22}, LCW{23, 24}}}, + validity.begin()); + + auto result = cudf::lists::count_elements(cudf::lists_column_view(list)); + cudf::test::fixed_width_column_wrapper expected({2, 1, 3, 5}, {1, 0, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} + +TEST_F(ListsElementsTest, CountElementsEmpty) +{ + using LCW = cudf::test::lists_column_wrapper; + + LCW empty{}; + auto result = cudf::lists::count_elements(cudf::lists_column_view(empty)); + EXPECT_EQ(0, result->size()); +} diff --git a/cpp/tests/reshape/byte_cast_tests.cpp b/cpp/tests/reshape/byte_cast_tests.cpp index 2df3ce9e021..48ee77d565f 100644 --- a/cpp/tests/reshape/byte_cast_tests.cpp +++ b/cpp/tests/reshape/byte_cast_tests.cpp @@ -114,7 +114,7 @@ TEST_F(ByteCastTest, int32ValuesWithNulls) 5, std::move(fixed_width_column_wrapper{0, 4, 8, 12, 16, 20}.release()), std::move(int32_data.release()), - 3, + 2, detail::make_null_mask(even_validity, even_validity + 5)); auto const output_int32 = cudf::byte_cast(int32_col, cudf::flip_endianness::YES); diff --git a/cpp/tests/reshape/explode_tests.cpp b/cpp/tests/reshape/explode_tests.cpp new file mode 100644 index 00000000000..6f98332243e --- /dev/null +++ b/cpp/tests/reshape/explode_tests.cpp @@ -0,0 +1,453 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +using namespace cudf::test; + +class ExplodeTest : public cudf::test::BaseFixture { +}; + +template +class ExplodeTypedTest : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(ExplodeTypedTest, cudf::test::FixedPointTypes); + +TEST_F(ExplodeTest, Empty) +{ + lists_column_wrapper a{}; + fixed_width_column_wrapper b{}; + + cudf::table_view t({a, b}); + + auto ret = cudf::explode(t, 0); + + fixed_width_column_wrapper expected_a{}; + fixed_width_column_wrapper expected_b{}; + cudf::table_view expected({expected_a, expected_b}); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} + +TEST_F(ExplodeTest, NonList) +{ + fixed_width_column_wrapper a{100, 200, 300}; + fixed_width_column_wrapper b{100, 200, 300}; + + cudf::table_view t({a, b}); + + EXPECT_THROW(cudf::explode(t, 1), cudf::logic_error); +} + +TEST_F(ExplodeTest, Basics) +{ + /* + a b + [1, 2, 7] 100 + [5, 6] 200 + [0, 3] 300 + */ + + fixed_width_column_wrapper a{100, 200, 300}; + lists_column_wrapper b{lists_column_wrapper{1, 2, 7}, + lists_column_wrapper{5, 6}, + lists_column_wrapper{0, 3}}; + strings_column_wrapper c{"string0", "string1", "string2"}; + + fixed_width_column_wrapper expected_a{100, 100, 100, 200, 200, 300, 300}; + fixed_width_column_wrapper expected_b{1, 2, 7, 5, 6, 0, 3}; + strings_column_wrapper expected_c{ + "string0", "string0", "string0", "string1", "string1", "string2", "string2"}; + + cudf::table_view t({a, b, c}); + cudf::table_view expected({expected_a, expected_b, expected_c}); + + auto ret = cudf::explode(t, 1); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} + +TEST_F(ExplodeTest, SingleNull) +{ + /* + a b + [1, 2, 7] 100 + [5, 6] 200 + [] 300 + [0, 3] 400 + */ + + auto first_invalid = + cudf::test::make_counting_transform_iterator(0, [](auto i) { return i == 0 ? false : true; }); + + lists_column_wrapper a({lists_column_wrapper{1, 2, 7}, + lists_column_wrapper{5, 6}, + lists_column_wrapper{}, + lists_column_wrapper{0, 3}}, + first_invalid); + fixed_width_column_wrapper b({100, 200, 300, 400}); + + fixed_width_column_wrapper expected_a{5, 6, 0, 3}; + fixed_width_column_wrapper expected_b{200, 200, 400, 400}; + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} + +TEST_F(ExplodeTest, Nulls) +{ + /* + a b + [1, 2, 7] 100 + [5, 6] 200 + [0, 3] 300 + */ + + auto valids = cudf::test::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + auto always_valid = cudf::test::make_counting_transform_iterator(0, [](auto i) { return true; }); + + lists_column_wrapper a({lists_column_wrapper{1, 2, 7}, + lists_column_wrapper{5, 6}, + lists_column_wrapper{0, 3}}, + valids); + fixed_width_column_wrapper b({100, 200, 300}, valids); + + fixed_width_column_wrapper expected_a({1, 2, 7, 0, 3}); + fixed_width_column_wrapper expected_b({100, 100, 100, 300, 300}, always_valid); + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} + +TEST_F(ExplodeTest, NullsInList) +{ + /* + a b + [1, 2, 7] 100 + [5, 6, 0, 9] 200 + [] 300 + [0, 3, 8] 400 + */ + + auto valids = cudf::test::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + lists_column_wrapper a{lists_column_wrapper({1, 2, 7}, valids), + lists_column_wrapper({5, 6, 0, 9}, valids), + lists_column_wrapper{}, + lists_column_wrapper({0, 3, 8}, valids)}; + fixed_width_column_wrapper b{100, 200, 300, 400}; + + fixed_width_column_wrapper expected_a({1, 2, 7, 5, 6, 0, 9, 0, 3, 8}, + {1, 0, 1, 1, 0, 1, 0, 1, 0, 1}); + fixed_width_column_wrapper expected_b{100, 100, 100, 200, 200, 200, 200, 400, 400, 400}; + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} + +TEST_F(ExplodeTest, Nested) +{ + /* + a b + [[1, 2], [7, 6, 5]] 100 + [[5, 6]] 200 + [[0, 3],[],[5],[2, 1]] 300 + */ + + lists_column_wrapper a{ + lists_column_wrapper{lists_column_wrapper{1, 2}, + lists_column_wrapper{7, 6, 5}}, + lists_column_wrapper{lists_column_wrapper{5, 6}}, + lists_column_wrapper{lists_column_wrapper{0, 3}, + lists_column_wrapper{}, + lists_column_wrapper{5}, + lists_column_wrapper{2, 1}}}; + fixed_width_column_wrapper b{100, 200, 300}; + + lists_column_wrapper expected_a{lists_column_wrapper{1, 2}, + lists_column_wrapper{7, 6, 5}, + lists_column_wrapper{5, 6}, + lists_column_wrapper{0, 3}, + lists_column_wrapper{}, + lists_column_wrapper{5}, + lists_column_wrapper{2, 1}}; + fixed_width_column_wrapper expected_b{100, 100, 200, 300, 300, 300, 300}; + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} + +TEST_F(ExplodeTest, NestedNulls) +{ + /* + a b + [[1, 2], [7, 6, 5]] 100 + [[5, 6]] 200 + [[0, 3],[5],[2, 1]] 300 + */ + + auto valids = cudf::test::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + auto always_valid = cudf::test::make_counting_transform_iterator(0, [](auto i) { return true; }); + + lists_column_wrapper a( + {lists_column_wrapper{lists_column_wrapper{1, 2}, + lists_column_wrapper{7, 6, 5}}, + lists_column_wrapper{lists_column_wrapper{5, 6}}, + lists_column_wrapper{lists_column_wrapper{0, 3}, + lists_column_wrapper{5}, + lists_column_wrapper{2, 1}}}, + valids); + fixed_width_column_wrapper b({100, 200, 300}, valids); + + lists_column_wrapper expected_a{lists_column_wrapper{1, 2}, + lists_column_wrapper{7, 6, 5}, + lists_column_wrapper{0, 3}, + lists_column_wrapper{5}, + lists_column_wrapper{2, 1}}; + fixed_width_column_wrapper expected_b({100, 100, 300, 300, 300}, always_valid); + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} + +TEST_F(ExplodeTest, NullsInNested) +{ + /* + a b + [[1, 2], [7, 6, 5]] 100 + [[5, 6]] 200 + [[0, 3],[5],[2, 1]] 300 + */ + + auto valids = cudf::test::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + lists_column_wrapper a( + {lists_column_wrapper{lists_column_wrapper({1, 2}, valids), + lists_column_wrapper{7, 6, 5}}, + lists_column_wrapper{lists_column_wrapper{5, 6}}, + lists_column_wrapper{lists_column_wrapper{0, 3}, + lists_column_wrapper{5}, + lists_column_wrapper({2, 1}, valids)}}); + fixed_width_column_wrapper b({100, 200, 300}); + + lists_column_wrapper expected_a{lists_column_wrapper({1, 2}, valids), + lists_column_wrapper{7, 6, 5}, + lists_column_wrapper{5, 6}, + lists_column_wrapper{0, 3}, + lists_column_wrapper{5}, + lists_column_wrapper({2, 1}, valids)}; + fixed_width_column_wrapper expected_b{100, 100, 200, 300, 300, 300}; + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} + +TEST_F(ExplodeTest, NullsInNestedDoubleExplode) +{ + /* + a b + [[1, 2], [], [7, 6, 5]] 100 + [[5, 6]] 200 + [[0, 3],[5],[2, 1]] 300 + */ + + auto valids = cudf::test::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + lists_column_wrapper a{ + lists_column_wrapper{lists_column_wrapper({1, 2}, valids), + lists_column_wrapper{}, + lists_column_wrapper{7, 6, 5}}, + lists_column_wrapper{lists_column_wrapper{5, 6}}, + lists_column_wrapper{lists_column_wrapper{0, 3}, + lists_column_wrapper{5}, + lists_column_wrapper({2, 1}, valids)}}; + fixed_width_column_wrapper b{100, 200, 300}; + + fixed_width_column_wrapper expected_a({1, 2, 7, 6, 5, 5, 6, 0, 3, 5, 2, 1}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); + fixed_width_column_wrapper expected_b{ + 100, 100, 100, 100, 100, 200, 200, 300, 300, 300, 300, 300}; + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + ret = cudf::explode(ret->view(), 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} + +TEST_F(ExplodeTest, NestedStructs) +{ + /* + a b + [[1, 2], [7, 6, 5]] {100, "100"} + [[5, 6]] {200, "200"} + [[0, 3],[5],[2, 1]] {300, "300"} + */ + + auto valids = cudf::test::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + lists_column_wrapper a( + {lists_column_wrapper{lists_column_wrapper({1, 2}, valids), + lists_column_wrapper{7, 6, 5}}, + lists_column_wrapper{lists_column_wrapper{5, 6}}, + lists_column_wrapper{lists_column_wrapper{0, 3}, + lists_column_wrapper{5}, + lists_column_wrapper({2, 1}, valids)}}); + fixed_width_column_wrapper b1({100, 200, 300}); + strings_column_wrapper b2{"100", "200", "300"}; + structs_column_wrapper b({b1, b2}); + + lists_column_wrapper expected_a{lists_column_wrapper({1, 2}, valids), + lists_column_wrapper{7, 6, 5}, + lists_column_wrapper{5, 6}, + lists_column_wrapper{0, 3}, + lists_column_wrapper{5}, + lists_column_wrapper({2, 1}, valids)}; + fixed_width_column_wrapper expected_b1{100, 100, 200, 300, 300, 300}; + strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"}; + structs_column_wrapper expected_b({expected_b1, expected_b2}); + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} + +TYPED_TEST(ExplodeTypedTest, ListOfStructs) +{ + /* + a b + [{70, "70"}, {75, "75"}] 100 + [{50, "50"}, {55, "55"}] 200 + [{35, "35"}, {45, "45"}] 300 + [{25, "25"}, {30, "30"}] 400 + [{15, "15"}, {20, "20"}] 500 +*/ + + auto numeric_col = + fixed_width_column_wrapper{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}}; + strings_column_wrapper string_col{"70", "75", "50", "55", "35", "45", "25", "30", "15", "20"}; + auto struct_col = structs_column_wrapper{{numeric_col, string_col}}.release(); + auto a = cudf::make_lists_column(5, + fixed_width_column_wrapper{0, 2, 4, 6, 8, 10}.release(), + std::move(struct_col), + cudf::UNKNOWN_NULL_COUNT, + {}); + + fixed_width_column_wrapper b{100, 200, 300, 400, 500}; + + cudf::table_view t({a->view(), b}); + auto ret = cudf::explode(t, 0); + + auto expected_numeric_col = + fixed_width_column_wrapper{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}}; + strings_column_wrapper expected_string_col{ + "70", "75", "50", "55", "35", "45", "25", "30", "15", "20"}; + + auto expected_a = structs_column_wrapper{{expected_numeric_col, expected_string_col}}.release(); + fixed_width_column_wrapper expected_b{100, 100, 200, 200, 300, 300, 400, 400, 500, 500}; + + cudf::table_view expected({expected_a->view(), expected_b}); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} + +TEST_F(ExplodeTest, SlicedList) +{ + /* + a b + [[1, 2],[7, 6, 5]] 100 + [[5, 6]] 200 + [[0, 3],[5],[2, 1]] 300 + [[8, 3],[],[4, 3, 1, 2]] 400 + [[2, 3, 4],[9, 8]] 500 + + slicing the top 2 rows and the bottom row off + */ + + auto valids = cudf::test::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + lists_column_wrapper a( + {lists_column_wrapper{lists_column_wrapper({1, 2}, valids), + lists_column_wrapper{7, 6, 5}}, + lists_column_wrapper{lists_column_wrapper{5, 6}}, + lists_column_wrapper{lists_column_wrapper{0, 3}, + lists_column_wrapper{5}, + lists_column_wrapper({2, 1}, valids)}, + lists_column_wrapper{lists_column_wrapper{8, 3}, + lists_column_wrapper{}, + lists_column_wrapper({4, 3, 1, 2}, valids)}, + lists_column_wrapper{lists_column_wrapper{2, 3, 4}, + lists_column_wrapper{9, 8}}}); + fixed_width_column_wrapper b({100, 200, 300, 400, 500}); + + lists_column_wrapper expected_a{lists_column_wrapper{0, 3}, + lists_column_wrapper{5}, + lists_column_wrapper({2, 1}, valids), + lists_column_wrapper{8, 3}, + lists_column_wrapper{}, + lists_column_wrapper({4, 3, 1, 2}, valids)}; + fixed_width_column_wrapper expected_b{300, 300, 300, 400, 400, 400}; + + cudf::table_view t({a, b}); + auto sliced_t = cudf::slice(t, {2, 4}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(sliced_t[0], 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu index c6565ac72dc..c501071ccbe 100644 --- a/cpp/tests/scalar/scalar_device_view_test.cu +++ b/cpp/tests/scalar/scalar_device_view_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #include #include +#include #include #include #include diff --git a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp index 066a6624fb7..ad693f96c4d 100644 --- a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp +++ b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp @@ -249,7 +249,7 @@ TEST_F(ApplyBooleanMask, NoNullInput) TEST_F(ApplyBooleanMask, CorrectNullCount) { - cudf::size_type inputRows = 75000; + cudf::size_type inputRows = 471234; auto seq1 = cudf::test::make_counting_transform_iterator(0, [](auto i) { return i; }); auto valid_seq1 = cudf::test::make_counting_transform_iterator(0, [](auto row) { return true; }); diff --git a/cpp/tests/strings/attrs_tests.cpp b/cpp/tests/strings/attrs_tests.cpp index 396895dc055..117a215374a 100644 --- a/cpp/tests/strings/attrs_tests.cpp +++ b/cpp/tests/strings/attrs_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,6 @@ #include #include #include -#include #include diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp index c01b220d9da..97b1dd716d7 100644 --- a/cpp/tests/strings/find_tests.cpp +++ b/cpp/tests/strings/find_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,7 +23,6 @@ #include #include #include -#include #include diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp index 3a3613cc35d..40775382e16 100644 --- a/cpp/tests/strings/floats_tests.cpp +++ b/cpp/tests/strings/floats_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,6 @@ #include #include #include -#include #include diff --git a/cpp/tests/table/row_operators_tests.cpp b/cpp/tests/table/row_operators_tests.cpp index c604e83f05d..3c970a5d1f1 100644 --- a/cpp/tests/table/row_operators_tests.cpp +++ b/cpp/tests/table/row_operators_tests.cpp @@ -65,3 +65,25 @@ TEST_F(RowOperatorTestForNAN, NANSorting) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got2->view()); } + +TEST_F(RowOperatorTestForNAN, NANSortingNonNull) +{ + cudf::test::fixed_width_column_wrapper input{ + {0., + double(NAN), + -1., + 7., + std::numeric_limits::infinity(), + 1., + -1 * std::numeric_limits::infinity()}}; + + cudf::table_view input_table{{input}}; + + auto result = cudf::sorted_order(input_table, {cudf::order::ASCENDING}); + cudf::test::fixed_width_column_wrapper expected_asc{{6, 2, 0, 5, 3, 4, 1}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_asc, result->view()); + + result = cudf::sorted_order(input_table, {cudf::order::DESCENDING}); + cudf::test::fixed_width_column_wrapper expected_desc{{1, 4, 3, 5, 0, 2, 6}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_desc, result->view()); +} diff --git a/cpp/tests/transform/bools_to_mask_test.cpp b/cpp/tests/transform/bools_to_mask_test.cpp index 5e49e2e854a..20d1c5df5ea 100644 --- a/cpp/tests/transform/bools_to_mask_test.cpp +++ b/cpp/tests/transform/bools_to_mask_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,6 @@ #include #include #include -#include struct MaskToNullTest : public cudf::test::BaseFixture { void run_test(std::vector input, std::vector val) diff --git a/cpp/tests/transform/mask_to_bools_test.cpp b/cpp/tests/transform/mask_to_bools_test.cpp index 2b0325336e1..2a759ffcfe5 100644 --- a/cpp/tests/transform/mask_to_bools_test.cpp +++ b/cpp/tests/transform/mask_to_bools_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,7 +22,6 @@ #include #include #include -#include struct MaskToBools : public cudf::test::BaseFixture { }; diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index 62f31233c80..4f7ac41a00f 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -67,6 +67,8 @@ struct column_property_comparator { if (lhs.size() > 0 && check_exact_equality) { EXPECT_EQ(lhs.nullable(), rhs.nullable()); } + EXPECT_EQ(lhs.null_count(), rhs.null_count()); + // equivalent, but not exactly equal columns can have a different number of children if their // sizes are both 0. Specifically, empty string columns may or may not have children. if (check_exact_equality || lhs.size() > 0) { diff --git a/cpp/tests/utilities/scalar_utilities.cu b/cpp/tests/utilities/scalar_utilities.cu deleted file mode 100644 index 6149356e2f3..00000000000 --- a/cpp/tests/utilities/scalar_utilities.cu +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include "gtest/gtest.h" - -using cudf::scalar_type_t; - -namespace cudf { -namespace test { -namespace { -struct compare_scalar_functor { - template - void operator()(cudf::scalar const& lhs, cudf::scalar const& rhs) - { - auto lhs_t = static_cast const&>(lhs); - auto rhs_t = static_cast const&>(rhs); - EXPECT_EQ(lhs_t.value(), rhs_t.value()); - } -}; - -template <> -void compare_scalar_functor::operator()(cudf::scalar const& lhs, cudf::scalar const& rhs) -{ - auto lhs_t = static_cast const&>(lhs); - auto rhs_t = static_cast const&>(rhs); - EXPECT_FLOAT_EQ(lhs_t.value(), rhs_t.value()); -} - -template <> -void compare_scalar_functor::operator()(cudf::scalar const& lhs, cudf::scalar const& rhs) -{ - auto lhs_t = static_cast const&>(lhs); - auto rhs_t = static_cast const&>(rhs); - EXPECT_DOUBLE_EQ(lhs_t.value(), rhs_t.value()); -} - -template <> -void compare_scalar_functor::operator()(cudf::scalar const& lhs, - cudf::scalar const& rhs) -{ - CUDF_FAIL("Unsupported scalar compare type: dictionary"); -} - -template <> -void compare_scalar_functor::operator()(cudf::scalar const& lhs, - cudf::scalar const& rhs) -{ - CUDF_FAIL("Unsupported scalar compare type: list_view"); -} - -template <> -void compare_scalar_functor::operator()(cudf::scalar const& lhs, - cudf::scalar const& rhs) -{ - CUDF_FAIL("Unsupported scalar compare type: struct_view"); -} - -} // anonymous namespace - -void expect_scalars_equal(cudf::scalar const& lhs, cudf::scalar const& rhs) -{ - EXPECT_EQ(lhs.type(), rhs.type()); - EXPECT_EQ(lhs.is_valid(), rhs.is_valid()); - - if (lhs.is_valid() && rhs.is_valid() && lhs.type() == rhs.type()) { - type_dispatcher(lhs.type(), compare_scalar_functor{}, lhs, rhs); - } -} - -} // namespace test -} // namespace cudf diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 09cfcabdcdb..8daf3a0850e 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -21,6 +21,7 @@ # import os import sys +from recommonmark.transform import AutoStructify sys.path.insert(0, os.path.abspath("../..")) @@ -200,8 +201,15 @@ autoclass_content = "init" +# Config AutoStructify +github_doc_root = 'https://github.com/rtfd/recommonmark/tree/master/doc/' def setup(app): app.add_js_file("copybutton_pydocs.js") app.add_css_file("params.css") app.add_css_file("https://docs.rapids.ai/assets/css/custom.css") + app.add_config_value('recommonmark_config', { + 'url_resolver': lambda url: github_doc_root + url, + 'auto_toc_tree_section': 'Contents', + }, True) + app.add_transform(AutoStructify) diff --git a/docs/cudf/source/groupby.md b/docs/cudf/source/groupby.md index 4d775e3d51a..7e96d4fe38c 100644 --- a/docs/cudf/source/groupby.md +++ b/docs/cudf/source/groupby.md @@ -33,6 +33,37 @@ import cudf >>> gb3 = df.groupby(cudf.Series(['a', 'a', 'b', 'b', 'b'])) # grouping by an external column ``` +``` warning:: + cuDF uses `sort=False` by default to achieve better performance, which provides no gaurentee to the group order in outputs. This deviates from Pandas default behavior. + + For example: + + .. code-block:: python + + >>> df = cudf.DataFrame({'a' : [2, 2, 1], 'b' : [42, 21, 11]}) + >>> df.groupby('a').sum() + b + a + 2 63 + 1 11 + >>> df.to_pandas().groupby('a').sum() + b + a + 1 11 + 2 63 + + Setting `sort=True` will produce Pandas-like output, but with some performance penalty: + + .. code-block:: python + + >>> df.groupby('a', sort=True).sum() + b + a + 1 11 + 2 63 + +``` + ### Grouping by index levels You can also group by one or more levels of a MultiIndex: @@ -66,7 +97,7 @@ b Aggregations on groups is supported via the `agg` method: -``` +```python >>> df a b c 0 1 1 1 diff --git a/java/pom.xml b/java/pom.xml index ddd0d06a74f..387ef1cb65b 100755 --- a/java/pom.xml +++ b/java/pom.xml @@ -132,6 +132,12 @@ 2.25.0 test + + org.apache.arrow + arrow-vector + ${arrow.version} + test + @@ -151,6 +157,7 @@ ALL ${project.build.directory}/cmake-build 1.7.30 + 0.15.1 diff --git a/java/src/main/java/ai/rapids/cudf/ArrowColumnBuilder.java b/java/src/main/java/ai/rapids/cudf/ArrowColumnBuilder.java new file mode 100644 index 00000000000..b3c97930d2a --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/ArrowColumnBuilder.java @@ -0,0 +1,113 @@ +/* + * + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +import java.nio.ByteBuffer; +import java.util.ArrayList; + +/** + * Column builder from Arrow data. This builder takes in byte buffers referencing + * Arrow data and allows efficient building of CUDF ColumnVectors from that Arrow data. + * The caller can add multiple batches where each batch corresponds to Arrow data + * and those batches get concatenated together after being converted to CUDF + * ColumnVectors. + * This currently only supports primitive types and Strings, Decimals and nested types + * such as list and struct are not supported. + */ +public final class ArrowColumnBuilder implements AutoCloseable { + private DType type; + private final ArrayList data = new ArrayList<>(); + private final ArrayList validity = new ArrayList<>(); + private final ArrayList offsets = new ArrayList<>(); + private final ArrayList nullCount = new ArrayList<>(); + private final ArrayList rows = new ArrayList<>(); + + public ArrowColumnBuilder(HostColumnVector.DataType type) { + this.type = type.getType(); + } + + /** + * Add an Arrow buffer. This API allows you to add multiple if you want them + * combined into a single ColumnVector. + * Note, this takes all data, validity, and offsets buffers, but they may not all + * be needed based on the data type. The buffer should be null if its not used + * for that type. + * This API only supports primitive types and Strings, Decimals and nested types + * such as list and struct are not supported. + * @param rows - number of rows in this Arrow buffer + * @param nullCount - number of null values in this Arrow buffer + * @param data - ByteBuffer of the Arrow data buffer + * @param validity - ByteBuffer of the Arrow validity buffer + * @param offsets - ByteBuffer of the Arrow offsets buffer + */ + public void addBatch(long rows, long nullCount, ByteBuffer data, ByteBuffer validity, + ByteBuffer offsets) { + this.rows.add(rows); + this.nullCount.add(nullCount); + this.data.add(data); + this.validity.add(validity); + this.offsets.add(offsets); + } + + /** + * Create the immutable ColumnVector, copied to the device based on the Arrow data. + * @return - new ColumnVector + */ + public final ColumnVector buildAndPutOnDevice() { + int numBatches = rows.size(); + ArrayList allVecs = new ArrayList<>(numBatches); + ColumnVector vecRet; + try { + for (int i = 0; i < numBatches; i++) { + allVecs.add(ColumnVector.fromArrow(type, rows.get(i), nullCount.get(i), + data.get(i), validity.get(i), offsets.get(i))); + } + if (numBatches == 1) { + vecRet = allVecs.get(0); + } else if (numBatches > 1) { + vecRet = ColumnVector.concatenate(allVecs.toArray(new ColumnVector[0])); + } else { + throw new IllegalStateException("Can't build a ColumnVector when no Arrow batches specified"); + } + } finally { + // close the vectors that were concatenated + if (numBatches > 1) { + allVecs.forEach(cv -> cv.close()); + } + } + return vecRet; + } + + @Override + public void close() { + // memory buffers owned outside of this + } + + @Override + public String toString() { + return "ArrowColumnBuilder{" + + "type=" + type + + ", data=" + data + + ", validity=" + validity + + ", offsets=" + offsets + + ", nullCount=" + nullCount + + ", rows=" + rows + + '}'; + } +} diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index 88c024a437b..252f869a049 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -25,6 +25,7 @@ import java.math.BigDecimal; import java.math.RoundingMode; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import java.util.Optional; @@ -310,6 +311,50 @@ public BaseDeviceMemoryBuffer getDeviceBufferFor(BufferType type) { return srcBuffer; } + /** + * Ensures the ByteBuffer passed in is a direct byte buffer. + * If it is not then it creates one and copies the data in + * the byte buffer passed in to the direct byte buffer + * it created and returns it. + */ + private static ByteBuffer bufferAsDirect(ByteBuffer buf) { + ByteBuffer bufferOut = buf; + if (bufferOut != null && !bufferOut.isDirect()) { + bufferOut = ByteBuffer.allocateDirect(buf.remaining()); + bufferOut.put(buf); + bufferOut.flip(); + } + return bufferOut; + } + + /** + * Create a ColumnVector from the Apache Arrow byte buffers passed in. + * Any of the buffers not used for that datatype should be set to null. + * The buffers are expected to be off heap buffers, but if they are not, + * it will handle copying them to direct byte buffers. + * This only supports primitive types. Strings, Decimals and nested types + * such as list and struct are not supported. + * @param type - type of the column + * @param numRows - Number of rows in the arrow column + * @param nullCount - Null count + * @param data - ByteBuffer of the Arrow data buffer + * @param validity - ByteBuffer of the Arrow validity buffer + * @param offsets - ByteBuffer of the Arrow offsets buffer + * @return - new ColumnVector + */ + public static ColumnVector fromArrow( + DType type, + long numRows, + long nullCount, + ByteBuffer data, + ByteBuffer validity, + ByteBuffer offsets) { + long columnHandle = fromArrow(type.typeId.getNativeId(), numRows, nullCount, + bufferAsDirect(data), bufferAsDirect(validity), bufferAsDirect(offsets)); + ColumnVector vec = new ColumnVector(columnHandle); + return vec; + } + /** * Create a new vector of length rows, where each row is filled with the Scalar's * value @@ -615,6 +660,10 @@ public ColumnVector castTo(DType type) { private static native long sequence(long initialValue, long step, int rows); + private static native long fromArrow(int type, long col_length, + long null_count, ByteBuffer data, ByteBuffer validity, + ByteBuffer offsets) throws CudfException; + private static native long fromScalar(long scalarHandle, int rowCount) throws CudfException; private static native long makeList(long[] handles, long typeHandle, int scale, long rows) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index c2110a5f8ff..1dce52f7105 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -2323,6 +2323,37 @@ public static ColumnView makeStructView(ColumnView... columns) { return makeStructView(columns[0].rows, columns); } + /** + * Create a column of bool values indicating whether the specified scalar + * is an element of each row of a list column. + * Output `column[i]` is set to null if one or more of the following are true: + * 1. The key is null + * 2. The column vector list value is null + * 3. The list row does not contain the key, and contains at least + * one null. + * @param key the scalar to look up + * @return a Boolean ColumnVector with the result of the lookup + */ + public final ColumnVector listContains(Scalar key) { + assert type.equals(DType.LIST) : "column type must be a LIST"; + return new ColumnVector(listContains(getNativeView(), key.getScalarHandle())); + } + + /** + * Create a column of bool values indicating whether the list rows of the first + * column contain the corresponding values in the second column. + * 1. The key value is null + * 2. The column vector list value is null + * 3. The list row does not contain the key, and contains at least + * one null. + * @param key the ColumnVector with look up values + * @return a Boolean ColumnVector with the result of the lookup + */ + public final ColumnVector listContainsColumn(ColumnView key) { + assert type.equals(DType.LIST) : "column type must be a LIST"; + return new ColumnVector(listContainsColumn(getNativeView(), key.getNativeView())); + } + ///////////////////////////////////////////////////////////////////////////// // INTERNAL/NATIVE ACCESS ///////////////////////////////////////////////////////////////////////////// @@ -2558,6 +2589,22 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat private static native long extractListElement(long nativeView, int index); + /** + * Native method for list lookup + * @param nativeView the column view handle of the list + * @param key the scalar key handle + * @return column handle of the resultant + */ + private static native long listContains(long nativeView, long key); + + /** + * Native method for list lookup + * @param nativeView the column view handle of the list + * @param keyColumn the column handle of look up keys + * @return column handle of the resultant + */ + private static native long listContainsColumn(long nativeView, long keyColumn); + private static native long castTo(long nativeHandle, int type, int scale); private static native long logicalCastTo(long nativeHandle, int type, int scale); diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 14748db872d..da4c446d9f7 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -501,6 +501,8 @@ private static native long[] repeatColumnCount(long tableHandle, long columnHandle, boolean checkCount); + private static native long[] explode(long tableHandle, int index); + private native long createCudfTableView(long[] nativeColumnViewHandles); ///////////////////////////////////////////////////////////////////////////// @@ -1615,6 +1617,47 @@ public ContiguousTable[] contiguousSplit(int... indices) { return contiguousSplit(nativeHandle, indices); } + /** + * Explodes a list column's elements. + * + * Any list is exploded, which means the elements of the list in each row are expanded + * into new rows in the output. The corresponding rows for other columns in the input + * are duplicated. + * + * + * Example: + * input: [[5,10,15], 100], + * [[20,25], 200], + * [[30], 300], + * index: 0 + * output: [5, 100], + * [10, 100], + * [15, 100], + * [20, 200], + * [25, 200], + * [30, 300] + * + * + * Nulls propagate in different ways depending on what is null. + * + * [[5,null,15], 100], + * [null, 200] + * returns: + * [5, 100], + * [null, 100], + * [15, 100] + * + * Note that null lists are completely removed from the output + * and nulls inside lists are pulled out and remain. + * + * @param index Column index to explode inside the table. + * @return A new table with explode_col exploded. + */ + public Table explode(int index) { + assert 0 <= index && index < columns.length : "Column index is out of range"; + assert columns[index].getType().equals(DType.LIST) : "Column to explode must be of type LIST"; + return new Table(explode(nativeHandle, index)); + } /** * Gathers the rows of this table according to `gatherMap` such that row "i" @@ -2658,11 +2701,15 @@ private static ColumnVector from(DType type, Object dataArray) { } @SuppressWarnings("unchecked") - private static ColumnVector fromLists(DataType dataType, Object[][] dataArray) { + private static ColumnVector fromLists(DataType dataType, Object[] dataArray) { List[] dataLists = new List[dataArray.length]; for (int i = 0; i < dataLists.length; ++i) { - Object[] dataList = dataArray[i]; - dataLists[i] = dataList != null ? Arrays.asList(dataList) : null; + // The element in dataArray can be an array or list, because the below overloaded + // version accepts a List of Array as rows. + // `public TestBuilder column(ListType dataType, List... values)` + Object dataList = dataArray[i]; + dataLists[i] = dataList == null ? null : + (dataList instanceof List ? (List)dataList : Arrays.asList((Object[])dataList)); } return ColumnVector.fromLists(dataType, dataLists); } @@ -2680,7 +2727,7 @@ public Table build() { Object dataArray = typeErasedData.get(i); if (dtype.isNestedType()) { if (dtype.equals(DType.LIST)) { - columns.add(fromLists(dataType, (Object[][]) dataArray)); + columns.add(fromLists(dataType, (Object[]) dataArray)); } else if (dtype.equals(DType.STRUCT)) { columns.add(fromStructs(dataType, (StructData[]) dataArray)); } else { diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index 3bce4912fa4..a1e8517c646 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -14,12 +14,15 @@ * limitations under the License. */ +#include #include #include #include +#include #include #include #include +#include #include #include #include @@ -50,6 +53,78 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequence(JNIEnv *env, j CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(JNIEnv *env, jclass, + jint j_type, + jlong j_col_length, + jlong j_null_count, + jobject j_data_obj, + jobject j_validity_obj, + jobject j_offsets_obj) { + try { + cudf::jni::auto_set_device(env); + cudf::type_id n_type = static_cast(j_type); + // not all the buffers are used for all types + void const *data_address = 0; + int data_length = 0; + if (j_data_obj != 0) { + data_address = env->GetDirectBufferAddress(j_data_obj); + data_length = env->GetDirectBufferCapacity(j_data_obj); + } + void const *validity_address = 0; + int validity_length = 0; + if (j_validity_obj != 0) { + validity_address = env->GetDirectBufferAddress(j_validity_obj); + validity_length = env->GetDirectBufferCapacity(j_validity_obj); + } + void const *offsets_address = 0; + int offsets_length = 0; + if (j_offsets_obj != 0) { + offsets_address = env->GetDirectBufferAddress(j_offsets_obj); + offsets_length = env->GetDirectBufferCapacity(j_offsets_obj); + } + auto data_buffer = arrow::Buffer::Wrap(static_cast(data_address), static_cast(data_length)); + auto null_buffer = arrow::Buffer::Wrap(static_cast(validity_address), static_cast(validity_length)); + auto offsets_buffer = arrow::Buffer::Wrap(static_cast(offsets_address), static_cast(offsets_length)); + + cudf::jni::native_jlongArray outcol_handles(env, 1); + std::shared_ptr arrow_array; + switch (n_type) { + case cudf::type_id::DECIMAL32: + JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DECIMAL32 yet", 0); + break; + case cudf::type_id::DECIMAL64: + JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DECIMAL64 yet", 0); + break; + case cudf::type_id::STRUCT: + JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting STRUCT yet", 0); + break; + case cudf::type_id::LIST: + JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting LIST yet", 0); + break; + case cudf::type_id::DICTIONARY32: + JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DICTIONARY32 yet", 0); + break; + case cudf::type_id::STRING: + arrow_array = std::make_shared(j_col_length, offsets_buffer, data_buffer, null_buffer, j_null_count); + break; + default: + // this handles the primitive types + arrow_array = cudf::detail::to_arrow_array(n_type, j_col_length, data_buffer, null_buffer, j_null_count); + } + auto name_and_type = arrow::field("col", arrow_array->type()); + std::vector> fields = {name_and_type}; + std::shared_ptr schema = std::make_shared(fields); + auto arrow_table = arrow::Table::Make(schema, std::vector>{arrow_array}); + std::unique_ptr table_result = cudf::from_arrow(*(arrow_table)); + std::vector> retCols = table_result->release(); + if (retCols.size() != 1) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Must result in one column", 0); + } + return reinterpret_cast(retCols[0].release()); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeList(JNIEnv *env, jobject j_object, jlongArray handles, jlong j_type, diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 621344ac38f..82e71b04a2f 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -329,6 +330,40 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractListElement(JNIEnv CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContains(JNIEnv *env, jclass, + jlong column_view, + jlong lookup_key) { + JNI_NULL_CHECK(env, column_view, "column is null", 0); + JNI_NULL_CHECK(env, lookup_key, "lookup scalar is null", 0); + try { + cudf::jni::auto_set_device(env); + cudf::column_view *cv = reinterpret_cast(column_view); + cudf::lists_column_view lcv(*cv); + cudf::scalar *lookup_scalar = reinterpret_cast(lookup_key); + + std::unique_ptr ret = cudf::lists::contains(lcv, *lookup_scalar); + return reinterpret_cast(ret.release()); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsColumn(JNIEnv *env, jclass, + jlong column_view, + jlong lookup_key_cv) { + JNI_NULL_CHECK(env, column_view, "column is null", 0); + JNI_NULL_CHECK(env, lookup_key_cv, "lookup column is null", 0); + try { + cudf::jni::auto_set_device(env); + cudf::column_view *cv = reinterpret_cast(column_view); + cudf::lists_column_view lcv(*cv); + cudf::column_view *lookup_cv = reinterpret_cast(lookup_key_cv); + + std::unique_ptr ret = cudf::lists::contains(lcv, *lookup_cv); + return reinterpret_cast(ret.release()); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass, jlong column_view, jlong delimiter) { diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 32f602ffe85..20afe12baf9 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -186,18 +186,19 @@ class jni_writer_data_sink final : public cudf::io::data_sink { long alloc_size = MINIMUM_WRITE_BUFFER_SIZE; }; -template class jni_table_writer_handle final { +template class jni_table_writer_handle final { public: - explicit jni_table_writer_handle(std::shared_ptr &state) : state(state), sink() {} - jni_table_writer_handle(std::shared_ptr &state, - std::unique_ptr &sink) - : state(state), sink(std::move(sink)) {} + explicit jni_table_writer_handle(std::unique_ptr writer) + : writer(std::move(writer)), sink() {} + jni_table_writer_handle(std::unique_ptr writer, + std::unique_ptr sink) + : writer(std::move(writer)), sink(std::move(sink)) {} - std::shared_ptr state; + std::unique_ptr writer; std::unique_ptr sink; }; -typedef jni_table_writer_handle native_parquet_writer_handle; +typedef jni_table_writer_handle native_parquet_writer_handle; typedef jni_table_writer_handle native_orc_writer_handle; class native_arrow_ipc_writer_handle final { @@ -871,9 +872,9 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin( .decimal_precision(v_precisions) .build(); - std::shared_ptr state = write_parquet_chunked_begin(opts); + auto writer_ptr = std::make_unique(opts); cudf::jni::native_parquet_writer_handle *ret = - new cudf::jni::native_parquet_writer_handle(state, data_sink); + new cudf::jni::native_parquet_writer_handle(std::move(writer_ptr), std::move(data_sink)); return reinterpret_cast(ret); } CATCH_STD(env, 0) @@ -919,9 +920,9 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin( .decimal_precision(v_precisions) .build(); - std::shared_ptr state = write_parquet_chunked_begin(opts); + auto writer_ptr = std::make_unique(opts); cudf::jni::native_parquet_writer_handle *ret = - new cudf::jni::native_parquet_writer_handle(state); + new cudf::jni::native_parquet_writer_handle(std::move(writer_ptr)); return reinterpret_cast(ret); } CATCH_STD(env, 0) @@ -944,7 +945,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(JNIEnv *env, } try { cudf::jni::auto_set_device(env); - write_parquet_chunked(*tview, state->state); + state->writer->write(*tview); } CATCH_STD(env, ) } @@ -959,7 +960,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetEnd(JNIEnv *env, jc std::unique_ptr make_sure_we_delete(state); try { cudf::jni::auto_set_device(env); - write_parquet_chunked_end(state->state); + state->writer->close(); } CATCH_STD(env, ) } @@ -1043,9 +1044,9 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin( .compression(static_cast(j_compression)) .enable_statistics(true) .build(); - auto writer_ptr = std::make_shared(opts); + auto writer_ptr = std::make_unique(opts); cudf::jni::native_orc_writer_handle *ret = - new cudf::jni::native_orc_writer_handle(writer_ptr, data_sink); + new cudf::jni::native_orc_writer_handle(std::move(writer_ptr), std::move(data_sink)); return reinterpret_cast(ret); } CATCH_STD(env, 0) @@ -1084,8 +1085,9 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin( .compression(static_cast(j_compression)) .enable_statistics(true) .build(); - auto writer_ptr = std::make_shared(opts); - cudf::jni::native_orc_writer_handle *ret = new cudf::jni::native_orc_writer_handle(writer_ptr); + auto writer_ptr = std::make_unique(opts); + cudf::jni::native_orc_writer_handle *ret = + new cudf::jni::native_orc_writer_handle(std::move(writer_ptr)); return reinterpret_cast(ret); } CATCH_STD(env, 0) @@ -1107,7 +1109,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCChunk(JNIEnv *env, jcla } try { cudf::jni::auto_set_device(env); - state->state->write(*tview); + state->writer->write(*tview); } CATCH_STD(env, ) } @@ -1121,7 +1123,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCEnd(JNIEnv *env, jclass std::unique_ptr make_sure_we_delete(state); try { cudf::jni::auto_set_device(env); - state->state->close(); + state->writer->close(); } CATCH_STD(env, ) } @@ -1950,4 +1952,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_timeRangeRollingWindowAgg CATCH_STD(env, NULL); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explode(JNIEnv *env, jclass, + jlong input_jtable, + jint column_index) { + JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0); + try { + cudf::jni::auto_set_device(env); + cudf::table_view *input_table = reinterpret_cast(input_jtable); + cudf::size_type col_index = static_cast(column_index); + std::unique_ptr exploded = cudf::explode(*input_table, col_index); + return cudf::jni::convert_table_for_return(env, exploded); + } + CATCH_STD(env, 0); +} + } // extern "C" diff --git a/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java new file mode 100644 index 00000000000..d8ba4548b6d --- /dev/null +++ b/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java @@ -0,0 +1,330 @@ +/* + * + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +import java.nio.ByteBuffer; +import java.util.ArrayList; + +import ai.rapids.cudf.HostColumnVector.BasicType; +import ai.rapids.cudf.HostColumnVector.ListType; +import ai.rapids.cudf.HostColumnVector.StructType; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.DateDayVector; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.util.Text; + +import org.junit.jupiter.api.Test; + +import static ai.rapids.cudf.TableTest.assertColumnsAreEqual; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class ArrowColumnVectorTest extends CudfTestBase { + + @Test + void testArrowIntMultiBatches() { + ArrowColumnBuilder builder = new ArrowColumnBuilder(new HostColumnVector.BasicType(true, DType.INT32)); + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + int numVecs = 4; + IntVector[] vectors = new IntVector[numVecs]; + try { + ArrayList expectedArr = new ArrayList(); + for (int j = 0; j < numVecs; j++) { + int pos = 0; + int count = 10000; + IntVector vector = new IntVector("intVec", allocator); + int start = count * j; + int end = count * (j + 1); + for (int i = start; i < end; i++) { + expectedArr.add(i); + ((IntVector) vector).setSafe(pos, i); + pos++; + } + vector.setValueCount(count); + vectors[j] = vector; + ByteBuffer data = vector.getDataBuffer().nioBuffer(); + ByteBuffer valid = vector.getValidityBuffer().nioBuffer(); + builder.addBatch(vector.getValueCount(), vector.getNullCount(), data, valid, null); + } + ColumnVector cv = builder.buildAndPutOnDevice(); + ColumnVector expected = ColumnVector.fromBoxedInts(expectedArr.toArray(new Integer[0])); + assertEquals(cv.getType(), DType.INT32); + assertColumnsAreEqual(expected, cv, "ints"); + } finally { + for (int i = 0; i < numVecs; i++) { + vectors[i].close(); + } + } + } + + @Test + void testArrowLong() { + ArrowColumnBuilder builder = new ArrowColumnBuilder(new HostColumnVector.BasicType(true, DType.INT64)); + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + try (BigIntVector vector = new BigIntVector("vec", allocator)) { + ArrayList expectedArr = new ArrayList(); + int count = 10000; + for (int i = 0; i < count; i++) { + expectedArr.add(new Long(i)); + ((BigIntVector) vector).setSafe(i, i); + } + vector.setValueCount(count); + ByteBuffer data = vector.getDataBuffer().nioBuffer(); + ByteBuffer valid = vector.getValidityBuffer().nioBuffer(); + builder.addBatch(vector.getValueCount(), vector.getNullCount(), data, valid, null); + ColumnVector cv = builder.buildAndPutOnDevice(); + assertEquals(cv.getType(), DType.INT64); + ColumnVector expected = ColumnVector.fromBoxedLongs(expectedArr.toArray(new Long[0])); + assertColumnsAreEqual(expected, cv, "Longs"); + } + } + + @Test + void testArrowLongOnHeap() { + ArrowColumnBuilder builder = new ArrowColumnBuilder(new HostColumnVector.BasicType(true, DType.INT64)); + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + try (BigIntVector vector = new BigIntVector("vec", allocator)) { + ArrayList expectedArr = new ArrayList(); + int count = 10000; + for (int i = 0; i < count; i++) { + expectedArr.add(new Long(i)); + ((BigIntVector) vector).setSafe(i, i); + } + vector.setValueCount(count); + // test that we handle convert buffer to direct byte buffer if its on the heap + ByteBuffer data = vector.getDataBuffer().nioBuffer(); + ByteBuffer dataOnHeap = ByteBuffer.allocate(data.remaining()); + dataOnHeap.put(data); + dataOnHeap.flip(); + ByteBuffer valid = vector.getValidityBuffer().nioBuffer(); + ByteBuffer validOnHeap = ByteBuffer.allocate(valid.remaining()); + validOnHeap.put(data); + validOnHeap.flip(); + builder.addBatch(vector.getValueCount(), vector.getNullCount(), dataOnHeap, validOnHeap, null); + ColumnVector cv = builder.buildAndPutOnDevice(); + assertEquals(cv.getType(), DType.INT64); + ColumnVector expected = ColumnVector.fromBoxedLongs(expectedArr.toArray(new Long[0])); + assertColumnsAreEqual(expected, cv, "Longs"); + } + } + + @Test + void testArrowDouble() { + ArrowColumnBuilder builder = new ArrowColumnBuilder(new HostColumnVector.BasicType(true, DType.FLOAT64)); + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + try (Float8Vector vector = new Float8Vector("vec", allocator)) { + ArrayList expectedArr = new ArrayList(); + int count = 10000; + for (int i = 0; i < count; i++) { + expectedArr.add(new Double(i)); + ((Float8Vector) vector).setSafe(i, i); + } + vector.setValueCount(count); + ByteBuffer data = vector.getDataBuffer().nioBuffer(); + ByteBuffer valid = vector.getValidityBuffer().nioBuffer(); + builder.addBatch(vector.getValueCount(), vector.getNullCount(), data, valid, null); + ColumnVector cv = builder.buildAndPutOnDevice(); + assertEquals(cv.getType(), DType.FLOAT64); + double[] array = expectedArr.stream().mapToDouble(i->i).toArray(); + ColumnVector expected = ColumnVector.fromDoubles(array); + assertColumnsAreEqual(expected, cv, "doubles"); + } + } + + @Test + void testArrowFloat() { + ArrowColumnBuilder builder = new ArrowColumnBuilder(new HostColumnVector.BasicType(true, DType.FLOAT32)); + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + try (Float4Vector vector = new Float4Vector("vec", allocator)) { + ArrayList expectedArr = new ArrayList(); + int count = 10000; + for (int i = 0; i < count; i++) { + expectedArr.add(new Float(i)); + ((Float4Vector) vector).setSafe(i, i); + } + vector.setValueCount(count); + ByteBuffer data = vector.getDataBuffer().nioBuffer(); + ByteBuffer valid = vector.getValidityBuffer().nioBuffer(); + builder.addBatch(vector.getValueCount(), vector.getNullCount(), data, valid, null); + ColumnVector cv = builder.buildAndPutOnDevice(); + assertEquals(cv.getType(), DType.FLOAT32); + float[] floatArray = new float[expectedArr.size()]; + int i = 0; + for (Float f : expectedArr) { + floatArray[i++] = (f != null ? f : Float.NaN); // Or whatever default you want. + } + ColumnVector expected = ColumnVector.fromFloats(floatArray); + assertColumnsAreEqual(expected, cv, "floats"); + } + } + + @Test + void testArrowString() { + ArrowColumnBuilder builder = new ArrowColumnBuilder(new HostColumnVector.BasicType(true, DType.STRING)); + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + try (VarCharVector vector = new VarCharVector("vec", allocator)) { + ArrayList expectedArr = new ArrayList(); + int count = 10000; + for (int i = 0; i < count; i++) { + String toAdd = i + "testString"; + expectedArr.add(toAdd); + ((VarCharVector) vector).setSafe(i, new Text(toAdd)); + } + vector.setValueCount(count); + ByteBuffer data = vector.getDataBuffer().nioBuffer(); + ByteBuffer valid = vector.getValidityBuffer().nioBuffer(); + ByteBuffer offsets = vector.getOffsetBuffer().nioBuffer(); + builder.addBatch(vector.getValueCount(), vector.getNullCount(), data, valid, offsets); + ColumnVector cv = builder.buildAndPutOnDevice(); + assertEquals(cv.getType(), DType.STRING); + ColumnVector expected = ColumnVector.fromStrings(expectedArr.toArray(new String[0])); + assertColumnsAreEqual(expected, cv, "Strings"); + } + } + + @Test + void testArrowStringOnHeap() { + ArrowColumnBuilder builder = new ArrowColumnBuilder(new HostColumnVector.BasicType(true, DType.STRING)); + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + try (VarCharVector vector = new VarCharVector("vec", allocator)) { + ArrayList expectedArr = new ArrayList(); + int count = 10000; + for (int i = 0; i < count; i++) { + String toAdd = i + "testString"; + expectedArr.add(toAdd); + ((VarCharVector) vector).setSafe(i, new Text(toAdd)); + } + vector.setValueCount(count); + ByteBuffer data = vector.getDataBuffer().nioBuffer(); + ByteBuffer valid = vector.getValidityBuffer().nioBuffer(); + ByteBuffer offsets = vector.getOffsetBuffer().nioBuffer(); + ByteBuffer dataOnHeap = ByteBuffer.allocate(data.remaining()); + dataOnHeap.put(data); + dataOnHeap.flip(); + ByteBuffer validOnHeap = ByteBuffer.allocate(valid.remaining()); + validOnHeap.put(data); + validOnHeap.flip(); + ByteBuffer offsetsOnHeap = ByteBuffer.allocate(offsets.remaining()); + offsetsOnHeap.put(offsets); + offsetsOnHeap.flip(); + builder.addBatch(vector.getValueCount(), vector.getNullCount(), dataOnHeap, validOnHeap, offsetsOnHeap); + ColumnVector cv = builder.buildAndPutOnDevice(); + assertEquals(cv.getType(), DType.STRING); + ColumnVector expected = ColumnVector.fromStrings(expectedArr.toArray(new String[0])); + assertColumnsAreEqual(expected, cv, "Strings"); + } + } + + @Test + void testArrowDays() { + ArrowColumnBuilder builder = new ArrowColumnBuilder(new HostColumnVector.BasicType(true, DType.TIMESTAMP_DAYS)); + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + try (DateDayVector vector = new DateDayVector("vec", allocator)) { + ArrayList expectedArr = new ArrayList(); + int count = 10000; + for (int i = 0; i < count; i++) { + expectedArr.add(i); + ((DateDayVector) vector).setSafe(i, i); + } + vector.setValueCount(count); + ByteBuffer data = vector.getDataBuffer().nioBuffer(); + ByteBuffer valid = vector.getValidityBuffer().nioBuffer(); + builder.addBatch(vector.getValueCount(), vector.getNullCount(), data, valid, null); + ColumnVector cv = builder.buildAndPutOnDevice(); + assertEquals(cv.getType(), DType.TIMESTAMP_DAYS); + int[] array = expectedArr.stream().mapToInt(i->i).toArray(); + ColumnVector expected = ColumnVector.daysFromInts(array); + assertColumnsAreEqual(expected, cv, "timestamp days"); + } + } + + @Test + void testArrowDecimalThrows() { + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + try (DecimalVector vector = new DecimalVector("vec", allocator, 7, 3)) { + ArrowColumnBuilder builder = new ArrowColumnBuilder(new HostColumnVector.BasicType(true, DType.create(DType.DTypeEnum.DECIMAL32, 3))); + ((DecimalVector) vector).setSafe(0, -3); + ((DecimalVector) vector).setSafe(1, 1); + ((DecimalVector) vector).setSafe(2, 2); + ((DecimalVector) vector).setSafe(3, 3); + ((DecimalVector) vector).setSafe(4, 4); + ((DecimalVector) vector).setSafe(5, 5); + vector.setValueCount(6); + ByteBuffer data = vector.getDataBuffer().nioBuffer(); + ByteBuffer valid = vector.getValidityBuffer().nioBuffer(); + builder.addBatch(vector.getValueCount(), vector.getNullCount(), data, valid, null); + assertThrows(IllegalArgumentException.class, () -> { + builder.buildAndPutOnDevice(); + }); + } + } + + @Test + void testArrowDecimal64Throws() { + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + try (DecimalVector vector = new DecimalVector("vec", allocator, 18, 0)) { + ArrowColumnBuilder builder = new ArrowColumnBuilder(new HostColumnVector.BasicType(true, DType.create(DType.DTypeEnum.DECIMAL64, -11))); + ((DecimalVector) vector).setSafe(0, -3); + ((DecimalVector) vector).setSafe(1, 1); + ((DecimalVector) vector).setSafe(2, 2); + vector.setValueCount(3); + ByteBuffer data = vector.getDataBuffer().nioBuffer(); + ByteBuffer valid = vector.getValidityBuffer().nioBuffer(); + builder.addBatch(vector.getValueCount(), vector.getNullCount(), data, valid, null); + assertThrows(IllegalArgumentException.class, () -> { + builder.buildAndPutOnDevice(); + }); + } + } + + @Test + void testArrowListThrows() { + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + try (ListVector vector = ListVector.empty("list", allocator)) { + ArrowColumnBuilder builder = new ArrowColumnBuilder(new ListType(true, new HostColumnVector.BasicType(true, DType.STRING))); + // buffer don't matter as we expect it to throw anyway + builder.addBatch(vector.getValueCount(), vector.getNullCount(), null, null, null); + assertThrows(IllegalArgumentException.class, () -> { + builder.buildAndPutOnDevice(); + }); + } + } + + @Test + void testArrowStructThrows() { + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + try (StructVector vector = StructVector.empty("struct", allocator)) { + ArrowColumnBuilder builder = new ArrowColumnBuilder(new StructType(true, new HostColumnVector.BasicType(true, DType.STRING))); + // buffer don't matter as we expect it to throw anyway + builder.addBatch(vector.getValueCount(), vector.getNullCount(), null, null, null); + assertThrows(IllegalArgumentException.class, () -> { + builder.buildAndPutOnDevice(); + }); + } + } +} diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 88ff50959f7..582b67b8287 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2899,6 +2899,67 @@ void testExtractListElements() { } } + @Test + void testListContainsString() { + List list1 = Arrays.asList("Héllo there", "thésé"); + List list2 = Arrays.asList("", "ARé some", "test strings"); + List list3 = Arrays.asList(null, "", "ARé some", "test strings", "thésé"); + List list4 = Arrays.asList(null, "", "ARé some", "test strings"); + List list5 = null; + try (ColumnVector v = ColumnVector.fromLists(new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), list1, list2, list3, list4, list5); + ColumnVector expected = ColumnVector.fromBoxedBooleans(true, false, true, null, null); + ColumnVector result = v.listContains(Scalar.fromString("thésé"))) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + void testListContainsInt() { + List list1 = Arrays.asList(1, 2, 3); + List list2 = Arrays.asList(4, 5, 6); + List list3 = Arrays.asList(7, 8, 9); + List list4 = null; + try (ColumnVector v = ColumnVector.fromLists(new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.INT32)), list1, list2, list3, list4); + ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, true, null); + ColumnVector result = v.listContains(Scalar.fromInt(7))) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + void testListContainsStringCol() { + List list1 = Arrays.asList("Héllo there", "thésé"); + List list2 = Arrays.asList("", "ARé some", "test strings"); + List list3 = Arrays.asList("FOO", "", "ARé some", "test"); + List list4 = Arrays.asList(null, "FOO", "", "ARé some", "test"); + List list5 = Arrays.asList(null, "FOO", "", "ARé some", "test"); + List list6 = null; + try (ColumnVector v = ColumnVector.fromLists(new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), list1, list2, list3, list4, list5, list6); + ColumnVector expected = ColumnVector.fromBoxedBooleans(true, true, true, true, null, null); + ColumnVector result = v.listContainsColumn( + ColumnVector.fromStrings("thésé", "", "test", "test", "iotA", null))) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + void testListContainsIntCol() { + List list1 = Arrays.asList(1, 2, 3); + List list2 = Arrays.asList(4, 5, 6); + List list3 = Arrays.asList(null, 8, 9); + List list4 = Arrays.asList(null, 8, 9); + List list5 = null; + try (ColumnVector v = ColumnVector.fromLists(new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.INT32)), list1, list2, list3, list4, list5); + ColumnVector expected = ColumnVector.fromBoxedBooleans(true, false, true, null, null); + ColumnVector result = v.listContainsColumn(ColumnVector.fromBoxedInts(3, 3, 8, 3, null))) { + assertColumnsAreEqual(expected, result); + } + } + @Test void testStringSplitRecord() { try (ColumnVector v = ColumnVector.fromStrings("Héllo there", "thésé", "null", "", "ARé some", "test strings"); diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index ebd8dadc514..35be427d0c8 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -4426,4 +4426,55 @@ void testBuilderWithColumn() { } } } + + @Test + void testExplode() { + // Child is primitive type + try (Table t1 = new Table.TestBuilder() + .column(new ListType(true, new BasicType(true, DType.INT32)), + Arrays.asList(1, 2, 3), + Arrays.asList(4, 5), + Arrays.asList(6), + null) + .column("s1", "s2", "s3", "s4") + .column( 1, 3, 5, 7) + .column(12.0, 14.0, 13.0, 11.0) + .build(); + Table expected = new Table.TestBuilder() + .column( 1, 2, 3, 4, 5, 6) + .column("s1", "s1", "s1", "s2", "s2", "s3") + .column( 1, 1, 1, 3, 3, 5) + .column(12.0, 12.0, 12.0, 14.0, 14.0, 13.0) + .build()) { + try (Table exploded = t1.explode(0)) { + assertTablesAreEqual(expected, exploded); + } + } + + // Child is nested type + StructType nestedType = new StructType(false, + new BasicType(false, DType.INT32), new BasicType(false, DType.STRING)); + try (Table t1 = new Table.TestBuilder() + .column(new ListType(false, nestedType), + Arrays.asList(struct(1, "k1"), struct(2, "k2"), struct(3, "k3")), + Arrays.asList(struct(4, "k4"), struct(5, "k5")), + Arrays.asList(struct(6, "k6"))) + .column("s1", "s2", "s3") + .column( 1, 3, 5) + .column(12.0, 14.0, 13.0) + .build(); + Table expected = new Table.TestBuilder() + .column(nestedType, + struct(1, "k1"), struct(2, "k2"), struct(3, "k3"), + struct(4, "k4"), struct(5, "k5"), struct(6, "k6")) + .column("s1", "s1", "s1", "s2", "s2", "s3") + .column( 1, 1, 1, 3, 3, 5) + .column(12.0, 12.0, 12.0, 14.0, 14.0, 13.0) + .build()) { + try (Table exploded = t1.explode(0)) { + assertTablesAreEqual(expected, exploded); + } + } + } + } diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 77d69ebc150..2d9438b515f 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. from cudf.utils.gpu_utils import validate_setup # isort:skip validate_setup() @@ -40,7 +40,7 @@ merge, ) from cudf.core.algorithms import factorize -from cudf.core.dtypes import CategoricalDtype +from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype from cudf.core.groupby import Grouper from cudf.core.ops import ( add, @@ -64,7 +64,7 @@ ) from cudf.core.reshape import concat, get_dummies, melt, merge_sorted from cudf.core.series import isclose -from cudf.core.tools.datetimes import to_datetime, DateOffset +from cudf.core.tools.datetimes import DateOffset, to_datetime from cudf.core.tools.numeric import to_numeric from cudf.io import ( from_dlpack, diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py index 975cfebcd59..b816f18b5aa 100644 --- a/python/cudf/cudf/_fuzz_testing/parquet.py +++ b/python/cudf/cudf/_fuzz_testing/parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. import logging @@ -96,6 +96,10 @@ def set_rand_params(self, params): params_dict[param] = list( np.unique(np.random.choice(self._df.columns, col_size)) ) + elif param in ("skiprows", "num_rows"): + params_dict[param] = np.random.choice( + [None, self._rand(len(self._df))] + ) else: params_dict[param] = np.random.choice(values) self._current_params["test_kwargs"] = self.process_kwargs(params_dict) diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py index c392cefcabf..db2bcf74112 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. import sys @@ -28,18 +28,29 @@ def parquet_reader_test(parquet_buffer): params={ "columns": ALL_POSSIBLE_VALUES, "use_pandas_metadata": [True, False], + "skiprows": ALL_POSSIBLE_VALUES, + "num_rows": ALL_POSSIBLE_VALUES, }, ) -def parquet_reader_columns(parquet_buffer, columns, use_pandas_metadata): +def parquet_reader_columns( + parquet_buffer, columns, use_pandas_metadata, skiprows, num_rows +): pdf = pd.read_parquet( parquet_buffer, columns=columns, use_pandas_metadata=use_pandas_metadata, ) + + pdf = pdf.iloc[skiprows:] + if num_rows is not None: + pdf = pdf.head(num_rows) + gdf = cudf.read_parquet( parquet_buffer, columns=columns, use_pandas_metadata=use_pandas_metadata, + skiprows=skiprows, + num_rows=num_rows, ) compare_dataframe(gdf, pdf) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index be2d4ef5f51..0293518a5d9 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -10,13 +10,16 @@ datetime, filling, gpuarrow, + groupby, hash, interop, join, + json, merge, null_mask, nvtext, orc, + parquet, partitioning, quantiles, reduce, @@ -27,6 +30,7 @@ search, sort, stream_compaction, + string_casting, strings, table, transpose, diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi new file mode 100644 index 00000000000..0f8c044410d --- /dev/null +++ b/python/cudf/cudf/_lib/column.pyi @@ -0,0 +1,124 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from __future__ import annotations +from typing import Tuple, Union, TypeVar, Optional + +from cudf._typing import DtypeObj, Dtype, ScalarLike +from cudf.core.buffer import Buffer +from cudf.core.column import ColumnBase + + +T = TypeVar("T") + +class Column: + _data: Optional[Buffer] + _mask: Optional[Buffer] + _base_data: Optional[Buffer] + _base_mask: Optional[Buffer] + _dtype: DtypeObj + _offset: int + _null_count: int + _children: Tuple[ColumnBase, ...] + _base_children: Tuple[ColumnBase, ...] + + def __init__( + self, + data: Optional[Buffer], + dtype: Dtype, + size: int = None, + mask: Optional[Buffer] = None, + offset: int = None, + null_count: int = None, + children: Tuple[ColumnBase, ...] = (), + ) -> None: + ... + + @property + def base_size(self) -> int: + ... + + @property + def dtype(self) -> DtypeObj: + ... + + @property + def size(self) -> int: + ... + + @property + def base_data(self) -> Optional[Buffer]: + ... + + @property + def base_data_ptr(self) -> int: + ... + + @property + def data(self) -> Optional[Buffer]: + ... + + @property + def data_ptr(self) -> int: + ... + + def set_base_data(self, value: Buffer) -> None: + ... + + @property + def nullable(self) -> bool: + ... + + @property + def has_nulls(self) -> bool: + ... + + @property + def base_mask(self) -> Optional[Buffer]: + ... + + @property + def base_mask_ptr(self) -> int: + ... + + @property + def mask(self) -> Optional[Buffer]: + ... + + @property + def mask_ptr(self) -> int: + ... + + def set_base_mask(self, value: Optional[Buffer]) -> None: + ... + + def set_mask(self: T, value: Optional[Buffer]) -> T: + ... + + @property + def null_count(self) -> int: + ... + + @property + def offset(self) -> int: + ... + + @property + def base_children(self) -> Tuple[ColumnBase, ...]: + ... + + @property + def children(self) -> Tuple[ColumnBase, ...]: + ... + + def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: + ... + + def _mimic_inplace(self, other_col: ColumnBase, inplace=False) -> Optional[ColumnBase]: + ... + + @staticmethod + def from_scalar( + val: ScalarLike, + size: int + ) -> ColumnBase: # TODO: This should be Scalar, not ScalarLike + ... diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index c2f047fd0d5..28dacb5e944 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. import cupy as cp import numpy as np @@ -10,6 +10,7 @@ import cudf from cudf.core.buffer import Buffer from cudf.utils.dtypes import ( is_categorical_dtype, + is_decimal_dtype, is_list_dtype, is_struct_dtype ) @@ -59,14 +60,14 @@ cdef class Column: The *dtype* indicates the Column's element type. """ def __init__( - self, - object data, - int size, - object dtype, - object mask=None, - int offset=0, - object null_count=None, - object children=() + self, + object data, + int size, + object dtype, + object mask=None, + int offset=0, + object null_count=None, + object children=() ): self._size = size @@ -246,10 +247,10 @@ cdef class Column: ) return cudf.core.column.build_column( - self.data, - self.dtype, - mask, - self.size, + data=self.data, + dtype=self.dtype, + mask=mask, + size=self.size, offset=0, children=self.children ) @@ -386,14 +387,19 @@ cdef class Column: tid = libcudf_types.type_id.LIST elif is_struct_dtype(self.dtype): tid = libcudf_types.type_id.STRUCT + elif is_decimal_dtype(self.dtype): + tid = libcudf_types.type_id.DECIMAL64 else: tid = ( ( np_to_cudf_types[np.dtype(data_dtype)] ) ) - - cdef libcudf_types.data_type dtype = libcudf_types.data_type(tid) + cdef libcudf_types.data_type dtype = ( + libcudf_types.data_type(tid, -self.dtype.scale) + if tid == libcudf_types.type_id.DECIMAL64 + else libcudf_types.data_type(tid) + ) cdef libcudf_types.size_type offset = self.offset cdef vector[column_view] children cdef void* data @@ -555,25 +561,22 @@ cdef class Column: children = tuple(children) result = cudf.core.column.build_column( - data, - dtype, - mask, - size, - offset, - null_count, - tuple(children) + data=data, + dtype=dtype, + mask=mask, + size=size, + offset=offset, + null_count=null_count, + children=tuple(children) ) return result - -def make_column_from_scalar(object py_val, size_type size): - - cdef DeviceScalar val = py_val.device_value - - cdef const scalar* c_val = val.get_raw_ptr() - cdef unique_ptr[column] c_result - with nogil: - c_result = move(cpp_make_column_from_scalar(c_val[0], size)) - - return Column.from_unique_ptr(move(c_result)) + @staticmethod + def from_scalar(py_val, size_type size): + cdef DeviceScalar val = py_val.device_value + cdef const scalar* c_val = val.get_raw_ptr() + cdef unique_ptr[column] c_result + with nogil: + c_result = move(cpp_make_column_from_scalar(c_val[0], size)) + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd index 412f8c25658..f7f094834e6 100644 --- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd +++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.string cimport string @@ -71,7 +71,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: cudf_io_types.statistics_freq get_stats_level() except + cudf_table_view.table_view get_table() except + const cudf_io_types.table_metadata get_metadata() except + - bool is_enabled_return_filemetadata() except + string get_column_chunks_file_path() except+ void set_metadata( @@ -83,9 +82,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: void set_compression( cudf_io_types.compression_type compression ) except + - void enable_return_filemetadata( - bool req - ) except + void set_column_chunks_file_path( string column_chunks_file_path ) except + @@ -112,9 +108,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: parquet_writer_options_builder& compression( cudf_io_types.compression_type compression ) except + - parquet_writer_options_builder& return_filemetadata( - bool req - ) except + parquet_writer_options_builder& column_chunks_file_path( string column_chunks_file_path ) except + @@ -168,21 +161,15 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: chunked_parquet_writer_options build() except + - cdef shared_ptr[pq_chunked_state] write_parquet_chunked_begin( - chunked_parquet_writer_options args - ) except + - - cdef void write_parquet_chunked(cudf_table_view.table_view table_, - shared_ptr[pq_chunked_state]) except + - - cdef unique_ptr[vector[uint8_t]] write_parquet_chunked_end( - shared_ptr[pq_chunked_state], - bool return_meta, - string column_chunks_file_path, - ) except + - - cdef cppclass pq_chunked_state: - pass + cdef cppclass parquet_chunked_writer: + parquet_chunked_writer() except+ + parquet_chunked_writer(chunked_parquet_writer_options args) except+ + parquet_chunked_writer& write( + cudf_table_view.table_view table_, + ) except+ + unique_ptr[vector[uint8_t]] close( + string column_chunks_file_path, + ) except+ cdef unique_ptr[vector[uint8_t]] merge_rowgroup_metadata( const vector[unique_ptr[vector[uint8_t]]]& metadata_list diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd index cf86076f8d6..bd1108b2cdf 100644 --- a/python/cudf/cudf/_lib/cpp/types.pxd +++ b/python/cudf/cudf/_lib/cpp/types.pxd @@ -47,32 +47,34 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil: UNEQUAL "cudf::null_equality::UNEQUAL" ctypedef enum type_id "cudf::type_id": - EMPTY "cudf::type_id::EMPTY" - INT8 "cudf::type_id::INT8" - INT16 "cudf::type_id::INT16" - INT32 "cudf::type_id::INT32" - INT64 "cudf::type_id::INT64" - UINT8 "cudf::type_id::UINT8" - UINT16 "cudf::type_id::UINT16" - UINT32 "cudf::type_id::UINT32" - UINT64 "cudf::type_id::UINT64" - FLOAT32 "cudf::type_id::FLOAT32" - FLOAT64 "cudf::type_id::FLOAT64" - BOOL8 "cudf::type_id::BOOL8" - TIMESTAMP_DAYS "cudf::type_id::TIMESTAMP_DAYS" - TIMESTAMP_SECONDS "cudf::type_id::TIMESTAMP_SECONDS" + EMPTY "cudf::type_id::EMPTY" + INT8 "cudf::type_id::INT8" + INT16 "cudf::type_id::INT16" + INT32 "cudf::type_id::INT32" + INT64 "cudf::type_id::INT64" + UINT8 "cudf::type_id::UINT8" + UINT16 "cudf::type_id::UINT16" + UINT32 "cudf::type_id::UINT32" + UINT64 "cudf::type_id::UINT64" + FLOAT32 "cudf::type_id::FLOAT32" + FLOAT64 "cudf::type_id::FLOAT64" + BOOL8 "cudf::type_id::BOOL8" + TIMESTAMP_DAYS "cudf::type_id::TIMESTAMP_DAYS" + TIMESTAMP_SECONDS "cudf::type_id::TIMESTAMP_SECONDS" TIMESTAMP_MILLISECONDS "cudf::type_id::TIMESTAMP_MILLISECONDS" TIMESTAMP_MICROSECONDS "cudf::type_id::TIMESTAMP_MICROSECONDS" - TIMESTAMP_NANOSECONDS "cudf::type_id::TIMESTAMP_NANOSECONDS" - DICTIONARY32 "cudf::type_id::DICTIONARY32" - STRING "cudf::type_id::STRING" - LIST "cudf::type_id::LIST" - STRUCT "cudf::type_id::STRUCT" - NUM_TYPE_IDS "cudf::type_id::NUM_TYPE_IDS" - DURATION_SECONDS "cudf::type_id::DURATION_SECONDS" - DURATION_MILLISECONDS "cudf::type_id::DURATION_MILLISECONDS" - DURATION_MICROSECONDS "cudf::type_id::DURATION_MICROSECONDS" - DURATION_NANOSECONDS "cudf::type_id::DURATION_NANOSECONDS" + TIMESTAMP_NANOSECONDS "cudf::type_id::TIMESTAMP_NANOSECONDS" + DICTIONARY32 "cudf::type_id::DICTIONARY32" + STRING "cudf::type_id::STRING" + LIST "cudf::type_id::LIST" + STRUCT "cudf::type_id::STRUCT" + NUM_TYPE_IDS "cudf::type_id::NUM_TYPE_IDS" + DURATION_SECONDS "cudf::type_id::DURATION_SECONDS" + DURATION_MILLISECONDS "cudf::type_id::DURATION_MILLISECONDS" + DURATION_MICROSECONDS "cudf::type_id::DURATION_MICROSECONDS" + DURATION_NANOSECONDS "cudf::type_id::DURATION_NANOSECONDS" + DECIMAL32 "cudf::type_id::DECIMAL32" + DECIMAL64 "cudf::type_id::DECIMAL64" ctypedef enum hash_id "cudf::hash_id": HASH_IDENTITY "cudf::hash_id::HASH_IDENTITY" @@ -85,7 +87,9 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil: data_type() except + data_type(const data_type&) except + data_type(type_id id) except + + data_type(type_id id, int32_t scale) except + type_id id() except + + int32_t scale() except + cdef extern from "cudf/types.hpp" namespace "cudf" nogil: ctypedef enum interpolation: diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index c7780d17b27..a9739a02283 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -31,7 +31,7 @@ from cudf._lib.utils import ( from libc.stdlib cimport free from libc.stdint cimport uint8_t -from libcpp.memory cimport shared_ptr, unique_ptr, make_unique +from libcpp.memory cimport unique_ptr, make_unique from libcpp.string cimport string from libcpp.map cimport map from libcpp.vector cimport vector @@ -50,13 +50,10 @@ from cudf._lib.cpp.io.parquet cimport ( parquet_reader_options, parquet_writer_options, write_parquet as parquet_writer, + parquet_chunked_writer as cpp_parquet_chunked_writer, chunked_parquet_writer_options, chunked_parquet_writer_options_builder, - write_parquet_chunked_begin, - write_parquet_chunked, - write_parquet_chunked_end, merge_rowgroup_metadata as parquet_merge_metadata, - pq_chunked_state ) from cudf._lib.column cimport Column from cudf._lib.io.utils cimport ( @@ -323,11 +320,9 @@ cpdef write_parquet( cdef parquet_writer_options args cdef unique_ptr[vector[uint8_t]] out_metadata_c cdef string c_column_chunks_file_path - cdef bool return_filemetadata = False cdef bool _int96_timestamps = int96_timestamps if metadata_file_path is not None: c_column_chunks_file_path = str.encode(metadata_file_path) - return_filemetadata = True # Perform write with nogil: @@ -337,7 +332,6 @@ cpdef write_parquet( .compression(comp_type) .stats_level(stat_freq) .column_chunks_file_path(c_column_chunks_file_path) - .return_filemetadata(return_filemetadata) .int96_timestamps(_int96_timestamps) .build() ) @@ -361,7 +355,8 @@ cdef class ParquetWriter: -------- cudf.io.parquet.write_parquet """ - cdef shared_ptr[pq_chunked_state] state + cdef bool initialized + cdef unique_ptr[cpp_parquet_chunked_writer] writer cdef cudf_io_types.sink_info sink cdef unique_ptr[cudf_io_types.data_sink] _data_sink cdef cudf_io_types.statistics_freq stat_freq @@ -374,43 +369,39 @@ cdef class ParquetWriter: self.stat_freq = _get_stat_freq(statistics) self.comp_type = _get_comp_type(compression) self.index = index + self.initialized = False def write_table(self, Table table): """ Writes a single table to the file """ - if not self.state: + if not self.initialized: self._initialize_chunked_state(table) - cdef table_view tv = table.data_view() - if self.index is not False: - if isinstance(table._index, cudf.core.multiindex.MultiIndex) \ - or table._index.name is not None: - tv = table.view() + cdef table_view tv + if self.index is not False and ( + table._index.name is not None or + isinstance(table._index, cudf.core.multiindex.MultiIndex)): + tv = table.view() + else: + tv = table.data_view() with nogil: - write_parquet_chunked(tv, self.state) + self.writer.get()[0].write(tv) def close(self, object metadata_file_path=None): cdef unique_ptr[vector[uint8_t]] out_metadata_c - cdef bool return_meta cdef string column_chunks_file_path - if not self.state: + if not self.initialized: return None # Update metadata-collection options if metadata_file_path is not None: column_chunks_file_path = str.encode(metadata_file_path) - return_meta = True - else: - return_meta = False with nogil: out_metadata_c = move( - write_parquet_chunked_end( - self.state, return_meta, column_chunks_file_path - ) + self.writer.get()[0].close(column_chunks_file_path) ) - self.state.reset() if metadata_file_path is not None: out_metadata_py = BufferArrayFromVector.from_unique_ptr( @@ -423,8 +414,8 @@ cdef class ParquetWriter: self.close() def _initialize_chunked_state(self, Table table): - """ Wraps write_parquet_chunked_begin. This is called lazily on the first - call to write, so that we can get metadata from the first table """ + """ Prepares all the values required to build the + chunked_parquet_writer_options and creates a writer""" cdef unique_ptr[cudf_io_types.table_metadata_with_nullability] tbl_meta tbl_meta = make_unique[cudf_io_types.table_metadata_with_nullability]() @@ -434,7 +425,6 @@ cdef class ParquetWriter: tbl_meta.get().user_data[str.encode("pandas")] = \ str.encode(pandas_metadata) - # call write_parquet_chunked_begin cdef chunked_parquet_writer_options args with nogil: args = move( @@ -444,7 +434,8 @@ cdef class ParquetWriter: .stats_level(self.stat_freq) .build() ) - self.state = write_parquet_chunked_begin(args) + self.writer.reset(new cpp_parquet_chunked_writer(args)) + self.initialized = True cpdef merge_filemetadata(object filemetadata_list): diff --git a/python/cudf/cudf/_lib/table.pyi b/python/cudf/cudf/_lib/table.pyi new file mode 100644 index 00000000000..772e940f812 --- /dev/null +++ b/python/cudf/cudf/_lib/table.pyi @@ -0,0 +1,29 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from typing import List, Any, Optional, TYPE_CHECKING + +import cudf + +class Table(object): + _data: cudf.core.column_accessor.ColumnAccessor + _index: Optional[cudf.core.index.Index] + + def __init__(self, data: object = None, index: object = None) -> None: ... + + @property + def _num_columns(self) -> int: ... + + @property + def _num_indices(self) -> int: ... + + @property + def _num_rows(self) -> int: ... + + @property + def _column_names(self) -> List[Any]: ... + + @property + def _index_names(self) -> List[Any]: ... + + @property + def _columns(self) -> List[Any]: ... # TODO: actually, a list of columns diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd index c6e19840d6a..9b35ca2e80c 100644 --- a/python/cudf/cudf/_lib/types.pxd +++ b/python/cudf/cudf/_lib/types.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. from libc.stdint cimport int32_t from libcpp cimport bool diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index 5998f9ec2f9..370d083d7ac 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. from enum import IntEnum @@ -14,7 +14,7 @@ from cudf._lib.types cimport ( ) from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view -from cudf.core.dtypes import ListDtype, StructDtype +from cudf.core.dtypes import ListDtype, StructDtype, Decimal64Dtype cimport cudf._lib.cpp.types as libcudf_types @@ -64,6 +64,8 @@ class TypeId(IntEnum): DURATION_NANOSECONDS = ( libcudf_types.type_id.DURATION_NANOSECONDS ) + DECIMAL32 = libcudf_types.type_id.DECIMAL32 + DECIMAL64 = libcudf_types.type_id.DECIMAL64 np_to_cudf_types = { @@ -188,12 +190,21 @@ cdef dtype_from_structs_column_view(column_view cv): } return StructDtype(fields) +cdef dtype_from_decimal_column_view(column_view cv): + scale = -cv.type().scale() + precision = 18 # max of 64 bit integer + return Decimal64Dtype(precision=precision, scale=scale) + cdef dtype_from_column_view(column_view cv): cdef libcudf_types.type_id tid = cv.type().id() if tid == libcudf_types.type_id.LIST: - dtype = dtype_from_lists_column_view(cv) + return dtype_from_lists_column_view(cv) elif tid == libcudf_types.type_id.STRUCT: - dtype = dtype_from_structs_column_view(cv) + return dtype_from_structs_column_view(cv) + elif tid == libcudf_types.type_id.DECIMAL64: + return dtype_from_decimal_column_view(cv) + elif tid == libcudf_types.type_id.DECIMAL32: + raise NotImplementedError("decimal32 types are not supported yet. " + "Use decimal64 instead") else: - dtype = cudf_to_np_types[(tid)] - return dtype + return cudf_to_np_types[(tid)] diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py new file mode 100644 index 00000000000..0087daa1676 --- /dev/null +++ b/python/cudf/cudf/_typing.py @@ -0,0 +1,28 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from typing import TYPE_CHECKING, Any, TypeVar, Union + +import numpy as np +from pandas import Period, Timedelta, Timestamp +from pandas.api.extensions import ExtensionDtype + +if TYPE_CHECKING: + import cudf + +# Many of these are from +# https://github.com/pandas-dev/pandas/blob/master/pandas/_typing.py + +Dtype = Union["ExtensionDtype", str, np.dtype] +DtypeObj = Union["ExtensionDtype", np.dtype] + +# scalars +DatetimeLikeScalar = TypeVar( + "DatetimeLikeScalar", Period, Timestamp, Timedelta +) +ScalarLike = Any + +# columns +ColumnLike = Any + +# binary operation +BinaryOperand = Union["cudf.Scalar", "cudf.core.column.ColumnBase"] diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py index d6c232373c7..91a369c31f8 100644 --- a/python/cudf/cudf/core/__init__.py +++ b/python/cudf/cudf/core/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) 2018-2020, NVIDIA CORPORATION. -from cudf.core import buffer, column, common +from cudf.core import buffer, column, column_accessor, common from cudf.core.buffer import Buffer from cudf.core.dataframe import DataFrame, from_pandas, merge from cudf.core.index import ( diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py index 02150a79d57..0550b1d4de0 100644 --- a/python/cudf/cudf/core/abc.py +++ b/python/cudf/cudf/core/abc.py @@ -12,9 +12,9 @@ try: import pickle5 as pickle except ImportError: - import pickle + import pickle # type: ignore else: - import pickle + import pickle # type: ignore class Serializable(abc.ABC): diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py index 08bc068c28c..350346a87f9 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer.py @@ -1,7 +1,10 @@ # Copyright (c) 2020, NVIDIA CORPORATION. +from __future__ import annotations + import functools import operator import pickle +from typing import Any, Dict, Optional, Tuple import numpy as np @@ -12,7 +15,13 @@ class Buffer(Serializable): - def __init__(self, data=None, size=None, owner=None): + ptr: int + size: int + _owner: Any + + def __init__( + self, data: Any = None, size: Optional[int] = None, owner: Any = None + ): """ A Buffer represents a device memory allocation. @@ -36,7 +45,6 @@ def __init__(self, data=None, size=None, owner=None): elif hasattr(data, "__array_interface__") or hasattr( data, "__cuda_array_interface__" ): - self._init_from_array_like(data, owner) elif isinstance(data, memoryview): self._init_from_array_like(np.asarray(data), owner) @@ -57,15 +65,15 @@ def __init__(self, data=None, size=None, owner=None): raise TypeError("data must be Buffer, array-like or integer") self._init_from_array_like(np.asarray(data), owner) - def __len__(self): + def __len__(self) -> int: return self.size @property - def nbytes(self): + def nbytes(self) -> int: return self.size @property - def __cuda_array_interface__(self): + def __cuda_array_interface__(self) -> dict: intf = { "data": (self.ptr, False), "shape": (self.size,), @@ -102,8 +110,8 @@ def _init_from_array_like(self, data, owner): f"Cannot construct Buffer from {data.__class__.__name__}" ) - def serialize(self): - header = {} + def serialize(self) -> Tuple[dict, list]: + header = {} # type: Dict[Any, Any] header["type-serialized"] = pickle.dumps(type(self)) header["constructor-kwargs"] = {} header["desc"] = self.__cuda_array_interface__.copy() @@ -112,7 +120,7 @@ def serialize(self): return header, frames @classmethod - def deserialize(cls, header, frames): + def deserialize(cls, header: dict, frames: list) -> Buffer: buf = cls(frames[0], **header["constructor-kwargs"]) if header["desc"]["shape"] != buf.__cuda_array_interface__["shape"]: @@ -125,7 +133,7 @@ def deserialize(cls, header, frames): return buf @classmethod - def empty(cls, size): + def empty(cls, size: int) -> Buffer: dbuf = DeviceBuffer(size=size) return Buffer(dbuf) diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 7e583ea4b2b..81dab52d353 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. from cudf.core.column.categorical import CategoricalColumn from cudf.core.column.column import ( @@ -21,3 +21,4 @@ from cudf.core.column.string import StringColumn # noqa: F401 from cudf.core.column.struct import StructColumn # noqa: F401 from cudf.core.column.timedelta import TimeDeltaColumn # noqa: F401 +from cudf.core.column.decimal import DecimalColumn # noqa: F401 diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index ff514e6c6f0..498851c47ee 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1,12 +1,27 @@ # Copyright (c) 2018-2020, NVIDIA CORPORATION. +from __future__ import annotations + import pickle +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Mapping, + Optional, + Tuple, + Union, + cast, +) import numpy as np import pandas as pd +from numba import cuda import cudf from cudf import _lib as libcudf +from cudf._lib.scalar import as_device_scalar from cudf._lib.transform import bools_to_mask +from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import column from cudf.core.column.methods import ColumnMethodsMixin @@ -18,9 +33,23 @@ min_unsigned_type, ) +if TYPE_CHECKING: + from cudf.core.column import ( + ColumnBase, + DatetimeColumn, + NumericalColumn, + StringColumn, + TimeDeltaColumn, + ) + + +ParentType = Union["cudf.Series", "cudf.Index"] + class CategoricalAccessor(ColumnMethodsMixin): - def __init__(self, column, parent=None): + _column: CategoricalColumn + + def __init__(self, column: Any, parent: ParentType = None): """ Accessor object for categorical properties of the Series values. Be aware that assigning to `categories` is a inplace operation, @@ -28,7 +57,8 @@ def __init__(self, column, parent=None): Parameters ---------- - data : Series or CategoricalIndex + column : Column + parent : Series or CategoricalIndex Examples -------- @@ -77,34 +107,35 @@ def __init__(self, column, parent=None): raise AttributeError( "Can only use .cat accessor with a 'category' dtype" ) - self._column = column - self._parent = parent + super().__init__(column=column, parent=parent) @property - def categories(self): + def categories(self) -> "cudf.Index": """ The categories of this categorical. """ return cudf.core.index.as_index(self._column.categories) @property - def codes(self): + def codes(self) -> "cudf.Series": """ Return Series of codes as well as the index. """ - return cudf.Series( - self._column.codes, - index=self._parent.index if self._parent is not None else None, + index = ( + self._parent.index + if isinstance(self._parent, cudf.Series) + else None ) + return cudf.Series(self._column.codes, index=index) @property - def ordered(self): + def ordered(self) -> bool: """ Whether the categories have an ordered relationship. """ return self._column.ordered - def as_ordered(self, inplace=False): + def as_ordered(self, inplace: bool = False) -> Optional[ParentType]: """ Set the Categorical to be ordered. @@ -165,7 +196,7 @@ def as_ordered(self, inplace=False): return self._return_or_inplace(out_col, inplace=inplace) - def as_unordered(self, inplace=False): + def as_unordered(self, inplace: bool = False) -> Optional[ParentType]: """ Set the Categorical to be unordered. @@ -237,7 +268,9 @@ def as_unordered(self, inplace=False): return self._return_or_inplace(out_col, inplace=inplace) - def add_categories(self, new_categories, inplace=False): + def add_categories( + self, new_categories: Any, inplace: bool = False + ) -> Optional[ParentType]: """ Add new categories. @@ -320,7 +353,9 @@ def add_categories(self, new_categories, inplace=False): return self._return_or_inplace(out_col, inplace=inplace) - def remove_categories(self, removals, inplace=False): + def remove_categories( + self, removals: Any, inplace: bool = False, + ) -> Optional[ParentType]: """ Remove the specified categories. @@ -411,8 +446,12 @@ def remove_categories(self, removals, inplace=False): return self._return_or_inplace(out_col, inplace=inplace) def set_categories( - self, new_categories, ordered=None, rename=False, inplace=False, - ): + self, + new_categories: Any, + ordered: bool = False, + rename: bool = False, + inplace: bool = False, + ) -> Optional[ParentType]: """ Set the categories to the specified new_categories. @@ -539,7 +578,12 @@ def set_categories( ) return self._return_or_inplace(out_col, inplace=inplace) - def reorder_categories(self, new_categories, ordered=False, inplace=False): + def reorder_categories( + self, + new_categories: Any, + ordered: bool = False, + inplace: bool = False, + ) -> Optional[ParentType]: """ Reorder categories as specified in new_categories. @@ -621,9 +665,9 @@ def reorder_categories(self, new_categories, ordered=False, inplace=False): return self._return_or_inplace(out_col, inplace=inplace) - def _categories_equal(self, new_categories, ordered=None): - ordered = ordered if ordered is not None else self.ordered - + def _categories_equal( + self, new_categories: ColumnBase, ordered=False + ) -> bool: cur_categories = self._column.categories if len(new_categories) != len(cur_categories): return False @@ -640,8 +684,12 @@ def _categories_equal(self, new_categories, ordered=None): return cur_categories.equals(new_categories) def _set_categories( - self, current_categories, new_categories, is_unique=False, ordered=None - ): + self, + current_categories: Any, + new_categories: Any, + is_unique: bool = False, + ordered: bool = False, + ) -> CategoricalColumn: """Returns a new CategoricalColumn with the categories set to the specified *new_categories*. @@ -705,14 +753,17 @@ class CategoricalColumn(column.ColumnBase): """Implements operations for Columns of Categorical type """ + _codes: Optional[NumericalColumn] + _children: Tuple[NumericalColumn] + def __init__( self, - dtype, - mask=None, - size=None, - offset=0, - null_count=None, - children=(), + dtype: CategoricalDtype, + mask: Buffer = None, + size: int = None, + offset: int = 0, + null_count: int = None, + children: Tuple["column.ColumnBase", ...] = (), ): """ Parameters @@ -722,7 +773,7 @@ def __init__( The validity mask offset : int Data offset - children : Tuple[Column] + children : Tuple[ColumnBase] Two non-null columns containing the categories and codes respectively """ @@ -745,24 +796,23 @@ def __init__( null_count=null_count, children=children, ) - self._codes = None @property - def base_size(self): + def base_size(self) -> int: return int( (self.base_children[0].size) / self.base_children[0].dtype.itemsize ) - def __contains__(self, item): + def __contains__(self, item: ScalarLike) -> bool: try: self._encode(item) except ValueError: return False return self._encode(item) in self.as_numerical - def serialize(self): - header = {} + def serialize(self) -> Tuple[dict, list]: + header = {} # type: Dict[Any, Any] frames = [] header["type-serialized"] = pickle.dumps(type(self)) header["dtype"], dtype_frames = self.dtype.serialize() @@ -771,7 +821,7 @@ def serialize(self): header["data"], data_frames = self.codes.serialize() header["data_frames_count"] = len(data_frames) frames.extend(data_frames) - if self.nullable: + if self.mask is not None: mask_header, mask_frames = self.mask.serialize() header["mask"] = mask_header frames.extend(mask_frames) @@ -779,7 +829,7 @@ def serialize(self): return header, frames @classmethod - def deserialize(cls, header, frames): + def deserialize(cls, header: dict, frames: list) -> CategoricalColumn: n_dtype_frames = header["dtype_frames_count"] dtype = CategoricalDtype.deserialize( header["dtype"], frames[:n_dtype_frames] @@ -796,11 +846,14 @@ def deserialize(cls, header, frames): mask = Buffer.deserialize( header["mask"], [frames[n_dtype_frames + n_data_frames]] ) - return column.build_column( - data=None, - dtype=dtype, - mask=mask, - children=(column.as_column(data.base_data, dtype=data.dtype),), + return cast( + CategoricalColumn, + column.build_column( + data=None, + dtype=dtype, + mask=mask, + children=(column.as_column(data.base_data, dtype=data.dtype),), + ), ) def set_base_data(self, value): @@ -812,16 +865,16 @@ def set_base_data(self, value): else: super().set_base_data(value) - def set_base_mask(self, value): + def set_base_mask(self, value: Optional[Buffer]): super().set_base_mask(value) self._codes = None - def set_base_children(self, value): + def set_base_children(self, value: Tuple[ColumnBase, ...]): super().set_base_children(value) self._codes = None @property - def children(self): + def children(self) -> Tuple[NumericalColumn]: if self._children is None: codes_column = self.base_children[0] @@ -829,20 +882,26 @@ def children(self): buf.ptr = buf.ptr + (self.offset * codes_column.dtype.itemsize) buf.size = self.size * codes_column.dtype.itemsize - codes_column = column.build_column( - data=buf, dtype=codes_column.dtype, size=self.size, + codes_column = cast( + cudf.core.column.NumericalColumn, + column.build_column( + data=buf, dtype=codes_column.dtype, size=self.size, + ), ) self._children = (codes_column,) return self._children @property - def as_numerical(self): - return column.build_column( - data=self.codes.data, dtype=self.codes.dtype, mask=self.mask + def as_numerical(self) -> NumericalColumn: + return cast( + cudf.core.column.NumericalColumn, + column.build_column( + data=self.codes.data, dtype=self.codes.dtype, mask=self.mask + ), ) @property - def categories(self): + def categories(self) -> ColumnBase: return self.dtype.categories._values @categories.setter @@ -852,30 +911,82 @@ def categories(self, value): ) @property - def codes(self): + def codes(self) -> NumericalColumn: if self._codes is None: self._codes = self.children[0].set_mask(self.mask) - return self._codes + return cast(cudf.core.column.NumericalColumn, self._codes) @property - def ordered(self): + def ordered(self) -> bool: return self.dtype.ordered @ordered.setter - def ordered(self, value): + def ordered(self, value: bool): self.dtype.ordered = value - def cat(self, parent=None): + def cat(self, parent: ParentType = None): return CategoricalAccessor(self, parent=parent) - def unary_operator(self, unaryop): + def unary_operator(self, unaryop: str): raise TypeError( f"Series of dtype `category` cannot perform the operation: " f"{unaryop}" ) - def binary_operator(self, op, rhs, reflect=False): + def __setitem__(self, key, value): + if cudf.utils.dtypes.is_scalar(value): + value = self._encode(value) if value is not None else value + else: + value = cudf.core.column.as_column(value).astype(self.dtype) + value = value.codes + codes = self.codes + codes[key] = value + out = cudf.core.column.build_categorical_column( + categories=self.categories, + codes=codes, + mask=codes.base_mask, + size=codes.size, + offset=self.offset, + ordered=self.ordered, + ) + self._mimic_inplace(out, inplace=True) + + def _fill( + self, + fill_value: ScalarLike, + begin: int, + end: int, + inplace: bool = False, + ) -> "column.ColumnBase": + if end <= begin or begin >= self.size: + return self if inplace else self.copy() + + fill_code = self._encode(fill_value) + fill_scalar = as_device_scalar(fill_code, self.codes.dtype) + + result = self if inplace else self.copy() + + libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar) + return result + + def slice( + self, start: int, stop: int, stride: int = None + ) -> "column.ColumnBase": + codes = self.codes.slice(start, stop, stride) + return cudf.core.column.build_categorical_column( + categories=self.categories, + codes=cudf.core.column.as_column( + codes.base_data, dtype=codes.dtype + ), + mask=codes.base_mask, + ordered=self.ordered, + size=codes.size, + offset=codes.offset, + ) + def binary_operator( + self, op: str, rhs, reflect: bool = False + ) -> ColumnBase: if not (self.ordered and rhs.ordered) and op not in ("eq", "ne"): if op in ("lt", "gt", "le", "ge"): raise TypeError( @@ -889,7 +1000,7 @@ def binary_operator(self, op, rhs, reflect=False): raise TypeError("Categoricals can only compare with the same type") return self.as_numerical.binary_operator(op, rhs.as_numerical) - def normalize_binop_value(self, other): + def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn: if isinstance(other, np.ndarray) and other.ndim == 0: other = other.item() @@ -905,7 +1016,9 @@ def normalize_binop_value(self, other): ) return col - def sort_by_values(self, ascending=True, na_position="last"): + def sort_by_values( + self, ascending: bool = True, na_position="last" + ) -> Tuple[CategoricalColumn, NumericalColumn]: codes, inds = self.as_numerical.sort_by_values(ascending, na_position) col = column.build_categorical_column( categories=self.dtype.categories, @@ -916,19 +1029,21 @@ def sort_by_values(self, ascending=True, na_position="last"): ) return col, inds - def element_indexing(self, index): + def element_indexing(self, index: int) -> ScalarLike: val = self.as_numerical.element_indexing(index) - return self._decode(val) if val is not None else val + return self._decode(int(val)) if val is not None else val @property - def __cuda_array_interface__(self): + def __cuda_array_interface__(self) -> Mapping[str, Any]: raise TypeError( "Categorical does not support `__cuda_array_interface__`." " Please consider using `.codes` or `.categories`" " if you need this functionality." ) - def to_pandas(self, index=None, nullable=False): + def to_pandas( + self, index: ColumnLike = None, nullable: bool = False, **kwargs + ) -> pd.Series: signed_dtype = min_signed_type(len(self.categories)) codes = self.cat().codes.astype(signed_dtype).fillna(-1).to_array() categories = self.categories.to_pandas() @@ -938,7 +1053,7 @@ def to_pandas(self, index=None, nullable=False): return pd.Series(data, index=index) @property - def values_host(self): + def values_host(self) -> np.ndarray: """ Return a numpy representation of the CategoricalColumn. """ @@ -951,7 +1066,16 @@ def values(self): """ raise NotImplementedError("cudf.Categorical is not yet implemented") - def unique(self): + def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase": + return ( + self.astype(self.categories.dtype).clip(lo, hi).astype(self.dtype) + ) + + @property + def data_array_view(self) -> cuda.devicearray.DeviceNDArray: + return self.codes.data_array_view + + def unique(self) -> CategoricalColumn: codes = self.as_numerical.unique() return column.build_categorical_column( categories=self.categories, @@ -962,18 +1086,23 @@ def unique(self): ordered=self.ordered, ) - def _encode(self, value): + def _encode(self, value) -> ScalarLike: return self.categories.find_first_value(value) - def _decode(self, value): + def _decode(self, value: int) -> ScalarLike: if value == self.default_na_value(): return None return self.categories.element_indexing(value) - def default_na_value(self): + def default_na_value(self) -> ScalarLike: return -1 - def find_and_replace(self, to_replace, replacement, all_nan): + def find_and_replace( + self, + to_replace: ColumnLike, + replacement: ColumnLike, + all_nan: bool = False, + ) -> CategoricalColumn: """ Return col with *to_replace* replaced with *replacement*. """ @@ -1038,7 +1167,9 @@ def find_and_replace(self, to_replace, replacement, all_nan): ordered=self.dtype.ordered, ) - def fillna(self, fill_value=None, method=None): + def fillna( + self, fill_value: Any = None, method: Any = None, dtype: Dtype = None + ) -> CategoricalColumn: """ Fill null values with *fill_value* """ @@ -1084,20 +1215,22 @@ def fillna(self, fill_value=None, method=None): return result - def find_first_value(self, value, closest=False): + def find_first_value( + self, value: ScalarLike, closest: bool = False + ) -> int: """ Returns offset of first value that matches """ return self.as_numerical.find_first_value(self._encode(value)) - def find_last_value(self, value, closest=False): + def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: """ Returns offset of last value that matches """ return self.as_numerical.find_last_value(self._encode(value)) @property - def is_monotonic_increasing(self): + def is_monotonic_increasing(self) -> bool: if not hasattr(self, "_is_monotonic_increasing"): self._is_monotonic_increasing = ( self.ordered and self.as_numerical.is_monotonic_increasing @@ -1105,14 +1238,16 @@ def is_monotonic_increasing(self): return self._is_monotonic_increasing @property - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: if not hasattr(self, "_is_monotonic_decreasing"): self._is_monotonic_decreasing = ( self.ordered and self.as_numerical.is_monotonic_decreasing ) return self._is_monotonic_decreasing - def as_categorical_column(self, dtype, **kwargs): + def as_categorical_column( + self, dtype: Dtype, **kwargs + ) -> CategoricalColumn: if isinstance(dtype, str) and dtype == "category": return self if ( @@ -1129,6 +1264,9 @@ def as_categorical_column(self, dtype, **kwargs): categories=dtype.categories, ordered=dtype.ordered ) + if not isinstance(dtype, CategoricalDtype): + raise ValueError("dtype must be CategoricalDtype") + if not isinstance(self.categories, type(dtype.categories._values)): # If both categories are of different Column types, # return a column full of Nulls. @@ -1138,25 +1276,25 @@ def as_categorical_column(self, dtype, **kwargs): new_categories=dtype.categories, ordered=dtype.ordered ) - def as_numerical_column(self, dtype): + def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: return self._get_decategorized_column().as_numerical_column(dtype) - def as_string_column(self, dtype, **kwargs): + def as_string_column(self, dtype, format=None) -> StringColumn: return self._get_decategorized_column().as_string_column( - dtype, **kwargs + dtype, format=format ) - def as_datetime_column(self, dtype, **kwargs): + def as_datetime_column(self, dtype, **kwargs) -> DatetimeColumn: return self._get_decategorized_column().as_datetime_column( dtype, **kwargs ) - def as_timedelta_column(self, dtype, **kwargs): + def as_timedelta_column(self, dtype, **kwargs) -> TimeDeltaColumn: return self._get_decategorized_column().as_timedelta_column( dtype, **kwargs ) - def _get_decategorized_column(self): + def _get_decategorized_column(self) -> ColumnBase: if self.null_count == len(self): # self.categories is empty; just return codes return self.cat().codes._column @@ -1165,7 +1303,7 @@ def _get_decategorized_column(self): out = out.set_mask(self.mask) return out - def copy(self, deep=True): + def copy(self, deep: bool = True) -> CategoricalColumn: if deep: copied_col = libcudf.copying.copy_column(self) copied_cat = libcudf.copying.copy_column(self.dtype._categories) @@ -1192,12 +1330,13 @@ def copy(self, deep=True): size=self.size, ) - def __sizeof__(self): + def __sizeof__(self) -> int: return ( self.cat().categories.__sizeof__() + self.cat().codes.__sizeof__() ) - def _memory_usage(self, deep=False): + def _memory_usage(self, **kwargs) -> int: + deep = kwargs.get("deep", False) if deep: return self.__sizeof__() else: @@ -1206,22 +1345,25 @@ def _memory_usage(self, deep=False): + self.cat().codes.memory_usage() ) - def _mimic_inplace(self, other_col, inplace=False): + def _mimic_inplace( + self, other_col: ColumnBase, inplace: bool = False + ) -> Optional[ColumnBase]: out = super()._mimic_inplace(other_col, inplace=inplace) - if inplace: + if inplace and isinstance(other_col, CategoricalColumn): self._codes = other_col._codes return out - def view(self, dtype): + def view(self, dtype: Dtype) -> ColumnBase: raise NotImplementedError( "Categorical column views are not currently supported" ) -def _create_empty_categorical_column(categorical_column, dtype): - +def _create_empty_categorical_column( + categorical_column: CategoricalColumn, dtype: "CategoricalDtype" +) -> CategoricalColumn: return column.build_categorical_column( - categories=dtype.categories, + categories=column.as_column(dtype.categories), codes=column.as_column( cudf.utils.utils.scalar_broadcast_to( categorical_column.default_na_value(), @@ -1236,7 +1378,9 @@ def _create_empty_categorical_column(categorical_column, dtype): ) -def pandas_categorical_as_column(categorical, codes=None): +def pandas_categorical_as_column( + categorical: ColumnLike, codes: ColumnLike = None +) -> CategoricalColumn: """Creates a CategoricalColumn from a pandas.Categorical diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 1a32842b027..670dd456de9 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1,9 +1,24 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. +from __future__ import annotations +import builtins import pickle import warnings -from numbers import Number +from collections.abc import MutableSequence from types import SimpleNamespace +from typing import ( + Any, + Callable, + Dict, + List, + Mapping, + Optional, + Sequence, + Tuple, + TypeVar, + Union, + cast, +) import cupy import numpy as np @@ -22,6 +37,7 @@ from cudf._lib.scalar import as_device_scalar from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count from cudf._lib.transform import bools_to_mask +from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.core.abc import Serializable from cudf.core.buffer import Buffer from cudf.core.dtypes import CategoricalDtype @@ -32,6 +48,7 @@ cudf_dtypes_to_pandas_dtypes, get_time_unit, is_categorical_dtype, + is_decimal_dtype, is_list_dtype, is_numerical_dtype, is_scalar, @@ -43,68 +60,34 @@ ) from cudf.utils.utils import mask_dtype +T = TypeVar("T", bound="ColumnBase") -class ColumnBase(Column, Serializable): - def __init__( - self, - data, - size, - dtype, - mask=None, - offset=0, - null_count=None, - children=(), - ): - """ - Parameters - ---------- - data : Buffer - dtype - The type associated with the data Buffer - mask : Buffer, optional - children : tuple, optional - """ - super().__init__( - data, - size=size, - dtype=dtype, - mask=mask, - offset=offset, - children=children, - ) - def as_frame(self): +class ColumnBase(Column, Serializable): + def as_frame(self) -> "cudf.core.frame.Frame": """ Converts a Column to Frame """ return cudf.core.frame.Frame({None: self.copy(deep=False)}) @property - def data_array_view(self): + def data_array_view(self) -> "cuda.devicearray.DeviceNDArray": """ View the data as a device array object """ - if self.dtype == "object": - raise ValueError("Cannot get an array view of a StringColumn") - - if is_categorical_dtype(self.dtype): - return self.codes.data_array_view - else: - dtype = self.dtype - result = cuda.as_cuda_array(self.data) # Workaround until `.view(...)` can change itemsize # xref: https://github.com/numba/numba/issues/4829 result = cuda.devicearray.DeviceNDArray( - shape=(result.nbytes // dtype.itemsize,), - strides=(dtype.itemsize,), - dtype=dtype, + shape=(result.nbytes // self.dtype.itemsize,), + strides=(self.dtype.itemsize,), + dtype=self.dtype, gpu_data=result.gpu_data, ) return result @property - def mask_array_view(self): + def mask_array_view(self) -> "cuda.devicearray.DeviceNDArray": """ View the mask as a device array """ @@ -121,10 +104,12 @@ def mask_array_view(self): ) return result - def __len__(self): + def __len__(self) -> int: return self.size - def to_pandas(self, index=None, nullable=False, **kwargs): + def to_pandas( + self, index: ColumnLike = None, nullable: bool = False, **kwargs + ) -> "pd.Series": if nullable and self.dtype in cudf_dtypes_to_pandas_dtypes: pandas_nullable_dtype = cudf_dtypes_to_pandas_dtypes[self.dtype] arrow_array = self.to_arrow() @@ -143,14 +128,14 @@ def __iter__(self): cudf.utils.utils.raise_iteration_error(obj=self) @property - def values_host(self): + def values_host(self) -> "np.ndarray": """ Return a numpy representation of the Column. """ return self.data_array_view.copy_to_host() @property - def values(self): + def values(self) -> "cupy.ndarray": """ Return a CuPy representation of the Column. """ @@ -162,14 +147,18 @@ def values(self): return cupy.asarray(self.data_array_view) - def clip(self, lo, hi): - if is_categorical_dtype(self): - input_col = self.astype(self.categories.dtype) - return libcudf.replace.clip(input_col, lo, hi).astype(self.dtype) - else: - return libcudf.replace.clip(self, lo, hi) + def find_and_replace( + self: T, + to_replace: ColumnLike, + replacement: ColumnLike, + all_nan: bool = False, + ) -> T: + raise NotImplementedError + + def clip(self, lo: ScalarLike, hi: ScalarLike) -> ColumnBase: + return libcudf.replace.clip(self, lo, hi) - def equals(self, other, check_dtypes=False): + def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool: if self is other: return True if other is None or len(self) != len(other): @@ -179,21 +168,32 @@ def equals(self, other, check_dtypes=False): return False return (self == other).min() - def all(self): + def all(self) -> bool: return bool(libcudf.reduce.reduce("all", self, dtype=np.bool_)) - def any(self): + def any(self) -> bool: return bool(libcudf.reduce.reduce("any", self, dtype=np.bool_)) - def __sizeof__(self): - n = self.data.size + def __sizeof__(self) -> int: + n = 0 + if self.data is not None: + n += self.data.size if self.nullable: n += bitmask_allocation_size_bytes(self.size) return n - @classmethod - def _concat(cls, objs, dtype=None): + def cat( + self, parent=None + ) -> "cudf.core.column.categorical.CategoricalAccessor": + raise NotImplementedError() + + def str(self, parent=None) -> "cudf.core.column.string.StringMethods": + raise NotImplementedError() + @classmethod + def _concat( + cls, objs: "MutableSequence[ColumnBase]", dtype: Dtype = None + ) -> ColumnBase: if len(objs) == 0: dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): @@ -281,7 +281,7 @@ def _concat(cls, objs, dtype=None): if is_categorical: col = build_categorical_column( - categories=cats, + categories=as_column(cats), codes=as_column(col.base_data, dtype=col.dtype), mask=col.base_mask, size=col.size, @@ -290,11 +290,17 @@ def _concat(cls, objs, dtype=None): return col - def dropna(self): - dropped_col = self.as_frame().dropna()._as_column() + def dropna(self, drop_nan: bool = False) -> ColumnBase: + if drop_nan: + col = self.nans_to_nulls() + else: + col = self + dropped_col = ( + col.as_frame()._drop_na_rows(drop_nan=drop_nan)._as_column() + ) return dropped_col - def to_arrow(self): + def to_arrow(self) -> pa.Array: """Convert to PyArrow Array Examples @@ -343,7 +349,7 @@ def to_arrow(self): )["None"].chunk(0) @classmethod - def from_arrow(cls, array): + def from_arrow(cls, array: pa.Array) -> ColumnBase: """ Convert PyArrow Array/ChunkedArray to column @@ -405,15 +411,18 @@ def from_arrow(cls, array): "None" ] - def _get_mask_as_column(self): + def _get_mask_as_column(self) -> ColumnBase: return libcudf.transform.mask_to_bools( self.base_mask, self.offset, self.offset + len(self) ) - def _memory_usage(self, **kwargs): + def _memory_usage(self, **kwargs) -> int: return self.__sizeof__() - def to_gpu_array(self, fillna=None): + def default_na_value(self) -> Any: + raise NotImplementedError() + + def to_gpu_array(self, fillna=None) -> "cuda.devicearray.DeviceNDArray": """Get a dense numba device array for the data. Parameters @@ -430,9 +439,9 @@ def to_gpu_array(self, fillna=None): if fillna: return self.fillna(self.default_na_value()).data_array_view else: - return self.dropna().data_array_view + return self.dropna(drop_nan=False).data_array_view - def to_array(self, fillna=None): + def to_array(self, fillna=None) -> "np.array": """Get a dense numpy array for the data. Parameters @@ -451,13 +460,16 @@ def to_array(self, fillna=None): return self.to_gpu_array(fillna=fillna).copy_to_host() - def _fill(self, fill_value, begin=0, end=-1, inplace=False): + def _fill( + self, + fill_value: ScalarLike, + begin: int, + end: int, + inplace: bool = False, + ) -> Optional[ColumnBase]: if end <= begin or begin >= self.size: return self if inplace else self.copy() - if is_categorical_dtype(self.dtype): - return self._fill_categorical(fill_value, begin, end, inplace) - fill_scalar = as_device_scalar(fill_value, self.dtype) if not inplace: @@ -477,7 +489,6 @@ def _fill(self, fill_value, begin=0, end=-1, inplace=False): return self - def _fill_categorical(self, fill_value, begin, end, inplace): fill_code = self._encode(fill_value) fill_scalar = as_device_scalar(fill_code, self.codes.dtype) @@ -486,16 +497,16 @@ def _fill_categorical(self, fill_value, begin, end, inplace): libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar) return result - def shift(self, offset, fill_value): + def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase: return libcudf.copying.shift(self, offset, fill_value) @property - def valid_count(self): + def valid_count(self) -> int: """Number of non-null values""" return len(self) - self.null_count @property - def nullmask(self): + def nullmask(self) -> Buffer: """The gpu buffer for the null-mask """ if self.nullable: @@ -503,7 +514,7 @@ def nullmask(self): else: raise ValueError("Column has no null mask") - def copy(self, deep=True): + def copy(self, deep: bool = True) -> ColumnBase: """Columns are immutable, so a deep copy produces a copy of the underlying data and mask and a shallow copy creates a new column and copies the references of the data and mask. @@ -520,7 +531,7 @@ def copy(self, deep=True): children=self.base_children, ) - def view(self, dtype): + def view(self, dtype: Dtype) -> ColumnBase: """ View the data underlying a column as different dtype. The source column must divide evenly into the size of @@ -562,6 +573,7 @@ def view(self, dtype): + f" total bytes into {dtype} with size {dtype.itemsize}" ) + assert self.base_data is not None new_buf_ptr = ( self.base_data.ptr + self.offset * self.dtype.itemsize ) @@ -573,7 +585,7 @@ def view(self, dtype): ) return build_column(view_buf, dtype=dtype) - def element_indexing(self, index): + def element_indexing(self, index: int): """Default implementation for indexing to an element Raises @@ -588,46 +600,29 @@ def element_indexing(self, index): return libcudf.copying.get_element(self, index).value - def __getitem__(self, arg): + def slice(self, start: int, stop: int, stride: int = None) -> ColumnBase: + if start < 0: + start = start + len(self) + if stop < 0: + stop = stop + len(self) + if start >= stop: + return column_empty(0, self.dtype, masked=True) + # compute mask slice + if stride == 1 or stride is None: + return libcudf.copying.column_slice(self, [start, stop])[0] + else: + # Need to create a gather map for given slice with stride + gather_map = arange( + start=start, stop=stop, step=stride, dtype=np.dtype(np.int32), + ) + return self.take(gather_map) - if isinstance(arg, Number): - arg = int(arg) - return self.element_indexing(arg) + def __getitem__(self, arg) -> Union[ScalarLike, ColumnBase]: + if is_scalar(arg): + return self.element_indexing(int(arg)) elif isinstance(arg, slice): - - if is_categorical_dtype(self): - codes = self.codes[arg] - return build_categorical_column( - categories=self.categories, - codes=as_column(codes.base_data, dtype=codes.dtype), - mask=codes.base_mask, - ordered=self.ordered, - size=codes.size, - offset=codes.offset, - ) - start, stop, stride = arg.indices(len(self)) - - if start < 0: - start = start + len(self) - if stop < 0: - stop = stop + len(self) - - if start >= stop: - return column_empty(0, self.dtype, masked=True) - # compute mask slice - if stride == 1 or stride is None: - - return libcudf.copying.column_slice(self, [start, stop])[0] - else: - # Need to create a gather map for given slice with stride - gather_map = arange( - start=start, - stop=stop, - step=stride, - dtype=np.dtype(np.int32), - ) - return self.take(gather_map) + return self.slice(start, stop, stride) else: arg = as_column(arg) if len(arg) == 0: @@ -638,7 +633,7 @@ def __getitem__(self, arg): return self.apply_boolean_mask(arg) raise NotImplementedError(type(arg)) - def __setitem__(self, key, value): + def __setitem__(self, key: Any, value: Any): """ Set the value of self[key] to value. @@ -679,10 +674,7 @@ def __setitem__(self, key, value): nelem = len(key) if is_scalar(value): - if is_categorical_dtype(self.dtype): - value = self._encode(value) - else: - value = self.dtype.type(value) if value is not None else value + value = self.dtype.type(value) if value is not None else value else: if len(value) != nelem: msg = ( @@ -692,9 +684,6 @@ def __setitem__(self, key, value): ) raise ValueError(msg) value = as_column(value).astype(self.dtype) - if is_categorical_dtype(value.dtype): - value = value.cat().set_categories(self.categories) - assert self.dtype == value.dtype if ( isinstance(key, slice) @@ -705,34 +694,11 @@ def __setitem__(self, key, value): out = libcudf.copying.copy_range( value, self, 0, nelem, key_start, key_stop, False ) - if is_categorical_dtype(value.dtype): - out = build_categorical_column( - categories=value.categories, - codes=as_column(out.base_data, dtype=out.dtype), - mask=out.base_mask, - size=out.size, - offset=out.offset, - ordered=value.ordered, - ) else: try: if is_scalar(value): input = self - if is_categorical_dtype(self.dtype): - input = self.codes - out = input.as_frame()._scatter(key, [value])._as_column() - - if is_categorical_dtype(self.dtype): - out = build_categorical_column( - categories=self.categories, - codes=as_column(out.base_data, dtype=out.dtype), - mask=out.base_mask, - size=out.size, - offset=out.offset, - ordered=self.ordered, - ) - else: if not isinstance(value, Column): value = as_column(value) @@ -750,7 +716,12 @@ def __setitem__(self, key, value): self._mimic_inplace(out, inplace=True) - def fillna(self, value=None, method=None, dtype=None): + def fillna( + self: T, + value: Any = None, + method: builtins.str = None, + dtype: Dtype = None, + ) -> T: """Fill null values with ``value``. Returns a copy with null filled. @@ -759,7 +730,7 @@ def fillna(self, value=None, method=None, dtype=None): input_col=self, replacement=value, method=method, dtype=dtype ) - def isnull(self): + def isnull(self) -> ColumnBase: """Identify missing values in a Column. """ result = libcudf.unary.is_null(self) @@ -771,12 +742,12 @@ def isnull(self): return result - def isna(self): + def isna(self) -> ColumnBase: """Identify missing values in a Column. Alias for isnull. """ return self.isnull() - def notnull(self): + def notnull(self) -> ColumnBase: """Identify non-missing values in a Column. """ result = libcudf.unary.is_valid(self) @@ -788,12 +759,14 @@ def notnull(self): return result - def notna(self): + def notna(self) -> ColumnBase: """Identify non-missing values in a Column. Alias for notnull. """ return self.notnull() - def find_first_value(self, value): + def find_first_value( + self, value: ScalarLike, closest: bool = False + ) -> int: """ Returns offset of first value that matches """ @@ -804,7 +777,7 @@ def find_first_value(self, value): raise ValueError("value not found") return indices[0] - def find_last_value(self, value): + def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: """ Returns offset of last value that matches """ @@ -815,21 +788,26 @@ def find_last_value(self, value): raise ValueError("value not found") return indices[-1] - def append(self, other): + def append(self, other: ColumnBase) -> ColumnBase: return ColumnBase._concat([self, as_column(other)]) - def quantile(self, q, interpolation, exact): + def quantile( + self, + q: Union[float, Sequence[float]], + interpolation: builtins.str, + exact: bool, + ) -> ColumnBase: raise TypeError(f"cannot perform quantile with type {self.dtype}") - def median(self, skipna=None): + def median(self, skipna: bool = None) -> ScalarLike: raise TypeError(f"cannot perform median with type {self.dtype}") - def take(self, indices, keep_index=True): + def take(self: T, indices: ColumnBase, keep_index: bool = True) -> T: """Return Column by taking values from the corresponding *indices*. """ # Handle zero size if indices.size == 0: - return column_empty_like(self, newsize=0) + return cast(T, column_empty_like(self, newsize=0)) try: return ( self.as_frame() @@ -843,7 +821,7 @@ def take(self, indices, keep_index=True): ) from e raise - def isin(self, values): + def isin(self, values: Sequence) -> ColumnBase: """Check whether values are contained in the Column. Parameters @@ -898,17 +876,17 @@ def isin(self, values): rhs = as_column(pd.Categorical.from_codes([-1], categories=[])) rhs = rhs.cat().set_categories(lhs_cats).astype(self.dtype) - lhs = cudf.DataFrame({"x": lhs, "orig_order": arange(len(lhs))}) - rhs = cudf.DataFrame( + ldf = cudf.DataFrame({"x": lhs, "orig_order": arange(len(lhs))}) + rdf = cudf.DataFrame( {"x": rhs, "bool": full(len(rhs), True, dtype="bool")} ) - res = lhs.merge(rhs, on="x", how="left").sort_values(by="orig_order") + res = ldf.merge(rdf, on="x", how="left").sort_values(by="orig_order") res = res.drop_duplicates(subset="orig_order", ignore_index=True) res = res._data["bool"].fillna(False) return res - def as_mask(self): + def as_mask(self) -> Buffer: """Convert booleans to bitmask Returns @@ -928,15 +906,15 @@ def to_dlpack(self): return cudf.io.dlpack.to_dlpack(self) @property - def is_unique(self): + def is_unique(self) -> bool: return self.distinct_count() == len(self) @property - def is_monotonic(self): + def is_monotonic(self) -> bool: return self.is_monotonic_increasing @property - def is_monotonic_increasing(self): + def is_monotonic_increasing(self) -> bool: if not hasattr(self, "_is_monotonic_increasing"): if self.has_nulls: self._is_monotonic_increasing = False @@ -947,7 +925,7 @@ def is_monotonic_increasing(self): return self._is_monotonic_increasing @property - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: if not hasattr(self, "_is_monotonic_decreasing"): if self.has_nulls: self._is_monotonic_decreasing = False @@ -957,14 +935,16 @@ def is_monotonic_decreasing(self): ) return self._is_monotonic_decreasing - def get_slice_bound(self, label, side, kind): + def get_slice_bound( + self, label: ScalarLike, side: builtins.str, kind: builtins.str + ) -> int: """ Calculate slice bound that corresponds to given label. Returns leftmost (one-past-the-rightmost if ``side=='right'``) position of given label. Parameters ---------- - label : object + label : Scalar side : {'left', 'right'} kind : {'ix', 'loc', 'getitem'} """ @@ -979,21 +959,29 @@ def get_slice_bound(self, label, side, kind): # Not currently using `kind` argument. if side == "left": return self.find_first_value(label, closest=True) - if side == "right": + elif side == "right": return self.find_last_value(label, closest=True) + 1 + else: + raise ValueError(f"Invalid value for side: {side}") - def sort_by_values(self, ascending=True, na_position="last"): + def sort_by_values( + self: ColumnBase, + ascending: bool = True, + na_position: builtins.str = "last", + ) -> Tuple[ColumnBase, "cudf.core.column.NumericalColumn"]: col_inds = self.as_frame()._get_sorted_inds(ascending, na_position) - col_keys = self[col_inds] + col_keys = self.take(col_inds) return col_keys, col_inds - def distinct_count(self, method="sort", dropna=True): + def distinct_count( + self, method: builtins.str = "sort", dropna: bool = True + ) -> int: if method != "sort": msg = "non sort based distinct_count() not implemented yet" raise NotImplementedError(msg) return cpp_distinct_count(self, ignore_nulls=dropna) - def astype(self, dtype, **kwargs): + def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: if is_categorical_dtype(dtype): return self.as_categorical_column(dtype, **kwargs) elif pd.api.types.pandas_dtype(dtype).type in { @@ -1015,7 +1003,7 @@ def astype(self, dtype, **kwargs): else: return self.as_numerical_column(dtype) - def as_categorical_column(self, dtype, **kwargs): + def as_categorical_column(self, dtype, **kwargs) -> ColumnBase: if "ordered" in kwargs: ordered = kwargs["ordered"] else: @@ -1058,26 +1046,36 @@ def as_categorical_column(self, dtype, **kwargs): ordered=ordered, ) - def as_numerical_column(self, dtype): + def as_numerical_column( + self, dtype: Dtype + ) -> "cudf.core.column.NumericalColumn": raise NotImplementedError - def as_datetime_column(self, dtype, **kwargs): + def as_datetime_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.DatetimeColumn": raise NotImplementedError - def as_timedelta_column(self, dtype, **kwargs): + def as_timedelta_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.TimeDeltaColumn": raise NotImplementedError - def as_string_column(self, dtype, **kwargs): + def as_string_column( + self, dtype: Dtype, format=None + ) -> "cudf.core.column.StringColumn": raise NotImplementedError - def apply_boolean_mask(self, mask): + def apply_boolean_mask(self, mask) -> ColumnBase: mask = as_column(mask, dtype="bool") result = ( self.as_frame()._apply_boolean_mask(boolean_mask=mask)._as_column() ) return result - def argsort(self, ascending=True, na_position="last"): + def argsort( + self, ascending: bool = True, na_position: builtins.str = "last" + ) -> ColumnBase: sorted_indices = self.as_frame()._get_sorted_inds( ascending=ascending, na_position=na_position @@ -1085,7 +1083,7 @@ def argsort(self, ascending=True, na_position="last"): return sorted_indices @property - def __cuda_array_interface__(self): + def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]: output = { "shape": (len(self),), "strides": (self.dtype.itemsize,), @@ -1157,14 +1155,18 @@ def __ge__(self, other): return self.binary_operator("ge", other) def searchsorted( - self, value, side="left", ascending=True, na_position="last" + self, + value, + side: builtins.str = "left", + ascending: bool = True, + na_position: builtins.str = "last", ): values = as_column(value).as_frame() return self.as_frame().searchsorted( values, side, ascending=ascending, na_position=na_position ) - def unique(self): + def unique(self) -> ColumnBase: """ Get unique values in the data """ @@ -1174,17 +1176,18 @@ def unique(self): ._as_column() ) - def serialize(self): - header = {} + def serialize(self) -> Tuple[dict, list]: + header = {} # type: Dict[Any, Any] frames = [] header["type-serialized"] = pickle.dumps(type(self)) header["dtype"] = self.dtype.str - data_header, data_frames = self.data.serialize() - header["data"] = data_header - frames.extend(data_frames) + if self.data is not None: + data_header, data_frames = self.data.serialize() + header["data"] = data_header + frames.extend(data_frames) - if self.nullable: + if self.mask is not None: mask_header, mask_frames = self.mask.serialize() header["mask"] = mask_header frames.extend(mask_frames) @@ -1193,7 +1196,7 @@ def serialize(self): return header, frames @classmethod - def deserialize(cls, header, frames): + def deserialize(cls, header: dict, frames: list) -> ColumnBase: dtype = header["dtype"] data = Buffer.deserialize(header["data"], [frames[0]]) mask = None @@ -1201,61 +1204,71 @@ def deserialize(cls, header, frames): mask = Buffer.deserialize(header["mask"], [frames[1]]) return build_column(data=data, dtype=dtype, mask=mask) - def min(self, skipna=None, dtype=None): + def binary_operator( + self, op: builtins.str, other: BinaryOperand, reflect: bool = False + ) -> ColumnBase: + raise NotImplementedError + + def min(self, skipna: bool = None, dtype: Dtype = None): result_col = self._process_for_reduction(skipna=skipna) if isinstance(result_col, ColumnBase): return libcudf.reduce.reduce("min", result_col, dtype=dtype) else: return result_col - def max(self, skipna=None, dtype=None): + def max(self, skipna: bool = None, dtype: Dtype = None): result_col = self._process_for_reduction(skipna=skipna) if isinstance(result_col, ColumnBase): return libcudf.reduce.reduce("max", result_col, dtype=dtype) else: return result_col - def sum(self, skipna=None, dtype=None, min_count=0): + def sum( + self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 + ): raise TypeError(f"cannot perform sum with type {self.dtype}") - def product(self, skipna=None, dtype=None, min_count=0): + def product( + self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 + ): raise TypeError(f"cannot perform prod with type {self.dtype}") - def mean(self, skipna=None, dtype=None): + def mean(self, skipna: bool = None, dtype: Dtype = None): raise TypeError(f"cannot perform mean with type {self.dtype}") - def std(self, skipna=None, ddof=1, dtype=np.float64): + def std(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64): raise TypeError(f"cannot perform std with type {self.dtype}") - def var(self, skipna=None, ddof=1, dtype=np.float64): + def var(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64): raise TypeError(f"cannot perform var with type {self.dtype}") - def kurtosis(self, skipna=None): + def kurtosis(self, skipna: bool = None): raise TypeError(f"cannot perform kurt with type {self.dtype}") - def skew(self, skipna=None): + def skew(self, skipna: bool = None): raise TypeError(f"cannot perform skew with type {self.dtype}") - def cov(self, other): + def cov(self, other: ColumnBase): raise TypeError( f"cannot perform covarience with types {self.dtype}, " f"{other.dtype}" ) - def corr(self, other): + def corr(self, other: ColumnBase): raise TypeError( f"cannot perform corr with types {self.dtype}, {other.dtype}" ) - def nans_to_nulls(self): + def nans_to_nulls(self: T) -> T: if self.dtype.kind == "f": - col = self.fillna(np.nan) - newmask = libcudf.transform.nans_to_nulls(col) + newmask = libcudf.transform.nans_to_nulls(self) return self.set_mask(newmask) else: return self - def _process_for_reduction(self, skipna=None, min_count=0): + def _process_for_reduction( + self, skipna: bool = None, min_count: int = 0 + ) -> Union[ColumnBase, ScalarLike]: skipna = True if skipna is None else skipna if skipna: @@ -1280,8 +1293,13 @@ def _process_for_reduction(self, skipna=None, min_count=0): return result_col def scatter_to_table( - self, row_indices, column_indices, names, nrows=None, ncols=None - ): + self, + row_indices: ColumnBase, + column_indices: ColumnBase, + names: List[Any], + nrows: int = None, + ncols: int = None, + ) -> "cudf.core.frame.Frame": """ Scatters values from the column into a table. @@ -1326,7 +1344,12 @@ def scatter_to_table( ) -def column_empty_like(column, dtype=None, masked=False, newsize=None): +def column_empty_like( + column: ColumnBase, + dtype: Dtype = None, + masked: bool = False, + newsize: int = None, +) -> ColumnBase: """Allocate a new column like the given *column* """ if dtype is None: @@ -1338,6 +1361,7 @@ def column_empty_like(column, dtype=None, masked=False, newsize=None): and is_categorical_dtype(column.dtype) and dtype == column.dtype ): + column = cast("cudf.core.column.CategoricalColumn", column) codes = column_empty_like(column.codes, masked=masked, newsize=newsize) return build_column( data=None, @@ -1350,7 +1374,9 @@ def column_empty_like(column, dtype=None, masked=False, newsize=None): return column_empty(row_count, dtype, masked) -def column_empty_like_same_mask(column, dtype): +def column_empty_like_same_mask( + column: ColumnBase, dtype: Dtype +) -> ColumnBase: """Create a new empty Column with the same length and the same mask. Parameters @@ -1364,11 +1390,13 @@ def column_empty_like_same_mask(column, dtype): return result -def column_empty(row_count, dtype="object", masked=False): +def column_empty( + row_count: int, dtype: Dtype = "object", masked: bool = False +) -> ColumnBase: """Allocate a new column like the given row_count and dtype. """ dtype = pd.api.types.pandas_dtype(dtype) - children = () + children = () # type: Tuple[ColumnBase, ...] if is_categorical_dtype(dtype): data = None @@ -1401,8 +1429,15 @@ def column_empty(row_count, dtype="object", masked=False): def build_column( - data, dtype, mask=None, size=None, offset=0, null_count=None, children=() -): + data: Union[Buffer, None], + dtype: Dtype, + *, + size: int = None, + mask: Buffer = None, + offset: int = 0, + null_count: int = None, + children: Tuple[ColumnBase, ...] = (), +) -> ColumnBase: """ Build a Column of the appropriate type from the given parameters @@ -1437,6 +1472,7 @@ def build_column( children=children, ) elif dtype.type is np.datetime64: + assert data is not None return cudf.core.column.DatetimeColumn( data=data, dtype=dtype, @@ -1446,6 +1482,7 @@ def build_column( null_count=null_count, ) elif dtype.type is np.timedelta64: + assert data is not None return cudf.core.column.TimeDeltaColumn( data=data, dtype=dtype, @@ -1473,6 +1510,15 @@ def build_column( ) elif is_struct_dtype(dtype): return cudf.core.column.StructColumn( + data=data, + dtype=dtype, + size=size, + mask=mask, + null_count=null_count, + children=children, + ) + elif is_decimal_dtype(dtype): + return cudf.core.column.DecimalColumn( data=data, size=size, dtype=dtype, @@ -1481,6 +1527,7 @@ def build_column( children=children, ) else: + assert data is not None return cudf.core.column.NumericalColumn( data=data, dtype=dtype, @@ -1492,14 +1539,14 @@ def build_column( def build_categorical_column( - categories, - codes, - mask=None, - size=None, - offset=0, - null_count=None, - ordered=None, -): + categories: ColumnBase, + codes: ColumnBase, + mask: Buffer = None, + size: int = None, + offset: int = 0, + null_count: int = None, + ordered: bool = None, +) -> "cudf.core.column.CategoricalColumn": """ Build a CategoricalColumn @@ -1523,9 +1570,9 @@ def build_categorical_column( if codes.dtype != codes_dtype: codes = codes.astype(codes_dtype) - dtype = CategoricalDtype(categories=as_column(categories), ordered=ordered) + dtype = CategoricalDtype(categories=categories, ordered=ordered) - return build_column( + result = build_column( data=None, dtype=dtype, mask=mask, @@ -1534,9 +1581,15 @@ def build_categorical_column( null_count=null_count, children=(codes,), ) + return cast("cudf.core.column.CategoricalColumn", result) -def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): +def as_column( + arbitrary: Any, + nan_as_null: bool = None, + dtype: Dtype = None, + length: int = None, +): """Create a Column from an arbitrary object Parameters @@ -1773,7 +1826,10 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): mask = data.mask data = cudf.core.column.timedelta.TimeDeltaColumn( - data=buffer, mask=mask, dtype=arbitrary.dtype + data=buffer, + size=len(arbitrary), + mask=mask, + dtype=arbitrary.dtype, ) elif arb_dtype.kind in ("O", "U"): data = as_column( @@ -1822,9 +1878,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null ) elif isinstance(arbitrary, cudf.Scalar): - data = libcudf.column.make_column_from_scalar( - arbitrary, length if length else 1 - ) + data = ColumnBase.from_scalar(arbitrary, length if length else 1) elif isinstance(arbitrary, pd.core.arrays.masked.BaseMaskedArray): cudf_dtype = arbitrary._data.dtype @@ -1853,6 +1907,14 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): "Cannot create list column from given data" ) return as_column(data, nan_as_null=nan_as_null) + if isinstance(dtype, cudf.core.dtypes.Decimal64Dtype): + data = pa.array( + arbitrary, + type=pa.decimal128( + precision=dtype.precision, scale=dtype.scale + ), + ) + return cudf.core.column.DecimalColumn.from_arrow(data) dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): raise TypeError @@ -1898,7 +1960,11 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): return data -def column_applymap(udf, column, out_dtype): +def column_applymap( + udf: Callable[[ScalarLike], ScalarLike], + column: ColumnBase, + out_dtype: Dtype, +) -> ColumnBase: """Apply an element-wise function to transform the values in the Column. Parameters @@ -1946,7 +2012,7 @@ def kernel_non_masked(values, results): return as_column(results) -def _data_from_cuda_array_interface_desc(obj): +def _data_from_cuda_array_interface_desc(obj) -> Buffer: desc = obj.__cuda_array_interface__ ptr = desc["data"][0] nelem = desc["shape"][0] if len(desc["shape"]) > 0 else 1 @@ -1956,7 +2022,7 @@ def _data_from_cuda_array_interface_desc(obj): return data -def _mask_from_cuda_array_interface_desc(obj): +def _mask_from_cuda_array_interface_desc(obj) -> Union[Buffer, None]: desc = obj.__cuda_array_interface__ mask = desc.get("mask", None) @@ -1979,7 +2045,7 @@ def _mask_from_cuda_array_interface_desc(obj): return mask -def serialize_columns(columns): +def serialize_columns(columns) -> Tuple[List[dict], List]: """ Return the headers and frames resulting from serializing a list of Column @@ -1994,7 +2060,7 @@ def serialize_columns(columns): frames : list list of frames """ - headers = [] + headers = [] # type List[Dict[Any, Any], ...] frames = [] if len(columns) > 0: @@ -2006,7 +2072,7 @@ def serialize_columns(columns): return headers, frames -def deserialize_columns(headers, frames): +def deserialize_columns(headers: List[dict], frames: List) -> List[ColumnBase]: """ Construct a list of Columns from a list of headers and frames. @@ -2024,7 +2090,12 @@ def deserialize_columns(headers, frames): return columns -def arange(start, stop=None, step=1, dtype=None): +def arange( + start: Union[int, float], + stop: Union[int, float] = None, + step: Union[int, float] = 1, + dtype=None, +) -> ColumnBase: """ Returns a column with evenly spaced values within a given interval. @@ -2077,7 +2148,7 @@ def arange(start, stop=None, step=1, dtype=None): ) -def full(size, fill_value, dtype=None): +def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase: """ Returns a column of given size and dtype, filled with a given value. @@ -2108,7 +2179,4 @@ def full(size, fill_value, dtype=None): 4 7 dtype: int8 """ - - return libcudf.column.make_column_from_scalar( - cudf.Scalar(fill_value, dtype), size - ) + return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 4561b1f68f2..8ae16288050 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -1,7 +1,10 @@ # Copyright (c) 2019-2020, NVIDIA CORPORATION. +from __future__ import annotations + import datetime as dt import re from numbers import Number +from typing import Any, Sequence, Union, cast import numpy as np import pandas as pd @@ -9,7 +12,9 @@ import cudf from cudf import _lib as libcudf -from cudf.core.column import column, string +from cudf._typing import DatetimeLikeScalar, Dtype, DtypeObj, ScalarLike +from cudf.core.buffer import Buffer +from cudf.core.column import ColumnBase, column, string from cudf.utils.dtypes import is_scalar from cudf.utils.utils import _fillna_natwise @@ -34,7 +39,13 @@ class DatetimeColumn(column.ColumnBase): def __init__( - self, data, dtype, mask=None, size=None, offset=0, null_count=None + self, + data: Buffer, + dtype: DtypeObj, + mask: Buffer = None, + size: int = None, + offset: int = 0, + null_count: int = None, ): """ Parameters @@ -66,49 +77,51 @@ def __init__( self._time_unit, _ = np.datetime_data(self.dtype) - def __contains__(self, item): + def __contains__(self, item: ScalarLike) -> bool: try: - item = np.datetime64(item, self._time_unit) + item_as_dt64 = np.datetime64(item, self._time_unit) except ValueError: # If item cannot be converted to datetime type # np.datetime64 raises ValueError, hence `item` # cannot exist in `self`. return False - return item.astype("int64") in self.as_numerical + return item_as_dt64.astype("int64") in self.as_numerical @property - def time_unit(self): + def time_unit(self) -> str: return self._time_unit @property - def year(self): + def year(self) -> ColumnBase: return self.get_dt_field("year") @property - def month(self): + def month(self) -> ColumnBase: return self.get_dt_field("month") @property - def day(self): + def day(self) -> ColumnBase: return self.get_dt_field("day") @property - def hour(self): + def hour(self) -> ColumnBase: return self.get_dt_field("hour") @property - def minute(self): + def minute(self) -> ColumnBase: return self.get_dt_field("minute") @property - def second(self): + def second(self) -> ColumnBase: return self.get_dt_field("second") @property - def weekday(self): + def weekday(self) -> ColumnBase: return self.get_dt_field("weekday") - def to_pandas(self, index=None, **kwargs): + def to_pandas( + self, index: "cudf.Index" = None, nullable: bool = False, **kwargs + ) -> "cudf.Series": # Workaround until following issue is fixed: # https://issues.apache.org/jira/browse/ARROW-9772 @@ -122,10 +135,10 @@ def to_pandas(self, index=None, **kwargs): return pd_series - def get_dt_field(self, field): + def get_dt_field(self, field: str) -> ColumnBase: return libcudf.datetime.extract_datetime_component(self, field) - def normalize_binop_value(self, other): + def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike: if isinstance(other, cudf.Scalar): return other @@ -162,30 +175,41 @@ def normalize_binop_value(self, other): raise TypeError(f"cannot normalize {type(other)}") @property - def as_numerical(self): - return column.build_column( - data=self.base_data, - dtype=np.int64, - mask=self.base_mask, - offset=self.offset, - size=self.size, + def as_numerical(self) -> "cudf.core.column.NumericalColumn": + return cast( + "cudf.core.column.NumericalColumn", + column.build_column( + data=self.base_data, + dtype=np.int64, + mask=self.base_mask, + offset=self.offset, + size=self.size, + ), ) - def as_datetime_column(self, dtype, **kwargs): + def as_datetime_column(self, dtype: Dtype, **kwargs) -> DatetimeColumn: dtype = np.dtype(dtype) if dtype == self.dtype: return self return libcudf.unary.cast(self, dtype=dtype) - def as_timedelta_column(self, dtype, **kwargs): + def as_timedelta_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.TimeDeltaColumn": raise TypeError( f"cannot astype a datetimelike from [{self.dtype}] to [{dtype}]" ) - def as_numerical_column(self, dtype): - return self.as_numerical.astype(dtype) + def as_numerical_column( + self, dtype: Dtype + ) -> "cudf.core.column.NumericalColumn": + return cast( + "cudf.core.column.NumericalColumn", self.as_numerical.astype(dtype) + ) - def as_string_column(self, dtype, format=None): + def as_string_column( + self, dtype: Dtype, format=None + ) -> "cudf.core.column.StringColumn": if format is None: format = _dtype_to_format_conversion.get( self.dtype.name, "%Y-%m-%d %H:%M:%S" @@ -195,20 +219,25 @@ def as_string_column(self, dtype, format=None): np.dtype(self.dtype) ](self, format) else: - return column.column_empty(0, dtype="object", masked=False) + return cast( + "cudf.core.column.StringColumn", + column.column_empty(0, dtype="object", masked=False), + ) - def default_na_value(self): + def default_na_value(self) -> DatetimeLikeScalar: """Returns the default NA value for this column """ return np.datetime64("nat", self.time_unit) - def mean(self, skipna=None, dtype=np.float64): + def mean(self, skipna=None, dtype=np.float64) -> ScalarLike: return pd.Timestamp( self.as_numerical.mean(skipna=skipna, dtype=dtype), unit=self.time_unit, ) - def quantile(self, q, interpolation, exact): + def quantile( + self, q: Union[float, Sequence[float]], interpolation: str, exact: bool + ) -> ColumnBase: result = self.as_numerical.quantile( q=q, interpolation=interpolation, exact=exact ) @@ -216,18 +245,23 @@ def quantile(self, q, interpolation, exact): return pd.Timestamp(result, unit=self.time_unit) return result.astype(self.dtype) - def binary_operator(self, op, rhs, reflect=False): + def binary_operator( + self, + op: str, + rhs: Union[ColumnBase, "cudf.Scalar"], + reflect: bool = False, + ) -> ColumnBase: if isinstance(rhs, cudf.DateOffset): return binop_offset(self, rhs, op) lhs, rhs = self, rhs if op in ("eq", "ne", "lt", "gt", "le", "ge"): out_dtype = np.bool elif op == "add" and pd.api.types.is_timedelta64_dtype(rhs.dtype): - out_dtype = cudf.core.column.timedelta._timedelta_binary_op_add( + out_dtype = cudf.core.column.timedelta._timedelta_add_result_dtype( rhs, lhs ) elif op == "sub" and pd.api.types.is_timedelta64_dtype(rhs.dtype): - out_dtype = cudf.core.column.timedelta._timedelta_binary_op_sub( + out_dtype = cudf.core.column.timedelta._timedelta_sub_result_dtype( rhs if reflect else lhs, lhs if reflect else rhs ) elif op == "sub" and pd.api.types.is_datetime64_dtype(rhs.dtype): @@ -244,13 +278,11 @@ def binary_operator(self, op, rhs, reflect=False): f"Series of dtype {self.dtype} cannot perform " f" the operation {op}" ) + return binop(lhs, rhs, op=op, out_dtype=out_dtype, reflect=reflect) - if reflect: - lhs, rhs = rhs, lhs - - return binop(lhs, rhs, op=op, out_dtype=out_dtype) - - def fillna(self, fill_value=None, method=None): + def fillna( + self, fill_value: Any = None, method: str = None, dtype: Dtype = None + ) -> DatetimeColumn: if fill_value is not None: if cudf.utils.utils.isnat(fill_value): return _fillna_natwise(self) @@ -262,7 +294,9 @@ def fillna(self, fill_value=None, method=None): return super().fillna(fill_value, method) - def find_first_value(self, value, closest=False): + def find_first_value( + self, value: ScalarLike, closest: bool = False + ) -> int: """ Returns offset of first value that matches """ @@ -270,7 +304,7 @@ def find_first_value(self, value, closest=False): value = column.as_column(value, dtype=self.dtype).as_numerical[0] return self.as_numerical.find_first_value(value, closest=closest) - def find_last_value(self, value, closest=False): + def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: """ Returns offset of last value that matches """ @@ -279,10 +313,10 @@ def find_last_value(self, value, closest=False): return self.as_numerical.find_last_value(value, closest=closest) @property - def is_unique(self): + def is_unique(self) -> bool: return self.as_numerical.is_unique - def can_cast_safely(self, to_dtype): + def can_cast_safely(self, to_dtype: Dtype) -> bool: if np.issubdtype(to_dtype, np.datetime64): to_res, _ = np.datetime_data(to_dtype) @@ -315,7 +349,15 @@ def can_cast_safely(self, to_dtype): @annotate("BINARY_OP", color="orange", domain="cudf_python") -def binop(lhs, rhs, op, out_dtype): +def binop( + lhs: Union[ColumnBase, ScalarLike], + rhs: Union[ColumnBase, ScalarLike], + op: str, + out_dtype: Dtype, + reflect: bool, +) -> ColumnBase: + if reflect: + lhs, rhs = rhs, lhs out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) return out @@ -329,11 +371,10 @@ def binop_offset(lhs, rhs, op): return out -def infer_format(element, **kwargs): +def infer_format(element: str, **kwargs) -> str: """ Infers datetime format from a string, also takes cares for `ms` and `ns` """ - fmt = pd.core.tools.datetimes._guess_datetime_format(element, **kwargs) if fmt is not None: @@ -345,8 +386,8 @@ def infer_format(element, **kwargs): # There is possibility that the element is of following format # '00:00:03.333333 2016-01-01' - second_part = re.split(r"(\D+)", element_parts[1], maxsplit=1) - subsecond_fmt = ".%" + str(len(second_part[0])) + "f" + second_parts = re.split(r"(\D+)", element_parts[1], maxsplit=1) + subsecond_fmt = ".%" + str(len(second_parts[0])) + "f" first_part = pd.core.tools.datetimes._guess_datetime_format( element_parts[0], **kwargs @@ -360,16 +401,16 @@ def infer_format(element, **kwargs): if first_part is None: raise ValueError("Unable to infer the timestamp format from the data") - if len(second_part) > 1: + if len(second_parts) > 1: # "Z" indicates Zulu time(widely used in aviation) - Which is # UTC timezone that currently cudf only supports. Having any other # unsuppported timezone will let the code fail below # with a ValueError. - second_part.remove("Z") - second_part = "".join(second_part[1:]) + second_parts.remove("Z") + second_part = "".join(second_parts[1:]) if len(second_part) > 1: - # Only infer if second_part is not an empty string. + # Only infer if second_parts is not an empty string. second_part = pd.core.tools.datetimes._guess_datetime_format( second_part, **kwargs ) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py new file mode 100644 index 00000000000..58156c3826c --- /dev/null +++ b/python/cudf/cudf/core/column/decimal.py @@ -0,0 +1,77 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +import cupy as cp +import numpy as np +import pyarrow as pa + +from cudf import _lib as libcudf +from cudf.core.buffer import Buffer +from cudf.core.column import ColumnBase +from cudf.core.dtypes import Decimal64Dtype +from cudf.utils.utils import pa_mask_buffer_to_mask + + +class DecimalColumn(ColumnBase): + @classmethod + def from_arrow(cls, data: pa.Array): + dtype = Decimal64Dtype.from_arrow(data.type) + mask_buf = data.buffers()[0] + mask = ( + mask_buf + if mask_buf is None + else pa_mask_buffer_to_mask(mask_buf, len(data)) + ) + data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int64")) + data_64 = data_128[::2].copy() + return cls( + data=Buffer(data_64.view("uint8")), + size=len(data), + dtype=dtype, + mask=mask, + ) + + def to_arrow(self): + data_buf_64 = self.base_data.to_host_array().view("int64") + data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64") + # use striding to set the first 64 bits of each 128-bit chunk: + data_buf_128[::2] = data_buf_64 + # use striding again to set the remaining bits of each 128-bit chunk: + # 0 for non-negative values, -1 for negative values: + data_buf_128[1::2] = np.piecewise( + data_buf_64, [data_buf_64 < 0], [-1, 0] + ) + data_buf = pa.py_buffer(data_buf_128) + mask_buf = ( + self.base_mask + if self.base_mask is None + else pa.py_buffer(self.base_mask.to_host_array()) + ) + return pa.Array.from_buffers( + type=self.dtype.to_arrow(), + length=self.size, + buffers=[mask_buf, data_buf], + ) + + def binary_operator(self, op, other, reflect=False): + if reflect: + self, other = other, self + result = libcudf.binaryop.binaryop(self, other, op, "int32") + result.dtype.precision = _binop_precision(self.dtype, other.dtype, op) + return result + + +def _binop_precision(l_dtype, r_dtype, op): + """ + Returns the result precision when performing the + binary operation `op` for the given dtypes. + + See: https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql + """ # noqa: E501 + p1, p2 = l_dtype.precision, r_dtype.precision + s1, s2 = l_dtype.scale, r_dtype.scale + if op in ("add", "sub"): + return max(s1, s2) + max(p1 - s1, p2 - s2) + 1 + elif op == "mul": + return p1 + p2 + 1 + else: + raise NotImplementedError() diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index c2aa41a5de1..8641bc88806 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -173,8 +173,7 @@ def __init__(self, column, parent=None): raise AttributeError( "Can only use .list accessor with a 'list' dtype" ) - self._column = column - self._parent = parent + super().__init__(column=column, parent=parent) @property def leaves(self): diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 8395c9c3da6..eec9c2a7860 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -1,9 +1,57 @@ # Copyright (c) 2020, NVIDIA CORPORATION. +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional, Union, overload + +from typing_extensions import Literal + import cudf +if TYPE_CHECKING: + from cudf.core.column import ColumnBase + class ColumnMethodsMixin: + _column: ColumnBase + _parent: Optional[Union["cudf.Series", "cudf.Index"]] + + def __init__( + self, + column: ColumnBase, + parent: Union["cudf.Series", "cudf.Index"] = None, + ): + self._column = column + self._parent = parent + + @overload + def _return_or_inplace( + self, new_col, inplace: Literal[False], expand=False, retain_index=True + ) -> Union["cudf.Series", "cudf.Index"]: + ... + + @overload + def _return_or_inplace( + self, new_col, expand: bool = False, retain_index: bool = True + ) -> Union["cudf.Series", "cudf.Index"]: + ... + + @overload + def _return_or_inplace( + self, new_col, inplace: Literal[True], expand=False, retain_index=True + ) -> None: + ... + + @overload + def _return_or_inplace( + self, + new_col, + inplace: bool = False, + expand: bool = False, + retain_index: bool = True, + ) -> Optional[Union["cudf.Series", "cudf.Index"]]: + ... + def _return_or_inplace( self, new_col, inplace=False, expand=False, retain_index=True ): @@ -19,31 +67,29 @@ def _return_or_inplace( ), inplace=True, ) + return None else: self._column._mimic_inplace(new_col, inplace=True) + return None else: + if self._parent is None: + return new_col if expand or isinstance( self._parent, (cudf.DataFrame, cudf.MultiIndex) ): # This branch indicates the passed as new_col - # is actually a table-like data + # is a Table table = new_col - if isinstance(table, cudf._lib.table.Table): - if isinstance(self._parent, cudf.Index): - idx = self._parent._constructor_expanddim._from_table( - table=table - ) - idx.names = None - return idx - else: - return self._parent._constructor_expanddim( - data=table._data, index=self._parent.index - ) + if isinstance(self._parent, cudf.Index): + idx = self._parent._constructor_expanddim._from_table( + table=table + ) + idx.names = None + return idx else: return self._parent._constructor_expanddim( - {index: value for index, value in enumerate(table)}, - index=self._parent.index, + data=table._data, index=self._parent.index ) elif isinstance(self._parent, cudf.Series): if retain_index: @@ -59,7 +105,4 @@ def _return_or_inplace( new_col, name=self._parent.name ) else: - if self._parent is None: - return new_col - else: - return self._parent._mimic_inplace(new_col, inplace=False) + return self._parent._mimic_inplace(new_col, inplace=False) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 54a6d274843..f77c408f205 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -1,6 +1,8 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. +from __future__ import annotations from numbers import Number +from typing import Any, Callable, Sequence, Union, cast import numpy as np import pandas as pd @@ -10,8 +12,15 @@ import cudf from cudf import _lib as libcudf from cudf._lib.quantiles import quantile as cpp_quantile +from cudf._typing import BinaryOperand, ColumnLike, Dtype, DtypeObj, ScalarLike from cudf.core.buffer import Buffer -from cudf.core.column import as_column, build_column, column, string +from cudf.core.column import ( + ColumnBase, + as_column, + build_column, + column, + string, +) from cudf.utils import cudautils, utils from cudf.utils.dtypes import ( min_column_type, @@ -21,9 +30,15 @@ ) -class NumericalColumn(column.ColumnBase): +class NumericalColumn(ColumnBase): def __init__( - self, data, dtype, mask=None, size=None, offset=0, null_count=None + self, + data: Buffer, + dtype: DtypeObj, + mask: Buffer = None, + size: int = None, + offset: int = 0, + null_count: int = None, ): """ Parameters @@ -39,6 +54,7 @@ def __init__( if size is None: size = data.size // dtype.itemsize size = size - offset + super().__init__( data, size=size, @@ -48,7 +64,7 @@ def __init__( null_count=null_count, ) - def __contains__(self, item): + def __contains__(self, item: ScalarLike) -> bool: """ Returns True if column contains item, else False. """ @@ -66,10 +82,12 @@ def __contains__(self, item): self, column.as_column([item], dtype=self.dtype) ).any() - def unary_operator(self, unaryop): + def unary_operator(self, unaryop: str) -> ColumnBase: return _numeric_column_unaryop(self, op=unaryop) - def binary_operator(self, binop, rhs, reflect=False): + def binary_operator( + self, binop: str, rhs: BinaryOperand, reflect: bool = False, + ) -> ColumnBase: int_dtypes = [ np.dtype("int8"), np.dtype("int16"), @@ -80,32 +98,33 @@ def binary_operator(self, binop, rhs, reflect=False): np.dtype("uint32"), np.dtype("uint64"), ] - tmp = rhs - if reflect: - tmp = self - if isinstance(rhs, (NumericalColumn, cudf.Scalar)) or np.isscalar(rhs): + if rhs is None: + out_dtype = self.dtype + else: + if not ( + isinstance(rhs, (NumericalColumn, cudf.Scalar,),) + or np.isscalar(rhs) + ): + msg = "{!r} operator not supported between {} and {}" + raise TypeError(msg.format(binop, type(self), type(rhs))) out_dtype = np.result_type(self.dtype, rhs.dtype) if binop in ["mod", "floordiv"]: + tmp = self if reflect else rhs if (tmp.dtype in int_dtypes) and ( (np.isscalar(tmp) and (0 == tmp)) or ((isinstance(tmp, NumericalColumn)) and (0.0 in tmp)) ): out_dtype = np.dtype("float64") - elif rhs is None: - out_dtype = self.dtype - else: - raise TypeError( - f"'{binop}' operator not supported between " - f"{type(self).__name__} and {type(rhs).__name__}" - ) return _numeric_column_binop( lhs=self, rhs=rhs, op=binop, out_dtype=out_dtype, reflect=reflect ) - def _apply_scan_op(self, op): + def _apply_scan_op(self, op: str) -> ColumnBase: return libcudf.reduce.scan(op, self, True) - def normalize_binop_value(self, other): + def normalize_binop_value( + self, other: ScalarLike + ) -> Union[ColumnBase, ScalarLike]: if other is None: return other if isinstance(other, cudf.Scalar): @@ -122,8 +141,8 @@ def normalize_binop_value(self, other): return other other_dtype = np.promote_types(self.dtype, other_dtype) if other_dtype == np.dtype("float16"): - other = np.dtype("float32").type(other) - other_dtype = other.dtype + other_dtype = np.dtype("float32") + other = other_dtype.type(other) if self.dtype.kind == "b": other_dtype = min_signed_type(other) if np.isscalar(other): @@ -134,104 +153,110 @@ def normalize_binop_value(self, other): other, size=len(self), dtype=other_dtype ) return column.build_column( - data=Buffer.from_array_like(ary), - dtype=ary.dtype, - mask=self.mask, + data=Buffer(ary), dtype=ary.dtype, mask=self.mask, ) else: raise TypeError(f"cannot broadcast {type(other)}") - def int2ip(self): + def int2ip(self) -> "cudf.core.column.StringColumn": if self.dtype != np.dtype("int64"): raise TypeError("Only int64 type can be converted to ip") return libcudf.string_casting.int2ip(self) - def as_string_column(self, dtype, **kwargs): + def as_string_column( + self, dtype: Dtype, format=None + ) -> "cudf.core.column.StringColumn": if len(self) > 0: return string._numeric_to_str_typecast_functions[ np.dtype(self.dtype) ](self) else: - return as_column([], dtype="object") - - def as_datetime_column(self, dtype, **kwargs): + return cast( + "cudf.core.column.StringColumn", as_column([], dtype="object") + ) - return build_column( - data=self.astype("int64").base_data, - dtype=dtype, - mask=self.base_mask, - offset=self.offset, - size=self.size, + def as_datetime_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.DatetimeColumn": + return cast( + "cudf.core.column.DatetimeColumn", + build_column( + data=self.astype("int64").base_data, + dtype=dtype, + mask=self.base_mask, + offset=self.offset, + size=self.size, + ), ) - def as_timedelta_column(self, dtype, **kwargs): - - return build_column( - data=self.astype("int64").base_data, - dtype=dtype, - mask=self.base_mask, - offset=self.offset, - size=self.size, + def as_timedelta_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.TimeDeltaColumn": + return cast( + "cudf.core.column.TimeDeltaColumn", + build_column( + data=self.astype("int64").base_data, + dtype=dtype, + mask=self.base_mask, + offset=self.offset, + size=self.size, + ), ) - def as_numerical_column(self, dtype): + def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: dtype = np.dtype(dtype) if dtype == self.dtype: return self return libcudf.unary.cast(self, dtype) - def sum(self, skipna=None, dtype=None, min_count=0): - result_col = self._process_for_reduction( + def reduce(self, op: str, skipna: bool = None, **kwargs) -> float: + min_count = kwargs.pop("min_count", 0) + preprocessed = self._process_for_reduction( skipna=skipna, min_count=min_count ) - if isinstance(result_col, cudf.core.column.ColumnBase): - return libcudf.reduce.reduce("sum", result_col, dtype=dtype) + if isinstance(preprocessed, ColumnBase): + return libcudf.reduce.reduce(op, preprocessed, **kwargs) else: - return result_col + return cast(float, preprocessed) - def product(self, skipna=None, dtype=None, min_count=0): - result_col = self._process_for_reduction( - skipna=skipna, min_count=min_count + def sum( + self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 + ) -> float: + return self.reduce( + "sum", skipna=skipna, dtype=dtype, min_count=min_count ) - if isinstance(result_col, cudf.core.column.ColumnBase): - return libcudf.reduce.reduce("product", result_col, dtype=dtype) - else: - return result_col - def mean(self, skipna=None, dtype=np.float64): - result_col = self._process_for_reduction(skipna=skipna) - if isinstance(result_col, cudf.core.column.ColumnBase): - return libcudf.reduce.reduce("mean", result_col, dtype=dtype) - else: - return result_col + def product( + self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 + ) -> float: + return self.reduce( + "product", skipna=skipna, dtype=dtype, min_count=min_count + ) - def var(self, skipna=None, ddof=1, dtype=np.float64): - result = self._process_for_reduction(skipna=skipna) - if isinstance(result, cudf.core.column.ColumnBase): - return libcudf.reduce.reduce("var", result, dtype=dtype, ddof=ddof) - else: - return result + def mean(self, skipna: bool = None, dtype: Dtype = np.float64) -> float: + return self.reduce("mean", skipna=skipna, dtype=dtype) - def std(self, skipna=None, ddof=1, dtype=np.float64): - result_col = self._process_for_reduction(skipna=skipna) - if isinstance(result_col, cudf.core.column.ColumnBase): - return libcudf.reduce.reduce( - "std", result_col, dtype=dtype, ddof=ddof - ) - else: - return result_col + def var( + self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64 + ) -> float: + return self.reduce("var", skipna=skipna, dtype=dtype, ddof=ddof) - def sum_of_squares(self, dtype=None): + def std( + self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64 + ) -> float: + return self.reduce("std", skipna=skipna, dtype=dtype, ddof=ddof) + + def sum_of_squares(self, dtype: Dtype = None) -> float: return libcudf.reduce.reduce("sum_of_squares", self, dtype=dtype) - def kurtosis(self, skipna=None): + def kurtosis(self, skipna: bool = None) -> float: skipna = True if skipna is None else skipna if len(self) == 0 or (not skipna and self.has_nulls): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - self = self.nans_to_nulls().dropna() + self = self.nans_to_nulls().dropna() # type: ignore if len(self) < 4: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) @@ -250,13 +275,13 @@ def kurtosis(self, skipna=None): kurt = term_one_section_one * term_one_section_two - 3 * term_two return kurt - def skew(self, skipna=None): + def skew(self, skipna: bool = None) -> ScalarLike: skipna = True if skipna is None else skipna if len(self) == 0 or (not skipna and self.has_nulls): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - self = self.nans_to_nulls().dropna() + self = self.nans_to_nulls().dropna() # type: ignore if len(self) < 3: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) @@ -273,7 +298,9 @@ def skew(self, skipna=None): skew = unbiased_coef * m3 / (m2 ** (3 / 2)) return skew - def quantile(self, q, interpolation, exact): + def quantile( + self, q: Union[float, Sequence[float]], interpolation: str, exact: bool + ) -> NumericalColumn: if isinstance(q, Number) or cudf.utils.dtypes.is_list_like(q): np_array_q = np.asarray(q) if np.logical_or(np_array_q < 0, np_array_q > 1).any(): @@ -284,15 +311,14 @@ def quantile(self, q, interpolation, exact): # will only have values in range [0, 1] result = self._numeric_quantile(q, interpolation, exact) if isinstance(q, Number): - result = result[0] return ( cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - if result is cudf.NA - else result + if result[0] is cudf.NA + else result[0] ) return result - def median(self, skipna=None): + def median(self, skipna: bool = None) -> NumericalColumn: skipna = True if skipna is None else skipna if not skipna and self.has_nulls: @@ -301,24 +327,17 @@ def median(self, skipna=None): # enforce linear in case the default ever changes return self.quantile(0.5, interpolation="linear", exact=True) - def _numeric_quantile(self, q, interpolation, exact): - is_number = isinstance(q, Number) - - if is_number: - quant = [float(q)] - elif isinstance(q, list) or isinstance(q, np.ndarray): - quant = q - else: - msg = "`q` must be either a single element, list or numpy array" - raise TypeError(msg) - + def _numeric_quantile( + self, q: Union[float, Sequence[float]], interpolation: str, exact: bool + ) -> NumericalColumn: + quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q # get sorted indices and exclude nulls sorted_indices = self.as_frame()._get_sorted_inds(True, "first") sorted_indices = sorted_indices[self.null_count :] return cpp_quantile(self, quant, interpolation, sorted_indices, exact) - def cov(self, other): + def cov(self, other: ColumnBase) -> float: if ( len(self) == 0 or len(other) == 0 @@ -330,7 +349,7 @@ def cov(self, other): cov_sample = result.sum() / (len(self) - 1) return cov_sample - def corr(self, other): + def corr(self, other: ColumnBase) -> float: if len(self) == 0 or len(other) == 0: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) @@ -341,12 +360,14 @@ def corr(self, other): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) return cov / lhs_std / rhs_std - def round(self, decimals=0): + def round(self, decimals: int = 0) -> NumericalColumn: """Round the values in the Column to the given number of decimals. """ return libcudf.round.round(self, decimal_places=decimals) - def applymap(self, udf, out_dtype=None): + def applymap( + self, udf: Callable[[ScalarLike], ScalarLike], out_dtype: Dtype = None + ) -> ColumnBase: """Apply an element-wise function to transform the values in the Column. Parameters @@ -367,7 +388,7 @@ def applymap(self, udf, out_dtype=None): out = column.column_applymap(udf=udf, column=self, out_dtype=out_dtype) return out - def default_na_value(self): + def default_na_value(self) -> ScalarLike: """Returns the default NA value for this column """ dkind = self.dtype.kind @@ -382,7 +403,12 @@ def default_na_value(self): else: raise TypeError(f"numeric column of {self.dtype} has no NaN value") - def find_and_replace(self, to_replace, replacement, all_nan): + def find_and_replace( + self, + to_replace: ColumnLike, + replacement: ColumnLike, + all_nan: bool = False, + ) -> NumericalColumn: """ Return col with *to_replace* replaced with *value*. """ @@ -409,38 +435,52 @@ def find_and_replace(self, to_replace, replacement, all_nan): replaced, to_replace_col, replacement_col ) - def fillna(self, fill_value=None, method=None): + def fillna( + self, + fill_value: Any = None, + method: str = None, + dtype: Dtype = None, + fill_nan: bool = True, + ) -> NumericalColumn: """ Fill null values with *fill_value* """ + if fill_nan: + col = self.nans_to_nulls() + else: + col = self + if method is not None: - return super().fillna(fill_value, method) + return super(NumericalColumn, col).fillna(fill_value, method) if ( isinstance(fill_value, cudf.Scalar) - and fill_value.dtype == self.dtype + and fill_value.dtype == col.dtype ): - return super().fillna(fill_value, method) + return super(NumericalColumn, col).fillna(fill_value, method) + if np.isscalar(fill_value): # castsafely to the same dtype as self - fill_value_casted = self.dtype.type(fill_value) + fill_value_casted = col.dtype.type(fill_value) if not np.isnan(fill_value) and (fill_value_casted != fill_value): raise TypeError( f"Cannot safely cast non-equivalent " - f"{type(fill_value).__name__} to {self.dtype.name}" + f"{type(fill_value).__name__} to {col.dtype.name}" ) fill_value = cudf.Scalar(fill_value_casted) else: fill_value = column.as_column(fill_value, nan_as_null=False) # cast safely to the same dtype as self - if is_integer_dtype(self.dtype): - fill_value = _safe_cast_to_int(fill_value, self.dtype) + if is_integer_dtype(col.dtype): + fill_value = _safe_cast_to_int(fill_value, col.dtype) else: - fill_value = fill_value.astype(self.dtype) + fill_value = fill_value.astype(col.dtype) - return super().fillna(fill_value, method) + return super(NumericalColumn, col).fillna(fill_value, method) - def find_first_value(self, value, closest=False): + def find_first_value( + self, value: ScalarLike, closest: bool = False + ) -> int: """ Returns offset of first value that matches. For monotonic columns, returns the offset of the first larger value @@ -469,7 +509,7 @@ def find_first_value(self, value, closest=False): raise ValueError("value not found") return found - def find_last_value(self, value, closest=False): + def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: """ Returns offset of last value that matches. For monotonic columns, returns the offset of the last smaller value @@ -498,7 +538,7 @@ def find_last_value(self, value, closest=False): raise ValueError("value not found") return found - def can_cast_safely(self, to_dtype): + def can_cast_safely(self, to_dtype: DtypeObj) -> bool: """ Returns true if all the values in self can be safely cast to dtype @@ -585,10 +625,10 @@ def can_cast_safely(self, to_dtype): elif self.dtype.kind == "f" and to_dtype.kind in {"i", "u"}: info = np.iinfo(to_dtype) min_, max_ = info.min, info.max + # best we can do is hope to catch it here and avoid compare if (self.min() >= min_) and (self.max() <= max_): - - filled = self.fillna(0) + filled = self.fillna(0, fill_nan=False) if (cudf.Series(filled) % 1 == 0).all(): return True else: @@ -596,9 +636,17 @@ def can_cast_safely(self, to_dtype): else: return False + return False + @annotate("BINARY_OP", color="orange", domain="cudf_python") -def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False): +def _numeric_column_binop( + lhs: Union[ColumnBase, ScalarLike], + rhs: Union[ColumnBase, ScalarLike], + op: str, + out_dtype: Dtype, + reflect: bool = False, +) -> ColumnBase: if reflect: lhs, rhs = rhs, lhs @@ -615,7 +663,7 @@ def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False): return out -def _numeric_column_unaryop(operand, op): +def _numeric_column_unaryop(operand: ColumnBase, op: str) -> ColumnBase: if callable(op): return libcudf.transform.transform(operand, op) @@ -623,7 +671,7 @@ def _numeric_column_unaryop(operand, op): return libcudf.unary.unary_operation(operand, op) -def _safe_cast_to_int(col, dtype): +def _safe_cast_to_int(col: ColumnBase, dtype: DtypeObj) -> ColumnBase: """ Cast given NumericalColumn to given integer dtype safely. """ @@ -642,7 +690,9 @@ def _safe_cast_to_int(col, dtype): ) -def _normalize_find_and_replace_input(input_column_dtype, col_to_normalize): +def _normalize_find_and_replace_input( + input_column_dtype: DtypeObj, col_to_normalize: Union[ColumnBase, list] +) -> ColumnBase: normalized_column = column.as_column( col_to_normalize, dtype=input_column_dtype if len(col_to_normalize) <= 0 else None, @@ -684,7 +734,9 @@ def _normalize_find_and_replace_input(input_column_dtype, col_to_normalize): return normalized_column.astype(input_column_dtype) -def digitize(column, bins, right=False): +def digitize( + column: ColumnBase, bins: np.ndarray, right: bool = False +) -> ColumnBase: """Return the indices of the bins to which each value in column belongs. Parameters @@ -699,7 +751,7 @@ def digitize(column, bins, right=False): Returns ------- - A device array containing the indices + A column containing the indices """ if not column.dtype == bins.dtype: raise ValueError( diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index f5df440b865..0124b421266 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1,10 +1,15 @@ # Copyright (c) 2019-2020, NVIDIA CORPORATION. +from __future__ import annotations + +import builtins import pickle import warnings +from typing import Any, Dict, Optional, Sequence, Tuple, Union, cast, overload import cupy import numpy as np import pandas as pd +from numba import cuda from nvtx import annotate import cudf @@ -140,6 +145,7 @@ translate as cpp_translate, ) from cudf._lib.strings.wrap import wrap as cpp_wrap +from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import column, datetime from cudf.core.column.methods import ColumnMethodsMixin @@ -197,6 +203,9 @@ } +ParentType = Union["cudf.Series", "cudf.Index"] + + class StringMethods(ColumnMethodsMixin): def __init__(self, column, parent=None): """ @@ -214,10 +223,9 @@ def __init__(self, column, parent=None): raise AttributeError( "Can only use .str accessor with string values" ) - self._column = column - self._parent = parent + super().__init__(column=column, parent=parent) - def htoi(self): + def htoi(self) -> ParentType: """ Returns integer value represented by each hex string. String is interpretted to have hex (base-16) characters. @@ -242,7 +250,7 @@ def htoi(self): return self._return_or_inplace(out, inplace=False) - def ip2int(self): + def ip2int(self) -> ParentType: """ This converts ip strings to integers @@ -279,7 +287,7 @@ def __getitem__(self, key): else: return self.get(key) - def len(self): + def len(self) -> ParentType: """ Computes the length of each element in the Series/Index. @@ -301,7 +309,7 @@ def len(self): return self._return_or_inplace(cpp_count_characters(self._column)) - def byte_count(self): + def byte_count(self) -> ParentType: """ Computes the number of bytes of each string in the Series/Index. @@ -328,6 +336,16 @@ def byte_count(self): """ return self._return_or_inplace(cpp_count_bytes(self._column),) + @overload + def cat(self, sep: str = None, na_rep: str = None) -> str: + ... + + @overload + def cat( + self, others, sep: str = None, na_rep: str = None + ) -> Union[ParentType, "cudf.core.column.StringColumn"]: + ... + def cat(self, others=None, sep=None, na_rep=None): """ Concatenate strings in the Series/Index with given separator. @@ -339,28 +357,28 @@ def cat(self, others=None, sep=None, na_rep=None): Parameters ---------- - others : Series or List of str - Strings to be appended. - The number of strings must match ``size()`` of this instance. - This must be either a Series of string dtype or a Python - list of strings. + others : Series or List of str + Strings to be appended. + The number of strings must match ``size()`` of this instance. + This must be either a Series of string dtype or a Python + list of strings. - sep : str - If specified, this separator will be appended to each string - before appending the others. + sep : str + If specified, this separator will be appended to each string + before appending the others. - na_rep : str - This character will take the place of any null strings - (not empty strings) in either list. + na_rep : str + This character will take the place of any null strings + (not empty strings) in either list. - - If ``na_rep`` is ``None``, and ``others`` is ``None``, - missing values in the Series/Index are - omitted from the result. + - If ``na_rep`` is ``None``, and ``others`` is ``None``, + missing values in the Series/Index are + omitted from the result. - - If ``na_rep`` is ``None``, and ``others`` is - not ``None``, a row containing a missing value - in any of the columns (before concatenation) - will have a missing value in the result. + - If ``na_rep`` is ``None``, and ``others`` is + not ``None``, a row containing a missing value + in any of the columns (before concatenation) + will have a missing value in the result. Returns ------- @@ -441,7 +459,7 @@ def cat(self, others=None, sep=None, na_rep=None): out = out[0] return out - def join(self, sep): + def join(self, sep) -> ParentType: """ Join lists contained as elements in the Series/Index with passed delimiter. @@ -453,7 +471,9 @@ def join(self, sep): "Columns of arrays / lists are not yet " "supported" ) - def extract(self, pat, flags=0, expand=True): + def extract( + self, pat: str, flags: int = 0, expand: bool = True + ) -> ParentType: """ Extract capture groups in the regex `pat` as columns in a DataFrame. @@ -517,7 +537,14 @@ def extract(self, pat, flags=0, expand=True): else: return self._return_or_inplace(out, expand=expand) - def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): + def contains( + self, + pat: Union[str, Sequence], + case: bool = True, + flags: int = 0, + na=np.nan, + regex: bool = True, + ) -> ParentType: """ Test if pattern or regex is contained within a string of a Series or Index. @@ -646,7 +673,15 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): ) return self._return_or_inplace(result_col) - def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): + def replace( + self, + pat: Union[str, Sequence], + repl: Union[str, Sequence], + n: int = -1, + case=None, + flags: int = 0, + regex: bool = True, + ) -> ParentType: """ Replace occurrences of pattern/regex in the Series/Index with some other string. Equivalent to `str.replace() @@ -748,7 +783,7 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): ), ) - def replace_with_backrefs(self, pat, repl): + def replace_with_backrefs(self, pat: str, repl: str) -> ParentType: """ Use the ``repl`` back-ref template to create a new string with the extracted elements found using the ``pat`` expression. @@ -778,7 +813,9 @@ def replace_with_backrefs(self, pat, repl): cpp_replace_with_backrefs(self._column, pat, repl) ) - def slice(self, start=None, stop=None, step=None): + def slice( + self, start: int = None, stop: int = None, step: int = None + ) -> ParentType: """ Slice substrings from each element in the Series or Index. @@ -847,7 +884,7 @@ def slice(self, start=None, stop=None, step=None): cpp_slice_strings(self._column, start, stop, step), ) - def isinteger(self): + def isinteger(self) -> ParentType: """ Check whether all characters in each string form integer. @@ -907,7 +944,7 @@ def isinteger(self): """ return self._return_or_inplace(cpp_is_integer(self._column)) - def ishex(self): + def ishex(self) -> ParentType: """ Check whether all characters in each string form a hex integer. @@ -946,7 +983,7 @@ def ishex(self): """ return self._return_or_inplace(str_cast.is_hex(self._column)) - def istimestamp(self, format): + def istimestamp(self, format: str) -> ParentType: """ Check whether all characters in each string can be converted to a timestamp using the given format. @@ -970,7 +1007,7 @@ def istimestamp(self, format): str_cast.istimestamp(self._column, format) ) - def isfloat(self): + def isfloat(self) -> ParentType: """ Check whether all characters in each string form floating value. @@ -1033,7 +1070,7 @@ def isfloat(self): """ return self._return_or_inplace(cpp_is_float(self._column)) - def isdecimal(self): + def isdecimal(self) -> ParentType: """ Check whether all characters in each string are decimal. @@ -1094,7 +1131,7 @@ def isdecimal(self): """ return self._return_or_inplace(cpp_is_decimal(self._column)) - def isalnum(self): + def isalnum(self) -> ParentType: """ Check whether all characters in each string are alphanumeric. @@ -1163,7 +1200,7 @@ def isalnum(self): """ return self._return_or_inplace(cpp_is_alnum(self._column)) - def isalpha(self): + def isalpha(self) -> ParentType: """ Check whether all characters in each string are alphabetic. @@ -1219,7 +1256,7 @@ def isalpha(self): """ return self._return_or_inplace(cpp_is_alpha(self._column)) - def isdigit(self): + def isdigit(self) -> ParentType: """ Check whether all characters in each string are digits. @@ -1281,7 +1318,7 @@ def isdigit(self): """ return self._return_or_inplace(cpp_is_digit(self._column)) - def isnumeric(self): + def isnumeric(self) -> ParentType: """ Check whether all characters in each string are numeric. @@ -1349,7 +1386,7 @@ def isnumeric(self): """ return self._return_or_inplace(cpp_is_numeric(self._column)) - def isupper(self): + def isupper(self) -> ParentType: """ Check whether all characters in each string are uppercase. @@ -1406,7 +1443,7 @@ def isupper(self): """ return self._return_or_inplace(cpp_is_upper(self._column)) - def islower(self): + def islower(self) -> ParentType: """ Check whether all characters in each string are lowercase. @@ -1463,7 +1500,7 @@ def islower(self): """ return self._return_or_inplace(cpp_is_lower(self._column)) - def isipv4(self): + def isipv4(self) -> ParentType: """ Check whether all characters in each string form an IPv4 address. @@ -1487,7 +1524,7 @@ def isipv4(self): """ return self._return_or_inplace(str_cast.is_ipv4(self._column)) - def lower(self): + def lower(self) -> ParentType: """ Converts all characters to lowercase. @@ -1526,7 +1563,7 @@ def lower(self): """ return self._return_or_inplace(cpp_to_lower(self._column)) - def upper(self): + def upper(self) -> ParentType: """ Convert each string to uppercase. This only applies to ASCII characters at this time. @@ -1575,7 +1612,7 @@ def upper(self): """ return self._return_or_inplace(cpp_to_upper(self._column)) - def capitalize(self): + def capitalize(self) -> ParentType: """ Convert strings in the Series/Index to be capitalized. This only applies to ASCII characters at this time. @@ -1603,7 +1640,7 @@ def capitalize(self): """ return self._return_or_inplace(cpp_capitalize(self._column)) - def swapcase(self): + def swapcase(self) -> ParentType: """ Change each lowercase character to uppercase and vice versa. This only applies to ASCII characters at this time. @@ -1648,7 +1685,7 @@ def swapcase(self): """ return self._return_or_inplace(cpp_swapcase(self._column)) - def title(self): + def title(self) -> ParentType: """ Uppercase the first letter of each letter after a space and lowercase the rest. @@ -1693,7 +1730,9 @@ def title(self): """ return self._return_or_inplace(cpp_title(self._column)) - def filter_alphanum(self, repl=None, keep=True): + def filter_alphanum( + self, repl: str = None, keep: bool = True + ) -> ParentType: """ Remove non-alphanumeric characters from strings in this column. @@ -1728,7 +1767,9 @@ def filter_alphanum(self, repl=None, keep=True): cpp_filter_alphanum(self._column, cudf.Scalar(repl), keep), ) - def slice_from(self, starts, stops): + def slice_from( + self, starts: "cudf.Series", stops: "cudf.Series" + ) -> ParentType: """ Return substring of each string using positions for each string. @@ -1771,7 +1812,9 @@ def slice_from(self, starts, stops): ), ) - def slice_replace(self, start=None, stop=None, repl=None): + def slice_replace( + self, start: int = None, stop: int = None, repl: str = None + ) -> ParentType: """ Replace the specified section of each string with a new string. @@ -1856,7 +1899,7 @@ def slice_replace(self, start=None, stop=None, repl=None): cpp_slice_replace(self._column, start, stop, cudf.Scalar(repl)), ) - def insert(self, start=0, repl=None): + def insert(self, start: int = 0, repl: str = None) -> ParentType: """ Insert the specified string into each string in the specified position. @@ -1906,7 +1949,7 @@ def insert(self, start=0, repl=None): cpp_string_insert(self._column, start, cudf.Scalar(repl)), ) - def get(self, i=0): + def get(self, i: int = 0) -> ParentType: """ Extract element from each component at specified position. @@ -1950,7 +1993,9 @@ def get(self, i=0): return self._return_or_inplace(cpp_string_get(self._column, i)) - def split(self, pat=None, n=-1, expand=None): + def split( + self, pat: str = None, n: int = -1, expand: bool = None + ) -> ParentType: """ Split strings around given separator/delimiter. @@ -2079,14 +2124,14 @@ def split(self, pat=None, n=-1, expand=None): if expand: if self._column.null_count == len(self._column): - result_table = [self._column.copy()] + result_table = cudf.core.frame.Frame({0: self._column.copy()}) else: result_table = cpp_split( self._column, cudf.Scalar(pat, "str"), n ) if len(result_table._data) == 1: - if result_table._data[0].null_count == len(self._parent): - result_table = [] + if result_table._data[0].null_count == len(self._column): + result_table = cudf.core.frame.Frame({}) else: result_table = cpp_split_record( self._column, cudf.Scalar(pat, "str"), n @@ -2094,7 +2139,9 @@ def split(self, pat=None, n=-1, expand=None): return self._return_or_inplace(result_table, expand=expand) - def rsplit(self, pat=None, n=-1, expand=None): + def rsplit( + self, pat: str = None, n: int = -1, expand: bool = None + ) -> ParentType: """ Split strings around given separator/delimiter. @@ -2232,18 +2279,18 @@ def rsplit(self, pat=None, n=-1, expand=None): if expand: if self._column.null_count == len(self._column): - result_table = [self._column.copy()] + result_table = cudf.core.frame.Frame({0: self._column.copy()}) else: result_table = cpp_rsplit(self._column, cudf.Scalar(pat), n) if len(result_table._data) == 1: - if result_table._data[0].null_count == len(self._parent): - result_table = [] + if result_table._data[0].null_count == len(self._column): + result_table = cudf.core.frame.Frame({}) else: result_table = cpp_rsplit_record(self._column, cudf.Scalar(pat), n) return self._return_or_inplace(result_table, expand=expand) - def partition(self, sep=" ", expand=True): + def partition(self, sep: str = " ", expand: bool = True) -> ParentType: """ Split the string at the first occurrence of sep. @@ -2323,7 +2370,7 @@ def partition(self, sep=" ", expand=True): cpp_partition(self._column, cudf.Scalar(sep)), expand=expand ) - def rpartition(self, sep=" ", expand=True): + def rpartition(self, sep: str = " ", expand: bool = True) -> ParentType: """ Split the string at the last occurrence of sep. @@ -2387,7 +2434,9 @@ def rpartition(self, sep=" ", expand=True): cpp_rpartition(self._column, cudf.Scalar(sep)), expand=expand ) - def pad(self, width, side="left", fillchar=" "): + def pad( + self, width: int, side: str = "left", fillchar: str = " " + ) -> ParentType: """ Pad strings in the Series/Index up to width. @@ -2472,7 +2521,7 @@ def pad(self, width, side="left", fillchar=" "): cpp_pad(self._column, width, fillchar, side) ) - def zfill(self, width): + def zfill(self, width: int) -> ParentType: """ Pad strings in the Series/Index by prepending ‘0’ characters. @@ -2545,7 +2594,7 @@ def zfill(self, width): return self._return_or_inplace(cpp_zfill(self._column, width)) - def center(self, width, fillchar=" "): + def center(self, width: int, fillchar: str = " ") -> ParentType: """ Filling left and right side of strings in the Series/Index with an additional character. @@ -2617,7 +2666,7 @@ def center(self, width, fillchar=" "): cpp_center(self._column, width, fillchar) ) - def ljust(self, width, fillchar=" "): + def ljust(self, width: int, fillchar: str = " ") -> ParentType: """ Filling right side of strings in the Series/Index with an additional character. Equivalent to `str.ljust() @@ -2671,7 +2720,7 @@ def ljust(self, width, fillchar=" "): cpp_ljust(self._column, width, fillchar) ) - def rjust(self, width, fillchar=" "): + def rjust(self, width: int, fillchar: str = " ") -> ParentType: """ Filling left side of strings in the Series/Index with an additional character. Equivalent to `str.rjust() @@ -2725,7 +2774,7 @@ def rjust(self, width, fillchar=" "): cpp_rjust(self._column, width, fillchar) ) - def strip(self, to_strip=None): + def strip(self, to_strip: str = None) -> ParentType: """ Remove leading and trailing characters. @@ -2784,7 +2833,7 @@ def strip(self, to_strip=None): cpp_strip(self._column, cudf.Scalar(to_strip)) ) - def lstrip(self, to_strip=None): + def lstrip(self, to_strip: str = None) -> ParentType: """ Remove leading and trailing characters. @@ -2831,7 +2880,7 @@ def lstrip(self, to_strip=None): cpp_lstrip(self._column, cudf.Scalar(to_strip)) ) - def rstrip(self, to_strip=None): + def rstrip(self, to_strip: str = None) -> ParentType: """ Remove leading and trailing characters. @@ -2886,7 +2935,7 @@ def rstrip(self, to_strip=None): cpp_rstrip(self._column, cudf.Scalar(to_strip)) ) - def wrap(self, width, **kwargs): + def wrap(self, width: int, **kwargs) -> ParentType: """ Wrap long strings in the Series/Index to be formatted in paragraphs with length less than a given width. @@ -2980,7 +3029,7 @@ def wrap(self, width, **kwargs): return self._return_or_inplace(cpp_wrap(self._column, width)) - def count(self, pat, flags=0): + def count(self, pat: str, flags: int = 0) -> ParentType: """ Count occurrences of pattern in each string of the Series/Index. @@ -3040,7 +3089,9 @@ def count(self, pat, flags=0): return self._return_or_inplace(cpp_count_re(self._column, pat)) - def findall(self, pat, flags=0, expand=True): + def findall( + self, pat: str, flags: int = 0, expand: bool = True + ) -> ParentType: """ Find all occurrences of pattern or regular expression in the Series/Index. @@ -3108,7 +3159,7 @@ def findall(self, pat, flags=0, expand=True): cpp_findall(self._column, pat), expand=expand ) - def isempty(self): + def isempty(self) -> ParentType: """ Check whether each string is an empty string. @@ -3128,9 +3179,9 @@ def isempty(self): 4 False dtype: bool """ - return self._return_or_inplace((self._parent == "").fillna(False)) + return self._return_or_inplace((self._column == "").fillna(False)) - def isspace(self): + def isspace(self) -> ParentType: """ Check whether all characters in each string are whitespace. @@ -3186,7 +3237,7 @@ def isspace(self): """ return self._return_or_inplace(cpp_isspace(self._column)) - def endswith(self, pat): + def endswith(self, pat: str) -> ParentType: """ Test if the end of each string element matches a pattern. @@ -3240,7 +3291,7 @@ def endswith(self, pat): return self._return_or_inplace(result_col) - def startswith(self, pat): + def startswith(self, pat: Union[str, Sequence]) -> ParentType: """ Test if the start of each string element matches a pattern. @@ -3300,7 +3351,7 @@ def startswith(self, pat): return self._return_or_inplace(result_col) - def find(self, sub, start=0, end=None): + def find(self, sub: str, start: int = 0, end: int = None) -> ParentType: """ Return lowest indexes in each strings in the Series/Index where the substring is fully contained between ``[start:end]``. @@ -3355,7 +3406,7 @@ def find(self, sub, start=0, end=None): return self._return_or_inplace(result_col) - def rfind(self, sub, start=0, end=None): + def rfind(self, sub: str, start: int = 0, end: int = None) -> ParentType: """ Return highest indexes in each strings in the Series/Index where the substring is fully contained between ``[start:end]``. @@ -3414,7 +3465,7 @@ def rfind(self, sub, start=0, end=None): return self._return_or_inplace(result_col) - def index(self, sub, start=0, end=None): + def index(self, sub: str, start: int = 0, end: int = None) -> ParentType: """ Return lowest indexes in each strings where the substring is fully contained between ``[start:end]``. This is the same @@ -3474,7 +3525,7 @@ def index(self, sub, start=0, end=None): else: return result - def rindex(self, sub, start=0, end=None): + def rindex(self, sub: str, start: int = 0, end: int = None) -> ParentType: """ Return highest indexes in each strings where the substring is fully contained between ``[start:end]``. This is the same @@ -3534,7 +3585,7 @@ def rindex(self, sub, start=0, end=None): else: return result - def match(self, pat, case=True, flags=0): + def match(self, pat: str, case: bool = True, flags: int = 0) -> ParentType: """ Determine if each string matches a regular expression. @@ -3579,7 +3630,7 @@ def match(self, pat, case=True, flags=0): return self._return_or_inplace(cpp_match_re(self._column, pat)) - def url_decode(self): + def url_decode(self) -> ParentType: """ Returns a URL-decoded format of each string. No format checking is performed. All characters @@ -3609,7 +3660,7 @@ def url_decode(self): return self._return_or_inplace(cpp_url_decode(self._column)) - def url_encode(self): + def url_encode(self) -> ParentType: """ Returns a URL-encoded format of each string. No format checking is performed. @@ -3640,7 +3691,7 @@ def url_encode(self): """ return self._return_or_inplace(cpp_url_encode(self._column)) - def code_points(self): + def code_points(self) -> ParentType: """ Returns an array by filling it with the UTF-8 code point values for each character of each string. @@ -3673,14 +3724,14 @@ def code_points(self): """ new_col = cpp_code_points(self._column) - if self._parent is None: - return new_col - elif isinstance(self._parent, cudf.Series): + if isinstance(self._parent, cudf.Series): return cudf.Series(new_col, name=self._parent.name) elif isinstance(self._parent, cudf.Index): return cudf.core.index.as_index(new_col, name=self._parent.name) + else: + return new_col - def translate(self, table): + def translate(self, table: dict) -> ParentType: """ Map all characters in the string through the given mapping table. @@ -3723,7 +3774,9 @@ def translate(self, table): table = str.maketrans(table) return self._return_or_inplace(cpp_translate(self._column, table)) - def filter_characters(self, table, keep=True, repl=None): + def filter_characters( + self, table: dict, keep: bool = True, repl: str = None + ) -> ParentType: """ Remove characters from each string using the character ranges in the given mapping table. @@ -3774,7 +3827,7 @@ def filter_characters(self, table, keep=True, repl=None): ), ) - def normalize_spaces(self): + def normalize_spaces(self) -> ParentType: """ Remove extra whitespace between tokens and trim whitespace from the beginning and the end of each string. @@ -3794,7 +3847,7 @@ def normalize_spaces(self): """ return self._return_or_inplace(cpp_normalize_spaces(self._column)) - def normalize_characters(self, do_lower=True): + def normalize_characters(self, do_lower: bool = True) -> ParentType: """ Normalizes strings characters for tokenizing. @@ -3843,7 +3896,7 @@ def normalize_characters(self, do_lower=True): cpp_normalize_characters(self._column, do_lower) ) - def tokenize(self, delimiter=" "): + def tokenize(self, delimiter: str = " ") -> ParentType: """ Each string is split into tokens using the provided delimiter(s). The sequence returned contains the tokens in the order @@ -3890,7 +3943,9 @@ def tokenize(self, delimiter=" "): for delimiters, but got {type(delimiter)}" ) - def detokenize(self, indices, separator=" "): + def detokenize( + self, indices: "cudf.Series", separator: str = " " + ) -> ParentType: """ Combines tokens into strings by concatenating them in the order in which they appear in the ``indices`` column. The ``separator`` is @@ -3898,7 +3953,7 @@ def detokenize(self, indices, separator=" "): Parameters ---------- - indices : list of ints + indices : Series Each value identifies the output row for the corresponding token. separator : str The string concatenated between each token in an output row. @@ -3925,7 +3980,7 @@ def detokenize(self, indices, separator=" "): retain_index=False, ) - def character_tokenize(self): + def character_tokenize(self) -> ParentType: """ Each string is split into individual characters. The sequence returned contains each character as an individual string. @@ -3973,14 +4028,14 @@ def character_tokenize(self): dtype: object """ result_col = cpp_character_tokenize(self._column) - if self._parent is None: - return result_col - elif isinstance(self._parent, cudf.Series): + if isinstance(self._parent, cudf.Series): return cudf.Series(result_col, name=self._parent.name) elif isinstance(self._parent, cudf.Index): return cudf.core.index.as_index(result_col, name=self._parent.name) + else: + return result_col - def token_count(self, delimiter=" "): + def token_count(self, delimiter: str = " ") -> ParentType: """ Each string is split into tokens using the provided delimiter. The returned integer sequence is the number of tokens in each string. @@ -4022,7 +4077,7 @@ def token_count(self, delimiter=" "): for delimiters, but got {type(delimiter)}" ) - def ngrams(self, n=2, separator="_"): + def ngrams(self, n: int = 2, separator: str = "_") -> ParentType: """ Generate the n-grams from a set of tokens, each record in series is treated a token. @@ -4059,7 +4114,7 @@ def ngrams(self, n=2, separator="_"): cpp_generate_ngrams(self._column, n, separator), retain_index=False ) - def character_ngrams(self, n=2): + def character_ngrams(self, n: int = 2) -> ParentType: """ Generate the n-grams from characters in a column of strings. @@ -4095,7 +4150,9 @@ def character_ngrams(self, n=2): cpp_generate_character_ngrams(self._column, n), retain_index=False ) - def ngrams_tokenize(self, n=2, delimiter=" ", separator="_"): + def ngrams_tokenize( + self, n: int = 2, delimiter: str = " ", separator: str = "_" + ) -> ParentType: """ Generate the n-grams using tokens from each string. This will tokenize each string and then generate ngrams for each @@ -4131,7 +4188,9 @@ def ngrams_tokenize(self, n=2, delimiter=" ", separator="_"): retain_index=False, ) - def replace_tokens(self, targets, replacements, delimiter=None): + def replace_tokens( + self, targets, replacements, delimiter: str = None + ) -> ParentType: """ The targets tokens are searched for within each string in the series and replaced with the corresponding replacements if found. @@ -4213,8 +4272,11 @@ def replace_tokens(self, targets, replacements, delimiter=None): ) def filter_tokens( - self, min_token_length, replacement=None, delimiter=None - ): + self, + min_token_length: int, + replacement: str = None, + delimiter: str = None, + ) -> ParentType: """ Remove tokens from within each string in the series that are smaller than min_token_length and optionally replace them @@ -4282,13 +4344,13 @@ def filter_tokens( def subword_tokenize( self, - hash_file, - max_length=64, - stride=48, - do_lower=True, - do_truncate=False, - max_rows_tensor=500, - ): + hash_file: str, + max_length: int = 64, + stride: int = 48, + do_lower: bool = True, + do_truncate: bool = False, + max_rows_tensor: int = 500, + ) -> Tuple[cupy.ndarray, cupy.ndarray, cupy.ndarray]: """ Run CUDA BERT subword tokenizer on cuDF strings column. Encodes words to token ids using vocabulary from a pretrained @@ -4337,12 +4399,12 @@ def subword_tokenize( Returns ------- - token-ids : Column + token-ids : cupy.ndarray The token-ids for each string padded with 0s to max_length. - attention-mask : Column + attention-mask : cupy.ndarray The mask for token-ids result where corresponding positions identify valid token-id values. - metadata : Column + metadata : cupy.ndarray Each row contains the index id of the original string and the first and last index of the token-ids that are non-padded and non-overlapping. @@ -4383,7 +4445,7 @@ def subword_tokenize( cupy.asarray(metadata), ) - def porter_stemmer_measure(self): + def porter_stemmer_measure(self) -> ParentType: """ Compute the Porter Stemmer measure for each string. The Porter Stemmer algorithm is described `here @@ -4406,7 +4468,7 @@ def porter_stemmer_measure(self): cpp_porter_stemmer_measure(self._column) ) - def is_consonant(self, position): + def is_consonant(self, position) -> ParentType: """ Return true for strings where the character at ``position`` is a consonant. The ``position`` parameter may also be a list of integers @@ -4450,7 +4512,7 @@ def is_consonant(self, position): cpp_is_letter(self._column, ltype, position) ) - def is_vowel(self, position): + def is_vowel(self, position) -> ParentType: """ Return true for strings where the character at ``position`` is a vowel -- not a consonant. The ``position`` parameter may also be @@ -4494,7 +4556,7 @@ def is_vowel(self, position): cpp_is_letter(self._column, ltype, position) ) - def edit_distance(self, targets): + def edit_distance(self, targets) -> ParentType: """ The ``targets`` strings are measured against the strings in this instance using the Levenshtein edit distance algorithm. @@ -4576,8 +4638,17 @@ class StringColumn(column.ColumnBase): """Implements operations for Columns of String type """ + _start_offset: Optional[int] + _end_offset: Optional[int] + _cached_sizeof: Optional[int] + def __init__( - self, mask=None, size=None, offset=0, null_count=None, children=() + self, + mask: Buffer = None, + size: int = None, + offset: int = 0, + null_count: int = None, + children: Tuple["column.ColumnBase", ...] = (), ): """ Parameters @@ -4627,34 +4698,38 @@ def __init__( self._end_offset = None @property - def start_offset(self): + def start_offset(self) -> int: if self._start_offset is None: if ( len(self.base_children) == 2 and self.offset < self.base_children[0].size ): - self._start_offset = int(self.base_children[0][self.offset]) + self._start_offset = int( + self.base_children[0].element_indexing(self.offset) + ) else: self._start_offset = 0 return self._start_offset @property - def end_offset(self): + def end_offset(self) -> int: if self._end_offset is None: if ( len(self.base_children) == 2 and (self.offset + self.size) < self.base_children[0].size ): self._end_offset = int( - self.base_children[0][self.offset + self.size] + self.base_children[0].element_indexing( + self.offset + self.size + ) ) else: self._end_offset = 0 return self._end_offset - def __sizeof__(self): + def __sizeof__(self) -> int: if self._cached_sizeof is None: n = 0 if len(self.base_children) == 2: @@ -4676,7 +4751,7 @@ def __sizeof__(self): return self._cached_sizeof @property - def base_size(self): + def base_size(self) -> int: if len(self.base_children) == 0: return 0 else: @@ -4685,7 +4760,13 @@ def base_size(self): / self.base_children[0].dtype.itemsize ) - def sum(self, skipna=None, dtype=None, min_count=0): + @property + def data_array_view(self) -> cuda.devicearray.DeviceNDArray: + raise ValueError("Cannot get an array view of a StringColumn") + + def sum( + self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 + ): result_col = self._process_for_reduction( skipna=skipna, min_count=min_count ) @@ -4703,39 +4784,38 @@ def set_base_data(self, value): else: super().set_base_data(value) - def set_base_mask(self, value): + def set_base_mask(self, value: Optional[Buffer]): super().set_base_mask(value) - def set_base_children(self, value): + def set_base_children(self, value: Tuple["column.ColumnBase", ...]): # TODO: Implement dtype validation of the children here somehow super().set_base_children(value) - def __contains__(self, item): + def __contains__(self, item: ScalarLike) -> bool: return True in self.str().contains(f"^{item}$") - def str(self, parent=None): + def str(self, parent: ParentType = None) -> StringMethods: return StringMethods(self, parent=parent) - def unary_operator(self, unaryop): + def unary_operator(self, unaryop: builtins.str): raise TypeError( f"Series of dtype `str` cannot perform the operation: " f"{unaryop}" ) - def __len__(self): + def __len__(self) -> int: return self.size - def _set_mask(self, value): - super()._set_mask(value) - @property - def _nbytes(self): + def _nbytes(self) -> int: if self.size == 0: return 0 else: return self.children[1].size - def as_numerical_column(self, dtype): + def as_numerical_column( + self, dtype: Dtype + ) -> "cudf.core.column.NumericalColumn": out_dtype = np.dtype(dtype) if out_dtype.kind in {"i", "u"}: @@ -4775,42 +4855,49 @@ def _as_datetime_or_timedelta_column(self, dtype, format): return result_col - def as_datetime_column(self, dtype, format=None): + def as_datetime_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.DatetimeColumn": out_dtype = np.dtype(dtype) + # infer on host from the first not na element + # or return all null column if all values + # are null in current column + format = kwargs.get("format", None) if format is None: - # infer on host from the first not na element - # or return all null column if all values - # are null in current column if self.null_count == len(self): - return column.column_empty( - len(self), dtype=out_dtype, masked=True + return cast( + "cudf.core.column.DatetimeColumn", + column.column_empty( + len(self), dtype=out_dtype, masked=True + ), ) else: - format = datetime.infer_format(self[self.notna()][0]) + format = datetime.infer_format( + self.apply_boolean_mask(self.notna()).element_indexing(0) + ) return self._as_datetime_or_timedelta_column(out_dtype, format) - def as_timedelta_column(self, dtype, format=None): + def as_timedelta_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.TimeDeltaColumn": out_dtype = np.dtype(dtype) - - if format is None: - format = "%D days %H:%M:%S" - + format = "%D days %H:%M:%S" return self._as_datetime_or_timedelta_column(out_dtype, format) - def as_string_column(self, dtype): + def as_string_column(self, dtype: Dtype, format=None) -> StringColumn: return self @property - def values_host(self): + def values_host(self) -> np.ndarray: """ Return a numpy representation of the StringColumn. """ return self.to_pandas().values @property - def values(self): + def values(self) -> cupy.ndarray: """ Return a CuPy representation of the StringColumn. """ @@ -4818,7 +4905,7 @@ def values(self): "String Arrays is not yet implemented in cudf" ) - def to_array(self, fillna=None): + def to_array(self, fillna: bool = None) -> np.ndarray: """Get a dense numpy array for the data. Notes @@ -4851,8 +4938,8 @@ def __arrow_array__(self, type=None): "consider using .to_arrow()" ) - def serialize(self): - header = {"null_count": self.null_count} + def serialize(self) -> Tuple[dict, list]: + header = {"null_count": self.null_count} # type: Dict[Any, Any] header["type-serialized"] = pickle.dumps(type(self)) header["size"] = self.size @@ -4872,7 +4959,7 @@ def serialize(self): return header, frames @classmethod - def deserialize(cls, header, frames): + def deserialize(cls, header: dict, frames: list) -> StringColumn: size = header["size"] if not isinstance(size, int): size = pickle.loads(size) @@ -4880,26 +4967,28 @@ def deserialize(cls, header, frames): # Deserialize the mask, value, and offset frames buffers = [Buffer(each_frame) for each_frame in frames] + nbuf = None if header["null_count"] > 0: nbuf = buffers[2] - else: - nbuf = None children = [] for h, b in zip(header["subheaders"], buffers[:2]): column_type = pickle.loads(h["type-serialized"]) children.append(column_type.deserialize(h, [b])) - col = column.build_column( - data=None, - dtype="str", - mask=nbuf, - children=tuple(children), - size=size, + col = cast( + StringColumn, + column.build_column( + data=None, + dtype="str", + mask=nbuf, + children=tuple(children), + size=size, + ), ) return col - def can_cast_safely(self, to_dtype): + def can_cast_safely(self, to_dtype: Dtype) -> bool: to_dtype = np.dtype(to_dtype) if self.dtype == to_dtype: @@ -4911,7 +5000,12 @@ def can_cast_safely(self, to_dtype): else: return True - def find_and_replace(self, to_replace, replacement, all_nan): + def find_and_replace( + self, + to_replace: ColumnLike, + replacement: ColumnLike, + all_nan: bool = False, + ) -> StringColumn: """ Return col with *to_replace* replaced with *value* """ @@ -4919,7 +5013,12 @@ def find_and_replace(self, to_replace, replacement, all_nan): replacement = column.as_column(replacement, dtype=self.dtype) return libcudf.replace.replace(self, to_replace, replacement) - def fillna(self, fill_value=None, method=None): + def fillna( + self, + fill_value: Any = None, + method: builtins.str = None, + dtype: Dtype = None, + ) -> StringColumn: if fill_value is not None: if not is_scalar(fill_value): fill_value = column.as_column(fill_value, dtype=self.dtype) @@ -4927,24 +5026,26 @@ def fillna(self, fill_value=None, method=None): else: return super().fillna(method=method) - def _find_first_and_last(self, value): + def _find_first_and_last(self, value: ScalarLike) -> Tuple[int, int]: found_indices = self.str().contains(f"^{value}$") found_indices = libcudf.unary.cast(found_indices, dtype=np.int32) first = column.as_column(found_indices).find_first_value(1) last = column.as_column(found_indices).find_last_value(1) return first, last - def find_first_value(self, value, closest=False): + def find_first_value( + self, value: ScalarLike, closest: bool = False + ) -> int: return self._find_first_and_last(value)[0] - def find_last_value(self, value, closest=False): + def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: return self._find_first_and_last(value)[1] - def normalize_binop_value(self, other): + def normalize_binop_value(self, other) -> "column.ColumnBase": # fastpath: gpu scalar if isinstance(other, cudf.Scalar) and other.dtype == "object": return column.as_column(other, length=len(self)) - if isinstance(other, column.Column): + if isinstance(other, column.ColumnBase): return other.astype(self.dtype) elif isinstance(other, str) or other is None: col = utils.scalar_broadcast_to( @@ -4959,16 +5060,18 @@ def normalize_binop_value(self, other): else: raise TypeError(f"cannot broadcast {type(other)}") - def default_na_value(self): + def default_na_value(self) -> ScalarLike: return None - def binary_operator(self, op, rhs, reflect=False): + def binary_operator( + self, op: builtins.str, rhs, reflect: bool = False + ) -> "column.ColumnBase": lhs = self if reflect: lhs, rhs = rhs, lhs if isinstance(rhs, (StringColumn, str, cudf.Scalar)): if op == "add": - return lhs.str().cat(others=rhs) + return cast("column.ColumnBase", lhs.str().cat(others=rhs)) elif op in ("eq", "ne", "gt", "lt", "ge", "le"): return _string_column_binop(self, rhs, op=op, out_dtype="bool") @@ -4977,7 +5080,7 @@ def binary_operator(self, op, rhs, reflect=False): ) @property - def is_unique(self): + def is_unique(self) -> bool: return len(self.unique()) == len(self) @property @@ -4986,19 +5089,17 @@ def __cuda_array_interface__(self): "Strings are not yet supported via `__cuda_array_interface__`" ) - def _mimic_inplace(self, other_col, inplace=False): - out = super()._mimic_inplace(other_col, inplace=inplace) - return out - @copy_docstring(column.ColumnBase.view) - def view(self, dtype): + def view(self, dtype) -> "cudf.core.column.ColumnBase": if self.null_count > 0: raise ValueError( "Can not produce a view of a string column with nulls" ) dtype = np.dtype(dtype) - str_byte_offset = self.base_children[0][self.offset] - str_end_byte_offset = self.base_children[0][self.offset + self.size] + str_byte_offset = self.base_children[0].element_indexing(self.offset) + str_end_byte_offset = self.base_children[0].element_indexing( + self.offset + self.size + ) char_dtype_size = self.base_children[1].dtype.itemsize n_bytes_to_view = ( @@ -5016,7 +5117,12 @@ def view(self, dtype): @annotate("BINARY_OP", color="orange", domain="cudf_python") -def _string_column_binop(lhs, rhs, op, out_dtype): +def _string_column_binop( + lhs: "column.ColumnBase", + rhs: "column.ColumnBase", + op: str, + out_dtype: Dtype, +) -> "column.ColumnBase": out = libcudf.binaryop.binaryop(lhs=lhs, rhs=rhs, op=op, dtype=out_dtype) return out diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 9036f1e2962..f797bdf9635 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -1,6 +1,9 @@ # Copyright (c) 2020, NVIDIA CORPORATION. +from __future__ import annotations + import datetime as dt from numbers import Number +from typing import Any, Sequence, Tuple, Union, cast import numpy as np import pandas as pd @@ -9,6 +12,14 @@ import cudf from cudf import _lib as libcudf +from cudf._typing import ( + BinaryOperand, + DatetimeLikeScalar, + Dtype, + DtypeObj, + ScalarLike, +) +from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, column, string from cudf.core.column.datetime import _numpy_to_pandas_conversion from cudf.utils.dtypes import is_scalar, np_to_pa_dtype @@ -24,7 +35,13 @@ class TimeDeltaColumn(column.ColumnBase): def __init__( - self, data, dtype, size, mask=None, offset=0, null_count=None + self, + data: Buffer, + dtype: Dtype, + size: int = None, + mask: Buffer = None, + offset: int = 0, + null_count: int = None, ): """ Parameters @@ -46,7 +63,9 @@ def __init__( dtype = np.dtype(dtype) if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") - + if size is None: + size = data.size // dtype.itemsize + size = size - offset super().__init__( data, size=size, @@ -61,7 +80,7 @@ def __init__( self._time_unit, _ = np.datetime_data(self.dtype) - def __contains__(self, item): + def __contains__(self, item: DatetimeLikeScalar) -> bool: try: item = np.timedelta64(item, self._time_unit) except ValueError: @@ -71,7 +90,7 @@ def __contains__(self, item): return False return item.view("int64") in self.as_numerical - def to_arrow(self): + def to_arrow(self) -> pa.Array: mask = None if self.nullable: mask = pa.py_buffer(self.mask_array_view.copy_to_host()) @@ -84,7 +103,9 @@ def to_arrow(self): null_count=self.null_count, ) - def to_pandas(self, index=None, **kwargs): + def to_pandas( + self, index=None, nullable: bool = False, **kwargs + ) -> pd.Series: # Workaround until following issue is fixed: # https://issues.apache.org/jira/browse/ARROW-9772 @@ -98,8 +119,10 @@ def to_pandas(self, index=None, **kwargs): return pd_series - def _binary_op_floordiv(self, rhs): - lhs, rhs = self, rhs + def _binary_op_floordiv( + self, rhs: BinaryOperand + ) -> Tuple["column.ColumnBase", BinaryOperand, DtypeObj]: + lhs = self # type: column.ColumnBase if pd.api.types.is_timedelta64_dtype(rhs.dtype): common_dtype = determine_out_dtype(self.dtype, rhs.dtype) lhs = lhs.astype(common_dtype).astype("float64") @@ -122,7 +145,7 @@ def _binary_op_floordiv(self, rhs): return lhs, rhs, out_dtype - def _binary_op_mul(self, rhs): + def _binary_op_mul(self, rhs: BinaryOperand) -> DtypeObj: if rhs.dtype.kind in ("f", "i", "u"): out_dtype = self.dtype else: @@ -132,7 +155,7 @@ def _binary_op_mul(self, rhs): ) return out_dtype - def _binary_op_mod(self, rhs): + def _binary_op_mod(self, rhs: BinaryOperand) -> DtypeObj: if pd.api.types.is_timedelta64_dtype(rhs.dtype): out_dtype = determine_out_dtype(self.dtype, rhs.dtype) elif rhs.dtype.kind in ("f", "i", "u"): @@ -144,7 +167,7 @@ def _binary_op_mod(self, rhs): ) return out_dtype - def _binary_op_eq_ne(self, rhs): + def _binary_op_eq_ne(self, rhs: BinaryOperand) -> DtypeObj: if pd.api.types.is_timedelta64_dtype(rhs.dtype): out_dtype = np.bool else: @@ -154,7 +177,7 @@ def _binary_op_eq_ne(self, rhs): ) return out_dtype - def _binary_op_lt_gt_le_ge(self, rhs): + def _binary_op_lt_gt_le_ge(self, rhs: BinaryOperand) -> DtypeObj: if pd.api.types.is_timedelta64_dtype(rhs.dtype): return np.bool else: @@ -163,8 +186,10 @@ def _binary_op_lt_gt_le_ge(self, rhs): f" and {rhs.dtype}" ) - def _binary_op_truediv(self, rhs): - lhs, rhs = self, rhs + def _binary_op_truediv( + self, rhs: BinaryOperand + ) -> Tuple["column.ColumnBase", BinaryOperand, DtypeObj]: + lhs = self # type: column.ColumnBase if pd.api.types.is_timedelta64_dtype(rhs.dtype): common_dtype = determine_out_dtype(self.dtype, rhs.dtype) lhs = lhs.astype(common_dtype).astype("float64") @@ -187,7 +212,9 @@ def _binary_op_truediv(self, rhs): return lhs, rhs, out_dtype - def binary_operator(self, op, rhs, reflect=False): + def binary_operator( + self, op: str, rhs: BinaryOperand, reflect: bool = False + ) -> "column.ColumnBase": lhs, rhs = self, rhs if op in ("eq", "ne"): @@ -199,14 +226,14 @@ def binary_operator(self, op, rhs, reflect=False): elif op == "mod": out_dtype = self._binary_op_mod(rhs) elif op == "truediv": - lhs, rhs, out_dtype = self._binary_op_truediv(rhs) + lhs, rhs, out_dtype = self._binary_op_truediv(rhs) # type: ignore elif op == "floordiv": - lhs, rhs, out_dtype = self._binary_op_floordiv(rhs) + lhs, rhs, out_dtype = self._binary_op_floordiv(rhs) # type: ignore op = "truediv" elif op == "add": - out_dtype = _timedelta_binary_op_add(lhs, rhs) + out_dtype = _timedelta_add_result_dtype(lhs, rhs) elif op == "sub": - out_dtype = _timedelta_binary_op_sub(lhs, rhs) + out_dtype = _timedelta_sub_result_dtype(lhs, rhs) else: raise TypeError( f"Series of dtype {self.dtype} cannot perform " @@ -214,10 +241,11 @@ def binary_operator(self, op, rhs, reflect=False): ) if reflect: - lhs, rhs = rhs, lhs + lhs, rhs = rhs, lhs # type: ignore + return binop(lhs, rhs, op=op, out_dtype=out_dtype) - def normalize_binop_value(self, other): + def normalize_binop_value(self, other) -> BinaryOperand: if isinstance(other, cudf.Scalar): return other @@ -247,30 +275,34 @@ def normalize_binop_value(self, other): raise TypeError(f"cannot normalize {type(other)}") @property - def as_numerical(self): - - return column.build_column( - data=self.base_data, - dtype=np.int64, - mask=self.base_mask, - offset=self.offset, - size=self.size, + def as_numerical(self) -> "cudf.core.column.NumericalColumn": + return cast( + "cudf.core.column.NumericalColumn", + column.build_column( + data=self.base_data, + dtype=np.int64, + mask=self.base_mask, + offset=self.offset, + size=self.size, + ), ) - def default_na_value(self): + def default_na_value(self) -> ScalarLike: """Returns the default NA value for this column """ return np.timedelta64("nat", self.time_unit) @property - def time_unit(self): + def time_unit(self) -> str: return self._time_unit - def fillna(self, fill_value=None, method=None): + def fillna( + self, fill_value: Any = None, method: str = None, dtype: Dtype = None + ) -> TimeDeltaColumn: if fill_value is not None: if cudf.utils.utils.isnat(fill_value): return _fillna_natwise(self) - col = self + col = self # type: column.ColumnBase if is_scalar(fill_value): if isinstance(fill_value, np.timedelta64): dtype = determine_out_dtype(self.dtype, fill_value.dtype) @@ -280,51 +312,61 @@ def fillna(self, fill_value=None, method=None): fill_value = cudf.Scalar(fill_value, dtype=dtype) else: fill_value = column.as_column(fill_value, nan_as_null=False) - - return ColumnBase.fillna(col, fill_value) + return cast(TimeDeltaColumn, ColumnBase.fillna(col, fill_value)) else: return super().fillna(method=method) - def as_numerical_column(self, dtype): - return self.as_numerical.astype(dtype) + def as_numerical_column( + self, dtype: Dtype + ) -> "cudf.core.column.NumericalColumn": + return cast( + "cudf.core.column.NumericalColumn", self.as_numerical.astype(dtype) + ) - def as_datetime_column(self, dtype, **kwargs): + def as_datetime_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.DatetimeColumn": raise TypeError( f"cannot astype a timedelta from [{self.dtype}] to [{dtype}]" ) - def as_string_column(self, dtype, **kwargs): - - if not kwargs.get("format"): - fmt = _dtype_to_format_conversion.get( + def as_string_column( + self, dtype: Dtype, format=None + ) -> "cudf.core.column.StringColumn": + if format is None: + format = _dtype_to_format_conversion.get( self.dtype.name, "%D days %H:%M:%S" ) - kwargs["format"] = fmt if len(self) > 0: return string._timedelta_to_str_typecast_functions[ np.dtype(self.dtype) - ](self, **kwargs) + ](self, format=format) else: - return column.column_empty(0, dtype="object", masked=False) + return cast( + "cudf.core.column.StringColumn", + column.column_empty(0, dtype="object", masked=False), + ) - def as_timedelta_column(self, dtype, **kwargs): + def as_timedelta_column(self, dtype: Dtype, **kwargs) -> TimeDeltaColumn: dtype = np.dtype(dtype) if dtype == self.dtype: return self return libcudf.unary.cast(self, dtype=dtype) - def mean(self, skipna=None, dtype=np.float64): + def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta: return pd.Timedelta( self.as_numerical.mean(skipna=skipna, dtype=dtype), unit=self.time_unit, ) - def median(self, skipna=None): + def median(self, skipna: bool = None) -> pd.Timedelta: return pd.Timedelta( self.as_numerical.median(skipna=skipna), unit=self.time_unit ) - def quantile(self, q, interpolation, exact): + def quantile( + self, q: Union[float, Sequence[float]], interpolation: str, exact: bool + ) -> "column.ColumnBase": result = self.as_numerical.quantile( q=q, interpolation=interpolation, exact=exact ) @@ -332,7 +374,9 @@ def quantile(self, q, interpolation, exact): return pd.Timedelta(result, unit=self.time_unit) return result.astype(self.dtype) - def sum(self, skipna=None, dtype=None, min_count=0): + def sum( + self, skipna: bool = None, dtype: Dtype = None, min_count=0 + ) -> pd.Timedelta: if len(self) == 0: return pd.Timedelta(None, unit=self.time_unit) else: @@ -343,13 +387,15 @@ def sum(self, skipna=None, dtype=None, min_count=0): unit=self.time_unit, ) - def std(self, skipna=None, ddof=1, dtype=np.float64): + def std( + self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64 + ) -> pd.Timedelta: return pd.Timedelta( self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype), unit=self.time_unit, ) - def components(self, index=None): + def components(self, index=None) -> "cudf.DataFrame": """ Return a Dataframe of the components of the Timedeltas. @@ -443,7 +489,7 @@ def components(self, index=None): ) @property - def days(self): + def days(self) -> "cudf.core.column.NumericalColumn": """ Number of days for each element. @@ -456,7 +502,7 @@ def days(self): ) @property - def seconds(self): + def seconds(self) -> "cudf.core.column.NumericalColumn": """ Number of seconds (>= 0 and less than 1 day). @@ -479,7 +525,7 @@ def seconds(self): ) @property - def microseconds(self): + def microseconds(self) -> "cudf.core.column.NumericalColumn": """ Number of microseconds (>= 0 and less than 1 second). @@ -499,7 +545,7 @@ def microseconds(self): ) @property - def nanoseconds(self): + def nanoseconds(self) -> "cudf.core.column.NumericalColumn": """ Return the number of nanoseconds (n), where 0 <= n < 1 microsecond. @@ -524,12 +570,17 @@ def nanoseconds(self): @annotate("BINARY_OP", color="orange", domain="cudf_python") -def binop(lhs, rhs, op, out_dtype): +def binop( + lhs: "column.ColumnBase", + rhs: "column.ColumnBase", + op: str, + out_dtype: DtypeObj, +) -> "cudf.core.column.ColumnBase": out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) return out -def determine_out_dtype(lhs_dtype, rhs_dtype): +def determine_out_dtype(lhs_dtype: Dtype, rhs_dtype: Dtype) -> Dtype: if np.can_cast(np.dtype(lhs_dtype), np.dtype(rhs_dtype)): return rhs_dtype elif np.can_cast(np.dtype(rhs_dtype), np.dtype(lhs_dtype)): @@ -538,7 +589,9 @@ def determine_out_dtype(lhs_dtype, rhs_dtype): raise TypeError(f"Cannot type-cast {lhs_dtype} and {rhs_dtype}") -def _timedelta_binary_op_add(lhs, rhs): +def _timedelta_add_result_dtype( + lhs: BinaryOperand, rhs: BinaryOperand +) -> Dtype: if pd.api.types.is_timedelta64_dtype(rhs.dtype): out_dtype = determine_out_dtype(lhs.dtype, rhs.dtype) elif pd.api.types.is_datetime64_dtype(rhs.dtype): @@ -557,7 +610,9 @@ def _timedelta_binary_op_add(lhs, rhs): return out_dtype -def _timedelta_binary_op_sub(lhs, rhs): +def _timedelta_sub_result_dtype( + lhs: BinaryOperand, rhs: BinaryOperand +) -> Dtype: if pd.api.types.is_timedelta64_dtype( lhs.dtype ) and pd.api.types.is_timedelta64_dtype(rhs.dtype): diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index c750cc92f30..f5823528d02 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -1,5 +1,11 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from __future__ import annotations + import itertools +from collections import OrderedDict from collections.abc import MutableMapping +from typing import TYPE_CHECKING, Any, Tuple, Union import pandas as pd @@ -11,9 +17,22 @@ to_nested_dict, ) +if TYPE_CHECKING: + from cudf.core.column import ColumnBase + class ColumnAccessor(MutableMapping): - def __init__(self, data=None, multiindex=False, level_names=None): + + _data: "OrderedDict[Any, ColumnBase]" + multiindex: bool + _level_names: Tuple[Any, ...] + + def __init__( + self, + data: Union[MutableMapping, ColumnAccessor] = None, + multiindex: bool = False, + level_names=None, + ): """ Parameters ---------- @@ -33,7 +52,7 @@ def __init__(self, data=None, multiindex=False, level_names=None): if isinstance(data, ColumnAccessor): multiindex = multiindex or data.multiindex level_names = level_names or data.level_names - self._data = data + self._data = data._data self.multiindex = multiindex self._level_names = level_names @@ -44,21 +63,21 @@ def __init__(self, data=None, multiindex=False, level_names=None): def __iter__(self): return self._data.__iter__() - def __getitem__(self, key): + def __getitem__(self, key: Any) -> ColumnBase: return self._data[key] - def __setitem__(self, key, value): + def __setitem__(self, key: Any, value: Any): self.set_by_label(key, value) self._clear_cache() - def __delitem__(self, key): + def __delitem__(self, key: Any): self._data.__delitem__(key) self._clear_cache() - def __len__(self): + def __len__(self) -> int: return len(self._data) - def __repr__(self): + def __repr__(self) -> str: data_repr = self._data.__repr__() multiindex_repr = self.multiindex.__repr__() level_names_repr = self.level_names.__repr__() @@ -70,14 +89,14 @@ def __repr__(self): ) @property - def level_names(self): + def level_names(self) -> Tuple[Any, ...]: if self._level_names is None or len(self._level_names) == 0: return tuple((None,) * max(1, self.nlevels)) else: return self._level_names @property - def nlevels(self): + def nlevels(self) -> int: if len(self._data) == 0: return 0 if not self.multiindex: @@ -86,28 +105,28 @@ def nlevels(self): return len(next(iter(self.keys()))) @property - def name(self): + def name(self) -> Any: if len(self._data) == 0: return None return self.level_names[-1] @property - def nrows(self): + def nrows(self) -> int: if len(self._data) == 0: return 0 else: return len(next(iter(self.values()))) @cached_property - def names(self): + def names(self) -> Tuple[Any, ...]: return tuple(self.keys()) @cached_property - def columns(self): + def columns(self) -> Tuple[ColumnBase, ...]: return tuple(self.values()) @cached_property - def _grouped_data(self): + def _grouped_data(self) -> MutableMapping: """ If self.multiindex is True, return the underlying mapping as a nested mapping. @@ -125,7 +144,7 @@ def _clear_cache(self): except AttributeError: pass - def to_pandas_index(self): + def to_pandas_index(self) -> pd.Index: """" Convert the keys of the ColumnAccessor to a Pandas Index object. """ @@ -142,7 +161,7 @@ def to_pandas_index(self): result = pd.Index(self.names, name=self.name, tupleize_cols=False) return result - def insert(self, name, value, loc=-1): + def insert(self, name: Any, value: Any, loc: int = -1): """ Insert column into the ColumnAccessor at the specified location. @@ -176,10 +195,10 @@ def insert(self, name, value, loc=-1): else: new_keys = self.names[:loc] + (name,) + self.names[loc:] new_values = self.columns[:loc] + (value,) + self.columns[loc:] - self._data = self._data.__class__(zip(new_keys, new_values),) + self._data = self._data.__class__(zip(new_keys, new_values)) self._clear_cache() - def copy(self, deep=False): + def copy(self, deep=False) -> ColumnAccessor: """ Make a copy of this ColumnAccessor. """ @@ -195,7 +214,7 @@ def copy(self, deep=False): level_names=self.level_names, ) - def select_by_label(self, key): + def select_by_label(self, key: Any) -> ColumnAccessor: """ Return a subset of this column accessor, composed of the keys specified by `key`. @@ -218,7 +237,7 @@ def select_by_label(self, key): return self._select_by_label_with_wildcard(key) return self._select_by_label_grouped(key) - def select_by_index(self, index): + def select_by_index(self, index: Any) -> ColumnAccessor: """ Return a ColumnAccessor composed of the columns specified by index. @@ -243,7 +262,7 @@ def select_by_index(self, index): data, multiindex=self.multiindex, level_names=self.level_names, ) - def set_by_label(self, key, value): + def set_by_label(self, key: Any, value: Any): """ Add (or modify) column by name. @@ -256,14 +275,14 @@ def set_by_label(self, key, value): self._data[key] = value self._clear_cache() - def _select_by_label_list_like(self, key): + def _select_by_label_list_like(self, key: Any) -> ColumnAccessor: return self.__class__( to_flat_dict({k: self._grouped_data[k] for k in key}), multiindex=self.multiindex, level_names=self.level_names, ) - def _select_by_label_grouped(self, key): + def _select_by_label_grouped(self, key: Any) -> ColumnAccessor: result = self._grouped_data[key] if isinstance(result, cudf.core.column.ColumnBase): return self.__class__({key: result}) @@ -277,7 +296,7 @@ def _select_by_label_grouped(self, key): level_names=self.level_names[len(key) :], ) - def _select_by_label_slice(self, key): + def _select_by_label_slice(self, key: slice) -> ColumnAccessor: start, stop = key.start, key.stop if key.step is not None: raise TypeError("Label slicing with step is not supported") @@ -303,7 +322,7 @@ def _select_by_label_slice(self, key): level_names=self.level_names, ) - def _select_by_label_with_wildcard(self, key): + def _select_by_label_with_wildcard(self, key: Any) -> ColumnAccessor: key = self._pad_key(key, slice(None)) return self.__class__( {k: self._data[k] for k in self._data if _compare_keys(k, key)}, @@ -311,7 +330,7 @@ def _select_by_label_with_wildcard(self, key): level_names=self.level_names, ) - def _pad_key(self, key, pad_value=""): + def _pad_key(self, key: Any, pad_value="") -> Any: """ Pad the provided key to a length equal to the number of levels. @@ -323,7 +342,7 @@ def _pad_key(self, key, pad_value=""): return key + (pad_value,) * (self.nlevels - len(key)) -def _compare_keys(target, key): +def _compare_keys(target: Any, key: Any) -> bool: """ Compare `key` to `target`. diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f9b61a60830..e5626190098 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -9,6 +9,7 @@ import warnings from collections import OrderedDict, defaultdict from collections.abc import Iterable, Mapping, Sequence +from typing import Any, Set import cupy import numpy as np @@ -2364,7 +2365,7 @@ def iteritems(self): for k in self: yield (k, self[k]) - @property + @property # type: ignore @annotate("DATAFRAME_LOC", color="blue", domain="cudf_python") def loc(self): """ @@ -2535,14 +2536,14 @@ def at(self): """ return self.loc - @property + @property # type: ignore @annotate("DATAFRAME_COLUMNS_GETTER", color="yellow", domain="cudf_python") def columns(self): """Returns a tuple of columns """ return self._data.to_pandas_index() - @columns.setter + @columns.setter # type: ignore @annotate("DATAFRAME_COLUMNS_SETTER", color="yellow", domain="cudf_python") def columns(self, columns): if isinstance(columns, (cudf.MultiIndex, cudf.Index)): @@ -4229,14 +4230,13 @@ def join( ) return df - @copy_docstring(DataFrameGroupBy.__init__) def groupby( self, by=None, axis=0, level=None, as_index=True, - sort=True, + sort=False, group_keys=True, squeeze=False, observed=False, @@ -4274,7 +4274,6 @@ def groupby( sort=sort, ) - @copy_docstring(Rolling) def rolling( self, window, min_periods=None, center=False, axis=0, win_type=None ): @@ -7272,7 +7271,7 @@ def equals(self, other): return False return super().equals(other) - _accessors = set() + _accessors = set() # type: Set[Any] def from_pandas(obj, nan_as_null=None): diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index ccd92de69fc..b89b3ddb2be 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -2,6 +2,7 @@ import decimal import pickle +from typing import Any import numpy as np import pandas as pd @@ -9,10 +10,11 @@ from pandas.api.extensions import ExtensionDtype import cudf +from cudf._typing import Dtype class CategoricalDtype(ExtensionDtype): - def __init__(self, categories=None, ordered=None): + def __init__(self, categories=None, ordered: bool = None) -> None: """ dtype similar to pd.CategoricalDtype with the categories stored on the GPU. @@ -21,7 +23,7 @@ def __init__(self, categories=None, ordered=None): self.ordered = ordered @property - def categories(self): + def categories(self) -> "cudf.core.index.Index": if self._categories is None: return cudf.core.index.as_index( cudf.core.column.column_empty(0, dtype="object", masked=False) @@ -41,23 +43,23 @@ def str(self): return "|O08" @classmethod - def from_pandas(cls, dtype): + def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype": return CategoricalDtype( categories=dtype.categories, ordered=dtype.ordered ) - def to_pandas(self): + def to_pandas(self) -> pd.CategoricalDtype: if self.categories is None: categories = None else: categories = self.categories.to_pandas() return pd.CategoricalDtype(categories=categories, ordered=self.ordered) - def _init_categories(self, categories): + def _init_categories(self, categories: Any): if categories is None: return categories if len(categories) == 0: - dtype = "object" + dtype = "object" # type: Any else: dtype = None @@ -68,7 +70,7 @@ def _init_categories(self, categories): else: return column - def __eq__(self, other): + def __eq__(self, other: Dtype) -> bool: if isinstance(other, str): return other == self.name elif other is self: @@ -111,10 +113,10 @@ def deserialize(cls, header, frames): class ListDtype(ExtensionDtype): + _typ: pa.ListType + name: str = "list" - name = "list" - - def __init__(self, element_type): + def __init__(self, element_type: Any) -> None: if isinstance(element_type, ListDtype): self._typ = pa.list_(element_type._typ) else: @@ -124,7 +126,7 @@ def __init__(self, element_type): self._typ = pa.list_(element_type) @property - def element_type(self): + def element_type(self) -> Dtype: if isinstance(self._typ.value_type, pa.ListType): return ListDtype.from_arrow(self._typ.value_type) else: @@ -220,18 +222,47 @@ def __hash__(self): return hash(self._typ) -class DecimalDtype(ExtensionDtype): +class Decimal64Dtype(ExtensionDtype): name = "decimal" _metadata = ("precision", "scale") + _MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max)) - def __init__(self, precision, scale): + def __init__(self, precision, scale=0): + """ + Parameters + ---------- + precision : int + The total number of digits in each value of this dtype + scale : int, optional + The scale of the Decimal64Dtype. See Notes below. + + Notes + ----- + When the scale is positive: + - numbers with fractional parts (e.g., 0.0042) can be represented + - the scale is the total number of digits to the right of the + decimal point + When the scale is negative: + - only multiples of powers of 10 (including 10**0) can be + represented (e.g., 1729, 4200, 1000000) + - the scale represents the number of trailing zeros in the value. + For example, 42 is representable with precision=2 and scale=0. + 13.0051 is representable with precision=6 and scale=4, + and *not* representable with precision<6 or scale<4. + """ + self._validate(precision, scale) self._typ = pa.decimal128(precision, scale) @property def precision(self): return self._typ.precision + @precision.setter + def precision(self, value): + self._validate(value, self.scale) + self._typ = pa.decimal128(precision=value, scale=self.scale) + @property def scale(self): return self._typ.scale @@ -248,5 +279,25 @@ def to_arrow(self): def from_arrow(cls, typ): return cls(typ.precision, typ.scale) + @property + def itemsize(self): + return 8 + + def __repr__(self): + return ( + f"{self.__class__.__name__}" + f"(precision={self.precision}, scale={self.scale})" + ) + def __hash__(self): return hash(self._typ) + + @classmethod + def _validate(cls, precision, scale=0): + if precision > Decimal64Dtype._MAX_PRECISION: + raise ValueError( + f"Cannot construct a {cls.__name__}" + f" with precision > {cls._MAX_PRECISION}" + ) + if abs(scale) > precision: + raise ValueError(f"scale={scale} exceeds precision={precision}") diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ad4069dfb68..3d12ac2e6cc 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -5,6 +5,7 @@ import operator import warnings from collections import OrderedDict, abc as abc +from typing import overload import cupy import numpy as np @@ -12,6 +13,7 @@ import pyarrow as pa from nvtx import annotate from pandas.api.types import is_dict_like, is_dtype_equal +from typing_extensions import Literal import cudf from cudf import _lib as libcudf @@ -39,9 +41,23 @@ class Frame(libcudf.table.Table): """ @classmethod - def _from_table(cls, table): + def _from_table(cls, table: "Frame"): return cls(table._data, index=table._index) + @overload + def _mimic_inplace(self, result: "Frame") -> "Frame": + ... + + @overload + def _mimic_inplace(self, result: "Frame", inplace: Literal[True]): + ... + + @overload + def _mimic_inplace( + self, result: "Frame", inplace: Literal[False] + ) -> "Frame": + ... + def _mimic_inplace(self, result, inplace=False): if inplace: for col in self._data: @@ -1296,7 +1312,9 @@ def dropna( 0 Alfred Batmobile 1940-04-25 """ if axis == 0: - result = self._drop_na_rows(how=how, subset=subset, thresh=thresh) + result = self._drop_na_rows( + how=how, subset=subset, thresh=thresh, drop_nan=True + ) else: result = self._drop_na_columns( how=how, subset=subset, thresh=thresh @@ -1443,7 +1461,9 @@ def fillna( return self._mimic_inplace(result, inplace=inplace) - def _drop_na_rows(self, how="any", subset=None, thresh=None): + def _drop_na_rows( + self, how="any", subset=None, thresh=None, drop_nan=False + ): """ Drops null rows from `self`. @@ -1475,12 +1495,23 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None): ] if len(subset_cols) == 0: return self.copy(deep=True) - result = self.__class__._from_table( + + frame = self.copy(deep=False) + if drop_nan: + for name, col in frame._data.items(): + if name in subset and isinstance( + col, cudf.core.column.NumericalColumn + ): + frame._data[name] = col.nans_to_nulls() + else: + frame._data[name] = col + + result = frame.__class__._from_table( libcudf.stream_compaction.drop_nulls( - self, how=how, keys=subset, thresh=thresh + frame, how=how, keys=subset, thresh=thresh ) ) - result._postprocess_columns(self) + result._postprocess_columns(frame) return result def _drop_na_columns(self, how="any", subset=None, thresh=None): @@ -1501,7 +1532,10 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): thresh = len(df) for col in self._data.names: - if (len(df[col]) - df[col].null_count) < thresh: + no_threshold_valid_count = ( + len(df[col]) - df[col].nans_to_nulls().null_count + ) < thresh + if no_threshold_valid_count: continue out_cols.append(col) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b60815722c6..8af3b6f1d81 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -18,7 +18,7 @@ class GroupBy(Serializable): _MAX_GROUPS_BEFORE_WARN = 100 def __init__( - self, obj, by=None, level=None, sort=True, as_index=True, dropna=True + self, obj, by=None, level=None, sort=False, as_index=True, dropna=True ): """ Group a DataFrame or Series by a set of columns. @@ -37,9 +37,9 @@ def __init__( level : int, level_name or list, optional For objects with a MultiIndex, `level` can be used to specify grouping by one or more levels of the MultiIndex. - sort : True, optional - If True (default), sort results by group9s). Note that - unlike Pandas, this also sorts values within each group. + sort : bool, default False + Sort the result by group keys. Differ from Pandas, cudf defaults + to False for better performance. as_index : bool, optional If as_index=True (default), the group names appear as the keys of the resulting DataFrame. @@ -101,7 +101,7 @@ def size(self): len(self.obj), "int8", masked=False ) ) - .groupby(self.grouping) + .groupby(self.grouping, sort=self._sort) .agg("size") ) @@ -126,12 +126,13 @@ def agg(self, func): Examples -------- >>> import cudf - >>> a = cudf.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 3]}) + >>> a = cudf.DataFrame( + {'a': [1, 1, 2], 'b': [1, 2, 3], 'c': [2, 2, 1]}) >>> a.groupby('a').agg('sum') b a - 1 3 2 3 + 1 3 Specifying a list of aggregations to perform on each column. @@ -139,8 +140,8 @@ def agg(self, func): b c sum min sum min a - 1 3 1 4 2 2 3 3 1 1 + 1 3 1 4 2 Using a dict to specify aggregations to perform per column. @@ -148,8 +149,8 @@ def agg(self, func): a b max min mean a - 1 1 1 1.5 2 2 3 3.0 + 1 1 1 1.5 Using lambdas/callables to specify aggregations taking parameters. @@ -591,7 +592,7 @@ def rolling(self, *args, **kwargs): class DataFrameGroupBy(GroupBy): def __init__( - self, obj, by=None, level=None, sort=True, as_index=True, dropna=True + self, obj, by=None, level=None, sort=False, as_index=True, dropna=True ): """ Group DataFrame using a mapper or by a Series of columns. @@ -618,10 +619,11 @@ def __init__( For aggregated output, return object with group labels as the index. Only relevant for DataFrame input. as_index=False is effectively “SQL-style” grouped output. - sort : bool, default True - Sort group keys. Get better performance by turning this off. - Note this does not influence the order of observations within each - group. Groupby preserves the order of rows within each group. + sort : bool, default False + Sort result by group key. Differ from Pandas, cudf defaults to + ``False`` for better performance. Note this does not influence + the order of observations within each group. Groupby preserves + the order of rows within each group. dropna : bool, optional If True (default), do not include the "null" group. @@ -670,8 +672,8 @@ def __init__( >>> df.groupby(level="Type").mean() Max Speed Type - Captive 210.0 Wild 185.0 + Captive 210.0 """ super().__init__( @@ -689,12 +691,14 @@ def __getattribute__(self, key): except AttributeError: if key in self.obj: return self.obj[key].groupby( - self.grouping, dropna=self._dropna + self.grouping, dropna=self._dropna, sort=self._sort ) raise def __getitem__(self, key): - return self.obj[key].groupby(self.grouping, dropna=self._dropna) + return self.obj[key].groupby( + self.grouping, dropna=self._dropna, sort=self._sort + ) def nunique(self): """ @@ -705,7 +709,7 @@ def nunique(self): class SeriesGroupBy(GroupBy): def __init__( - self, obj, by=None, level=None, sort=True, as_index=True, dropna=True + self, obj, by=None, level=None, sort=False, as_index=True, dropna=True ): """ Group Series using a mapper or by a Series of columns. @@ -732,10 +736,11 @@ def __init__( For aggregated output, return object with group labels as the index. Only relevant for DataFrame input. as_index=False is effectively “SQL-style” grouped output. - sort : bool, default True - Sort group keys. Get better performance by turning this off. - Note this does not influence the order of observations within each - group. Groupby preserves the order of rows within each group. + sort : bool, default False + Sort result by group key. Differ from Pandas, cudf defaults to + ``False`` for better performance. Note this does not influence + the order of observations within each group. Groupby preserves + the order of rows within each group. Returns ------- diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 219d355d3cc..e3899a403f1 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1,8 +1,9 @@ # Copyright (c) 2018-2020, NVIDIA CORPORATION. -from __future__ import division, print_function +from __future__ import annotations, division, print_function import pickle from numbers import Number +from typing import Any, Dict, Set, Type import cupy import numpy as np @@ -132,6 +133,13 @@ def __init__( """ pass + @cached_property + def _values(self) -> ColumnBase: + raise NotImplementedError + + def __getitem__(self, key): + raise NotImplementedError() + def drop_duplicates(self, keep="first"): """ Return Index with duplicate values removed @@ -1485,7 +1493,11 @@ def _from_table(cls, table): else: return as_index(table) - _accessors = set() + _accessors = set() # type: Set[Any] + + @property + def _constructor_expanddim(self): + return cudf.MultiIndex class RangeIndex(Index): @@ -1773,7 +1785,7 @@ def find_label_range(self, first=None, last=None): return begin, end - @copy_docstring(_to_frame) + @copy_docstring(_to_frame) # type: ignore def to_frame(self, index=True, name=None): return _to_frame(self, index, name) @@ -2028,7 +2040,7 @@ def __getitem__(self, index): else: return res - @copy_docstring(_to_frame) + @copy_docstring(_to_frame) # type: ignore def to_frame(self, index=True, name=None): return _to_frame(self, index, name) @@ -2705,15 +2717,11 @@ def __repr__(self): + ")" ) - @copy_docstring(StringMethods.__init__) + @copy_docstring(StringMethods.__init__) # type: ignore @property def str(self): return StringMethods(column=self._values, parent=self) - @property - def _constructor_expanddim(self): - return cudf.MultiIndex - def _clean_nulls_from_index(self): """ Convert all na values(if any) in Index object @@ -2725,7 +2733,7 @@ def _clean_nulls_from_index(self): return self -def as_index(arbitrary, **kwargs): +def as_index(arbitrary, **kwargs) -> Index: """Create an Index from an arbitrary object Currently supported inputs are: @@ -2794,7 +2802,7 @@ def as_index(arbitrary, **kwargs): np.uint64: UInt64Index, np.float32: Float32Index, np.float64: Float64Index, -} +} # type: Dict[Any, Type[Index]] _index_to_dtype = { Int8Index: np.int8, diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 3872e296ed5..4ea32c77724 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -329,6 +329,9 @@ def _dispatch_scalar_unaop(self, op): return np.ceil(self.value) return getattr(self.value, op)() + def astype(self, dtype): + return Scalar(self.device_value, dtype) + class _NAType(object): def __init__(self): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 147262be08d..dfc687eb76d 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -5,6 +5,7 @@ from collections import abc as abc from numbers import Number from shutil import get_terminal_size +from typing import Any, Set from uuid import uuid4 import cupy @@ -43,6 +44,7 @@ from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( can_convert_to_column, + is_decimal_dtype, is_list_dtype, is_list_like, is_mixed_with_object_dtype, @@ -1099,6 +1101,7 @@ def __repr__(self): preprocess._column, cudf.core.column.CategoricalColumn ) and not is_list_dtype(preprocess.dtype) + and not is_decimal_dtype(preprocess.dtype) ) or isinstance( preprocess._column, cudf.core.column.timedelta.TimeDeltaColumn ): @@ -1705,17 +1708,17 @@ def __neg__(self): """ return self.__mul__(-1) - @copy_docstring(CategoricalAccessor.__init__) + @copy_docstring(CategoricalAccessor.__init__) # type: ignore @property def cat(self): return CategoricalAccessor(column=self._column, parent=self) - @copy_docstring(StringMethods.__init__) + @copy_docstring(StringMethods.__init__) # type: ignore @property def str(self): return StringMethods(column=self._column, parent=self) - @copy_docstring(ListMethods.__init__) + @copy_docstring(ListMethods.__init__) # type: ignore @property def list(self): return ListMethods(column=self._column, parent=self) @@ -4180,7 +4183,7 @@ def groupby( axis=0, level=None, as_index=True, - sort=True, + sort=False, group_keys=True, squeeze=False, observed=False, @@ -4442,7 +4445,7 @@ def keys(self): """ return self.index - _accessors = set() + _accessors = set() # type: Set[Any] truediv_int_dtype_corrections = { diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index a6ce2c85e42..7d1ab3a5435 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -374,7 +374,7 @@ def __init__(self, groupby, window, min_periods=None, center=False): self._group_keys = groupby.grouping.keys.take(sort_order) obj = groupby.obj.take(sort_order) - gb_size = groupby.size() + gb_size = groupby.size().sort_index() self._group_starts = ( gb_size.cumsum().shift(1).fillna(0).repeat(gb_size) ) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index d6e0fedf8e0..7c8455b6575 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -7,9 +7,11 @@ from pyarrow import orc as orc import cudf -from cudf import _lib as libcudf +from cudf._lib import orc as liborc from cudf.utils import ioutils -from cudf.utils.metadata import orc_column_statistics_pb2 as cs_pb2 +from cudf.utils.metadata import ( # type: ignore + orc_column_statistics_pb2 as cs_pb2, +) def _make_empty_df(filepath_or_buffer, columns): @@ -127,7 +129,7 @@ def read_orc_statistics( column_names, raw_file_statistics, raw_stripes_statistics, - ) = libcudf.orc.read_raw_orc_statistics(filepath_or_buffer) + ) = liborc.read_raw_orc_statistics(filepath_or_buffer) # Parse column names column_names = [ @@ -257,7 +259,7 @@ def read_orc( if engine == "cudf": df = DataFrame._from_table( - libcudf.orc.read_orc( + liborc.read_orc( filepath_or_buffer, columns, stripes, @@ -324,9 +326,9 @@ def to_orc(df, fname, compression=None, enable_statistics=True, **kwargs): if ioutils.is_fsspec_open_file(path_or_buf): with path_or_buf as file_obj: file_obj = ioutils.get_IOBase_writer(file_obj) - libcudf.orc.write_orc(df, file_obj, compression, enable_statistics) + liborc.write_orc(df, file_obj, compression, enable_statistics) else: - libcudf.orc.write_orc(df, path_or_buf, compression, enable_statistics) + liborc.write_orc(df, path_or_buf, compression, enable_statistics) -ORCWriter = libcudf.orc.ORCWriter +ORCWriter = liborc.ORCWriter diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index bf8898825c0..2048e574acc 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -1,5 +1,7 @@ # Copyright (c) 2020, NVIDIA CORPORATION. +from __future__ import annotations + from typing import Union import numpy as np diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 93bc6d1c573..c821755f670 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -1,7 +1,8 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. from __future__ import division +import decimal import operator import random from itertools import product @@ -1567,3 +1568,161 @@ def test_binops_with_NA_consistent(dtype, op): assert (result == expect_all).all() elif dtype in DATETIME_TYPES & TIMEDELTA_TYPES: assert result._column.null_count == len(data) + + +@pytest.mark.parametrize( + "args", + [ + ( + operator.add, + ["1.5", "2.0"], + cudf.Decimal64Dtype(scale=2, precision=2), + ["1.5", "2.0"], + cudf.Decimal64Dtype(scale=2, precision=2), + ["3.0", "4.0"], + cudf.Decimal64Dtype(scale=2, precision=3), + ), + ( + operator.add, + ["1.5", "2.0"], + cudf.Decimal64Dtype(scale=2, precision=2), + ["2.25", "1.005"], + cudf.Decimal64Dtype(scale=3, precision=4), + ["3.75", "3.005"], + cudf.Decimal64Dtype(scale=3, precision=5), + ), + ( + operator.add, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + ["0.1", "0.2"], + cudf.Decimal64Dtype(scale=3, precision=4), + ["100.1", "200.2"], + cudf.Decimal64Dtype(scale=3, precision=9), + ), + ( + operator.sub, + ["1.5", "2.0"], + cudf.Decimal64Dtype(scale=2, precision=2), + ["2.25", "1.005"], + cudf.Decimal64Dtype(scale=3, precision=4), + ["-0.75", "0.995"], + cudf.Decimal64Dtype(scale=3, precision=5), + ), + ( + operator.sub, + ["1.5", "2.0"], + cudf.Decimal64Dtype(scale=2, precision=2), + ["2.25", "1.005"], + cudf.Decimal64Dtype(scale=3, precision=4), + ["-0.75", "0.995"], + cudf.Decimal64Dtype(scale=3, precision=5), + ), + ( + operator.sub, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + ["0.1", "0.2"], + cudf.Decimal64Dtype(scale=3, precision=4), + ["99.9", "199.8"], + cudf.Decimal64Dtype(scale=3, precision=9), + ), + ( + operator.mul, + ["1.5", "2.0"], + cudf.Decimal64Dtype(scale=2, precision=2), + ["1.5", "3.0"], + cudf.Decimal64Dtype(scale=3, precision=4), + ["2.25", "6.0"], + cudf.Decimal64Dtype(scale=5, precision=7), + ), + ( + operator.mul, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + ["0.1", "0.2"], + cudf.Decimal64Dtype(scale=3, precision=4), + ["10.0", "40.0"], + cudf.Decimal64Dtype(scale=1, precision=8), + ), + ( + operator.mul, + ["1000", "2000"], + cudf.Decimal64Dtype(scale=-3, precision=4), + ["0.343", "0.500"], + cudf.Decimal64Dtype(scale=3, precision=3), + ["343.0", "1000.0"], + cudf.Decimal64Dtype(scale=0, precision=8), + ), + ( + operator.add, + ["1.5", None, "2.0"], + cudf.Decimal64Dtype(scale=2, precision=2), + ["1.5", None, "2.0"], + cudf.Decimal64Dtype(scale=2, precision=2), + ["3.0", None, "4.0"], + cudf.Decimal64Dtype(scale=2, precision=3), + ), + ( + operator.add, + ["1.5", None], + cudf.Decimal64Dtype(scale=2, precision=2), + ["2.25", "1.005"], + cudf.Decimal64Dtype(scale=3, precision=4), + ["3.75", None], + cudf.Decimal64Dtype(scale=3, precision=5), + ), + ( + operator.sub, + ["1.5", None], + cudf.Decimal64Dtype(scale=2, precision=2), + ["2.25", None], + cudf.Decimal64Dtype(scale=3, precision=4), + ["-0.75", None], + cudf.Decimal64Dtype(scale=3, precision=5), + ), + ( + operator.sub, + ["1.5", "2.0"], + cudf.Decimal64Dtype(scale=2, precision=2), + ["2.25", None], + cudf.Decimal64Dtype(scale=3, precision=4), + ["-0.75", None], + cudf.Decimal64Dtype(scale=3, precision=5), + ), + ( + operator.mul, + ["1.5", None], + cudf.Decimal64Dtype(scale=2, precision=2), + ["1.5", None], + cudf.Decimal64Dtype(scale=3, precision=4), + ["2.25", None], + cudf.Decimal64Dtype(scale=5, precision=7), + ), + ( + operator.mul, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + ["0.1", None], + cudf.Decimal64Dtype(scale=3, precision=4), + ["10.0", None], + cudf.Decimal64Dtype(scale=1, precision=8), + ), + ], +) +def test_binops_decimal(args): + op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype = args + + def decimal_series(input, dtype): + return cudf.Series( + [x if x is None else decimal.Decimal(x) for x in input], + dtype=dtype, + ) + + a = decimal_series(lhs, l_dtype) + b = decimal_series(rhs, r_dtype) + expect = decimal_series(expect, expect_dtype) + + got = op(a, b) + assert expect.dtype == got.dtype + utils.assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py index 62427cc593e..964e79a57b0 100644 --- a/python/cudf/cudf/tests/test_column_accessor.py +++ b/python/cudf/cudf/tests/test_column_accessor.py @@ -1,5 +1,6 @@ # Copyright (c) 2020, NVIDIA CORPORATION. + import pandas as pd import pytest diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py new file mode 100644 index 00000000000..f73a785727b --- /dev/null +++ b/python/cudf/cudf/tests/test_decimal.py @@ -0,0 +1,43 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from decimal import Decimal + +import pyarrow as pa +import pytest + +from cudf.core.column import DecimalColumn + + +@pytest.mark.parametrize( + "data", + [ + [Decimal("1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], + [Decimal("-1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], + [1], + [-1], + [1, 2, 3, 4], + [42, 1729, 4104], + [1, 2, None, 4], + [None, None, None], + [], + ], +) +@pytest.mark.parametrize( + "typ", + [ + pa.decimal128(precision=4, scale=2), + pa.decimal128(precision=5, scale=3), + pa.decimal128(precision=6, scale=4), + ], +) +def test_round_trip_decimal_column(data, typ): + pa_arr = pa.array(data, type=typ) + col = DecimalColumn.from_arrow(pa_arr) + assert pa_arr.equals(col.to_arrow()) + + +def test_from_arrow_max_precision(): + with pytest.raises(ValueError): + DecimalColumn.from_arrow( + pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=19)) + ) diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index 3482b314fee..08378361188 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -108,7 +108,7 @@ def test_dropna_with_all_nulls(how, data, axis): def test_dropna_nan_as_null(): sr = cudf.Series([1.0, 2.0, np.nan, None], nan_as_null=False) - assert_eq(sr.dropna(), sr[:3]) + assert_eq(sr.dropna(), sr[:2]) sr = sr.nans_to_nulls() assert_eq(sr.dropna(), sr[:2]) @@ -120,7 +120,7 @@ def test_dropna_nan_as_null(): ) got = df.dropna() - expected = df[:3] + expected = df[:2] assert_eq(expected, got) df = df.nans_to_nulls() @@ -210,13 +210,28 @@ def test_dropna_thresh_cols(thresh, subset, inplace): ) -def test_dropna_dataframe_np_nan(): - import numpy as np - - import cudf - - data = {"key": [1, 2], "val": [np.nan, 3]} +@pytest.mark.parametrize( + "data", + [ + { + "key": [1, 2, 10], + "val": cudf.Series([np.nan, 3, 1], nan_as_null=False), + "abc": [np.nan, None, 1], + }, + { + "key": [None, 2, 1], + "val": cudf.Series([3, np.nan, 0.1], nan_as_null=True), + "abc": [None, 1, None], + }, + ], +) +@pytest.mark.parametrize("axis", [0, 1]) +def test_dropna_dataframe_np_nan(data, axis): gdf = cudf.DataFrame(data) - pdf = pd.DataFrame(data) + pd_data = { + key: value.to_pandas() if isinstance(value, cudf.Series) else value + for key, value in data.items() + } + pdf = pd.DataFrame(pd_data) - assert_eq(pdf.dropna(), gdf.dropna(), check_dtype=False) + assert_eq(pdf.dropna(axis=axis), gdf.dropna(axis=axis), check_dtype=False) diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index 4b5867c073f..32cecec3f60 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -8,9 +8,9 @@ import cudf from cudf.core.dtypes import ( CategoricalDtype, + Decimal64Dtype, ListDtype, StructDtype, - DecimalDtype, ) from cudf.tests.utils import assert_eq @@ -136,6 +136,12 @@ def test_struct_dtype_fields(fields): def test_decimal_dtype(): - dt = DecimalDtype(4, 2) + dt = Decimal64Dtype(4, 2) assert dt.to_arrow() == pa.decimal128(4, 2) - assert dt == DecimalDtype.from_arrow(pa.decimal128(4, 2)) + assert dt == Decimal64Dtype.from_arrow(pa.decimal128(4, 2)) + + +def test_max_precision(): + Decimal64Dtype(scale=0, precision=18) + with pytest.raises(ValueError): + Decimal64Dtype(scale=0, precision=19) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index b42586f4137..294443500a9 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -65,9 +65,15 @@ def pdf(gdf): @pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) def test_groupby_mean(nelem): - got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).mean() + got_df = ( + make_frame(DataFrame, nelem=nelem) + .groupby(["x", "y"], sort=True) + .mean() + ) expect_df = ( - make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).mean() + make_frame(pd.DataFrame, nelem=nelem) + .groupby(["x", "y"], sort=True) + .mean() ) assert_eq(got_df, expect_df) @@ -78,12 +84,12 @@ def test_groupby_mean_3level(nelem): bys = list("xyz") got_df = ( make_frame(DataFrame, nelem=nelem, extra_levels=lvls) - .groupby(bys) + .groupby(bys, sort=True) .mean() ) expect_df = ( make_frame(pd.DataFrame, nelem=nelem, extra_levels=lvls) - .groupby(bys) + .groupby(bys, sort=True) .mean() ) assert_eq(got_df, expect_df) @@ -93,12 +99,12 @@ def test_groupby_mean_3level(nelem): def test_groupby_agg_mean_min(nelem): got_df = ( make_frame(DataFrame, nelem=nelem) - .groupby(["x", "y"]) + .groupby(["x", "y"], sort=True) .agg(["mean", "min"]) ) expect_df = ( make_frame(pd.DataFrame, nelem=nelem) - .groupby(["x", "y"]) + .groupby(["x", "y"], sort=True) .agg(["mean", "min"]) ) assert_eq(got_df, expect_df) @@ -108,12 +114,12 @@ def test_groupby_agg_mean_min(nelem): def test_groupby_agg_min_max_dictargs(nelem): expect_df = ( make_frame(pd.DataFrame, nelem=nelem, extra_vals="ab") - .groupby(["x", "y"]) + .groupby(["x", "y"], sort=True) .agg({"a": "min", "b": "max"}) ) got_df = ( make_frame(DataFrame, nelem=nelem, extra_vals="ab") - .groupby(["x", "y"]) + .groupby(["x", "y"], sort=True) .agg({"a": "min", "b": "max"}) ) assert_eq(expect_df, got_df) @@ -123,12 +129,12 @@ def test_groupby_agg_min_max_dictargs(nelem): def test_groupby_agg_min_max_dictlist(nelem): expect_df = ( make_frame(pd.DataFrame, nelem=nelem, extra_vals="ab") - .groupby(["x", "y"]) + .groupby(["x", "y"], sort=True) .agg({"a": ["min", "max"], "b": ["min", "max"]}) ) got_df = ( make_frame(DataFrame, nelem=nelem, extra_vals="ab") - .groupby(["x", "y"]) + .groupby(["x", "y"], sort=True) .agg({"a": ["min", "max"], "b": ["min", "max"]}) ) assert_eq(got_df, expect_df) @@ -141,17 +147,23 @@ def test_groupby_agg_min_max_dictlist(nelem): def test_groupby_2keys_agg(nelem, func): # gdf (Note: lack of multiIndex) expect_df = ( - make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) + make_frame(pd.DataFrame, nelem=nelem) + .groupby(["x", "y"], sort=True) + .agg(func) + ) + got_df = ( + make_frame(DataFrame, nelem=nelem) + .groupby(["x", "y"], sort=True) + .agg(func) ) - got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) check_dtype = False if func in _index_type_aggs else True assert_eq(got_df, expect_df, check_dtype=check_dtype) @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_as_index_single_agg(pdf, gdf, as_index): - gdf = gdf.groupby("y", as_index=as_index).agg({"x": "mean"}) - pdf = pdf.groupby("y", as_index=as_index).agg({"x": "mean"}) + gdf = gdf.groupby("y", as_index=as_index, sort=True).agg({"x": "mean"}) + pdf = pdf.groupby("y", as_index=as_index, sort=True).agg({"x": "mean"}) assert_eq(pdf, gdf) @@ -162,8 +174,12 @@ def test_groupby_as_index_multiindex(pdf, gdf, as_index): ) gdf = cudf.from_pandas(pdf) - gdf = gdf.groupby(["a", "b"], as_index=as_index).agg({"c": "mean"}) - pdf = pdf.groupby(["a", "b"], as_index=as_index).agg({"c": "mean"}) + gdf = gdf.groupby(["a", "b"], as_index=as_index, sort=True).agg( + {"c": "mean"} + ) + pdf = pdf.groupby(["a", "b"], as_index=as_index, sort=True).agg( + {"c": "mean"} + ) if as_index: assert_eq(pdf, gdf) @@ -174,14 +190,14 @@ def test_groupby_as_index_multiindex(pdf, gdf, as_index): def test_groupby_default(pdf, gdf): - gdf = gdf.groupby("y").agg({"x": "mean"}) - pdf = pdf.groupby("y").agg({"x": "mean"}) + gdf = gdf.groupby("y", sort=True).agg({"x": "mean"}) + pdf = pdf.groupby("y", sort=True).agg({"x": "mean"}) assert_eq(pdf, gdf) def test_group_keys_true(pdf, gdf): - gdf = gdf.groupby("y", group_keys=True).sum() - pdf = pdf.groupby("y", group_keys=True).sum() + gdf = gdf.groupby("y", group_keys=True, sort=True).sum() + pdf = pdf.groupby("y", group_keys=True, sort=True).sum() assert_eq(pdf, gdf) @@ -189,12 +205,21 @@ def test_group_keys_true(pdf, gdf): def test_groupby_getitem_getattr(as_index): pdf = pd.DataFrame({"x": [1, 3, 1], "y": [1, 2, 3], "z": [1, 4, 5]}) gdf = cudf.from_pandas(pdf) - assert_eq(pdf.groupby("x")["y"].sum(), gdf.groupby("x")["y"].sum()) - assert_eq(pdf.groupby("x").y.sum(), gdf.groupby("x").y.sum()) - assert_eq(pdf.groupby("x")[["y"]].sum(), gdf.groupby("x")[["y"]].sum()) assert_eq( - pdf.groupby(["x", "y"], as_index=as_index).sum(), - gdf.groupby(["x", "y"], as_index=as_index).sum(), + pdf.groupby("x", sort=True)["y"].sum(), + gdf.groupby("x", sort=True)["y"].sum(), + ) + assert_eq( + pdf.groupby("x", sort=True).y.sum(), + gdf.groupby("x", sort=True).y.sum(), + ) + assert_eq( + pdf.groupby("x", sort=True)[["y"]].sum(), + gdf.groupby("x", sort=True)[["y"]].sum(), + ) + assert_eq( + pdf.groupby(["x", "y"], as_index=as_index, sort=True).sum(), + gdf.groupby(["x", "y"], as_index=as_index, sort=True).sum(), ) @@ -244,8 +269,10 @@ def test_groupby_apply(): df["val1"] = np.random.random(nelem) df["val2"] = np.random.random(nelem) - expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False) - got_grpby = df.groupby(["key1", "key2"]) + expect_grpby = df.to_pandas().groupby( + ["key1", "key2"], as_index=False, sort=True + ) + got_grpby = df.groupby(["key1", "key2"], sort=True) def foo(df): df["out"] = df["val1"] + df["val2"] @@ -267,8 +294,10 @@ def test_groupby_apply_grouped(): df["val1"] = np.random.random(nelem) df["val2"] = np.random.random(nelem) - expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False) - got_grpby = df.groupby(["key1", "key2"]) + expect_grpby = df.to_pandas().groupby( + ["key1", "key2"], as_index=False, sort=True + ) + got_grpby = df.groupby(["key1", "key2"], sort=True) def foo(key1, val1, com1, com2): for i in range(cuda.threadIdx.x, len(key1), cuda.blockDim.x): @@ -302,11 +331,17 @@ def emulate(df): ["mean", "std", "var", "min", "max", "idxmin", "idxmax", "count", "sum"], ) def test_groupby_cudf_2keys_agg(nelem, func): - got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) + got_df = ( + make_frame(DataFrame, nelem=nelem) + .groupby(["x", "y"], sort=True) + .agg(func) + ) # pandas expect_df = ( - make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) + make_frame(pd.DataFrame, nelem=nelem) + .groupby(["x", "y"], sort=True) + .agg(func) ) check_dtype = False if func in _index_type_aggs else True assert_eq(got_df, expect_df, check_dtype=check_dtype) @@ -399,8 +434,8 @@ def test_groupby_series_level_zero(agg): def test_groupby_column_name(): pdf = pd.DataFrame({"xx": [1.0, 2.0, 3.0], "yy": [1, 2, 3]}) gdf = DataFrame.from_pandas(pdf) - g = gdf.groupby("yy") - p = pdf.groupby("yy") + g = gdf.groupby("yy", sort=True) + p = pdf.groupby("yy", sort=True) gxx = g["xx"].sum() pxx = p["xx"].sum() assert_eq(pxx, gxx) @@ -433,16 +468,16 @@ def test_groupby_column_name(): def test_groupby_column_numeral(): pdf = pd.DataFrame({0: [1.0, 2.0, 3.0], 1: [1, 2, 3]}) gdf = DataFrame.from_pandas(pdf) - p = pdf.groupby(1) - g = gdf.groupby(1) + p = pdf.groupby(1, sort=True) + g = gdf.groupby(1, sort=True) pxx = p[0].sum() gxx = g[0].sum() assert_eq(pxx, gxx) pdf = pd.DataFrame({0.5: [1.0, 2.0, 3.0], 1.5: [1, 2, 3]}) gdf = DataFrame.from_pandas(pdf) - p = pdf.groupby(1.5) - g = gdf.groupby(1.5) + p = pdf.groupby(1.5, sort=True) + g = gdf.groupby(1.5, sort=True) pxx = p[0.5].sum() gxx = g[0.5].sum() assert_eq(pxx, gxx) @@ -455,8 +490,8 @@ def test_groupby_column_numeral(): def test_groupby_external_series(series): pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}) gdf = DataFrame.from_pandas(pdf) - pxx = pdf.groupby(pd.Series(series)).x.sum() - gxx = gdf.groupby(cudf.Series(series)).x.sum() + pxx = pdf.groupby(pd.Series(series), sort=True).x.sum() + gxx = gdf.groupby(cudf.Series(series), sort=True).x.sum() assert_eq(pxx, gxx) @@ -464,8 +499,8 @@ def test_groupby_external_series(series): def test_groupby_external_series_incorrect_length(series): pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}) gdf = DataFrame.from_pandas(pdf) - pxx = pdf.groupby(pd.Series(series)).x.sum() - gxx = gdf.groupby(cudf.Series(series)).x.sum() + pxx = pdf.groupby(pd.Series(series), sort=True).x.sum() + gxx = gdf.groupby(cudf.Series(series), sort=True).x.sum() assert_eq(pxx, gxx) @@ -476,49 +511,52 @@ def test_groupby_levels(level): idx = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (2, 2)], names=("a", "b")) pdf = pd.DataFrame({"c": [1, 2, 3], "d": [2, 3, 4]}, index=idx) gdf = cudf.from_pandas(pdf) - assert_eq(pdf.groupby(level=level).sum(), gdf.groupby(level=level).sum()) + assert_eq( + pdf.groupby(level=level, sort=True).sum(), + gdf.groupby(level=level, sort=True).sum(), + ) def test_advanced_groupby_levels(): pdf = pd.DataFrame({"x": [1, 2, 3], "y": [1, 2, 1], "z": [1, 1, 1]}) gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby(["x", "y"]).sum() - gdg = gdf.groupby(["x", "y"]).sum() + pdg = pdf.groupby(["x", "y"], sort=True).sum() + gdg = gdf.groupby(["x", "y"], sort=True).sum() assert_eq(pdg, gdg) - pdh = pdg.groupby(level=1).sum() - gdh = gdg.groupby(level=1).sum() + pdh = pdg.groupby(level=1, sort=True).sum() + gdh = gdg.groupby(level=1, sort=True).sum() assert_eq(pdh, gdh) - pdg = pdf.groupby(["x", "y", "z"]).sum() - gdg = gdf.groupby(["x", "y", "z"]).sum() + pdg = pdf.groupby(["x", "y", "z"], sort=True).sum() + gdg = gdf.groupby(["x", "y", "z"], sort=True).sum() assert_eq(pdg, gdg) - pdg = pdf.groupby(["z"]).sum() - gdg = gdf.groupby(["z"]).sum() + pdg = pdf.groupby(["z"], sort=True).sum() + gdg = gdf.groupby(["z"], sort=True).sum() assert_eq(pdg, gdg) - pdg = pdf.groupby(["y", "z"]).sum() - gdg = gdf.groupby(["y", "z"]).sum() + pdg = pdf.groupby(["y", "z"], sort=True).sum() + gdg = gdf.groupby(["y", "z"], sort=True).sum() assert_eq(pdg, gdg) - pdg = pdf.groupby(["x", "z"]).sum() - gdg = gdf.groupby(["x", "z"]).sum() + pdg = pdf.groupby(["x", "z"], sort=True).sum() + gdg = gdf.groupby(["x", "z"], sort=True).sum() assert_eq(pdg, gdg) - pdg = pdf.groupby(["y"]).sum() - gdg = gdf.groupby(["y"]).sum() + pdg = pdf.groupby(["y"], sort=True).sum() + gdg = gdf.groupby(["y"], sort=True).sum() assert_eq(pdg, gdg) - pdg = pdf.groupby(["x"]).sum() - gdg = gdf.groupby(["x"]).sum() + pdg = pdf.groupby(["x"], sort=True).sum() + gdg = gdf.groupby(["x"], sort=True).sum() assert_eq(pdg, gdg) - pdh = pdg.groupby(level=0).sum() - gdh = gdg.groupby(level=0).sum() + pdh = pdg.groupby(level=0, sort=True).sum() + gdh = gdg.groupby(level=0, sort=True).sum() assert_eq(pdh, gdh) - pdg = pdf.groupby(["x", "y"]).sum() - gdg = gdf.groupby(["x", "y"]).sum() - pdh = pdg.groupby(level=[0, 1]).sum() - gdh = gdg.groupby(level=[0, 1]).sum() + pdg = pdf.groupby(["x", "y"], sort=True).sum() + gdg = gdf.groupby(["x", "y"], sort=True).sum() + pdh = pdg.groupby(level=[0, 1], sort=True).sum() + gdh = gdg.groupby(level=[0, 1], sort=True).sum() assert_eq(pdh, gdh) - pdh = pdg.groupby(level=[1, 0]).sum() - gdh = gdg.groupby(level=[1, 0]).sum() + pdh = pdg.groupby(level=[1, 0], sort=True).sum() + gdh = gdg.groupby(level=[1, 0], sort=True).sum() assert_eq(pdh, gdh) - pdg = pdf.groupby(["x", "y"]).sum() - gdg = gdf.groupby(["x", "y"]).sum() + pdg = pdf.groupby(["x", "y"], sort=True).sum() + gdg = gdf.groupby(["x", "y"], sort=True).sum() assert_exceptions_equal( lfunc=pdg.groupby, @@ -569,19 +607,19 @@ def test_groupby_unsupported_columns(): ) pdf["b"] = pd_cat gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby("x").sum() - gdg = gdf.groupby("x").sum() + pdg = pdf.groupby("x", sort=True).sum() + gdg = gdf.groupby("x", sort=True).sum() assert_eq(pdg, gdg) def test_list_of_series(): pdf = pd.DataFrame({"x": [1, 2, 3], "y": [1, 2, 1]}) gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby([pdf.x]).y.sum() - gdg = gdf.groupby([gdf.x]).y.sum() + pdg = pdf.groupby([pdf.x], sort=True).y.sum() + gdg = gdf.groupby([gdf.x], sort=True).y.sum() assert_eq(pdg, gdg) - pdg = pdf.groupby([pdf.x, pdf.y]).y.sum() - gdg = gdf.groupby([gdf.x, gdf.y]).y.sum() + pdg = pdf.groupby([pdf.x, pdf.y], sort=True).y.sum() + gdg = gdf.groupby([gdf.x, gdf.y], sort=True).y.sum() pytest.skip() assert_eq(pdg, gdg) @@ -602,10 +640,10 @@ def test_groupby_list_then_string(): gdf["b"] = [11, 2, 15, 12, 2] gdf["c"] = [6, 7, 6, 7, 6] pdf = gdf.to_pandas() - gdg = gdf.groupby("a", as_index=True).agg( + gdg = gdf.groupby("a", as_index=True, sort=True).agg( {"b": ["min", "max"], "c": "max"} ) - pdg = pdf.groupby("a", as_index=True).agg( + pdg = pdf.groupby("a", as_index=True, sort=True).agg( {"b": ["min", "max"], "c": "max"} ) assert_eq(gdg, pdg) @@ -617,10 +655,10 @@ def test_groupby_different_unequal_length_column_aggregations(): gdf["b"] = [11, 2, 15, 12, 2] gdf["c"] = [11, 2, 15, 12, 2] pdf = gdf.to_pandas() - gdg = gdf.groupby("a", as_index=True).agg( + gdg = gdf.groupby("a", as_index=True, sort=True).agg( {"b": "min", "c": ["max", "min"]} ) - pdg = pdf.groupby("a", as_index=True).agg( + pdg = pdf.groupby("a", as_index=True, sort=True).agg( {"b": "min", "c": ["max", "min"]} ) assert_eq(pdg, gdg) @@ -632,8 +670,8 @@ def test_groupby_single_var_two_aggs(): gdf["b"] = [11, 2, 15, 12, 2] gdf["c"] = [11, 2, 15, 12, 2] pdf = gdf.to_pandas() - gdg = gdf.groupby("a", as_index=True).agg({"b": ["min", "max"]}) - pdg = pdf.groupby("a", as_index=True).agg({"b": ["min", "max"]}) + gdg = gdf.groupby("a", as_index=True, sort=True).agg({"b": ["min", "max"]}) + pdg = pdf.groupby("a", as_index=True, sort=True).agg({"b": ["min", "max"]}) assert_eq(pdg, gdg) @@ -643,8 +681,12 @@ def test_groupby_double_var_two_aggs(): gdf["b"] = [11, 2, 15, 12, 2] gdf["c"] = [11, 2, 15, 12, 2] pdf = gdf.to_pandas() - gdg = gdf.groupby(["a", "b"], as_index=True).agg({"c": ["min", "max"]}) - pdg = pdf.groupby(["a", "b"], as_index=True).agg({"c": ["min", "max"]}) + gdg = gdf.groupby(["a", "b"], as_index=True, sort=True).agg( + {"c": ["min", "max"]} + ) + pdg = pdf.groupby(["a", "b"], as_index=True, sort=True).agg( + {"c": ["min", "max"]} + ) assert_eq(pdg, gdg) @@ -655,8 +697,8 @@ def test_groupby_apply_basic_agg_single_column(): gdf["mult"] = gdf["key"] * gdf["val"] pdf = gdf.to_pandas() - gdg = gdf.groupby(["key", "val"]).mult.sum() - pdg = pdf.groupby(["key", "val"]).mult.sum() + gdg = gdf.groupby(["key", "val"], sort=True).mult.sum() + pdg = pdf.groupby(["key", "val"], sort=True).mult.sum() assert_eq(pdg, gdg) @@ -668,8 +710,8 @@ def test_groupby_multi_agg_single_groupby_series(): } ) gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby("x").y.agg(["sum", "max"]) - gdg = gdf.groupby("x").y.agg(["sum", "max"]) + pdg = pdf.groupby("x", sort=True).y.agg(["sum", "max"]) + gdg = gdf.groupby("x", sort=True).y.agg(["sum", "max"]) assert_eq(pdg, gdg) @@ -684,8 +726,8 @@ def test_groupby_multi_agg_multi_groupby(): } ) gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby(["a", "b"]).agg(["sum", "max"]) - gdg = gdf.groupby(["a", "b"]).agg(["sum", "max"]) + pdg = pdf.groupby(["a", "b"], sort=True).agg(["sum", "max"]) + gdg = gdf.groupby(["a", "b"], sort=True).agg(["sum", "max"]) assert_eq(pdg, gdg) @@ -703,8 +745,8 @@ def test_groupby_datetime_multi_agg_multi_groupby(): } ) gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby(["a", "b"]).agg(["sum", "max"]) - gdg = gdf.groupby(["a", "b"]).agg(["sum", "max"]) + pdg = pdf.groupby(["a", "b"], sort=True).agg(["sum", "max"]) + gdg = gdf.groupby(["a", "b"], sort=True).agg(["sum", "max"]) assert_eq(pdg, gdg) @@ -730,8 +772,8 @@ def test_groupby_multi_agg_hash_groupby(agg): ).reset_index(drop=True) pdf = gdf.to_pandas() check_dtype = False if "count" in agg else True - pdg = pdf.groupby("id").agg(agg) - gdg = gdf.groupby("id").agg(agg) + pdg = pdf.groupby("id", sort=True).agg(agg) + gdg = gdf.groupby("id", sort=True).agg(agg) assert_eq(pdg, gdg, check_dtype=check_dtype) @@ -744,8 +786,8 @@ def test_groupby_nulls_basic(agg): pdf = pd.DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": [1, 2, 1, 2, 1, None]}) gdf = cudf.from_pandas(pdf) assert_eq( - getattr(pdf.groupby("a"), agg)(), - getattr(gdf.groupby("a"), agg)(), + getattr(pdf.groupby("a", sort=True), agg)(), + getattr(gdf.groupby("a", sort=True), agg)(), check_dtype=check_dtype, ) @@ -758,8 +800,8 @@ def test_groupby_nulls_basic(agg): ) gdf = cudf.from_pandas(pdf) assert_eq( - getattr(pdf.groupby("a"), agg)(), - getattr(gdf.groupby("a"), agg)(), + getattr(pdf.groupby("a", sort=True), agg)(), + getattr(gdf.groupby("a", sort=True), agg)(), check_dtype=check_dtype, ) @@ -775,8 +817,8 @@ def test_groupby_nulls_basic(agg): # TODO: fillna() used here since we don't follow # Pandas' null semantics. Should we change it? assert_eq( - getattr(pdf.groupby("a"), agg)().fillna(0), - getattr(gdf.groupby("a"), agg)().fillna(0), + getattr(pdf.groupby("a", sort=True), agg)().fillna(0), + getattr(gdf.groupby("a", sort=True), agg)().fillna(0), check_dtype=check_dtype, ) @@ -805,13 +847,15 @@ def test_groupby_all_nulls_index(): assert_eq(pdf.groupby("a").sum(), gdf.groupby("a").sum()) -def test_groupby_sort(): +@pytest.mark.parametrize("sort", [True, False]) +def test_groupby_sort(sort): pdf = pd.DataFrame({"a": [2, 2, 1, 1], "b": [1, 2, 3, 4]}) gdf = cudf.from_pandas(pdf) assert_eq( - pdf.groupby("a", sort=False).sum().sort_index(), - gdf.groupby("a", sort=False).sum().sort_index(), + pdf.groupby("a", sort=sort).sum(), + gdf.groupby("a", sort=sort).sum(), + check_like=not sort, ) pdf = pd.DataFrame( @@ -820,8 +864,30 @@ def test_groupby_sort(): gdf = cudf.from_pandas(pdf) assert_eq( - pdf.groupby(["c", "b"], sort=False).sum().sort_index(), - gdf.groupby(["c", "b"], sort=False).sum().to_pandas().sort_index(), + pdf.groupby(["c", "b"], sort=sort).sum(), + gdf.groupby(["c", "b"], sort=sort).sum(), + check_like=not sort, + ) + + ps = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=[2, 2, 2, 3, 3, 1, 1, 1]) + gs = cudf.from_pandas(ps) + + assert_eq( + ps.groupby(level=0, sort=sort).sum().to_frame(), + gs.groupby(level=0, sort=sort).sum().to_frame(), + check_like=not sort, + ) + + ps = pd.Series( + [1, 2, 3, 4, 5, 6, 7, 8], + index=pd.MultiIndex.from_product([(1, 2), ("a", "b"), (42, 84)]), + ) + gs = cudf.from_pandas(ps) + + assert_eq( + ps.groupby(level=0, sort=sort).sum().to_frame(), + gs.groupby(level=0, sort=sort).sum().to_frame(), + check_like=not sort, ) @@ -831,7 +897,9 @@ def test_groupby_cat(): ) gdf = cudf.from_pandas(pdf) assert_eq( - pdf.groupby("a").count(), gdf.groupby("a").count(), check_dtype=False + pdf.groupby("a", sort=True).count(), + gdf.groupby("a", sort=True).count(), + check_dtype=False, ) @@ -883,8 +951,8 @@ def test_groupby_std(): } pdf = pd.DataFrame(raw_data) gdf = DataFrame.from_pandas(pdf) - pdg = pdf.groupby("x") - gdg = gdf.groupby("x") + pdg = pdf.groupby("x", sort=True) + gdg = gdf.groupby("x", sort=True) pdresult = pdg.std() gdresult = gdg.std() @@ -906,18 +974,22 @@ def test_groupby_size(): gdf = cudf.from_pandas(pdf) assert_eq( - pdf.groupby("a").size(), gdf.groupby("a").size(), check_dtype=False + pdf.groupby("a", sort=True).size(), + gdf.groupby("a", sort=True).size(), + check_dtype=False, ) assert_eq( - pdf.groupby(["a", "b", "c"]).size(), - gdf.groupby(["a", "b", "c"]).size(), + pdf.groupby(["a", "b", "c"], sort=True).size(), + gdf.groupby(["a", "b", "c"], sort=True).size(), check_dtype=False, ) sr = pd.Series(range(len(pdf))) assert_eq( - pdf.groupby(sr).size(), gdf.groupby(sr).size(), check_dtype=False + pdf.groupby(sr, sort=True).size(), + gdf.groupby(sr, sort=True).size(), + check_dtype=False, ) @@ -932,8 +1004,8 @@ def test_groupby_datetime(nelem, as_index, agg): check_dtype = agg not in ("mean", "count", "idxmin", "idxmax") pdf = make_frame(pd.DataFrame, nelem=nelem, with_datetime=True) gdf = make_frame(cudf.DataFrame, nelem=nelem, with_datetime=True) - pdg = pdf.groupby("datetime", as_index=as_index) - gdg = gdf.groupby("datetime", as_index=as_index) + pdg = pdf.groupby("datetime", as_index=as_index, sort=True) + gdg = gdf.groupby("datetime", as_index=as_index, sort=True) if as_index is False: pdres = getattr(pdg, agg)() gdres = getattr(gdg, agg)() @@ -948,7 +1020,7 @@ def test_groupby_dropna(): expect = cudf.DataFrame( {"b": [3, 3]}, index=cudf.Series([1, None], name="a") ) - got = df.groupby("a", dropna=False).sum() + got = df.groupby("a", dropna=False, sort=True).sum() assert_eq(expect, got) df = cudf.DataFrame( @@ -959,7 +1031,7 @@ def test_groupby_dropna(): names=["a", "b"], ) expect = cudf.DataFrame({"c": [4, 2, 4]}, index=idx) - got = df.groupby(["a", "b"], dropna=False).sum() + got = df.groupby(["a", "b"], dropna=False, sort=True).sum() assert_eq(expect, got) @@ -968,7 +1040,7 @@ def test_groupby_dropna_getattr(): df = cudf.DataFrame() df["id"] = [0, 1, 1, None, None, 3, 3] df["val"] = [0, 1, 1, 2, 2, 3, 3] - got = df.groupby("id", dropna=False).val.sum() + got = df.groupby("id", dropna=False, sort=True).val.sum() expect = cudf.Series( [0, 2, 6, 4], name="val", index=cudf.Series([0, 1, 3, None], name="id") @@ -984,7 +1056,7 @@ def test_groupby_categorical_from_string(): gdf["id"] = gdf["id"].astype("category") assert_eq( cudf.DataFrame({"val": gdf["val"]}).set_index(keys=gdf["id"]), - gdf.groupby("id").sum(), + gdf.groupby("id", sort=True).sum(), ) @@ -1069,8 +1141,8 @@ def test_groupby_count(agg, by): ) gdf = cudf.from_pandas(pdf) - expect = pdf.groupby(by).agg(agg) - got = gdf.groupby(by).agg(agg) + expect = pdf.groupby(by, sort=True).agg(agg) + got = gdf.groupby(by, sort=True).agg(agg) assert_eq(expect, got, check_dtype=False) @@ -1120,8 +1192,8 @@ def test_groupby_nth(n, by): ) gdf = cudf.from_pandas(pdf) - expect = pdf.groupby(by).nth(n) - got = gdf.groupby(by).nth(n) + expect = pdf.groupby(by, sort=True).nth(n) + got = gdf.groupby(by, sort=True).nth(n) assert_eq(expect, got, check_dtype=False) @@ -1168,7 +1240,9 @@ def test_groupby_agg_combinations(agg): gdf = cudf.from_pandas(pdf) assert_eq( - pdf.groupby("a").agg(agg), gdf.groupby("a").agg(agg), check_dtype=False + pdf.groupby("a", sort=True).agg(agg), + gdf.groupby("a", sort=True).agg(agg), + check_dtype=False, ) @@ -1193,8 +1267,8 @@ def test_reset_index_after_empty_groupby(): gdf = cudf.from_pandas(pdf) assert_eq( - pdf.groupby("a").sum().reset_index(), - gdf.groupby("a").sum().reset_index(), + pdf.groupby("a", sort=True).sum().reset_index(), + gdf.groupby("a", sort=True).sum().reset_index(), ) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index b0f1cfed2c0..b8e157b12ae 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -460,19 +460,19 @@ def test_multiindex_multiple_groupby(): } ) gdf = cudf.DataFrame.from_pandas(pdf) - pdg = pdf.groupby(["a", "b"]).sum() - gdg = gdf.groupby(["a", "b"]).sum() + pdg = pdf.groupby(["a", "b"], sort=True).sum() + gdg = gdf.groupby(["a", "b"], sort=True).sum() assert_eq(pdg, gdg) - pdg = pdf.groupby(["a", "b"]).x.sum() - gdg = gdf.groupby(["a", "b"]).x.sum() + pdg = pdf.groupby(["a", "b"], sort=True).x.sum() + gdg = gdf.groupby(["a", "b"], sort=True).x.sum() assert_eq(pdg, gdg) @pytest.mark.parametrize( "func", [ - lambda df: df.groupby(["x", "y"]).z.sum(), - lambda df: df.groupby(["x", "y"]).sum(), + lambda df: df.groupby(["x", "y"], sort=True).z.sum(), + lambda df: df.groupby(["x", "y"], sort=True).sum(), ], ) def test_multi_column(func): @@ -498,7 +498,7 @@ def test_multiindex_equality(): gdf = cudf.DataFrame( {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [0, 1, 0, 1, 0]} ) - mi1 = gdf.groupby(["x", "y"]).mean().index + mi1 = gdf.groupby(["x", "y"], sort=True).mean().index mi2 = cudf.MultiIndex( levels=[[1, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], @@ -507,7 +507,7 @@ def test_multiindex_equality(): assert_eq(mi1, mi2) # mi made from two groupbys, are they equal? - mi2 = gdf.groupby(["x", "y"]).max().index + mi2 = gdf.groupby(["x", "y"], sort=True).max().index assert_eq(mi1, mi2) # mi made manually twice are they equal? @@ -549,7 +549,7 @@ def test_multiindex_equals(): gdf = cudf.DataFrame( {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [0, 1, 0, 1, 0]} ) - mi1 = gdf.groupby(["x", "y"]).mean().index + mi1 = gdf.groupby(["x", "y"], sort=True).mean().index mi2 = cudf.MultiIndex( levels=[[1, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], @@ -558,7 +558,7 @@ def test_multiindex_equals(): assert_eq(mi1.equals(mi2), True) # mi made from two groupbys, are they equal? - mi2 = gdf.groupby(["x", "y"]).max().index + mi2 = gdf.groupby(["x", "y"], sort=True).max().index assert_eq(mi1.equals(mi2), True) # mi made manually twice are they equal? @@ -575,8 +575,8 @@ def test_multiindex_equals(): assert_eq(mi1.equals(mi2), True) # mi made from different groupbys are they not equal? - mi1 = gdf.groupby(["x", "y"]).mean().index - mi2 = gdf.groupby(["x", "z"]).mean().index + mi1 = gdf.groupby(["x", "y"], sort=True).mean().index + mi2 = gdf.groupby(["x", "z"], sort=True).mean().index assert_eq(mi1.equals(mi2), False) # mi made from different manuals are they not equal? @@ -647,8 +647,8 @@ def test_multiindex_copy_sem(data, levels, codes, names): gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() - gdf = gdf.groupby(["Date", "Symbol"]).mean() - pdf = pdf.groupby(["Date", "Symbol"]).mean() + gdf = gdf.groupby(["Date", "Symbol"], sort=True).mean() + pdf = pdf.groupby(["Date", "Symbol"], sort=True).mean() gmi = gdf.index gmi_copy = gmi.copy(levels=levels, codes=codes, names=names) @@ -882,8 +882,8 @@ def test_multiindex_groupby_to_frame(): {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [0, 1, 0, 1, 0]} ) pdf = gdf.to_pandas() - gdg = gdf.groupby(["x", "y"]).count() - pdg = pdf.groupby(["x", "y"]).count() + gdg = gdf.groupby(["x", "y"], sort=True).count() + pdg = pdf.groupby(["x", "y"], sort=True).count() assert_eq(pdg.index.to_frame(), gdg.index.to_frame()) @@ -899,22 +899,22 @@ def test_multiindex_groupby_reset_index(): {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [0, 1, 0, 1, 0]} ) pdf = gdf.to_pandas() - gdg = gdf.groupby(["x", "y"]).sum() - pdg = pdf.groupby(["x", "y"]).sum() + gdg = gdf.groupby(["x", "y"], sort=True).sum() + pdg = pdf.groupby(["x", "y"], sort=True).sum() assert_eq(pdg.reset_index(), gdg.reset_index()) def test_multicolumn_reset_index(): gdf = cudf.DataFrame({"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5]}) pdf = gdf.to_pandas() - gdg = gdf.groupby(["x"]).agg({"y": ["count", "mean"]}) - pdg = pdf.groupby(["x"]).agg({"y": ["count", "mean"]}) + gdg = gdf.groupby(["x"], sort=True).agg({"y": ["count", "mean"]}) + pdg = pdf.groupby(["x"], sort=True).agg({"y": ["count", "mean"]}) assert_eq(pdg.reset_index(), gdg.reset_index(), check_dtype=False) - gdg = gdf.groupby(["x"]).agg({"y": ["count"]}) - pdg = pdf.groupby(["x"]).agg({"y": ["count"]}) + gdg = gdf.groupby(["x"], sort=True).agg({"y": ["count"]}) + pdg = pdf.groupby(["x"], sort=True).agg({"y": ["count"]}) assert_eq(pdg.reset_index(), gdg.reset_index(), check_dtype=False) - gdg = gdf.groupby(["x"]).agg({"y": "count"}) - pdg = pdf.groupby(["x"]).agg({"y": "count"}) + gdg = gdf.groupby(["x"], sort=True).agg({"y": "count"}) + pdg = pdf.groupby(["x"], sort=True).agg({"y": "count"}) assert_eq(pdg.reset_index(), gdg.reset_index(), check_dtype=False) @@ -923,11 +923,11 @@ def test_multiindex_multicolumn_reset_index(): {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [1, 2, 3, 4, 5]} ) pdf = gdf.to_pandas() - gdg = gdf.groupby(["x", "y"]).agg({"y": ["count", "mean"]}) - pdg = pdf.groupby(["x", "y"]).agg({"y": ["count", "mean"]}) + gdg = gdf.groupby(["x", "y"], sort=True).agg({"y": ["count", "mean"]}) + pdg = pdf.groupby(["x", "y"], sort=True).agg({"y": ["count", "mean"]}) assert_eq(pdg.reset_index(), gdg.reset_index(), check_dtype=False) - gdg = gdf.groupby(["x", "z"]).agg({"y": ["count", "mean"]}) - pdg = pdf.groupby(["x", "z"]).agg({"y": ["count", "mean"]}) + gdg = gdf.groupby(["x", "z"], sort=True).agg({"y": ["count", "mean"]}) + pdg = pdf.groupby(["x", "z"], sort=True).agg({"y": ["count", "mean"]}) assert_eq(pdg.reset_index(), gdg.reset_index(), check_dtype=False) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index d590a3ddb52..85e61acd8e6 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1,8 +1,7 @@ # Copyright (c) 2019-2021, NVIDIA CORPORATION. -import os import datetime -import math +import os from io import BytesIO import numpy as np @@ -12,9 +11,8 @@ import pytest import cudf -from cudf.tests.utils import assert_eq, supported_numpy_dtypes, gen_rand_series - from cudf.io.orc import ORCWriter +from cudf.tests.utils import assert_eq, gen_rand_series, supported_numpy_dtypes @pytest.fixture(scope="module") @@ -565,7 +563,7 @@ def normalized_equals(value1, value2): # Compare integers with floats now if isinstance(value1, float) or isinstance(value2, float): - return math.isclose(value1, value2) + return np.isclose(value1, value2) return value1 == value2 diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 55cff5ae6dd..8c09dc91253 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -244,8 +244,8 @@ def test_groupby_MI(nrows, ncols): {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)} ) pdf = gdf.to_pandas() - gdg = gdf.groupby(["a", "b"]).count() - pdg = pdf.groupby(["a", "b"]).count() + gdg = gdf.groupby(["a", "b"], sort=True).count() + pdg = pdf.groupby(["a", "b"], sort=True).count() pd.options.display.max_rows = nrows pd.options.display.max_columns = ncols assert gdg.__repr__() == pdg.__repr__() diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index f8e7fc5b4f3..1ae5bab0da4 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -321,6 +321,18 @@ def test_rolling_groupby_simple(agg): got = getattr(gdf.groupby("a").rolling(window_size), agg)().fillna(-1) assert_eq(expect, got, check_dtype=False) + pdf = pd.DataFrame( + {"a": [1, 1, 1, 2, 2], "b": [1, 1, 2, 2, 3], "c": [1, 2, 3, 4, 5]} + ) + gdf = cudf.from_pandas(pdf) + + for window_size in range(1, len(pdf) + 1): + expect = getattr(pdf.groupby("a").rolling(window_size), agg)().fillna( + -1 + ) + got = getattr(gdf.groupby("a").rolling(window_size), agg)().fillna(-1) + assert_eq(expect, got, check_dtype=False) + @pytest.mark.parametrize("agg", ["sum", "min", "max", "mean", "count"]) def test_rolling_groupby_multi(agg): @@ -335,10 +347,10 @@ def test_rolling_groupby_multi(agg): for window_size in range(1, len(pdf) + 1): expect = getattr( - pdf.groupby(["a", "b"]).rolling(window_size), agg + pdf.groupby(["a", "b"], sort=True).rolling(window_size), agg )().fillna(-1) got = getattr( - gdf.groupby(["a", "b"]).rolling(window_size), agg + gdf.groupby(["a", "b"], sort=True).rolling(window_size), agg )().fillna(-1) assert_eq(expect, got, check_dtype=False) diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index f4d04f84097..656b66bf793 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -146,11 +146,11 @@ def test_serialize_groupby_df(): df["key_1"] = np.random.randint(0, 20, 100) df["key_2"] = np.random.randint(0, 20, 100) df["val"] = np.arange(100, dtype=np.float32) - gb = df.groupby(["key_1", "key_2"]) + gb = df.groupby(["key_1", "key_2"], sort=True) outgb = gb.deserialize(*gb.serialize()) expect = gb.mean() got = outgb.mean() - assert_eq(got, expect) + assert_eq(got.sort_index(), expect.sort_index()) def test_serialize_groupby_external(): @@ -160,7 +160,7 @@ def test_serialize_groupby_external(): outgb = gb.deserialize(*gb.serialize()) expect = gb.mean() got = outgb.mean() - assert_eq(got, expect) + assert_eq(got.sort_index(), expect.sort_index()) def test_serialize_groupby_level(): @@ -171,7 +171,7 @@ def test_serialize_groupby_level(): expect = gb.mean() outgb = gb.deserialize(*gb.serialize()) got = outgb.mean() - assert_eq(expect, got) + assert_eq(expect.sort_index(), got.sort_index()) def test_serialize_groupby_sr(): @@ -180,7 +180,7 @@ def test_serialize_groupby_sr(): outgb = gb.deserialize(*gb.serialize()) got = gb.mean() expect = outgb.mean() - assert_eq(got, expect) + assert_eq(got.sort_index(), expect.sort_index()) def test_serialize_datetime(): diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 980dcb5a13b..a19b88caf4c 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -913,3 +913,24 @@ def custom_add_func(sr, val): lfunc_args_and_kwargs=([(custom_add_func, "val")], {"val": 11}), rfunc_args_and_kwargs=([(custom_add_func, "val")], {"val": 11}), ) + + +@pytest.mark.parametrize( + "data", + [ + [1, None, 11, 2.0, np.nan], + [np.nan], + [None, None, None], + [np.nan, 1, 10, 393.32, np.nan], + ], +) +@pytest.mark.parametrize("nan_as_null", [True, False]) +@pytest.mark.parametrize("fill_value", [1.2, 332, np.nan]) +def test_fillna_with_nan(data, nan_as_null, fill_value): + gs = cudf.Series(data, nan_as_null=nan_as_null) + ps = gs.to_pandas() + + expected = ps.fillna(fill_value) + actual = gs.fillna(fill_value) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 974892cb8e7..080420c8f75 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1117,8 +1117,8 @@ def test_string_groupby_key_index(): pdf["b"] = other_data gdf["b"] = other_data - expect = pdf.groupby("a").count() - got = gdf.groupby("a").count() + expect = pdf.groupby("a", sort=True).count() + got = gdf.groupby("a", sort=True).count() assert_eq(expect, got, check_dtype=False) diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py index cc580bedc08..1e8beb18234 100644 --- a/python/cudf/cudf/utils/applyutils.py +++ b/python/cudf/cudf/utils/applyutils.py @@ -1,5 +1,7 @@ # Copyright (c) 2018, NVIDIA CORPORATION. + import functools +from typing import Any, Dict from numba import cuda @@ -332,7 +334,7 @@ def chunk_wise_kernel(nrows, chunks, {args}): return kernel -_cache = dict() # WeakKeyDictionary() +_cache = dict() # type: Dict[Any, Any] @functools.wraps(_make_row_wise_kernel) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 3c15e1b2ad5..d49b4abd399 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. import datetime as dt import numbers @@ -234,13 +234,23 @@ def is_struct_dtype(obj): return ( type(obj) is cudf.core.dtypes.StructDtype or obj is cudf.core.dtypes.StructDtype - # or type(obj) is cudf.core.column.StructColumn - # or obj is cudf.core.column.StructColumn or (isinstance(obj, str) and obj == cudf.core.dtypes.StructDtype.name) or (hasattr(obj, "dtype") and is_struct_dtype(obj.dtype)) ) +def is_decimal_dtype(obj): + return ( + type(obj) is cudf.core.dtypes.Decimal64Dtype + or obj is cudf.core.dtypes.Decimal64Dtype + or ( + isinstance(obj, str) + and obj == cudf.core.dtypes.Decimal64Dtype.name + ) + or (hasattr(obj, "dtype") and is_decimal_dtype(obj.dtype)) + ) + + def cudf_dtype_from_pydata_dtype(dtype): """ Given a numpy or pandas dtype, converts it into the equivalent cuDF Python dtype. diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index 82a51b3f9b4..c71a6dbccb1 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -2,6 +2,7 @@ import ast import datetime as dt +from typing import Any, Dict import numpy as np import six @@ -101,7 +102,7 @@ def _check_error(tree): raise QuerySyntaxError("too many expressions") -_cache = {} +_cache = {} # type: Dict[Any, Any] def query_compile(expr): diff --git a/python/cudf/setup.cfg b/python/cudf/setup.cfg index 0b2711155d7..3067d2daafd 100644 --- a/python/cudf/setup.cfg +++ b/python/cudf/setup.cfg @@ -46,6 +46,21 @@ skip= dist __init__.py +[mypy] +ignore_missing_imports = True + +[mypy-cudf._lib.*] +ignore_errors = True + +[mypy-cudf._version] +ignore_errors = True + +[mypy-cudf.utils.metadata.orc_column_statistics_pb2] +ignore_errors = True + +[mypy-cudf.tests.*] +ignore_errors = True + [tool:pytest] addopts = --benchmark-warmup=off @@ -60,4 +75,3 @@ python_files = python_functions = bench_* test_* - diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index f0cf5bddca0..0ba35460835 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -424,6 +424,11 @@ def var( result.divisions = (min(self.columns), max(self.columns)) return handle_out(out, result) + def groupby(self, *args, **kwargs): + from .groupby import CudfSeriesGroupBy + + return CudfSeriesGroupBy(self, *args, **kwargs) + class Index(Series, dd.core.Index): _partition_type = cudf.Index diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index 494da4927d2..2803212a502 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -13,7 +13,7 @@ new_dd_object, split_out_on_cols, ) -from dask.dataframe.groupby import DataFrameGroupBy +from dask.dataframe.groupby import DataFrameGroupBy, SeriesGroupBy from dask.highlevelgraph import HighLevelGraph @@ -23,6 +23,40 @@ def __init__(self, *args, **kwargs): self.as_index = kwargs.pop("as_index", True) super().__init__(*args, **kwargs) + def __getitem__(self, key): + if isinstance(key, list): + g = CudfDataFrameGroupBy( + self.obj, + by=self.index, + slice=key, + sort=self.sort, + **self.dropna, + ) + else: + g = CudfSeriesGroupBy( + self.obj, + by=self.index, + slice=key, + sort=self.sort, + **self.dropna, + ) + + g._meta = g._meta[key] + return g + + def mean(self, split_every=None, split_out=1): + return groupby_agg( + self.obj, + self.index, + {c: "mean" for c in self.obj.columns if c not in self.index}, + split_every=split_every, + split_out=split_out, + dropna=self.dropna, + sep=self.sep, + sort=self.sort, + as_index=self.as_index, + ) + def aggregate(self, arg, split_every=None, split_out=1): if arg == "size": return self.size() @@ -50,6 +84,52 @@ def aggregate(self, arg, split_every=None, split_out=1): ) +class CudfSeriesGroupBy(SeriesGroupBy): + def __init__(self, *args, **kwargs): + self.sep = kwargs.pop("sep", "___") + self.as_index = kwargs.pop("as_index", True) + super().__init__(*args, **kwargs) + + def mean(self, split_every=None, split_out=1): + return groupby_agg( + self.obj, + self.index, + {self._slice: "mean"}, + split_every=split_every, + split_out=split_out, + dropna=self.dropna, + sep=self.sep, + sort=self.sort, + as_index=self.as_index, + )[self._slice] + + def aggregate(self, arg, split_every=None, split_out=1): + if arg == "size": + return self.size() + + _supported = {"count", "mean", "std", "var", "sum", "min", "max"} + if ( + isinstance(self.obj, DaskDataFrame) + and isinstance(self.index, (str, list)) + and _is_supported({self._slice: arg}, _supported) + ): + return groupby_agg( + self.obj, + self.index, + {self._slice: arg}, + split_every=split_every, + split_out=split_out, + dropna=self.dropna, + sep=self.sep, + sort=self.sort, + as_index=self.as_index, + ) + + return super().aggregate( + arg, split_every=split_every, split_out=split_out + ) + + def groupby_agg( ddf, gb_cols,