diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 08cc6ec4a14..0ac54113278 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -44,7 +44,7 @@ repos: hooks: - id: cython-lint - repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v0.971' + rev: 'v1.3.0' hooks: - id: mypy additional_dependencies: [types-cachetools] diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 5b8a0ea53ba..4031f1aa1c3 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -31,8 +31,8 @@ dependencies: - fmt>=9.1.0,<10 - fsspec>=0.6.0 - gcc_linux-64=11.* -- gmock>=1.13.0.* -- gtest>=1.13.0.* +- gmock>=1.13.0 +- gtest>=1.13.0 - hypothesis - ipython - libarrow==11.0.0.* diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index a236b62c5fd..275b8f9332f 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -396,6 +396,8 @@ outputs: - {{ pin_subpackage('libcudf', exact=True) }} - {{ pin_subpackage('libcudf_kafka', exact=True) }} - cudatoolkit {{ cuda_spec }} + - gtest {{ gtest_version }} + - gmock {{ gtest_version }} - libcurand {{ libcurand_run_version }} about: home: https://rapids.ai/ diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 565a396d913..c3db3370c62 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -153,8 +153,8 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- ConfigureNVBench( - STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/unique.cpp - stream_compaction/unique_count.cpp + STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/distinct_count.cpp + stream_compaction/unique.cpp stream_compaction/unique_count.cpp ) # ################################################################################################## @@ -195,8 +195,7 @@ ConfigureBench( reduction/reduce.cpp reduction/scan.cpp ) ConfigureNVBench( - REDUCTION_NVBENCH reduction/distinct_count.cpp reduction/rank.cpp reduction/scan_structs.cpp - reduction/segmented_reduce.cpp + REDUCTION_NVBENCH reduction/rank.cpp reduction/scan_structs.cpp reduction/segmented_reduce.cpp ) # ################################################################################################## @@ -280,7 +279,6 @@ ConfigureNVBench(TEXT_NVBENCH text/minhash.cpp) # * strings benchmark ------------------------------------------------------------------- ConfigureBench( STRINGS_BENCH - string/case.cpp string/combine.cpp string/contains.cpp string/convert_datetime.cpp @@ -301,7 +299,10 @@ ConfigureBench( string/url_decode.cu ) -ConfigureNVBench(STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengths.cpp) +ConfigureNVBench( + STRINGS_NVBENCH string/case.cpp string/char_types.cpp string/lengths.cpp string/like.cpp + string/reverse.cpp +) # ################################################################################################## # * json benchmark ------------------------------------------------------------------- diff --git a/cpp/benchmarks/reduction/distinct_count.cpp b/cpp/benchmarks/stream_compaction/distinct_count.cpp similarity index 75% rename from cpp/benchmarks/reduction/distinct_count.cpp rename to cpp/benchmarks/stream_compaction/distinct_count.cpp index d2218c270a8..2b2c901b90f 100644 --- a/cpp/benchmarks/reduction/distinct_count.cpp +++ b/cpp/benchmarks/stream_compaction/distinct_count.cpp @@ -15,14 +15,14 @@ */ #include -#include +#include -#include +#include #include template -static void bench_reduction_distinct_count(nvbench::state& state, nvbench::type_list) +static void bench_distinct_count(nvbench::state& state, nvbench::type_list) { auto const dtype = cudf::type_to_id(); auto const size = static_cast(state.get_int64("num_rows")); @@ -40,16 +40,19 @@ static void bench_reduction_distinct_count(nvbench::state& state, nvbench::type_ auto const& data_column = data_table->get_column(0); auto const input_table = cudf::table_view{{data_column, data_column, data_column}}; + auto mem_stats_logger = cudf::memory_stats_logger(); // init stats logger + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - rmm::cuda_stream_view stream{launch.get_stream()}; - cudf::detail::distinct_count(input_table, cudf::null_equality::EQUAL, stream); + cudf::distinct_count(input_table, cudf::null_equality::EQUAL); }); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); } using data_type = nvbench::type_list; -NVBENCH_BENCH_TYPES(bench_reduction_distinct_count, NVBENCH_TYPE_AXES(data_type)) - .set_name("reduction_distinct_count") +NVBENCH_BENCH_TYPES(bench_distinct_count, NVBENCH_TYPE_AXES(data_type)) + .set_name("distinct_count") .add_int64_axis("num_rows", { 10000, // 10k diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp index 6152ea741a3..0cdd5fbac32 100644 --- a/cpp/benchmarks/string/case.cpp +++ b/cpp/benchmarks/string/case.cpp @@ -15,36 +15,64 @@ */ #include -#include -#include +#include #include #include #include -class StringCase : public cudf::benchmark {}; +#include -static void BM_case(benchmark::State& state) +void bench_case(nvbench::state& state) { - cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}); - cudf::strings_column_view input(column->view()); + auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const max_width = static_cast(state.get_int64("width")); + auto const encoding = state.get_string("encoding"); - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - cudf::strings::to_lower(input); + if (static_cast(n_rows) * static_cast(max_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); } - state.SetBytesProcessed(state.iterations() * input.chars_size()); -} + data_profile const profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); + + auto col_view = column->view(); + + cudf::column::contents ascii_contents; + if (encoding == "ascii") { + data_profile ascii_profile = data_profile_builder().no_validity().distribution( + cudf::type_id::INT8, distribution_id::UNIFORM, 32, 126); // nice ASCII range + auto input = cudf::strings_column_view(col_view); + auto ascii_column = + create_random_column(cudf::type_id::INT8, row_count{input.chars_size()}, ascii_profile); + auto ascii_data = ascii_column->view(); -#define SORT_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(StringCase, name) \ - (::benchmark::State & st) { BM_case(st); } \ - BENCHMARK_REGISTER_F(StringCase, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 12, 1 << 24}}) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); + col_view = cudf::column_view(col_view.type(), + col_view.size(), + nullptr, + col_view.null_mask(), + col_view.null_count(), + 0, + {input.offsets(), ascii_data}); + + ascii_contents = ascii_column->release(); + } + auto input = cudf::strings_column_view(col_view); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + + state.add_element_count(input.chars_size(), "chars_size"); + state.add_global_memory_reads(input.chars_size()); + state.add_global_memory_writes(input.chars_size()); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = cudf::strings::to_lower(input); }); +} -SORT_BENCHMARK_DEFINE(to_lower) +NVBENCH_BENCH(bench_case) + .set_name("strings_case") + .add_int64_axis("width", {32, 64, 128, 256, 512, 1024, 2048}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_string_axis("encoding", {"ascii", "utf8"}); diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp new file mode 100644 index 00000000000..8e9e595fcef --- /dev/null +++ b/cpp/benchmarks/string/char_types.cpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +static void bench_char_types(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const api_type = state.get_string("api"); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + data_profile const table_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const table = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + auto input_types = cudf::strings::string_character_types::SPACE; + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + // gather some throughput statistics as well + auto chars_size = input.chars_size(); + state.add_global_memory_reads(chars_size); // all bytes are read; + if (api_type == "all") { + state.add_global_memory_writes(num_rows); // output is a bool8 per row + } else { + state.add_global_memory_writes(chars_size); + } + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + if (api_type == "all") { + auto result = cudf::strings::all_characters_of_type(input, input_types); + } else { + auto result = cudf::strings::filter_characters_of_type(input, input_types); + } + }); +} + +NVBENCH_BENCH(bench_char_types) + .set_name("char_types") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_string_axis("api", {"all", "filter"}); diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json index 7be868081b6..d5df222ae37 100644 --- a/cpp/cmake/thirdparty/patches/nvbench_override.json +++ b/cpp/cmake/thirdparty/patches/nvbench_override.json @@ -12,6 +12,11 @@ "file" : "nvbench/use_existing_fmt.diff", "issue" : "Fix add support for using an existing fmt [https://github.com/NVIDIA/nvbench/pull/125]", "fixed_in" : "" + }, + { + "file" : "nvbench/public_fmt_dep_in_conda.diff", + "issue" : "Propagate fmt requirement in conda envs [https://github.com/NVIDIA/nvbench/pull/127]", + "fixed_in" : "" } ] } diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp index 921ef5f65f1..cdb8a1c8a09 100644 --- a/cpp/include/cudf/copying.hpp +++ b/cpp/include/cudf/copying.hpp @@ -228,6 +228,9 @@ std::unique_ptr empty_like(scalar const& input); * * Supports only fixed-width types. * + * If the `mask_alloc` allocates a validity mask that mask is also uninitialized + * and the validity bits and the null count should be set by the caller. + * * @param[in] input Immutable view of input column to emulate * @param[in] mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN * @param[in] mr Device memory resource used to allocate the returned column's device memory @@ -244,6 +247,9 @@ std::unique_ptr allocate_like( * * Supports only fixed-width types. * + * If the `mask_alloc` allocates a validity mask that mask is also uninitialized + * and the validity bits and the null count should be set by the caller. + * * @param[in] input Immutable view of input column to emulate * @param[in] size The desired number of elements that the new column should have capacity for * @param[in] mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN diff --git a/cpp/include/cudf/detail/contiguous_split.hpp b/cpp/include/cudf/detail/contiguous_split.hpp index 4c6d19739cf..d9a35470b7d 100644 --- a/cpp/include/cudf/detail/contiguous_split.hpp +++ b/cpp/include/cudf/detail/contiguous_split.hpp @@ -67,7 +67,7 @@ class metadata_builder { * @brief Destructor that will be implemented as default, required because metadata_builder_impl * is incomplete at this stage. */ - ~metadata_builder() = default; + ~metadata_builder(); /** * @brief Add a column to this metadata builder. @@ -105,9 +105,23 @@ class metadata_builder { */ std::vector build() const; + /** + * @brief Clear the internal buffer containing all added metadata. + */ + void clear(); + private: std::unique_ptr impl; }; +/** + * @copydoc pack_metadata + * @param builder The reusable builder object to create packed column metadata. + */ +std::vector pack_metadata(table_view const& table, + uint8_t const* contiguous_buffer, + size_t buffer_size, + metadata_builder& builder); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp index 360006c1eea..e8bc97e95b3 100644 --- a/cpp/include/cudf/null_mask.hpp +++ b/cpp/include/cudf/null_mask.hpp @@ -36,6 +36,8 @@ namespace cudf { * @brief Returns the null count for a null mask of the specified `state` * representing `size` elements. * + * @throw std::invalid_argument if state is UNINITIALIZED + * * @param state The state of the null mask * @param size The number of elements represented by the mask * @return The count of null elements diff --git a/cpp/include/cudf_test/iterator_utilities.hpp b/cpp/include/cudf_test/iterator_utilities.hpp index c2c6b3ae83d..10f6e77d889 100644 --- a/cpp/include/cudf_test/iterator_utilities.hpp +++ b/cpp/include/cudf_test/iterator_utilities.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -121,6 +121,9 @@ template * The returned iterator yields `false` (to mark `null`) at the indices corresponding to the * pointers having `nullptr` values and `true` for the remaining indices. * + * @note The input vector is referenced by the transform iterator, so the + * lifespan must be just as long as the iterator. + * * @tparam T the data type * @param ptrs The data pointers for which the validity iterator is computed * @return auto Validity iterator @@ -128,8 +131,7 @@ template template [[maybe_unused]] static auto nulls_from_nullptrs(std::vector const& ptrs) { - // The vector `indices` is copied into the lambda as it can be destroyed at the caller site. - return thrust::make_transform_iterator(ptrs.begin(), [ptrs](auto ptr) { return ptr != nullptr; }); + return thrust::make_transform_iterator(ptrs.begin(), [](auto ptr) { return ptr != nullptr; }); } } // namespace iterators diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp index 6ea4311c8cb..2404cf0d134 100644 --- a/cpp/include/cudf_test/type_lists.hpp +++ b/cpp/include/cudf_test/type_lists.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -84,11 +84,13 @@ std::enable_if_t() && !cudf::is_timestamp_t> make_type_param_vector(std::initializer_list const& init_list) { - thrust::host_vector vec(init_list.size()); - std::transform(std::cbegin(init_list), std::cend(init_list), std::begin(vec), [](auto const& e) { - if constexpr (std::is_unsigned_v) { return static_cast(std::abs(e)); } - return static_cast(e); - }); + std::vector input{init_list}; + std::vector vec(init_list.size()); + std::transform( + std::cbegin(input), std::cend(input), std::begin(vec), [](auto const& e) -> TypeParam { + if constexpr (std::is_unsigned_v) { return static_cast(std::abs(e)); } + return static_cast(e); + }); return vec; } diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index daec9b5b199..8482769b569 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -217,6 +217,8 @@ std::unique_ptr binary_operation(LhsType const& lhs, auto out_view = out->mutable_view(); cudf::binops::compiled::binary_operation(out_view, lhs, rhs, op, stream); + // TODO: consider having the binary_operation count nulls instead + out->set_null_count(cudf::detail::null_count(out_view.null_mask(), 0, out->size(), stream)); return out; } } // namespace compiled @@ -373,6 +375,7 @@ std::unique_ptr binary_operation(column_view const& lhs, auto out_view = out->mutable_view(); binops::jit::binary_operation(out_view, lhs, rhs, ptx, stream); + out->set_null_count(cudf::detail::null_count(out_view.null_mask(), 0, out->size(), stream)); return out; } } // namespace detail diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index b98a2196748..4c22988900b 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -50,10 +50,9 @@ size_type state_null_count(mask_state state, size_type size) { switch (state) { case mask_state::UNALLOCATED: return 0; - case mask_state::UNINITIALIZED: return UNKNOWN_NULL_COUNT; case mask_state::ALL_NULL: return size; case mask_state::ALL_VALID: return 0; - default: CUDF_FAIL("Invalid null mask state."); + default: CUDF_FAIL("Invalid null mask state.", std::invalid_argument); } } diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp index 5f455e26e52..e147b12ad99 100644 --- a/cpp/src/column/column_factories.cpp +++ b/cpp/src/column/column_factories.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -81,12 +81,13 @@ std::unique_ptr make_numeric_column(data_type type, CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type."); CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); - return std::make_unique(type, - size, - rmm::device_buffer{size * cudf::size_of(type), stream, mr}, - detail::create_null_mask(size, state, stream, mr), - state_null_count(state, size), - std::vector>{}); + return std::make_unique( + type, + size, + rmm::device_buffer{size * cudf::size_of(type), stream, mr}, + detail::create_null_mask(size, state, stream, mr), + state == mask_state::UNINITIALIZED ? 0 : state_null_count(state, size), + std::vector>{}); } // Allocate storage for a specified number of numeric elements @@ -100,12 +101,13 @@ std::unique_ptr make_fixed_point_column(data_type type, CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type."); CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); - return std::make_unique(type, - size, - rmm::device_buffer{size * cudf::size_of(type), stream, mr}, - detail::create_null_mask(size, state, stream, mr), - state_null_count(state, size), - std::vector>{}); + return std::make_unique( + type, + size, + rmm::device_buffer{size * cudf::size_of(type), stream, mr}, + detail::create_null_mask(size, state, stream, mr), + state == mask_state::UNINITIALIZED ? 0 : state_null_count(state, size), + std::vector>{}); } // Allocate storage for a specified number of timestamp elements @@ -119,12 +121,13 @@ std::unique_ptr make_timestamp_column(data_type type, CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type."); CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); - return std::make_unique(type, - size, - rmm::device_buffer{size * cudf::size_of(type), stream, mr}, - detail::create_null_mask(size, state, stream, mr), - state_null_count(state, size), - std::vector>{}); + return std::make_unique( + type, + size, + rmm::device_buffer{size * cudf::size_of(type), stream, mr}, + detail::create_null_mask(size, state, stream, mr), + state == mask_state::UNINITIALIZED ? 0 : state_null_count(state, size), + std::vector>{}); } // Allocate storage for a specified number of duration elements @@ -138,12 +141,13 @@ std::unique_ptr make_duration_column(data_type type, CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type."); CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); - return std::make_unique(type, - size, - rmm::device_buffer{size * cudf::size_of(type), stream, mr}, - detail::create_null_mask(size, state, stream, mr), - state_null_count(state, size), - std::vector>{}); + return std::make_unique( + type, + size, + rmm::device_buffer{size * cudf::size_of(type), stream, mr}, + detail::create_null_mask(size, state, stream, mr), + state == mask_state::UNINITIALIZED ? 0 : state_null_count(state, size), + std::vector>{}); } // Allocate storage for a specified number of fixed width elements diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 4c3b4eddb8d..e7ac424001c 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1251,6 +1251,8 @@ std::vector contiguous_split(cudf::table_view const& input, std::vector cols; cols.reserve(num_root_columns); auto cur_dst_buf_info = h_dst_buf_info; + cudf::detail::metadata_builder meta_builder(num_root_columns); + for (std::size_t idx = 0; idx < num_partitions; idx++) { // traverse the buffers and build the columns. cur_dst_buf_info = build_output_columns( @@ -1258,14 +1260,18 @@ std::vector contiguous_split(cudf::table_view const& input, // pack the columns cudf::table_view t{cols}; - result.push_back(packed_table{ - t, - packed_columns{ - std::make_unique>(cudf::pack_metadata( - t, reinterpret_cast(out_buffers[idx].data()), out_buffers[idx].size())), - std::make_unique(std::move(out_buffers[idx]))}}); - cols.clear(); + + cudf::packed_columns packed_cols{ + std::make_unique>( + cudf::detail::pack_metadata(t, + reinterpret_cast(out_buffers[idx].data()), + out_buffers[idx].size(), + meta_builder)), + std::make_unique(std::move(out_buffers[idx]))}; + meta_builder.clear(); + + result.emplace_back(packed_table{std::move(t), std::move(packed_cols)}); } return result; } diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp index 2bcfeae20c4..9d4b02ffb4f 100644 --- a/cpp/src/copying/copy.cpp +++ b/cpp/src/copying/copy.cpp @@ -129,7 +129,7 @@ std::unique_ptr allocate_like(column_view const& input, size, rmm::device_buffer(size * size_of(input.type()), stream, mr), detail::create_null_mask(size, allocate_mask, stream, mr), - state_null_count(allocate_mask, input.size())); + 0); } } // namespace detail diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu index 9a506c866bc..64599716765 100644 --- a/cpp/src/copying/copy_range.cu +++ b/cpp/src/copying/copy_range.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -108,6 +108,7 @@ struct out_of_place_copy_range_dispatch { if (source_end != source_begin) { // otherwise no-op auto ret_view = p_ret->mutable_view(); in_place_copy_range(source, ret_view, source_begin, source_end, target_begin, stream); + p_ret->set_null_count(ret_view.null_count()); } return p_ret; diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp index bac9aac1886..e4de4a43b68 100644 --- a/cpp/src/copying/pack.cpp +++ b/cpp/src/copying/pack.cpp @@ -35,6 +35,8 @@ namespace { * and unpack. */ struct serialized_column { + serialized_column() = default; + serialized_column(data_type _type, size_type _size, size_type _null_count, @@ -150,24 +152,22 @@ packed_columns pack(cudf::table_view const& input, return contig_split_result.empty() ? packed_columns{} : std::move(contig_split_result[0].data); } -template -std::vector pack_metadata(ColumnIter begin, - ColumnIter end, +std::vector pack_metadata(table_view const& table, uint8_t const* contiguous_buffer, - size_t buffer_size) + size_t buffer_size, + metadata_builder& builder) { - auto mb = metadata_builder(std::distance(begin, end)); - - std::for_each(begin, end, [&mb, &contiguous_buffer, &buffer_size](column_view const& col) { - build_column_metadata(mb, col, contiguous_buffer, buffer_size); - }); + std::for_each( + table.begin(), table.end(), [&builder, contiguous_buffer, buffer_size](column_view const& col) { + build_column_metadata(builder, col, contiguous_buffer, buffer_size); + }); - return mb.build(); + return builder.build(); } class metadata_builder_impl { public: - metadata_builder_impl() = default; + metadata_builder_impl(size_type const num_root_columns) { metadata.reserve(num_root_columns); } void add_column_info_to_meta(data_type const col_type, size_type const col_size, @@ -182,14 +182,16 @@ class metadata_builder_impl { std::vector build() const { - // convert to anonymous bytes - std::vector metadata_bytes; - auto const metadata_begin = reinterpret_cast(metadata.data()); - std::copy(metadata_begin, - metadata_begin + (metadata.size() * sizeof(detail::serialized_column)), - std::back_inserter(metadata_bytes)); - - return metadata_bytes; + auto output = std::vector(metadata.size() * sizeof(detail::serialized_column)); + std::memcpy(output.data(), metadata.data(), output.size()); + return output; + } + + void clear() + { + // Clear all, except the first metadata entry storing the number of top level columns that + // was added upon object construction. + metadata.resize(1); } private: @@ -228,13 +230,16 @@ table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data) } metadata_builder::metadata_builder(size_type const num_root_columns) - : impl(std::make_unique()) + : impl(std::make_unique(num_root_columns + + 1 /*one more extra metadata entry as below*/)) { // first metadata entry is a stub indicating how many total (top level) columns // there are impl->add_column_info_to_meta(data_type{type_id::EMPTY}, num_root_columns, 0, -1, -1, 0); } +metadata_builder::~metadata_builder() = default; + void metadata_builder::add_column_info_to_meta(data_type const col_type, size_type const col_size, size_type const col_null_count, @@ -248,6 +253,8 @@ void metadata_builder::add_column_info_to_meta(data_type const col_type, std::vector metadata_builder::build() const { return impl->build(); } +void metadata_builder::clear() { return impl->clear(); } + } // namespace detail /** @@ -267,9 +274,10 @@ std::vector pack_metadata(table_view const& table, size_t buffer_size) { CUDF_FUNC_RANGE(); - return table.is_empty() - ? std::vector{} - : detail::pack_metadata(table.begin(), table.end(), contiguous_buffer, buffer_size); + if (table.is_empty()) { return std::vector{}; } + + auto builder = cudf::detail::metadata_builder(table.num_columns()); + return detail::pack_metadata(table, contiguous_buffer, buffer_size, builder); } /** diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu index a747cc195ae..342392c773e 100644 --- a/cpp/src/filling/fill.cu +++ b/cpp/src/filling/fill.cu @@ -122,6 +122,7 @@ struct out_of_place_fill_range_dispatch { auto ret_view = p_ret->mutable_view(); using DeviceType = cudf::device_storage_type_t; in_place_fill(ret_view, begin, end, value, stream); + p_ret->set_null_count(ret_view.null_count()); } return p_ret; diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp index 6312fbf93a9..3c8524588af 100644 --- a/cpp/src/io/avro/avro.cpp +++ b/cpp/src/io/avro/avro.cpp @@ -27,8 +27,9 @@ template <> uint64_t container::get_encoded() { uint64_t val = 0; - for (uint64_t len = 0; len < 64; len += 7) { - auto const byte = get_raw(); + for (auto len = 0; len < 64; len += 7) { + // 64-bit int since shift left is upto 64. + uint64_t const byte = get_raw(); val |= (byte & 0x7f) << len; if (byte < 0x80) break; } diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 65c93105304..0fdcd33ada4 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -813,10 +813,10 @@ std::pair, std::vector> device_json_co // For string columns return ["offsets", "char"] schema if (target_type.id() == type_id::STRING) { - return {std::move(col), {{"offsets"}, {"chars"}}}; + return {std::move(col), std::vector{{"offsets"}, {"chars"}}}; } // Non-string leaf-columns (e.g., numeric) do not have child columns in the schema - return {std::move(col), {}}; + return {std::move(col), std::vector{}}; } case json_col_t::StructColumn: { std::vector> child_columns; @@ -860,7 +860,7 @@ std::pair, std::vector> device_json_co data_type{type_id::INT8}, 0, rmm::device_buffer{0, stream, mr}), - {}} + std::vector{}} : device_json_column_to_cudf_column( json_col.child_columns.begin()->second, d_input, diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index c29bf2f8866..d8ca0411910 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1678,11 +1678,11 @@ std::pair, std::vector> json_column_to // For string columns return ["offsets", "char"] schema if (target_type.id() == type_id::STRING) { - return {std::move(col), {{"offsets"}, {"chars"}}}; + return {std::move(col), std::vector{{"offsets"}, {"chars"}}}; } // Non-string leaf-columns (e.g., numeric) do not have child columns in the schema else { - return {std::move(col), {}}; + return {std::move(col), std::vector{}}; } break; } @@ -1724,7 +1724,8 @@ std::pair, std::vector> json_column_to auto [child_column, names] = json_col.child_columns.empty() ? std::pair, - std::vector>{std::make_unique(), {}} + std::vector>{std::make_unique(), + std::vector{}} : json_column_to_cudf_column(json_col.child_columns.begin()->second, d_input, options, diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index fbe44eff5ad..1561737da48 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -102,27 +102,18 @@ constexpr type_id to_type_id(const orc::SchemaType& schema, return type_id::EMPTY; } -constexpr std::pair get_index_type_and_pos( - const orc::StreamKind kind, uint32_t skip_count, bool non_child) +gpu::StreamIndexType get_stream_index_type(orc::StreamKind kind) { switch (kind) { - case orc::DATA: - skip_count += 1; - skip_count |= (skip_count & 0xff) << 8; - return std::pair(gpu::CI_DATA, skip_count); + case orc::DATA: return gpu::CI_DATA; case orc::LENGTH: - case orc::SECONDARY: - skip_count += 1; - skip_count |= (skip_count & 0xff) << 16; - return std::pair(gpu::CI_DATA2, skip_count); - case orc::DICTIONARY_DATA: return std::pair(gpu::CI_DICTIONARY, skip_count); - case orc::PRESENT: - skip_count += (non_child ? 1 : 0); - return std::pair(gpu::CI_PRESENT, skip_count); - case orc::ROW_INDEX: return std::pair(gpu::CI_INDEX, skip_count); + case orc::SECONDARY: return gpu::CI_DATA2; + case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY; + case orc::PRESENT: return gpu::CI_PRESENT; + case orc::ROW_INDEX: return gpu::CI_INDEX; default: // Skip this stream as it's not strictly required - return std::pair(gpu::CI_NUM_STREAMS, 0); + return gpu::CI_NUM_STREAMS; } } @@ -213,16 +204,15 @@ size_t gather_stream_info(const size_t stripe_index, } if (col != -1) { if (src_offset >= stripeinfo->indexLength || use_index) { - // NOTE: skip_count field is temporarily used to track index ordering - auto& chunk = chunks[stripe_index][col]; - const auto idx = - get_index_type_and_pos(stream.kind, chunk.skip_count, col == orc2gdf[column_id]); - if (idx.first < gpu::CI_NUM_STREAMS) { - chunk.strm_id[idx.first] = stream_info.size(); - chunk.strm_len[idx.first] = stream.length; - chunk.skip_count = idx.second; - - if (idx.first == gpu::CI_DICTIONARY) { + auto& chunk = chunks[stripe_index][col]; + auto const index_type = get_stream_index_type(stream.kind); + if (index_type < gpu::CI_NUM_STREAMS) { + chunk.strm_id[index_type] = stream_info.size(); + chunk.strm_len[index_type] = stream.length; + // NOTE: skip_count field is temporarily used to track the presence of index streams + chunk.skip_count |= 1 << index_type; + + if (index_type == gpu::CI_DICTIONARY) { chunk.dictionary_start = *num_dictionary_entries; chunk.dict_len = stripefooter->columns[column_id].dictionarySize; *num_dictionary_entries += stripefooter->columns[column_id].dictionarySize; diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu index 11813677b95..6c0f0767b73 100644 --- a/cpp/src/io/orc/stripe_init.cu +++ b/cpp/src/io/orc/stripe_init.cu @@ -21,6 +21,8 @@ #include #include +#include +#include namespace cudf { namespace io { @@ -226,6 +228,30 @@ enum row_entry_state_e { STORE_INDEX2, }; +/** + * @brief Calculates the order of index streams based on the index types present in the column. + * + * @param index_types_bitmap The bitmap of index types showing which index streams are present + * + * @return The order of index streams + */ +static auto __device__ index_order_from_index_types(uint32_t index_types_bitmap) +{ + constexpr std::array full_order = {CI_PRESENT, CI_DATA, CI_DATA2}; + + std::array partial_order; + thrust::copy_if(thrust::seq, + full_order.cbegin(), + full_order.cend(), + partial_order.begin(), + [index_types_bitmap] __device__(auto index_type) { + // Check if the index type is present + return index_types_bitmap & (1 << index_type); + }); + + return partial_order; +} + /** * @brief Decode a single row group index entry * @@ -239,11 +265,14 @@ static uint32_t __device__ ProtobufParseRowIndexEntry(rowindex_state_s* s, uint8_t const* const end) { constexpr uint32_t pb_rowindexentry_id = ProtofType::FIXEDLEN + 8; + auto const stream_order = index_order_from_index_types(s->chunk.skip_count); const uint8_t* cur = start; row_entry_state_e state = NOT_FOUND; - uint32_t length = 0, strm_idx_id = s->chunk.skip_count >> 8, idx_id = 1, ci_id = CI_PRESENT, - pos_end = 0; + uint32_t length = 0; + uint32_t idx_id = 0; + uint32_t pos_end = 0; + uint32_t ci_id = CI_NUM_STREAMS; while (cur < end) { uint32_t v = 0; for (uint32_t l = 0; l <= 28; l += 7) { @@ -283,10 +312,8 @@ static uint32_t __device__ ProtobufParseRowIndexEntry(rowindex_state_s* s, } break; case STORE_INDEX0: - ci_id = (idx_id == (strm_idx_id & 0xff)) ? CI_DATA - : (idx_id == ((strm_idx_id >> 8) & 0xff)) ? CI_DATA2 - : CI_PRESENT; - idx_id++; + // Start of a new entry; determine the stream index types + ci_id = stream_order[idx_id++]; if (s->is_compressed) { if (ci_id < CI_PRESENT) s->row_index_entry[0][ci_id] = v; if (cur >= start + pos_end) return length; diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index 02c4532bb79..0997983c95e 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,9 +17,10 @@ #include #include #include -#include #include #include +#include +#include #include #include #include @@ -38,31 +39,33 @@ namespace detail { namespace { /** - * @brief Per string logic for case conversion functions. + * @brief Threshold to decide on using string or warp parallel functions. * + * If the average byte length of a string in a column exceeds this value then + * the warp-parallel function is used to compute the output sizes. + * Otherwise, a regular string-parallel function is used. + * + * This value was found using the strings_lengths benchmark results. */ -struct upper_lower_fn { - const column_device_view d_column; - character_flags_table_type case_flag; // flag to check with on each character - const character_flags_table_type* d_flags; - const character_cases_table_type* d_case_table; - const special_case_mapping* d_special_case_mapping; - int32_t* d_offsets{}; - char* d_chars{}; +constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64; - __device__ special_case_mapping get_special_case_mapping(uint32_t code_point) - { - return d_special_case_mapping[get_special_case_hash_index(code_point)]; - } +/** + * @brief Utility functions for converting characters to upper or lower case + */ +struct convert_char_fn { + character_flags_table_type case_flag; + character_flags_table_type const* d_flags; + character_cases_table_type const* d_case_table; + special_case_mapping const* d_special_case_mapping; - // compute-size / copy the bytes representing the special case mapping for this codepoint - __device__ int32_t handle_special_case_bytes(uint32_t code_point, - char* d_buffer, - detail::character_flags_table_type flag) + // compute size or copy the bytes representing the special case mapping for this codepoint + __device__ size_type handle_special_case_bytes(uint32_t code_point, + detail::character_flags_table_type flag, + char* d_buffer = nullptr) const { - special_case_mapping m = get_special_case_mapping(code_point); - size_type bytes = 0; + special_case_mapping m = d_special_case_mapping[get_special_case_hash_index(code_point)]; + size_type bytes = 0; auto const count = IS_LOWER(flag) ? m.num_upper_chars : m.num_lower_chars; auto const* chars = IS_LOWER(flag) ? m.upper : m.lower; for (uint16_t idx = 0; idx < count; idx++) { @@ -73,76 +76,200 @@ struct upper_lower_fn { return bytes; } - __device__ void operator()(size_type idx) + // this is called for converting any UTF-8 characters + __device__ size_type process_character(char_utf8 chr, char* d_buffer = nullptr) const + { + auto const code_point = detail::utf8_to_codepoint(chr); + + detail::character_flags_table_type flag = code_point <= 0x00'FFFF ? d_flags[code_point] : 0; + + // we apply special mapping in two cases: + // - uncased characters with the special mapping flag: always + // - cased characters with the special mapping flag: when matching the input case_flag + if (IS_SPECIAL(flag) && ((flag & case_flag) || !IS_UPPER_OR_LOWER(flag))) { + return handle_special_case_bytes(code_point, case_flag, d_buffer); + } + + char_utf8 const new_char = + (flag & case_flag) ? detail::codepoint_to_utf8(d_case_table[code_point]) : chr; + return (d_buffer) ? detail::from_char_utf8(new_char, d_buffer) + : detail::bytes_in_char_utf8(new_char); + } + + // special function for converting ASCII-only characters + __device__ char process_ascii(char chr) { - if (d_column.is_null(idx)) { + return (case_flag & d_flags[chr]) ? static_cast(d_case_table[chr]) : chr; + } +}; + +/** + * @brief Per string logic for case conversion functions + * + * This can be used in calls to make_strings_children. + */ +struct upper_lower_fn { + convert_char_fn converter; + column_device_view d_strings; + size_type* d_offsets{}; + char* d_chars{}; + + __device__ void operator()(size_type idx) const + { + if (d_strings.is_null(idx)) { if (!d_chars) d_offsets[idx] = 0; return; } - auto const d_str = d_column.template element(idx); - int32_t bytes = 0; + auto const d_str = d_strings.element(idx); + size_type bytes = 0; char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) { - uint32_t code_point = detail::utf8_to_codepoint(*itr); - - detail::character_flags_table_type flag = code_point <= 0x00'FFFF ? d_flags[code_point] : 0; - - // we apply special mapping in two cases: - // - uncased characters with the special mapping flag, always - // - cased characters with the special mapping flag, when matching the input case_flag - // - if (IS_SPECIAL(flag) && ((flag & case_flag) || !IS_UPPER_OR_LOWER(flag))) { - auto const new_bytes = handle_special_case_bytes(code_point, d_buffer, case_flag); - bytes += new_bytes; - if (d_buffer) d_buffer += new_bytes; + auto const size = converter.process_character(*itr, d_buffer); + if (d_buffer) { + d_buffer += size; } else { - char_utf8 new_char = - (flag & case_flag) ? detail::codepoint_to_utf8(d_case_table[code_point]) : *itr; - if (!d_buffer) - bytes += detail::bytes_in_char_utf8(new_char); - else - d_buffer += detail::from_char_utf8(new_char, d_buffer); + bytes += size; } } - if (!d_buffer) d_offsets[idx] = bytes; + if (!d_buffer) { d_offsets[idx] = bytes; } + } +}; + +/** + * @brief Count output bytes in warp-parallel threads + * + * This executes as one warp per string and just computes the output sizes. + */ +struct count_bytes_fn { + convert_char_fn converter; + column_device_view d_strings; + size_type* d_offsets; + + __device__ void operator()(size_type idx) const + { + auto const str_idx = idx / cudf::detail::warp_size; + auto const lane_idx = idx % cudf::detail::warp_size; + + // initialize the output for the atomicAdd + if (lane_idx == 0) { d_offsets[str_idx] = 0; } + __syncwarp(); + + if (d_strings.is_null(str_idx)) { return; } + auto const d_str = d_strings.element(str_idx); + auto const str_ptr = d_str.data(); + + size_type size = 0; + for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) { + auto const chr = str_ptr[i]; + if (is_utf8_continuation_char(chr)) { continue; } + char_utf8 u8 = 0; + to_char_utf8(str_ptr + i, u8); + size += converter.process_character(u8); + } + // this is every so slightly faster than using the cub::warp_reduce + if (size > 0) atomicAdd(d_offsets + str_idx, size); } }; +/** + * @brief Special functor for processing ASCII-only data + */ +struct ascii_converter_fn { + convert_char_fn converter; + __device__ char operator()(char chr) { return converter.process_ascii(chr); } +}; + /** * @brief Utility method for converting upper and lower case characters - * in a strings column. + * in a strings column * - * @param strings Strings to convert. + * @param input Strings to convert * @param case_flag The character type to convert (upper, lower, or both) - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column with characters converted. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column with characters converted */ -std::unique_ptr convert_case(strings_column_view const& strings, +std::unique_ptr convert_case(strings_column_view const& input, character_flags_table_type case_flag, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (strings.is_empty()) return make_empty_column(type_id::STRING); - - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - - // build functor with lookup tables used for case conversion - upper_lower_fn functor{d_column, - case_flag, - get_character_flags_table(), - get_character_cases_table(), - get_special_case_mapping_table()}; - - // this utility calls the functor to build the offsets and chars columns - auto children = cudf::strings::detail::make_strings_children(functor, strings.size(), stream, mr); - - return make_strings_column(strings.size(), - std::move(children.first), - std::move(children.second), - strings.null_count(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr)); + if (input.size() == input.null_count()) { + return std::make_unique(input.parent(), stream, mr); + } + + auto const d_strings = column_device_view::create(input.parent(), stream); + auto const d_flags = get_character_flags_table(); + auto const d_cases = get_character_cases_table(); + auto const d_special = get_special_case_mapping_table(); + + convert_char_fn ccfn{case_flag, d_flags, d_cases, d_special}; + upper_lower_fn converter{ccfn, *d_strings}; + + // For smaller strings, use the regular string-parallel algorithm + if ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) { + auto [offsets, chars] = + cudf::strings::detail::make_strings_children(converter, input.size(), stream, mr); + return make_strings_column(input.size(), + std::move(offsets), + std::move(chars), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); + } + + // Check if the input contains any multi-byte characters. + // This check incurs ~20% performance hit for smaller strings and so we only use it + // after the threshold check above. The check makes very little impact for larger strings + // but results in a large performance gain when the input contains only single-byte characters. + // The count_if is faster than any_of or all_of: https://github.com/NVIDIA/thrust/issues/1016 + bool const multi_byte_chars = + thrust::count_if( + rmm::exec_policy(stream), input.chars_begin(), input.chars_end(), [] __device__(auto chr) { + return is_utf8_continuation_char(chr); + }) > 0; + if (!multi_byte_chars) { + // optimization for ASCII-only case: copy the input column and inplace replace each character + auto result = std::make_unique(input.parent(), stream, mr); + auto d_chars = + result->mutable_view().child(strings_column_view::chars_column_index).data(); + auto const chars_size = strings_column_view(result->view()).chars_size(); + thrust::transform( + rmm::exec_policy(stream), d_chars, d_chars + chars_size, d_chars, ascii_converter_fn{ccfn}); + result->set_null_count(input.null_count()); + return result; + } + + // This will use a warp-parallel algorithm to compute the output sizes for each string + // and then uses the normal string parallel functor to build the output. + auto offsets = make_numeric_column( + data_type{type_to_id()}, input.size() + 1, mask_state::UNALLOCATED, stream, mr); + auto d_offsets = offsets->mutable_view().data(); + + // first pass, compute output sizes + // note: tried to use segmented-reduce approach instead here and it was consistently slower + count_bytes_fn counter{ccfn, *d_strings, d_offsets}; + auto const count_itr = thrust::make_counting_iterator(0); + thrust::for_each_n( + rmm::exec_policy(stream), count_itr, input.size() * cudf::detail::warp_size, counter); + + // convert sizes to offsets + auto const bytes = + cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream); + CUDF_EXPECTS(bytes <= static_cast(std::numeric_limits::max()), + "Size of output exceeds column size limit", + std::overflow_error); + + auto chars = create_chars_child_column(static_cast(bytes), stream, mr); + // second pass, write output + converter.d_offsets = d_offsets; + converter.d_chars = chars->mutable_view().data(); + thrust::for_each_n(rmm::exec_policy(stream), count_itr, input.size(), converter); + + return make_strings_column(input.size(), + std::move(offsets), + std::move(chars), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); } } // namespace diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu index a403061ba0e..b87fb80fcc2 100644 --- a/cpp/src/strings/char_types/char_types.cu +++ b/cpp/src/strings/char_types/char_types.cu @@ -31,59 +31,84 @@ #include #include -#include #include namespace cudf { namespace strings { namespace detail { -// -std::unique_ptr all_characters_of_type(strings_column_view const& strings, +namespace { + +/** + * @brief Returns true for each string where all characters match the given types. + * + * Only the characters that match to `verify_types` are checked. + * Returns false if no characters are checked or one character does not match `types`. + * Returns true if at least one character is checked and all checked characters match `types`. + */ +struct char_types_fn { + column_device_view const d_column; + character_flags_table_type const* d_flags; + string_character_types const types; + string_character_types const verify_types; + + __device__ bool operator()(size_type idx) const + { + if (d_column.is_null(idx)) { return false; } + auto const d_str = d_column.element(idx); + auto const end = d_str.data() + d_str.size_bytes(); + + bool type_matched = !d_str.empty(); // require at least one character; + size_type check_count = 0; // count checked characters + for (auto itr = d_str.data(); type_matched && (itr < end); ++itr) { + uint8_t const chr = static_cast(*itr); + if (is_utf8_continuation_char(chr)) { continue; } + auto u8 = static_cast(chr); // holds UTF8 value + // using max(int8) here since max(char)=255 on ARM systems + if (u8 > std::numeric_limits::max()) { to_char_utf8(itr, u8); } + + // lookup flags in table by codepoint + auto const code_point = utf8_to_codepoint(u8); + auto const flag = code_point <= 0x00'FFFF ? d_flags[code_point] : 0; + + if ((verify_types & flag) || // should flag be verified; + (flag == 0 && verify_types == ALL_TYPES)) // special edge case + { + type_matched = (types & flag) > 0; + ++check_count; + } + } + + return type_matched && (check_count > 0); + } +}; +} // namespace + +std::unique_ptr all_characters_of_type(strings_column_view const& input, string_character_types types, string_character_types verify_types, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto strings_count = strings.size(); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; + auto d_strings = column_device_view::create(input.parent(), stream); // create output column - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings_count, - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), + auto results = make_numeric_column(data_type{type_id::BOOL8}, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), stream, mr); - auto results_view = results->mutable_view(); - auto d_results = results_view.data(); // get the static character types table auto d_flags = detail::get_character_flags_table(); + // set the output values by checking the character types for each string thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - d_results, - [d_column, d_flags, types, verify_types, d_results] __device__(size_type idx) { - if (d_column.is_null(idx)) return false; - auto d_str = d_column.element(idx); - bool check = !d_str.empty(); // require at least one character - size_type check_count = 0; - for (auto itr = d_str.begin(); check && (itr != d_str.end()); ++itr) { - auto code_point = detail::utf8_to_codepoint(*itr); - // lookup flags in table by code-point - auto flag = code_point <= 0x00'FFFF ? d_flags[code_point] : 0; - if ((verify_types & flag) || // should flag be verified - (flag == 0 && verify_types == ALL_TYPES)) // special edge case - { - check = (types & flag) > 0; - ++check_count; - } - } - return check && (check_count > 0); - }); - // - results->set_null_count(strings.null_count()); + thrust::make_counting_iterator(input.size()), + results->mutable_view().data(), + char_types_fn{*d_strings, d_flags, types, verify_types}); + + results->set_null_count(input.null_count()); return results; } diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp index a4bbb213930..4693fc8e342 100644 --- a/cpp/tests/bitmask/bitmask_tests.cpp +++ b/cpp/tests/bitmask/bitmask_tests.cpp @@ -37,6 +37,7 @@ TEST_F(BitmaskUtilitiesTest, StateNullCount) EXPECT_EQ(0, cudf::state_null_count(cudf::mask_state::UNALLOCATED, 42)); EXPECT_EQ(42, cudf::state_null_count(cudf::mask_state::ALL_NULL, 42)); EXPECT_EQ(0, cudf::state_null_count(cudf::mask_state::ALL_VALID, 42)); + EXPECT_THROW(cudf::state_null_count(cudf::mask_state::UNINITIALIZED, 42), std::invalid_argument); } TEST_F(BitmaskUtilitiesTest, BitmaskAllocationSize) diff --git a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp index ea23a1c1069..a196f2ec980 100644 --- a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp +++ b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp @@ -75,6 +75,7 @@ TEST_F(PurgeNonEmptyNullsTest, SingleLevelList) // Set nullmask, post construction. cudf::detail::set_null_mask( input->mutable_view().null_mask(), 2, 3, false, cudf::get_default_stream()); + input->set_null_count(1); EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input)); EXPECT_TRUE(cudf::has_nonempty_nulls(*input)); @@ -158,6 +159,7 @@ TEST_F(PurgeNonEmptyNullsTest, TwoLevelList) // Set nullmask, post construction. cudf::detail::set_null_mask( input->mutable_view().null_mask(), 3, 4, false, cudf::get_default_stream()); + input->set_null_count(1); EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input)); EXPECT_TRUE(cudf::has_nonempty_nulls(*input)); @@ -213,6 +215,7 @@ TEST_F(PurgeNonEmptyNullsTest, ThreeLevelList) // Set nullmask, post construction. cudf::detail::set_null_mask( input->mutable_view().null_mask(), 3, 4, false, cudf::get_default_stream()); + input->set_null_count(1); EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input)); EXPECT_TRUE(cudf::has_nonempty_nulls(*input)); @@ -267,6 +270,7 @@ TEST_F(PurgeNonEmptyNullsTest, ListOfStrings) // Set nullmask, post construction. cudf::detail::set_null_mask( input->mutable_view().null_mask(), 2, 3, false, cudf::get_default_stream()); + input->set_null_count(1); EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input)); EXPECT_TRUE(cudf::has_nonempty_nulls(*input)); @@ -332,6 +336,7 @@ TEST_F(PurgeNonEmptyNullsTest, UnsanitizedListOfUnsanitizedStrings) // Set strings nullmask, post construction. cudf::set_null_mask(strings->mutable_view().null_mask(), 7, 8, false); + strings->set_null_count(1); EXPECT_TRUE(cudf::may_have_nonempty_nulls(*strings)); EXPECT_TRUE(cudf::has_nonempty_nulls(*strings)); @@ -358,6 +363,7 @@ TEST_F(PurgeNonEmptyNullsTest, UnsanitizedListOfUnsanitizedStrings) // Set lists nullmask, post construction. cudf::detail::set_null_mask( lists->mutable_view().null_mask(), 2, 3, false, cudf::get_default_stream()); + lists->set_null_count(1); EXPECT_TRUE(cudf::may_have_nonempty_nulls(*lists)); EXPECT_TRUE(cudf::has_nonempty_nulls(*lists)); diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 9bf16ecca6b..cd067cce928 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1568,7 +1568,7 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing) auto make_validity = [](std::vector const& validity) { return cudf::detail::make_counting_transform_iterator( - 0, [=](auto i) -> bool { return static_cast(validity[i]); }); + 0, [&](auto i) -> bool { return static_cast(validity[i]); }); }; constexpr int int_ignore{}; diff --git a/cpp/tests/scalar/scalar_test.cpp b/cpp/tests/scalar/scalar_test.cpp index 9c70fc364c7..d2f2b5d6a2e 100644 --- a/cpp/tests/scalar/scalar_test.cpp +++ b/cpp/tests/scalar/scalar_test.cpp @@ -33,7 +33,7 @@ TYPED_TEST_SUITE(TypedScalarTestWithoutFixedPoint, cudf::test::FixedWidthTypesWi TYPED_TEST(TypedScalarTest, DefaultValidity) { using Type = cudf::device_storage_type_t; - Type value = cudf::test::make_type_param_scalar(7); + Type value = static_cast(cudf::test::make_type_param_scalar(7)); cudf::scalar_type_t s(value); EXPECT_TRUE(s.is_valid()); @@ -71,7 +71,7 @@ TYPED_TEST(TypedScalarTestWithoutFixedPoint, SetNull) TYPED_TEST(TypedScalarTest, CopyConstructor) { using Type = cudf::device_storage_type_t; - Type value = cudf::test::make_type_param_scalar(8); + Type value = static_cast(cudf::test::make_type_param_scalar(8)); cudf::scalar_type_t s(value); auto s2 = s; diff --git a/cpp/tests/strings/case_tests.cpp b/cpp/tests/strings/case_tests.cpp index 3852930dafe..31637a6ab9a 100644 --- a/cpp/tests/strings/case_tests.cpp +++ b/cpp/tests/strings/case_tests.cpp @@ -202,6 +202,64 @@ TEST_F(StringsCaseTest, MultiCharLower) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(StringsCaseTest, Ascii) +{ + // triggering the ascii code path requires some long-ish strings + cudf::test::strings_column_wrapper input{ + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=-"}; + auto view = cudf::strings_column_view(input); + auto expected = cudf::test::strings_column_wrapper{ + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=-"}; + auto results = cudf::strings::to_lower(view); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + expected = cudf::test::strings_column_wrapper{ + "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ", + "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ", + "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ", + "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=-"}; + results = cudf::strings::to_upper(view); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = cudf::strings::to_upper(cudf::strings_column_view(cudf::slice(input, {1, 3}).front())); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, cudf::slice(expected, {1, 3}).front()); +} + +TEST_F(StringsCaseTest, LongStrings) +{ + // average string length >= AVG_CHAR_BYTES_THRESHOLD as defined in case.cu + cudf::test::strings_column_wrapper input{ + "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=-"}; + auto view = cudf::strings_column_view(input); + auto expected = cudf::test::strings_column_wrapper{ + "abcdéfghijklmnopqrstuvwxyzabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "abcdéfghijklmnopqrstuvwxyzabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "abcdéfghijklmnopqrstuvwxyzabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "abcdéfghijklmnopqrstuvwxyzabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=-"}; + auto results = cudf::strings::to_lower(view); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + expected = cudf::test::strings_column_wrapper{ + "ABCDÉFGHIJKLMNOPQRSTUVWXYZABCDÉFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ", + "ABCDÉFGHIJKLMNOPQRSTUVWXYZABCDÉFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ", + "ABCDÉFGHIJKLMNOPQRSTUVWXYZABCDÉFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ", + "ABCDÉFGHIJKLMNOPQRSTUVWXYZABCDÉFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=-"}; + results = cudf::strings::to_upper(view); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = cudf::strings::to_upper(cudf::strings_column_view(cudf::slice(input, {1, 3}).front())); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, cudf::slice(expected, {1, 3}).front()); +} + TEST_F(StringsCaseTest, EmptyStringsColumn) { cudf::column_view zero_size_strings_column( diff --git a/dependencies.yaml b/dependencies.yaml index 00ec63d31ef..70d7f8c1ec8 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -213,8 +213,8 @@ dependencies: - output_types: conda packages: - fmt>=9.1.0,<10 - - >est gtest>=1.13.0.* - - &gmock gmock>=1.13.0.* + - >est gtest>=1.13.0 + - &gmock gmock>=1.13.0 # Hard pin the patch version used during the build. This must be kept # in sync with the version pinned in get_arrow.cmake. - libarrow==11.0.0.* diff --git a/java/src/main/java/ai/rapids/cudf/BitVectorHelper.java b/java/src/main/java/ai/rapids/cudf/BitVectorHelper.java index cdb7e9e4418..fa96e833b90 100644 --- a/java/src/main/java/ai/rapids/cudf/BitVectorHelper.java +++ b/java/src/main/java/ai/rapids/cudf/BitVectorHelper.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -71,7 +71,7 @@ private static void shiftSrcLeftAndWriteToDst(HostMemoryBuffer src, HostMemoryBu /** * This method returns the length in bytes needed to represent X number of rows * e.g. getValidityLengthInBytes(5) => 1 byte - * getLengthInBytes(7) => 1 byte + * getValidityLengthInBytes(7) => 1 byte * getValidityLengthInBytes(14) => 2 bytes */ static long getValidityLengthInBytes(long rows) { diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index 4d43ffcb457..fecb13e1921 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -119,7 +119,10 @@ public ColumnVector(DType type, long rows, Optional nullCount, incRefCountInternal(true); } - private static OffHeapState makeOffHeap(DType type, long rows, Optional nullCount, + /** + * This method is internal and exposed purely for testing purposes + */ + static OffHeapState makeOffHeap(DType type, long rows, Optional nullCount, DeviceMemoryBuffer dataBuffer, DeviceMemoryBuffer validityBuffer, DeviceMemoryBuffer offsetBuffer, List toClose, long[] childHandles) { long viewHandle = initViewHandle(type, (int)rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(), @@ -141,7 +144,7 @@ private static OffHeapState makeOffHeap(DType type, long rows, Optional nu * @param offsetBuffer a host buffer required for strings and string categories. The column * vector takes ownership of the buffer. Do not use the buffer after calling * this. - * @param toClose List of buffers to track adn close once done, usually in case of children + * @param toClose List of buffers to track and close once done, usually in case of children * @param childHandles array of longs for child column view handles. */ public ColumnVector(DType type, long rows, Optional nullCount, diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 67ad9166fe0..8b59ea68972 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -43,8 +43,10 @@ public class ColumnView implements AutoCloseable, BinaryOperable { protected final ColumnVector.OffHeapState offHeap; /** - * Constructs a Column View given a native view address + * Constructs a Column View given a native view address. This asserts that if the ColumnView is + * of nested-type it doesn't contain non-empty nulls * @param address the view handle + * @throws AssertionError if the address points to a nested-type view with non-empty nulls */ ColumnView(long address) { this.viewHandle = address; @@ -52,15 +54,24 @@ public class ColumnView implements AutoCloseable, BinaryOperable { this.rows = ColumnView.getNativeRowCount(viewHandle); this.nullCount = ColumnView.getNativeNullCount(viewHandle); this.offHeap = null; - AssertEmptyNulls.assertNullsAreEmpty(this); + try { + AssertEmptyNulls.assertNullsAreEmpty(this); + } catch (AssertionError ae) { + // offHeap state is null, so there is nothing to clean in offHeap + // delete ColumnView to avoid memory leak + deleteColumnView(viewHandle); + viewHandle = 0; + throw ae; + } } /** * Intended to be called from ColumnVector when it is being constructed. Because state creates a * cudf::column_view instance and will close it in all cases, we don't want to have to double - * close it. + * close it. This asserts that if the offHeapState is of nested-type it doesn't contain non-empty nulls * @param state the state this view is based off of. + * @throws AssertionError if offHeapState points to a nested-type view with non-empty nulls */ protected ColumnView(ColumnVector.OffHeapState state) { offHeap = state; @@ -68,7 +79,14 @@ protected ColumnView(ColumnVector.OffHeapState state) { type = DType.fromNative(ColumnView.getNativeTypeId(viewHandle), ColumnView.getNativeTypeScale(viewHandle)); rows = ColumnView.getNativeRowCount(viewHandle); nullCount = ColumnView.getNativeNullCount(viewHandle); - AssertEmptyNulls.assertNullsAreEmpty(this); + try { + AssertEmptyNulls.assertNullsAreEmpty(this); + } catch (AssertionError ae) { + // cleanup offHeap + offHeap.clean(false); + viewHandle = 0; + throw ae; + } } /** @@ -649,8 +667,14 @@ public final ColumnVector ifElse(Scalar trueValue, Scalar falseValue) { public final ColumnVector[] slice(int... indices) { long[] nativeHandles = slice(this.getNativeView(), indices); ColumnVector[] columnVectors = new ColumnVector[nativeHandles.length]; - for (int i = 0; i < nativeHandles.length; i++) { - columnVectors[i] = new ColumnVector(nativeHandles[i]); + try { + for (int i = 0; i < nativeHandles.length; i++) { + columnVectors[i] = new ColumnVector(nativeHandles[i]); + nativeHandles[i] = 0; + } + } catch (Throwable t) { + cleanupColumnViews(nativeHandles, columnVectors); + throw t; } return columnVectors; } @@ -788,12 +812,31 @@ public final ColumnVector[] split(int... indices) { public ColumnView[] splitAsViews(int... indices) { long[] nativeHandles = split(this.getNativeView(), indices); ColumnView[] columnViews = new ColumnView[nativeHandles.length]; - for (int i = 0; i < nativeHandles.length; i++) { - columnViews[i] = new ColumnView(nativeHandles[i]); + try { + for (int i = 0; i < nativeHandles.length; i++) { + columnViews[i] = new ColumnView(nativeHandles[i]); + nativeHandles[i] = 0; + } + } catch (Throwable t) { + cleanupColumnViews(nativeHandles, columnViews); + throw t; } return columnViews; } + static void cleanupColumnViews(long[] nativeHandles, ColumnView[] columnViews) { + for (ColumnView columnView: columnViews) { + if (columnView != null) { + columnView.close(); + } + } + for (long nativeHandle: nativeHandles) { + if (nativeHandle != 0) { + deleteColumnView(nativeHandle); + } + } + } + /** * Create a new vector of "normalized" values, where: * 1. All representations of NaN (and -NaN) are replaced with the normalized NaN value diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 9abc2dbcd7c..93cb1acfae4 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -87,6 +87,7 @@ public Table(long[] cudfColumns) { try { for (int i = 0; i < cudfColumns.length; i++) { this.columns[i] = new ColumnVector(cudfColumns[i]); + cudfColumns[i] = 0; } long[] views = new long[columns.length]; for (int i = 0; i < columns.length; i++) { @@ -95,13 +96,7 @@ public Table(long[] cudfColumns) { nativeHandle = createCudfTableView(views); this.rows = columns[0].getRowCount(); } catch (Throwable t) { - for (int i = 0; i < cudfColumns.length; i++) { - if (this.columns[i] != null) { - this.columns[i].close(); - } else { - ColumnVector.deleteCudfColumn(cudfColumns[i]); - } - } + ColumnView.cleanupColumnViews(cudfColumns, this.columns); throw t; } } @@ -3396,8 +3391,14 @@ public static GatherMap mixedLeftAntiJoinGatherMap(Table leftKeys, Table rightKe public ColumnVector[] convertToRows() { long[] ptrs = convertToRows(nativeHandle); ColumnVector[] ret = new ColumnVector[ptrs.length]; - for (int i = 0; i < ptrs.length; i++) { - ret[i] = new ColumnVector(ptrs[i]); + try { + for (int i = 0; i < ptrs.length; i++) { + ret[i] = new ColumnVector(ptrs[i]); + ptrs[i] = 0; + } + } catch (Throwable t) { + ColumnView.cleanupColumnViews(ptrs, ret); + throw t; } return ret; } @@ -3479,8 +3480,14 @@ public ColumnVector[] convertToRows() { public ColumnVector[] convertToRowsFixedWidthOptimized() { long[] ptrs = convertToRowsFixedWidthOptimized(nativeHandle); ColumnVector[] ret = new ColumnVector[ptrs.length]; - for (int i = 0; i < ptrs.length; i++) { - ret[i] = new ColumnVector(ptrs[i]); + try { + for (int i = 0; i < ptrs.length; i++) { + ret[i] = new ColumnVector(ptrs[i]); + ptrs[i] = 0; + } + } catch (Throwable t) { + ColumnView.cleanupColumnViews(ptrs, ret); + throw t; } return ret; } @@ -3552,14 +3559,7 @@ public static Table fromPackedTable(ByteBuffer metadata, DeviceMemoryBuffer data } result = new Table(columns); } catch (Throwable t) { - for (int i = 0; i < columns.length; i++) { - if (columns[i] != null) { - columns[i].close(); - } - if (columnViewAddresses[i] != 0) { - ColumnView.deleteColumnView(columnViewAddresses[i]); - } - } + ColumnView.cleanupColumnViews(columnViewAddresses, columns); throw t; } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 6e9498acdac..59b4c9f9f67 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -32,6 +32,7 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.Optional; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Supplier; import java.util.stream.Collectors; @@ -6677,6 +6678,54 @@ void testApplyBooleanMaskFromListOfStructure() { } } + @Test + void testColumnViewWithNonEmptyNullsIsCleared() { + List list0 = Arrays.asList(1, 2, 3); + List list1 = Arrays.asList(4, 5, null); + List list2 = Arrays.asList(7, 8, 9); + List list3 = null; + try (ColumnVector input = ColumnVectorTest.makeListsColumn(DType.INT32, list0, list1, list2, list3); + BaseDeviceMemoryBuffer baseValidityBuffer = input.getDeviceBufferFor(BufferType.VALIDITY); + BaseDeviceMemoryBuffer baseOffsetBuffer = input.getDeviceBufferFor(BufferType.OFFSET); + HostMemoryBuffer newValidity = HostMemoryBuffer.allocate(BitVectorHelper.getValidityAllocationSizeInBytes(4))) { + + newValidity.copyFromDeviceBuffer(baseValidityBuffer); + // we are setting list1 with 3 elements to null. This will result in a non-empty null in the + // ColumnView at index 1 + BitVectorHelper.setNullAt(newValidity, 1); + // validityBuffer will be closed by offHeapState later + DeviceMemoryBuffer validityBuffer = DeviceMemoryBuffer.allocate(BitVectorHelper.getValidityAllocationSizeInBytes(4)); + try { + // offsetBuffer will be closed by offHeapState later + DeviceMemoryBuffer offsetBuffer = DeviceMemoryBuffer.allocate(baseOffsetBuffer.getLength()); + try { + validityBuffer.copyFromHostBuffer(newValidity); + offsetBuffer.copyFromMemoryBuffer(0, baseOffsetBuffer, 0, + baseOffsetBuffer.length, Cuda.DEFAULT_STREAM); + + // The new offHeapState will have 2 nulls, one null at index 4 from the original ColumnVector + // the other at index 1 which is non-empty + ColumnVector.OffHeapState offHeapState = ColumnVector.makeOffHeap(input.type, input.rows, Optional.of(2L), + null, validityBuffer, offsetBuffer, + null, Arrays.stream(input.getChildColumnViews()).mapToLong((c) -> c.viewHandle).toArray()); + try { + new ColumnView(offHeapState); + } catch (AssertionError ae) { + assert offHeapState.isClean(); + } + } catch (Exception e) { + if (!offsetBuffer.closed) { + offsetBuffer.close(); + } + } + } catch (Exception e) { + if (!validityBuffer.closed) { + validityBuffer.close(); + } + } + } + } + @Test public void testEventHandlerIsCalledForEachClose() { final AtomicInteger onClosedWasCalled = new AtomicInteger(0); @@ -6700,5 +6749,4 @@ public void testEventHandlerIsNotCalledIfNotSet() { } assertEquals(0, onClosedWasCalled.get()); } - } diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index 013cba3ae03..93edef311db 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -4,12 +4,12 @@ from __future__ import annotations from typing import Dict, Optional, Tuple, TypeVar +from typing_extensions import Self + from cudf._typing import Dtype, DtypeObj, ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase -T = TypeVar("T") - class Column: _data: Optional[Buffer] _mask: Optional[Buffer] @@ -29,8 +29,8 @@ class Column: size: int, dtype: Dtype, mask: Optional[Buffer] = None, - offset: int = None, - null_count: int = None, + offset: Optional[int] = None, + null_count: Optional[int] = None, children: Tuple[ColumnBase, ...] = (), ) -> None: ... @property @@ -56,7 +56,7 @@ class Column: @property def mask_ptr(self) -> int: ... def set_base_mask(self, value: Optional[Buffer]) -> None: ... - def set_mask(self: T, value: Optional[Buffer]) -> T: ... + def set_mask(self, value: Optional[Buffer]) -> Self: ... @property def null_count(self) -> int: ... @property @@ -68,7 +68,8 @@ class Column: def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: ... def _mimic_inplace( self, other_col: ColumnBase, inplace=False - ) -> Optional[ColumnBase]: ... + ) -> Optional[Self]: ... + # TODO: The val parameter should be Scalar, not ScalarLike @staticmethod def from_scalar(val: ScalarLike, size: int) -> ColumnBase: ... diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 68c52a8b9e8..a2e3bc44f3a 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -5,9 +5,10 @@ import pickle import warnings from functools import cached_property -from typing import Any, Set, TypeVar +from typing import Any, Set import pandas as pd +from typing_extensions import Self import cudf from cudf._lib.copying import _gather_map_is_valid, gather @@ -62,8 +63,6 @@ Float64Index([1.0, 2.0, 3.0], dtype='float64') """ -BaseIndexT = TypeVar("BaseIndexT", bound="BaseIndex") - class BaseIndex(Serializable): """Base class for all cudf Index types.""" @@ -101,8 +100,8 @@ def __contains__(self, item): return item in self._values def _copy_type_metadata( - self: BaseIndexT, other: BaseIndexT, *, override_dtypes=None - ) -> BaseIndexT: + self, other: Self, *, override_dtypes=None + ) -> Self: raise NotImplementedError def get_level_values(self, level): @@ -1451,7 +1450,6 @@ def get_slice_bound(self, label, side, kind=None): raise NotImplementedError def __array_function__(self, func, types, args, kwargs): - # check if the function is implemented for the current type cudf_index_module = type(self) for submodule in func.__module__.split(".")[1:]: diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 2262730d8a1..97f3b16bec8 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -5,9 +5,10 @@ import math import pickle from types import SimpleNamespace -from typing import Any, Dict, Mapping, Sequence, Tuple, Type, TypeVar +from typing import Any, Dict, Mapping, Optional, Sequence, Tuple import numpy +from typing_extensions import Self import rmm @@ -15,8 +16,6 @@ from cudf.core.abc import Serializable from cudf.utils.string import format_bytes -T = TypeVar("T", bound="Buffer") - def host_memory_allocation(nbytes: int) -> memoryview: """Allocate host memory using NumPy @@ -42,7 +41,7 @@ def host_memory_allocation(nbytes: int) -> memoryview: def cuda_array_interface_wrapper( ptr: int, size: int, - owner: object = None, + owner: Optional[object] = None, readonly=False, typestr="|u1", version=0, @@ -108,7 +107,7 @@ def __init__(self): ) @classmethod - def _from_device_memory(cls: Type[T], data: Any) -> T: + def _from_device_memory(cls, data: Any) -> Self: """Create a Buffer from an object exposing `__cuda_array_interface__`. No data is being copied. @@ -139,7 +138,7 @@ def _from_device_memory(cls: Type[T], data: Any) -> T: return ret @classmethod - def _from_host_memory(cls: Type[T], data: Any) -> T: + def _from_host_memory(cls, data: Any) -> Self: """Create a Buffer from a buffer or array like object Data must implement `__array_interface__`, the buffer protocol, and/or @@ -310,7 +309,7 @@ def serialize(self) -> Tuple[dict, list]: return header, frames @classmethod - def deserialize(cls: Type[T], header: dict, frames: list) -> T: + def deserialize(cls, header: dict, frames: list) -> Self: """Create an Buffer from a serialized representation. Parameters diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 16a1e3942e7..6243916b91b 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -4,15 +4,15 @@ import weakref from collections import defaultdict -from typing import Any, DefaultDict, Tuple, Type, TypeVar +from typing import Any, DefaultDict, Tuple from weakref import WeakSet +from typing_extensions import Self + import rmm from cudf.core.buffer.buffer import Buffer -T = TypeVar("T", bound="CopyOnWriteBuffer") - def _keys_cleanup(ptr): weak_set_values = CopyOnWriteBuffer._instances[ptr] @@ -55,9 +55,7 @@ def _finalize_init(self): weakref.finalize(self, _keys_cleanup, self._ptr) @classmethod - def _from_device_memory( - cls: Type[T], data: Any, *, exposed: bool = False - ) -> T: + def _from_device_memory(cls, data: Any, *, exposed: bool = False) -> Self: """Create a Buffer from an object exposing `__cuda_array_interface__`. No data is being copied. @@ -82,7 +80,7 @@ def _from_device_memory( return ret @classmethod - def _from_host_memory(cls: Type[T], data: Any) -> T: + def _from_host_memory(cls, data: Any) -> Self: ret = super()._from_host_memory(data) ret._finalize_init() return ret diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py index d2a87af3869..7f8399ba522 100644 --- a/python/cudf/cudf/core/buffer/spill_manager.py +++ b/python/cudf/cudf/core/buffer/spill_manager.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. from __future__ import annotations @@ -225,7 +225,7 @@ def __init__( self, *, spill_on_demand: bool = False, - device_memory_limit: int = None, + device_memory_limit: Optional[int] = None, statistic_level: int = 0, ) -> None: self._lock = threading.Lock() @@ -358,7 +358,7 @@ def spill_device_memory(self, nbytes: int) -> int: buf.lock.release() return spilled - def spill_to_device_limit(self, device_limit: int = None) -> int: + def spill_to_device_limit(self, device_limit: Optional[int] = None) -> int: """Try to spill device memory until device limit Notice, by default this is a no-op. diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index c71841a5a26..c1bc49c7a9e 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -7,18 +7,10 @@ import time import weakref from threading import RLock -from typing import ( - TYPE_CHECKING, - Any, - Dict, - List, - Optional, - Tuple, - Type, - TypeVar, -) +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple import numpy +from typing_extensions import Self import rmm @@ -34,9 +26,6 @@ from cudf.core.buffer.spill_manager import SpillManager -T = TypeVar("T", bound="SpillableBuffer") - - def get_spillable_owner(data) -> Optional[SpillableBuffer]: """Get the spillable owner of `data`, if any exist @@ -212,9 +201,7 @@ def _finalize_init(self, ptr_desc: Dict[str, Any], exposed: bool) -> None: self._manager.add(self) @classmethod - def _from_device_memory( - cls: Type[T], data: Any, *, exposed: bool = False - ) -> T: + def _from_device_memory(cls, data: Any, *, exposed: bool = False) -> Self: """Create a spillabe buffer from device memory. No data is being copied. @@ -236,7 +223,7 @@ def _from_device_memory( return ret @classmethod - def _from_host_memory(cls: Type[T], data: Any) -> T: + def _from_host_memory(cls, data: Any) -> Self: """Create a spillabe buffer from host memory. Data must implement `__array_interface__`, the buffer protocol, and/or @@ -448,7 +435,9 @@ def __cuda_array_interface__(self) -> dict: "version": 0, } - def memoryview(self, *, offset: int = 0, size: int = None) -> memoryview: + def memoryview( + self, *, offset: int = 0, size: Optional[int] = None + ) -> memoryview: size = self._size if size is None else size with self.lock: if self.spillable: @@ -573,7 +562,9 @@ def deserialize(cls, header: dict, frames: list): # copied. return SpillableBuffer.deserialize(header, frames) - def memoryview(self, *, offset: int = 0, size: int = None) -> memoryview: + def memoryview( + self, *, offset: int = 0, size: Optional[int] = None + ) -> memoryview: size = self._size if size is None else size return self._base.memoryview(offset=self._offset + offset, size=size) diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py index 2fe332a12fe..85e4762641e 100644 --- a/python/cudf/cudf/core/buffer/utils.py +++ b/python/cudf/cudf/core/buffer/utils.py @@ -16,8 +16,8 @@ def as_buffer( data: Union[int, Any], *, - size: int = None, - owner: object = None, + size: Optional[int] = None, + owner: Optional[object] = None, exposed: bool = False, ) -> Buffer: """Factory function to wrap `data` in a Buffer object. diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 52f7c0b957f..39332807139 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -11,6 +11,7 @@ import pandas as pd import pyarrow as pa from numba import cuda +from typing_extensions import Self import cudf from cudf import _lib as libcudf @@ -710,13 +711,12 @@ class CategoricalColumn(column.ColumnBase): def __init__( self, dtype: CategoricalDtype, - mask: Buffer = None, - size: int = None, + mask: Optional[Buffer] = None, + size: Optional[int] = None, offset: int = 0, - null_count: int = None, + null_count: Optional[int] = None, children: Tuple["column.ColumnBase", ...] = (), ): - if size is None: for child in children: assert child.offset == 0 @@ -874,7 +874,7 @@ def _fill( begin: int, end: int, inplace: bool = False, - ) -> "column.ColumnBase": + ) -> Self: if end <= begin or begin >= self.size: return self if inplace else self.copy() @@ -889,18 +889,21 @@ def _fill( return result def slice( - self, start: int, stop: int, stride: int = None - ) -> "column.ColumnBase": + self, start: int, stop: int, stride: Optional[int] = None + ) -> Self: codes = self.codes.slice(start, stop, stride) - return cudf.core.column.build_categorical_column( - categories=self.categories, - codes=cudf.core.column.build_column( - codes.base_data, dtype=codes.dtype + return cast( + Self, + cudf.core.column.build_categorical_column( + categories=self.categories, + codes=cudf.core.column.build_column( + codes.base_data, dtype=codes.dtype + ), + mask=codes.base_mask, + ordered=self.ordered, + size=codes.size, + offset=codes.offset, ), - mask=codes.base_mask, - ordered=self.ordered, - size=codes.size, - offset=codes.offset, ) def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: @@ -962,7 +965,9 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]: " if you need this functionality." ) - def to_pandas(self, index: pd.Index = None, **kwargs) -> pd.Series: + def to_pandas( + self, index: Optional[pd.Index] = None, **kwargs + ) -> pd.Series: if self.categories.dtype.kind == "f": new_mask = bools_to_mask(self.notnull()) col = column.build_categorical_column( @@ -1219,7 +1224,10 @@ def notnull(self) -> ColumnBase: return result def fillna( - self, fill_value: Any = None, method: Any = None, dtype: Dtype = None + self, + fill_value: Any = None, + method: Any = None, + dtype: Optional[Dtype] = None, ) -> CategoricalColumn: """ Fill null values with *fill_value* @@ -1237,7 +1245,7 @@ def fillna( try: fill_value = self._encode(fill_value) fill_value = self.codes.dtype.type(fill_value) - except (ValueError) as err: + except ValueError as err: err_msg = "fill value must be in categories" raise ValueError(err_msg) from err else: @@ -1351,7 +1359,7 @@ def _get_decategorized_column(self) -> ColumnBase: out = out.set_mask(self.mask) return out - def copy(self, deep: bool = True) -> CategoricalColumn: + def copy(self, deep: bool = True) -> Self: result_col = super().copy(deep=deep) if deep: result_col.categories = libcudf.copying.copy_column( @@ -1365,7 +1373,7 @@ def memory_usage(self) -> int: def _mimic_inplace( self, other_col: ColumnBase, inplace: bool = False - ) -> Optional[ColumnBase]: + ) -> Optional[Self]: out = super()._mimic_inplace(other_col, inplace=inplace) if inplace and isinstance(other_col, CategoricalColumn): self._codes = other_col._codes @@ -1641,7 +1649,7 @@ def _create_empty_categorical_column( def pandas_categorical_as_column( - categorical: ColumnLike, codes: ColumnLike = None + categorical: ColumnLike, codes: Optional[ColumnLike] = None ) -> CategoricalColumn: """Creates a CategoricalColumn from a pandas.Categorical diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 042a1060fae..607bf83ff6c 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2,6 +2,7 @@ from __future__ import annotations +import builtins import pickle import warnings from functools import cached_property @@ -15,7 +16,6 @@ Optional, Sequence, Tuple, - TypeVar, Union, cast, ) @@ -25,6 +25,7 @@ import pandas as pd import pyarrow as pa from numba import cuda +from typing_extensions import Self import rmm @@ -97,11 +98,6 @@ else: from pandas.core.arrays._arrow_utils import ArrowIntervalType -T = TypeVar("T", bound="ColumnBase") -# TODO: This workaround allows type hints for `slice`, since `slice` is a -# method in ColumnBase. -Slice = TypeVar("Slice", bound=slice) - class ColumnBase(Column, Serializable, BinaryOperand, Reducible): _VALID_REDUCTIONS = { @@ -201,7 +197,9 @@ def __repr__(self): f"dtype: {self.dtype}" ) - def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series": + def to_pandas( + self, index: Optional[pd.Index] = None, **kwargs + ) -> pd.Series: """Convert object to pandas type. The default implementation falls back to PyArrow for the conversion. @@ -244,11 +242,11 @@ def values(self) -> "cupy.ndarray": return cupy.asarray(self.data_array_view(mode="write")) def find_and_replace( - self: T, + self, to_replace: ColumnLike, replacement: ColumnLike, all_nan: bool = False, - ) -> T: + ) -> Self: raise NotImplementedError def clip(self, lo: ScalarLike, hi: ScalarLike) -> ColumnBase: @@ -397,7 +395,7 @@ def _fill( begin: int, end: int, inplace: bool = False, - ) -> Optional[ColumnBase]: + ) -> Optional[Self]: if end <= begin or begin >= self.size: return self if inplace else self.copy() @@ -437,15 +435,15 @@ def nullmask(self) -> Buffer: raise ValueError("Column has no null mask") return self.mask_array_view(mode="read") - def force_deep_copy(self: T) -> T: + def force_deep_copy(self) -> Self: """ A method to create deep copy irrespective of whether `copy-on-write` is enabled. """ result = libcudf.copying.copy_column(self) - return cast(T, result._with_type_metadata(self.dtype)) + return result._with_type_metadata(self.dtype) - def copy(self: T, deep: bool = True) -> T: + def copy(self, deep: bool = True) -> Self: """ Makes a copy of the Column. @@ -467,7 +465,7 @@ def copy(self: T, deep: bool = True) -> T: return self.force_deep_copy() else: return cast( - T, + Self, build_column( data=self.base_data if self.base_data is None @@ -548,7 +546,9 @@ def element_indexing(self, index: int): return libcudf.copying.get_element(self, idx).value - def slice(self, start: int, stop: int, stride: int = None) -> ColumnBase: + def slice( + self, start: int, stop: int, stride: Optional[int] = None + ) -> ColumnBase: stride = 1 if stride is None else stride if start < 0: start = start + len(self) @@ -607,8 +607,10 @@ def _wrap_binop_normalization(self, other): return self.normalize_binop_value(other) def _scatter_by_slice( - self, key: Slice, value: Union[cudf.core.scalar.Scalar, ColumnBase] - ) -> Optional[ColumnBase]: + self, + key: builtins.slice, + value: Union[cudf.core.scalar.Scalar, ColumnBase], + ) -> Optional[Self]: """If this function returns None, it's either a no-op (slice is empty), or the inplace replacement is already performed (fill-in-place). """ @@ -646,7 +648,7 @@ def _scatter_by_column( self, key: cudf.core.column.NumericalColumn, value: Union[cudf.core.scalar.Scalar, ColumnBase], - ) -> ColumnBase: + ) -> Self: if is_bool_dtype(key.dtype): # `key` is boolean mask if len(key) != len(self): @@ -697,11 +699,11 @@ def _check_scatter_key_length( raise ValueError(msg) def fillna( - self: T, + self, value: Any = None, - method: str = None, - dtype: Dtype = None, - ) -> T: + method: Optional[str] = None, + dtype: Optional[Dtype] = None, + ) -> Self: """Fill null values with ``value``. Returns a copy with null filled. @@ -769,8 +771,8 @@ def quantile( raise TypeError(f"cannot perform quantile with type {self.dtype}") def take( - self: T, indices: ColumnBase, nullify: bool = False, check_bounds=True - ) -> T: + self, indices: ColumnBase, nullify: bool = False, check_bounds=True + ) -> Self: """Return Column by taking values from the corresponding *indices*. Skip bounds checking if check_bounds is False. @@ -778,7 +780,7 @@ def take( """ # Handle zero size if indices.size == 0: - return cast(T, column_empty_like(self, newsize=0)) + return cast(Self, column_empty_like(self, newsize=0)) # TODO: For performance, the check and conversion of gather map should # be done by the caller. This check will be removed in future release. @@ -1097,7 +1099,6 @@ def apply_boolean_mask(self, mask) -> ColumnBase: def argsort( self, ascending: bool = True, na_position: str = "last" ) -> "cudf.core.column.NumericalColumn": - return self.as_frame()._get_sorted_inds( ascending=ascending, na_position=na_position ) @@ -1244,14 +1245,19 @@ def normalize_binop_value( ) -> Union[ColumnBase, ScalarLike]: raise NotImplementedError - def _minmax(self, skipna: bool = None): + def _minmax(self, skipna: Optional[bool] = None): result_col = self._process_for_reduction(skipna=skipna) if isinstance(result_col, ColumnBase): return libcudf.reduce.minmax(result_col) return result_col def _reduce( - self, op: str, skipna: bool = None, min_count: int = 0, *args, **kwargs + self, + op: str, + skipna: Optional[bool] = None, + min_count: int = 0, + *args, + **kwargs, ) -> ScalarLike: """Compute {op} of column values. @@ -1273,7 +1279,7 @@ def contains_na_entries(self) -> bool: return self.null_count != 0 def _process_for_reduction( - self, skipna: bool = None, min_count: int = 0 + self, skipna: Optional[bool] = None, min_count: int = 0 ) -> Union[ColumnBase, ScalarLike]: skipna = True if skipna is None else skipna @@ -1314,8 +1320,8 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: def _label_encoding( self, cats: ColumnBase, - dtype: Dtype = None, - na_sentinel: ScalarLike = None, + dtype: Optional[Dtype] = None, + na_sentinel: Optional[ScalarLike] = None, ): """ Convert each value in `self` into an integer code, with `cats` @@ -1389,9 +1395,9 @@ def _return_sentinel_column(): def column_empty_like( column: ColumnBase, - dtype: Dtype = None, + dtype: Optional[Dtype] = None, masked: bool = False, - newsize: int = None, + newsize: Optional[int] = None, ) -> ColumnBase: """Allocate a new column like the given *column*""" if dtype is None: @@ -1403,8 +1409,10 @@ def column_empty_like( and is_categorical_dtype(column.dtype) and dtype == column.dtype ): - column = cast("cudf.core.column.CategoricalColumn", column) - codes = column_empty_like(column.codes, masked=masked, newsize=newsize) + catcolumn = cast("cudf.core.column.CategoricalColumn", column) + codes = column_empty_like( + catcolumn.codes, masked=masked, newsize=newsize + ) return build_column( data=None, dtype=dtype, @@ -1494,10 +1502,10 @@ def build_column( data: Union[Buffer, None], dtype: Dtype, *, - size: int = None, - mask: Buffer = None, + size: Optional[int] = None, + mask: Optional[Buffer] = None, offset: int = 0, - null_count: int = None, + null_count: Optional[int] = None, children: Tuple[ColumnBase, ...] = (), ) -> ColumnBase: """ @@ -1666,10 +1674,10 @@ def build_column( def build_categorical_column( categories: ColumnBase, codes: ColumnBase, - mask: Buffer = None, - size: int = None, + mask: Optional[Buffer] = None, + size: Optional[int] = None, offset: int = 0, - null_count: int = None, + null_count: Optional[int] = None, ordered: bool = False, ) -> "cudf.core.column.CategoricalColumn": """ @@ -1757,10 +1765,10 @@ def build_interval_column( def build_list_column( indices: ColumnBase, elements: ColumnBase, - mask: Buffer = None, - size: int = None, + mask: Optional[Buffer] = None, + size: Optional[int] = None, offset: int = 0, - null_count: int = None, + null_count: Optional[int] = None, ) -> "cudf.core.column.ListColumn": """ Build a ListColumn @@ -1803,10 +1811,10 @@ def build_struct_column( names: Sequence[str], children: Tuple[ColumnBase, ...], dtype: Optional[Dtype] = None, - mask: Buffer = None, - size: int = None, + mask: Optional[Buffer] = None, + size: Optional[int] = None, offset: int = 0, - null_count: int = None, + null_count: Optional[int] = None, ) -> "cudf.core.column.StructColumn": """ Build a StructColumn @@ -1863,9 +1871,9 @@ def _make_copy_replacing_NaT_with_null(column): def as_column( arbitrary: Any, - nan_as_null: bool = None, - dtype: Dtype = None, - length: int = None, + nan_as_null: Optional[bool] = None, + dtype: Optional[Dtype] = None, + length: Optional[int] = None, ): """Create a Column from an arbitrary object @@ -2106,7 +2114,6 @@ def as_column( data = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype) elif arb_dtype.kind == "m": - time_unit = get_time_unit(arbitrary) cast_dtype = time_unit in ("D", "W", "M", "Y") @@ -2466,7 +2473,7 @@ def deserialize_columns(headers: List[dict], frames: List) -> List[ColumnBase]: def arange( start: Union[int, float], - stop: Union[int, float] = None, + stop: Optional[Union[int, float]] = None, step: Union[int, float] = 1, dtype=None, ) -> cudf.core.column.NumericalColumn: @@ -2524,7 +2531,9 @@ def arange( ) -def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase: +def full( + size: int, fill_value: ScalarLike, dtype: Optional[Dtype] = None +) -> ColumnBase: """ Returns a column of given size and dtype, filled with a given value. diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 4c65a631adc..c0a2a6ac546 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -6,7 +6,7 @@ import locale import re from locale import nl_langinfo -from typing import Any, Mapping, Sequence, cast +from typing import Any, Mapping, Optional, Sequence, cast import numpy as np import pandas as pd @@ -125,10 +125,10 @@ def __init__( self, data: Buffer, dtype: DtypeObj, - mask: Buffer = None, - size: int = None, # TODO: make non-optional + mask: Optional[Buffer] = None, + size: Optional[int] = None, # TODO: make non-optional offset: int = 0, - null_count: int = None, + null_count: Optional[int] = None, ): dtype = cudf.dtype(dtype) @@ -202,7 +202,10 @@ def day_of_year(self) -> ColumnBase: return self.get_dt_field("day_of_year") def to_pandas( - self, index: pd.Index = None, nullable: bool = False, **kwargs + self, + index: Optional[pd.Index] = None, + nullable: bool = False, + **kwargs, ) -> "cudf.Series": # Workaround until following issue is fixed: # https://issues.apache.org/jira/browse/ARROW-9772 @@ -363,7 +366,7 @@ def mean( def std( self, - skipna: bool = None, + skipna: Optional[bool] = None, min_count: int = 0, dtype: Dtype = np.float64, ddof: int = 1, @@ -375,7 +378,7 @@ def std( * _unit_to_nanoseconds_conversion[self.time_unit], ) - def median(self, skipna: bool = None) -> pd.Timestamp: + def median(self, skipna: Optional[bool] = None) -> pd.Timestamp: return pd.Timestamp( self.as_numerical.median(skipna=skipna), unit=self.time_unit ) @@ -451,7 +454,10 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) def fillna( - self, fill_value: Any = None, method: str = None, dtype: Dtype = None + self, + fill_value: Any = None, + method: Optional[str] = None, + dtype: Optional[Dtype] = None, ) -> DatetimeColumn: if fill_value is not None: if cudf.utils.utils._isnat(fill_value): @@ -495,7 +501,6 @@ def isin(self, values: Sequence) -> ColumnBase: def can_cast_safely(self, to_dtype: Dtype) -> bool: if np.issubdtype(to_dtype, np.datetime64): - to_res, _ = np.datetime_data(to_dtype) self_res, _ = np.datetime_data(self.dtype) @@ -542,10 +547,10 @@ def __init__( self, data: Buffer, dtype: pd.DatetimeTZDtype, - mask: Buffer = None, - size: int = None, + mask: Optional[Buffer] = None, + size: Optional[int] = None, offset: int = 0, - null_count: int = None, + null_count: Optional[int] = None, ): super().__init__( data=data, @@ -558,7 +563,10 @@ def __init__( self._dtype = dtype def to_pandas( - self, index: pd.Index = None, nullable: bool = False, **kwargs + self, + index: Optional[pd.Index] = None, + nullable: bool = False, + **kwargs, ) -> "cudf.Series": return self._local_time.to_pandas().dt.tz_localize( self.dtype.tz, ambiguous="NaT", nonexistent="NaT" diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 9fc7663ffca..420637c1924 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -2,7 +2,7 @@ import warnings from decimal import Decimal -from typing import Any, Sequence, Union, cast +from typing import Any, Optional, Sequence, Union, cast import cupy as cp import numpy as np @@ -103,7 +103,10 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str): return result def fillna( - self, value: Any = None, method: str = None, dtype: Dtype = None + self, + value: Any = None, + method: Optional[str] = None, + dtype: Optional[Dtype] = None, ): """Fill null values with ``value``. diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 657403a6082..38384d09126 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -1,4 +1,6 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. +from typing import Optional + import pandas as pd import pyarrow as pa @@ -19,7 +21,6 @@ def __init__( children=(), closed="right", ): - super().__init__( data=None, dtype=dtype, @@ -124,7 +125,9 @@ def as_interval_column(self, dtype, **kwargs): else: raise ValueError("dtype must be IntervalDtype") - def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series": + def to_pandas( + self, index: Optional[pd.Index] = None, **kwargs + ) -> pd.Series: # Note: This does not handle null values in the interval column. # However, this exact sequence (calling __from_arrow__ on the output of # self.to_arrow) is currently the best known way to convert interval diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 4eea64c00d3..0bb9f70f851 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -5,6 +5,7 @@ import numpy as np import pyarrow as pa +from typing_extensions import Self import cudf from cudf._lib.copying import segmented_gather @@ -261,7 +262,7 @@ def as_string_column( # Call libcudf to format the list column return format_list_column(lc, separators) - def _transform_leaves(self, func, *args, **kwargs): + def _transform_leaves(self, func, *args, **kwargs) -> Self: # return a new list column with the same nested structure # as ``self``, but with the leaf column transformed # by applying ``func`` to it diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 87e73d212ef..39798320f88 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -81,10 +81,10 @@ def __init__( self, data: Buffer, dtype: DtypeObj, - mask: Buffer = None, - size: int = None, # TODO: make this non-optional + mask: Optional[Buffer] = None, + size: Optional[int] = None, # TODO: make this non-optional offset: int = 0, - null_count: int = None, + null_count: Optional[int] = None, ): dtype = cudf.dtype(dtype) @@ -428,11 +428,11 @@ def _process_values_for_isin( return lhs, rhs - def _can_return_nan(self, skipna: bool = None) -> bool: + def _can_return_nan(self, skipna: Optional[bool] = None) -> bool: return not skipna and self.has_nulls(include_nan=True) def _process_for_reduction( - self, skipna: bool = None, min_count: int = 0 + self, skipna: Optional[bool] = None, min_count: int = 0 ) -> Union[NumericalColumn, ScalarLike]: skipna = True if skipna is None else skipna @@ -516,8 +516,8 @@ def find_and_replace( def fillna( self, fill_value: Any = None, - method: str = None, - dtype: Dtype = None, + method: Optional[str] = None, + dtype: Optional[Dtype] = None, fill_nan: bool = True, ) -> NumericalColumn: """ @@ -684,7 +684,6 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: ): return True else: - filled = self.fillna(0) return ( cudf.Series(filled).astype(to_dtype).astype(filled.dtype) @@ -720,8 +719,11 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: return self def to_pandas( - self, index: pd.Index = None, nullable: bool = False, **kwargs - ) -> "pd.Series": + self, + index: Optional[pd.Index] = None, + nullable: bool = False, + **kwargs, + ) -> pd.Series: if nullable and self.dtype in np_dtypes_to_pandas_dtypes: pandas_nullable_dtype = np_dtypes_to_pandas_dtypes[self.dtype] arrow_array = self.to_arrow() diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index bb7711a3ead..08c2f7cc7b1 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -1,9 +1,9 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. """Define an interface for columns that can perform numerical operations.""" from __future__ import annotations -from typing import cast +from typing import Optional, cast import numpy as np @@ -40,10 +40,10 @@ class NumericalBaseColumn(ColumnBase, Scannable): "cummax", } - def _can_return_nan(self, skipna: bool = None) -> bool: + def _can_return_nan(self, skipna: Optional[bool] = None) -> bool: return not skipna and self.has_nulls() - def kurtosis(self, skipna: bool = None) -> float: + def kurtosis(self, skipna: Optional[bool] = None) -> float: skipna = True if skipna is None else skipna if len(self) == 0 or self._can_return_nan(skipna=skipna): @@ -68,7 +68,7 @@ def kurtosis(self, skipna: bool = None) -> float: kurt = term_one_section_one * term_one_section_two - 3 * term_two return kurt - def skew(self, skipna: bool = None) -> ScalarLike: + def skew(self, skipna: Optional[bool] = None) -> ScalarLike: skipna = True if skipna is None else skipna if len(self) == 0 or self._can_return_nan(skipna=skipna): @@ -122,26 +122,39 @@ def quantile( ) return result - def mean(self, skipna: bool = None, min_count: int = 0, dtype=np.float64): + def mean( + self, + skipna: Optional[bool] = None, + min_count: int = 0, + dtype=np.float64, + ): return self._reduce( "mean", skipna=skipna, min_count=min_count, dtype=dtype ) def var( - self, skipna: bool = None, min_count: int = 0, dtype=np.float64, ddof=1 + self, + skipna: Optional[bool] = None, + min_count: int = 0, + dtype=np.float64, + ddof=1, ): return self._reduce( "var", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof ) def std( - self, skipna: bool = None, min_count: int = 0, dtype=np.float64, ddof=1 + self, + skipna: Optional[bool] = None, + min_count: int = 0, + dtype=np.float64, + ddof=1, ): return self._reduce( "std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof ) - def median(self, skipna: bool = None) -> NumericalBaseColumn: + def median(self, skipna: Optional[bool] = None) -> NumericalBaseColumn: skipna = True if skipna is None else skipna if self._can_return_nan(skipna=skipna): diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index fefa7beb562..a3163f1cebe 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -259,12 +259,14 @@ def byte_count(self) -> SeriesOrIndex: ) @overload - def cat(self, sep: str = None, na_rep: str = None) -> str: + def cat( + self, sep: Optional[str] = None, na_rep: Optional[str] = None + ) -> str: ... @overload def cat( - self, others, sep: str = None, na_rep: str = None + self, others, sep: Optional[str] = None, na_rep: Optional[str] = None ) -> Union[SeriesOrIndex, "cudf.core.column.string.StringColumn"]: ... @@ -744,7 +746,7 @@ def contains( 4 False dtype: bool - The ``pat`` may also be a list of strings in which case + The ``pat`` may also be a sequence of strings in which case the individual strings are searched in corresponding rows. >>> s2 = cudf.Series(['house', 'dog', 'and', '', '']) @@ -756,8 +758,6 @@ def contains( 4 dtype: bool """ # noqa W605 - if case is not True: - raise NotImplementedError("`case` parameter is not yet supported") if na is not np.nan: raise NotImplementedError("`na` parameter is not yet supported") if regex and isinstance(pat, re.Pattern): @@ -767,25 +767,34 @@ def contains( raise NotImplementedError( "unsupported value for `flags` parameter" ) - - if pat is None: - result_col = column.column_empty( - len(self._column), dtype="bool", masked=True + if regex and not case: + raise NotImplementedError( + "`case=False` only supported when `regex=False`" ) - elif is_scalar(pat): + + if is_scalar(pat): if regex: result_col = libstrings.contains_re(self._column, pat, flags) else: - result_col = libstrings.contains( - self._column, cudf.Scalar(pat, "str") - ) + if case is False: + input_column = libstrings.to_lower(self._column) + pat = cudf.Scalar(pat.lower(), dtype="str") # type: ignore + else: + input_column = self._column + pat = cudf.Scalar(pat, dtype="str") # type: ignore + result_col = libstrings.contains(input_column, pat) else: - result_col = libstrings.contains_multiple( - self._column, column.as_column(pat, dtype="str") - ) + # TODO: we silently ignore the `regex=` flag here + if case is False: + input_column = libstrings.to_lower(self._column) + pat = libstrings.to_lower(column.as_column(pat, dtype="str")) + else: + input_column = self._column + pat = column.as_column(pat, dtype="str") + result_col = libstrings.contains_multiple(input_column, pat) return self._return_or_inplace(result_col) - def like(self, pat: str, esc: str = None) -> SeriesOrIndex: + def like(self, pat: str, esc: Optional[str] = None) -> SeriesOrIndex: """ Test if a like pattern matches a string of a Series or Index. @@ -1065,7 +1074,10 @@ def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex: ) def slice( - self, start: int = None, stop: int = None, step: int = None + self, + start: Optional[int] = None, + stop: Optional[int] = None, + step: Optional[int] = None, ) -> SeriesOrIndex: """ Slice substrings from each element in the Series or Index. @@ -2040,7 +2052,7 @@ def istitle(self) -> SeriesOrIndex: return self._return_or_inplace(libstrings.is_title(self._column)) def filter_alphanum( - self, repl: str = None, keep: bool = True + self, repl: Optional[str] = None, keep: bool = True ) -> SeriesOrIndex: """ Remove non-alphanumeric characters from strings in this column. @@ -2126,7 +2138,10 @@ def slice_from( ) def slice_replace( - self, start: int = None, stop: int = None, repl: str = None + self, + start: Optional[int] = None, + stop: Optional[int] = None, + repl: Optional[str] = None, ) -> SeriesOrIndex: """ Replace the specified section of each string with a new string. @@ -2214,7 +2229,9 @@ def slice_replace( ), ) - def insert(self, start: int = 0, repl: str = None) -> SeriesOrIndex: + def insert( + self, start: int = 0, repl: Optional[str] = None + ) -> SeriesOrIndex: """ Insert the specified string into each string in the specified position. @@ -2394,10 +2411,10 @@ def get_json_object( def split( self, - pat: str = None, + pat: Optional[str] = None, n: int = -1, expand: bool = False, - regex: bool = None, + regex: Optional[bool] = None, ) -> SeriesOrIndex: """ Split strings around given separator/delimiter. @@ -2562,10 +2579,10 @@ def split( def rsplit( self, - pat: str = None, + pat: Optional[str] = None, n: int = -1, expand: bool = False, - regex: bool = None, + regex: Optional[bool] = None, ) -> SeriesOrIndex: """ Split strings around given separator/delimiter. @@ -3214,7 +3231,7 @@ def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: libstrings.rjust(self._column, width, fillchar) ) - def strip(self, to_strip: str = None) -> SeriesOrIndex: + def strip(self, to_strip: Optional[str] = None) -> SeriesOrIndex: r""" Remove leading and trailing characters. @@ -3273,7 +3290,7 @@ def strip(self, to_strip: str = None) -> SeriesOrIndex: libstrings.strip(self._column, cudf.Scalar(to_strip, "str")) ) - def lstrip(self, to_strip: str = None) -> SeriesOrIndex: + def lstrip(self, to_strip: Optional[str] = None) -> SeriesOrIndex: r""" Remove leading and trailing characters. @@ -3320,7 +3337,7 @@ def lstrip(self, to_strip: str = None) -> SeriesOrIndex: libstrings.lstrip(self._column, cudf.Scalar(to_strip, "str")) ) - def rstrip(self, to_strip: str = None) -> SeriesOrIndex: + def rstrip(self, to_strip: Optional[str] = None) -> SeriesOrIndex: r""" Remove leading and trailing characters. @@ -3973,7 +3990,9 @@ def removeprefix(self, prefix: str) -> SeriesOrIndex: ) return self._return_or_inplace(result) - def find(self, sub: str, start: int = 0, end: int = None) -> SeriesOrIndex: + def find( + self, sub: str, start: int = 0, end: Optional[int] = None + ) -> SeriesOrIndex: """ Return lowest indexes in each strings in the Series/Index where the substring is fully contained between ``[start:end]``. @@ -4029,7 +4048,7 @@ def find(self, sub: str, start: int = 0, end: int = None) -> SeriesOrIndex: return self._return_or_inplace(result_col) def rfind( - self, sub: str, start: int = 0, end: int = None + self, sub: str, start: int = 0, end: Optional[int] = None ) -> SeriesOrIndex: """ Return highest indexes in each strings in the Series/Index @@ -4090,7 +4109,7 @@ def rfind( return self._return_or_inplace(result_col) def index( - self, sub: str, start: int = 0, end: int = None + self, sub: str, start: int = 0, end: Optional[int] = None ) -> SeriesOrIndex: """ Return lowest indexes in each strings where the substring @@ -4152,7 +4171,7 @@ def index( return result def rindex( - self, sub: str, start: int = 0, end: int = None + self, sub: str, start: int = 0, end: Optional[int] = None ) -> SeriesOrIndex: """ Return highest indexes in each strings where the substring @@ -4419,7 +4438,7 @@ def translate(self, table: dict) -> SeriesOrIndex: ) def filter_characters( - self, table: dict, keep: bool = True, repl: str = None + self, table: dict, keep: bool = True, repl: Optional[str] = None ) -> SeriesOrIndex: """ Remove characters from each string using the character ranges @@ -4870,7 +4889,7 @@ def ngrams_tokenize( ) def replace_tokens( - self, targets, replacements, delimiter: str = None + self, targets, replacements, delimiter: Optional[str] = None ) -> SeriesOrIndex: """ The targets tokens are searched for within each string in the series @@ -4955,8 +4974,8 @@ def replace_tokens( def filter_tokens( self, min_token_length: int, - replacement: str = None, - delimiter: str = None, + replacement: Optional[str] = None, + delimiter: Optional[str] = None, ) -> SeriesOrIndex: """ Remove tokens from within each string in the series that are @@ -5344,10 +5363,10 @@ class StringColumn(column.ColumnBase): def __init__( self, - mask: Buffer = None, - size: int = None, # TODO: make non-optional + mask: Optional[Buffer] = None, + size: Optional[int] = None, # TODO: make non-optional offset: int = 0, - null_count: int = None, + null_count: Optional[int] = None, children: Tuple["column.ColumnBase", ...] = (), ): dtype = cudf.api.types.dtype("object") @@ -5477,8 +5496,8 @@ def to_arrow(self) -> pa.Array: def sum( self, - skipna: bool = None, - dtype: Dtype = None, + skipna: Optional[bool] = None, + dtype: Optional[Dtype] = None, min_count: int = 0, ): result_col = self._process_for_reduction( @@ -5609,8 +5628,11 @@ def values(self) -> cupy.ndarray: raise TypeError("String Arrays is not yet implemented in cudf") def to_pandas( - self, index: pd.Index = None, nullable: bool = False, **kwargs - ) -> "pd.Series": + self, + index: Optional[pd.Index] = None, + nullable: bool = False, + **kwargs, + ) -> pd.Series: if nullable: pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow()) pd_series = pd.Series(pandas_array, copy=False) @@ -5679,8 +5701,8 @@ def find_and_replace( def fillna( self, fill_value: Any = None, - method: str = None, - dtype: Dtype = None, + method: Optional[str] = None, + dtype: Optional[Dtype] = None, ) -> StringColumn: if fill_value is not None: if not is_scalar(fill_value): @@ -5828,7 +5850,6 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase": def _get_cols_list(parent_obj, others): - parent_index = ( parent_obj.index if isinstance(parent_obj, cudf.Series) else parent_obj ) diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 6838d711641..8558faa7f24 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -2,6 +2,7 @@ from __future__ import annotations from functools import cached_property +from typing import Optional import pandas as pd import pyarrow as pa @@ -57,7 +58,9 @@ def to_arrow(self): pa_type, len(self), buffers, children=children ) - def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series": + def to_pandas( + self, index: Optional[pd.Index] = None, **kwargs + ) -> pd.Series: # We cannot go via Arrow's `to_pandas` because of the following issue: # https://issues.apache.org/jira/browse/ARROW-12680 diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index e7979fa4d27..e1d913742ec 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -3,7 +3,7 @@ from __future__ import annotations import datetime -from typing import Any, Sequence, cast +from typing import Any, Optional, Sequence, cast import numpy as np import pandas as pd @@ -80,10 +80,10 @@ def __init__( self, data: Buffer, dtype: Dtype, - size: int = None, # TODO: make non-optional - mask: Buffer = None, + size: Optional[int] = None, # TODO: make non-optional + mask: Optional[Buffer] = None, offset: int = 0, - null_count: int = None, + null_count: Optional[int] = None, ): dtype = cudf.dtype(dtype) @@ -251,7 +251,10 @@ def time_unit(self) -> str: return self._time_unit def fillna( - self, fill_value: Any = None, method: str = None, dtype: Dtype = None + self, + fill_value: Any = None, + method: Optional[str] = None, + dtype: Optional[Dtype] = None, ) -> TimeDeltaColumn: if fill_value is not None: if cudf.utils.utils._isnat(fill_value): @@ -313,7 +316,7 @@ def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta: unit=self.time_unit, ) - def median(self, skipna: bool = None) -> pd.Timedelta: + def median(self, skipna: Optional[bool] = None) -> pd.Timedelta: return pd.Timedelta( self.as_numerical.median(skipna=skipna), unit=self.time_unit ) @@ -340,9 +343,9 @@ def quantile( def sum( self, - skipna: bool = None, + skipna: Optional[bool] = None, min_count: int = 0, - dtype: Dtype = None, + dtype: Optional[Dtype] = None, ) -> pd.Timedelta: return pd.Timedelta( # Since sum isn't overridden in Numerical[Base]Column, mypy only @@ -356,7 +359,7 @@ def sum( def std( self, - skipna: bool = None, + skipna: Optional[bool] = None, min_count: int = 0, dtype: Dtype = np.float64, ddof: int = 1, diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 707eda3f5e6..832d5acf2de 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -101,7 +101,7 @@ class ColumnAccessor(abc.MutableMapping): def __init__( self, - data: Union[abc.MutableMapping, ColumnAccessor] = None, + data: Union[abc.MutableMapping, ColumnAccessor, None] = None, multiindex: bool = False, level_names=None, ): diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8c8f0119b3f..2aa370ac8e5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -22,7 +22,6 @@ Optional, Set, Tuple, - TypeVar, Union, ) @@ -37,6 +36,7 @@ from pandas.core.dtypes.common import is_float, is_integer from pandas.io.formats import console from pandas.io.formats.printing import pprint_thing +from typing_extensions import Self import cudf import cudf.core.common @@ -105,9 +105,6 @@ _external_only_api, ) -T = TypeVar("T", bound="DataFrame") - - _cupy_nan_methods_map = { "min": "nanmin", "max": "nanmax", @@ -1306,7 +1303,7 @@ def __delitem__(self, name): self._drop_column(name) @_cudf_nvtx_annotate - def _slice(self: T, arg: slice) -> T: + def _slice(self, arg: slice) -> Self: """ _slice : slice the frame as per the arg @@ -1988,8 +1985,8 @@ def from_dict( cls, data: dict, orient: str = "columns", - dtype: Dtype = None, - columns: list = None, + dtype: Optional[Dtype] = None, + columns: Optional[list] = None, ) -> DataFrame: """ Construct DataFrame from dict of array-like or dicts. @@ -7139,13 +7136,14 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): "Keyword arguments other than `inplace` are not supported" ) - # Have to use a regex match to avoid capturing "==" - includes_assignment = re.search("[^=]=[^=]", expr) is not None + # Have to use a regex match to avoid capturing ==, >=, or <= + equals_sign_regex = "[^=><]=[^=]" + includes_assignment = re.search(equals_sign_regex, expr) is not None # Check if there were multiple statements. Filter out empty lines. statements = tuple(filter(None, expr.strip().split("\n"))) if len(statements) > 1 and any( - re.search("[^=]=[^=]", st) is None for st in statements + re.search(equals_sign_regex, st) is None for st in statements ): raise ValueError( "Multi-line expressions are only valid if all expressions " diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 230a5054a00..26b1ee971da 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -16,7 +16,6 @@ MutableMapping, Optional, Tuple, - TypeVar, Union, ) @@ -24,6 +23,7 @@ import numpy as np import pandas as pd import pyarrow as pa +from typing_extensions import Self import cudf from cudf import _lib as libcudf @@ -45,8 +45,6 @@ from cudf.utils.dtypes import find_common_type from cudf.utils.utils import _array_ufunc, _cudf_nvtx_annotate -T = TypeVar("T", bound="Frame") - # TODO: It looks like Frame is missing a declaration of `copy`, need to add class Frame(BinaryOperand, Scannable): @@ -146,8 +144,8 @@ def _from_columns_like_self( return frame._copy_type_metadata(self, override_dtypes=override_dtypes) def _mimic_inplace( - self: T, result: T, inplace: bool = False - ) -> Optional[Frame]: + self, result: Self, inplace: bool = False + ) -> Optional[Self]: if inplace: for col in self._data: if col in result._data: @@ -1069,7 +1067,6 @@ def from_arrow(cls, data): # Handle dict arrays cudf_category_frame = {} if len(dict_indices): - dict_indices_table = pa.table(dict_indices) data = data.drop(dict_indices_table.column_names) indices_columns = libcudf.interop.from_arrow(dict_indices_table) @@ -1191,11 +1188,11 @@ def _positions_from_column_names(self, column_names): ] def _copy_type_metadata( - self: T, - other: T, + self, + other: Self, *, override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, - ) -> T: + ) -> Self: """ Copy type metadata from each column of `other` to the corresponding column of `self`. diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 0b6ea1fb646..f955c5dfbd6 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1095,8 +1095,13 @@ def _normalize_aggs( columns = values._columns aggs_per_column = (aggs,) * len(columns) + # is_list_like performs type narrowing but type-checkers don't + # know it. One could add a TypeGuard annotation to + # is_list_like (see PEP647), but that is less useful than it + # seems because unlike the builtin narrowings it only performs + # narrowing in the positive case. normalized_aggs = [ - list(agg) if is_list_like(agg) else [agg] + list(agg) if is_list_like(agg) else [agg] # type: ignore for agg in aggs_per_column ] return column_names, columns, normalized_aggs diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 281290e1788..f5dc8298bd2 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -15,7 +15,6 @@ Optional, Tuple, Type, - TypeVar, Union, ) @@ -65,8 +64,6 @@ ) from cudf.utils.utils import _cudf_nvtx_annotate, search_range -T = TypeVar("T", bound="Frame") - def _lexsorted_equal_range( idx: Union[GenericIndex, cudf.MultiIndex], @@ -1048,7 +1045,7 @@ def _from_data( def _binaryop( self, - other: T, + other: Frame, op: str, fill_value: Any = None, *args, @@ -2601,7 +2598,6 @@ def __init__( copy=False, name=None, ): - if freq is not None: raise NotImplementedError("freq is not yet supported") diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 95931af038c..3c4d8d84c34 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -27,6 +27,7 @@ import cupy as cp import numpy as np import pandas as pd +from typing_extensions import Self import cudf import cudf._lib as libcudf @@ -224,8 +225,6 @@ def __init__(self, frame): _LocIndexerClass = TypeVar("_LocIndexerClass", bound="_FrameIndexer") _IlocIndexerClass = TypeVar("_IlocIndexerClass", bound="_FrameIndexer") -T = TypeVar("T", bound="IndexedFrame") - class IndexedFrame(Frame): """A frame containing an index. @@ -357,8 +356,8 @@ def _from_columns_like_self( ) def _mimic_inplace( - self: T, result: T, inplace: bool = False - ) -> Optional[Frame]: + self, result: Self, inplace: bool = False + ) -> Optional[Self]: if inplace: self._index = result._index return super()._mimic_inplace(result, inplace) @@ -441,7 +440,7 @@ def _scan(self, op, axis=None, skipna=True): results[name] = getattr(result_col, op)() return self._from_data(results, self._index) - def _check_data_index_length_match(self: T) -> None: + def _check_data_index_length_match(self) -> None: # Validate that the number of rows in the data matches the index if the # data is not empty. This is a helper for the constructor. if self._data.nrows > 0 and self._data.nrows != len(self._index): @@ -450,7 +449,7 @@ def _check_data_index_length_match(self: T) -> None: f"match length of index ({len(self._index)})" ) - def copy(self: T, deep: bool = True) -> T: + def copy(self, deep: bool = True) -> Self: """Make a copy of this object's indices and data. When ``deep=True`` (default), a new object will be created with a @@ -918,12 +917,12 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): return self._mimic_inplace(output, inplace=inplace) def _copy_type_metadata( - self: T, - other: T, + self, + other: Self, include_index: bool = True, *, override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, - ) -> T: + ) -> Self: """ Copy type metadata from each column of `other` to the corresponding column of `self`. @@ -2322,12 +2321,12 @@ def _n_largest_or_smallest(self, largest, n, columns, keep): raise ValueError('keep must be either "first", "last"') def _align_to_index( - self: T, + self, index: ColumnLike, how: str = "outer", sort: bool = True, allow_non_unique: bool = False, - ) -> T: + ) -> Self: index = cudf.core.index.as_index(index) if self.index.equals(index): diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 4a9bc89fa34..edabdb34435 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -101,7 +101,6 @@ def __init__( name=None, **kwargs, ): - if sortorder is not None: raise NotImplementedError("sortorder is not yet supported") if name is not None: @@ -811,7 +810,6 @@ def _get_valid_indices_by_tuple(self, index, row_tuple, max_length): @_cudf_nvtx_annotate def _index_and_downcast(self, result, index, index_key): - if isinstance(index_key, (numbers.Number, slice)): index_key = [index_key] if ( @@ -1069,7 +1067,6 @@ def _is_interval(self): @classmethod @_cudf_nvtx_annotate def _concat(cls, objs): - source_data = [o.to_frame(index=False) for o in objs] # TODO: Verify if this is really necessary or if we can rely on diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index c4128621148..5f07c57fc80 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -4,7 +4,7 @@ from __future__ import annotations import warnings -from typing import Any, Dict, Optional, Tuple, TypeVar, Union +from typing import Any, Dict, Optional, Tuple, Union import cupy import numpy as np @@ -20,8 +20,6 @@ from cudf.core.frame import Frame from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate -T = TypeVar("T", bound="Frame") - class SingleColumnFrame(Frame, NotIterable): """A one-dimensional frame. @@ -140,7 +138,6 @@ def to_numpy( return super().to_numpy(dtype, copy, na_value).flatten() def tolist(self): # noqa: D102 - raise TypeError( "cuDF does not support conversion to host memory " "via the `tolist()` method. Consider using " diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 92ef49e92d9..0ee9f511061 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -1,15 +1,16 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. import math import re import warnings -from typing import Sequence, Type, TypeVar, Union +from typing import Sequence, Union import cupy as cp import numpy as np import pandas as pd import pandas.tseries.offsets as pd_offset from pandas.core.tools.datetimes import _unit_map +from typing_extensions import Self import cudf from cudf import _lib as libcudf @@ -373,9 +374,6 @@ def get_units(value): return value -_T = TypeVar("_T", bound="DateOffset") - - class DateOffset: """ An object used for binary ops where calendrical arithmetic @@ -647,7 +645,7 @@ def __repr__(self): return repr_str @classmethod - def _from_freqstr(cls: Type[_T], freqstr: str) -> _T: + def _from_freqstr(cls, freqstr: str) -> Self: """ Parse a string and return a DateOffset object expects strings of the form 3D, 25W, 10ms, 42ns, etc. @@ -669,9 +667,9 @@ def _from_freqstr(cls: Type[_T], freqstr: str) -> _T: @classmethod def _from_pandas_ticks_or_weeks( - cls: Type[_T], + cls, tick: Union[pd.tseries.offsets.Tick, pd.tseries.offsets.Week], - ) -> _T: + ) -> Self: return cls(**{cls._TICK_OR_WEEK_TO_UNITS[type(tick)]: tick.n}) def _maybe_as_fast_pandas_offset(self): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index b3c8468c119..918bd995ed1 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9794,6 +9794,7 @@ def df_eval(request): ("a / b", float), ("a * b", int), ("a > b", int), + ("a >= b", int), ("a > b > c", int), ("a > b < c", int), ("a & b", int), @@ -9835,7 +9836,7 @@ def test_dataframe_eval(df_eval, expr, dtype): assert_eq(expect, got, check_names=False) # Test inplace - if re.search("[^=]=[^=]", expr) is not None: + if re.search("[^=><]=[^=]", expr) is not None: pdf_eval = df_eval.to_pandas() pdf_eval.eval(expr, inplace=True) df_eval.eval(expr, inplace=True) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 33095761fde..7fcad5df9f1 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. import datetime import decimal @@ -1896,12 +1896,10 @@ def test_reader_empty_stripe(datadir, fname): assert_eq(expected, got) -@pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/11890", raises=RuntimeError -) -def test_reader_unsupported_offsets(): - # needs enough data for more than one row group - expected = cudf.DataFrame({"str": ["*"] * 10001}, dtype="string") +# needs enough data for multiple row groups +@pytest.mark.parametrize("data", [["*"] * 10001, ["**", None] * 5001]) +def test_reader_row_index_order(data): + expected = cudf.DataFrame({"str": data}, dtype="string") buffer = BytesIO() expected.to_pandas().to_orc(buffer) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index c866e064366..12e832ba23b 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -820,6 +820,17 @@ def test_string_contains(ps_gs, pat, regex, flags, flags_raise, na, na_raise): assert_eq(expect, got) +def test_string_contains_case(ps_gs): + ps, gs = ps_gs + with pytest.raises(NotImplementedError): + gs.str.contains("A", case=False) + expected = ps.str.contains("A", regex=False, case=False) + got = gs.str.contains("A", regex=False, case=False) + assert_eq(expected, got) + got = gs.str.contains("a", regex=False, case=False) + assert_eq(expected, got) + + @pytest.mark.parametrize( "pat,esc,expect", [ diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index f19c373150d..67fc7215fc7 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -30,28 +30,31 @@ class CudfEngine(ArrowDatasetEngine): - @staticmethod - def read_metadata(*args, **kwargs): - meta, stats, parts, index = ArrowDatasetEngine.read_metadata( - *args, **kwargs + @classmethod + def _create_dd_meta(cls, dataset_info, **kwargs): + # Start with pandas-version of meta + meta_pd = super()._create_dd_meta(dataset_info, **kwargs) + + # Convert to cudf + meta_cudf = cudf.from_pandas(meta_pd) + + # Re-set "object" dtypes to align with pa schema + kwargs = dataset_info.get("kwargs", {}) + set_object_dtypes_from_pa_schema( + meta_cudf, + kwargs.get("schema", None), ) - new_meta = cudf.from_pandas(meta) - if parts: - # Re-set "object" dtypes align with pa schema - set_object_dtypes_from_pa_schema( - new_meta, - parts[0].get("common_kwargs", {}).get("schema", None), - ) # If `strings_to_categorical==True`, convert objects to int32 strings_to_cats = kwargs.get("strings_to_categorical", False) - for col in new_meta._data.names: + for col in meta_cudf._data.names: if ( - isinstance(new_meta._data[col], cudf.core.column.StringColumn) + isinstance(meta_cudf._data[col], cudf.core.column.StringColumn) and strings_to_cats ): - new_meta._data[col] = new_meta._data[col].astype("int32") - return (new_meta, stats, parts, index) + meta_cudf._data[col] = meta_cudf._data[col].astype("int32") + + return meta_cudf @classmethod def multi_support(cls):