From 789cbfdd69648fd7ec553922e64accb763ca3c57 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 30 May 2024 15:02:37 -0400 Subject: [PATCH 01/17] Use offsetalator in nvtext::tokenize_with_vocabulary (#15878) Updates the `token_counts_fn` kernel in the `nvtext::tokenize_with_vocabulary` to use the offsetalator instead of hardcoded `size_type` for accessing strings offsets. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/15878 --- cpp/src/text/vocabulary_tokenize.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu index 8913ce22da8..f012f7ce09a 100644 --- a/cpp/src/text/vocabulary_tokenize.cu +++ b/cpp/src/text/vocabulary_tokenize.cu @@ -240,10 +240,10 @@ CUDF_KERNEL void token_counts_fn(cudf::column_device_view const d_strings, return; } - auto const offsets = - d_strings.child(cudf::strings_column_view::offsets_column_index).data(); - auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()]; - auto const chars_begin = d_strings.data() + offsets[d_strings.offset()]; + auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); + auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type()); + auto const offset = offsets_itr[str_idx + d_strings.offset()] - offsets_itr[d_strings.offset()]; + auto const chars_begin = d_strings.data() + offsets_itr[d_strings.offset()]; auto const begin = d_str.data(); auto const end = begin + d_str.size_bytes(); From 476db9fbb4a9969ea7406b916cead38990097fb9 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Thu, 30 May 2024 23:42:51 -0500 Subject: [PATCH 02/17] Fix JSON parsing memory corruption - Fix Mixed types nested children removal (#15798) Fixes https://github.com/rapidsai/cudf/issues/15750 The references of deleted child columns are not removed, which caused segfault, and also memory errors (found with valgrind). This fix removes references of child columns and deletes them recursively. Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - David Wendt (https://github.com/davidwendt) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15798 --- cpp/src/io/json/json_column.cu | 17 ++++++++++++++-- cpp/tests/io/json_test.cpp | 36 ++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 631f8adbd6d..3e587768b11 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -594,8 +594,7 @@ void make_device_json_column(device_span input, col.validity = cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); col.type = json_col_t::StringColumn; - col.child_columns.clear(); // their references should be deleted too. - col.column_order.clear(); + // destroy references of all child columns after this step, by calling remove_child_columns }; path_from_tree tree_path{column_categories, @@ -628,6 +627,19 @@ void make_device_json_column(device_span input, std::vector is_pruned(num_columns, 0); columns.try_emplace(parent_node_sentinel, std::ref(root)); + std::function remove_child_columns = + [&](NodeIndexT this_col_id, device_json_column& col) { + for (auto col_name : col.column_order) { + auto child_id = mapped_columns[{this_col_id, col_name}]; + is_mixed_type_column[child_id] = 1; + remove_child_columns(child_id, col.child_columns.at(col_name)); + mapped_columns.erase({this_col_id, col_name}); + columns.erase(child_id); + } + col.child_columns.clear(); // their references are deleted above. + col.column_order.clear(); + }; + auto name_and_parent_index = [&is_array_of_arrays, &row_array_parent_col_id, &column_parent_ids, @@ -721,6 +733,7 @@ void make_device_json_column(device_span input, auto& col = columns.at(old_col_id).get(); if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) { reinitialize_as_string(old_col_id, col); + remove_child_columns(old_col_id, col); // all its children (which are already inserted) are ignored later. } col.forced_as_string_column = true; diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 9d766e80094..5d790e73246 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -2679,4 +2679,40 @@ TEST_F(JsonReaderTest, JsonNestedDtypeFilter) } } +TEST_F(JsonReaderTest, JSONMixedTypeChildren) +{ + std::string const json_str = R"( +{ "Root": { "Key": [ { "EE": "A" } ] } } +{ "Root": { "Key": { } } } +{ "Root": { "Key": [{ "YY": 1}] } } +)"; + // Column "EE" is created and destroyed + // Column "YY" should not be created + + cudf::io::json_reader_options options = + cudf::io::json_reader_options::builder(cudf::io::source_info{json_str.c_str(), json_str.size()}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .normalize_single_quotes(true) + .normalize_whitespace(false) + .mixed_types_as_string(true) + .keep_quotes(true); + + auto result = cudf::io::read_json(options); + + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].name, "Root"); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key"); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets"); + // types + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING); + cudf::test::strings_column_wrapper expected({R"([ { "EE": "A" } ])", "{ }", R"([{ "YY": 1}])"}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0)); +} + CUDF_TEST_PROGRAM_MAIN() From dec0354b1ac2af981d4e8f13aceb45365838a1d8 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 31 May 2024 08:38:57 -0400 Subject: [PATCH 03/17] Fix multi-replace target count logic for large strings (#15807) Replaces `thrust::count_if` with raw kernel counter to handle large strings (int64 offsets) and > 2GB strings columns. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/15807 --- cpp/src/strings/replace/multi.cu | 49 ++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index 9025234aa52..f4110707c79 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -30,23 +30,17 @@ #include #include #include -#include #include #include #include #include -#include #include -#include #include #include #include #include -#include -#include -#include #include namespace cudf { @@ -262,6 +256,38 @@ struct replace_multi_parallel_fn { device_span d_replacements; }; +constexpr int64_t block_size = 512; // number of threads per block +constexpr size_type bytes_per_thread = 4; // bytes processed per thread + +/** + * @brief Count the number of targets in a strings column + * + * @param fn Functor containing has_target() function + * @param chars_bytes Number of bytes in the strings column + * @param d_output Result of the count + */ +CUDF_KERNEL void count_targets(replace_multi_parallel_fn fn, int64_t chars_bytes, int64_t* d_output) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const byte_idx = static_cast(idx) * bytes_per_thread; + auto const lane_idx = static_cast(threadIdx.x); + + using block_reduce = cub::BlockReduce; + __shared__ typename block_reduce::TempStorage temp_storage; + + int64_t count = 0; + // each thread processes multiple bytes + for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < chars_bytes); ++i) { + count += fn.has_target(i, chars_bytes); + } + auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum()); + + if ((lane_idx == 0) && (total > 0)) { + cuda::atomic_ref ref{*d_output}; + ref.fetch_add(total, cuda::std::memory_order_relaxed); + } +} + /** * @brief Used by the copy-if function to produce target_pair objects * @@ -308,12 +334,11 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in // Count the number of targets in the entire column. // Note this may over-count in the case where a target spans adjacent strings. - auto target_count = thrust::count_if( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(chars_bytes), - [fn, chars_bytes] __device__(int64_t idx) { return fn.has_target(idx, chars_bytes); }); - + rmm::device_scalar d_count(0, stream); + auto const num_blocks = util::div_rounding_up_safe( + util::div_rounding_up_safe(chars_bytes, static_cast(bytes_per_thread)), block_size); + count_targets<<>>(fn, chars_bytes, d_count.data()); + auto target_count = d_count.value(stream); // Create a vector of every target position in the chars column. // These may also include overlapping targets which will be resolved later. auto targets_positions = rmm::device_uvector(target_count, stream); From e7be142b2bfd4f08c18d0020a959e162f01d819e Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 31 May 2024 08:14:55 -0700 Subject: [PATCH 04/17] Migrate round to pylibcudf (#15863) xref #15162 Migrate round.pxd to use pylibcudf APIs. Authors: - Thomas Li (https://github.com/lithomas1) Approvers: - https://github.com/brandon-b-miller - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15863 --- .../user_guide/api_docs/pylibcudf/index.rst | 1 + .../user_guide/api_docs/pylibcudf/round.rst | 6 +++ .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 2 + python/cudf/cudf/_lib/pylibcudf/__init__.py | 2 + .../_lib/pylibcudf/libcudf/CMakeLists.txt | 2 +- .../cudf/_lib/pylibcudf/libcudf/round.pxd | 6 +-- .../cudf/_lib/pylibcudf/libcudf/round.pyx | 0 python/cudf/cudf/_lib/pylibcudf/round.pxd | 13 +++++ python/cudf/cudf/_lib/pylibcudf/round.pyx | 54 +++++++++++++++++++ python/cudf/cudf/_lib/round.pyx | 36 +++++-------- .../cudf/cudf/pylibcudf_tests/test_round.py | 38 +++++++++++++ 12 files changed, 134 insertions(+), 27 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/round.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/round.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/round.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_round.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 1c1b37e2c37..26875ce7d12 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -22,6 +22,7 @@ This page provides API documentation for pylibcudf. reduce reshape rolling + round scalar search stream_compaction diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst new file mode 100644 index 00000000000..c97fda12301 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst @@ -0,0 +1,6 @@ +===== +round +===== + +.. automodule:: cudf._lib.pylibcudf.round + :members: diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 7d01671e84f..eff14ad549b 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -29,6 +29,7 @@ set(cython_sources replace.pyx reshape.pyx rolling.pyx + round.pyx scalar.pyx search.pyx stream_compaction.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 91c3fdf5602..4f77f8cbaef 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -15,6 +15,7 @@ from . cimport ( replace, reshape, rolling, + round, search, sorting, stream_compaction, @@ -48,6 +49,7 @@ __all__ = [ "reduce", "replace", "rolling", + "round", "search", "stream_compaction", "strings", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index fcdc4992f00..048b62b6013 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -15,6 +15,7 @@ replace, reshape, rolling, + round, search, sorting, stream_compaction, @@ -48,6 +49,7 @@ "reduce", "replace", "rolling", + "round", "search", "stream_compaction", "strings", diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt index 8a6ce6a5187..ac56d42dda8 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources aggregation.pyx binaryop.pyx copying.pyx replace.pyx reduce.pxd +set(cython_sources aggregation.pyx binaryop.pyx copying.pyx replace.pyx reduce.pxd round.pyx stream_compaction.pyx types.pyx unary.pyx ) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd index 06ff42485ea..027c4634c9f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd @@ -9,9 +9,9 @@ from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view cdef extern from "cudf/round.hpp" namespace "cudf" nogil: - ctypedef enum rounding_method "cudf::rounding_method": - HALF_UP "cudf::rounding_method::HALF_UP" - HALF_EVEN "cudf::rounding_method::HALF_EVEN" + cpdef enum class rounding_method(int32_t): + HALF_UP + HALF_EVEN cdef unique_ptr[column] round ( const column_view& input, diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/pylibcudf/round.pxd b/python/cudf/cudf/_lib/pylibcudf/round.pxd new file mode 100644 index 00000000000..ccb64fc2847 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/round.pxd @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libc.stdint cimport int32_t + +from cudf._lib.pylibcudf.libcudf.round cimport rounding_method + +from .column cimport Column + + +cpdef Column round( + Column source, + int32_t decimal_places = *, + rounding_method round_method = * +) diff --git a/python/cudf/cudf/_lib/pylibcudf/round.pyx b/python/cudf/cudf/_lib/pylibcudf/round.pyx new file mode 100644 index 00000000000..cfcc2aafbb8 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/round.pyx @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libc.stdint cimport int32_t +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf.round cimport ( + round as cpp_round, + rounding_method, +) + +from cudf._lib.pylibcudf.libcudf.round import \ + rounding_method as RoundingMethod # no-cython-lint + +from cudf._lib.pylibcudf.libcudf.column.column cimport column + +from .column cimport Column + + +cpdef Column round( + Column source, + int32_t decimal_places = 0, + rounding_method round_method = rounding_method.HALF_UP +): + """Rounds all the values in a column to the specified number of decimal places. + + For details, see :cpp:func:`round`. + + Parameters + ---------- + source : Column + The Column for which to round values. + decimal_places: int32_t, optional + The number of decimal places to round to (default 0) + round_method: rounding_method, optional + The method by which to round each value. + Can be one of { RoundingMethod.HALF_UP, RoundingMethod.HALF_EVEN } + (default rounding_method.HALF_UP) + + Returns + ------- + pylibcudf.Column + A Column with values rounded + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = move( + cpp_round( + source.view(), + decimal_places, + round_method + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/round.pyx b/python/cudf/cudf/_lib/round.pyx index c1c36dd8854..f8ad57947c8 100644 --- a/python/cudf/cudf/_lib/round.pyx +++ b/python/cudf/cudf/_lib/round.pyx @@ -2,16 +2,10 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf._lib.column cimport Column -from cudf._lib.pylibcudf.libcudf.column.column cimport column -from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view -from cudf._lib.pylibcudf.libcudf.round cimport ( - round as cpp_round, - rounding_method as cpp_rounding_method, -) + +import cudf._lib.pylibcudf as plc +from cudf._lib.pylibcudf.round import RoundingMethod @acquire_spill_lock() @@ -31,19 +25,15 @@ def round(Column input_col, int decimal_places=0, how="half_even"): if how not in {"half_even", "half_up"}: raise ValueError("'how' must be either 'half_even' or 'half_up'") - cdef column_view input_col_view = input_col.view() - cdef unique_ptr[column] c_result - cdef cpp_rounding_method c_how = ( - cpp_rounding_method.HALF_EVEN if how == "half_even" - else cpp_rounding_method.HALF_UP + how = ( + RoundingMethod.HALF_EVEN if how == "half_even" + else RoundingMethod.HALF_UP ) - with nogil: - c_result = move( - cpp_round( - input_col_view, - decimal_places, - c_how - ) - ) - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.round.round( + input_col.to_pylibcudf(mode="read"), + decimal_places, + how + ) + ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_round.py b/python/cudf/cudf/pylibcudf_tests/test_round.py new file mode 100644 index 00000000000..a234860477f --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_round.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc + + +@pytest.fixture(params=[False, True]) +def nullable(request): + return request.param + + +@pytest.fixture(params=["float32", "float64"]) +def column(request, nullable): + values = [2.5, 2.49, 1.6, 8, -1.5, -1.7, -0.5, 0.5] + typ = {"float32": pa.float32(), "float64": pa.float64()}[request.param] + if nullable: + values[2] = None + return plc.interop.from_arrow(pa.array(values, type=typ)) + + +@pytest.mark.parametrize( + "round_mode", ["half_towards_infinity", "half_to_even"] +) +@pytest.mark.parametrize("decimals", [0, 1, 2, 5]) +def test_round(column, round_mode, decimals): + method = { + "half_towards_infinity": plc.round.RoundingMethod.HALF_UP, + "half_to_even": plc.round.RoundingMethod.HALF_EVEN, + }[round_mode] + got = plc.round.round(column, decimals, method) + expect = pa.compute.round( + plc.interop.to_arrow(column), decimals, round_mode + ) + + assert_column_eq(expect, got) From 7949a9cf6911066663e2245a4bb624e0f1847b06 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 31 May 2024 14:54:18 -0400 Subject: [PATCH 05/17] Use offsetalator in orc rowgroup_char_counts_kernel (#15891) Replaces hardcoded `size_type` for accessing strings offsets data with the offsetalator to compute the number of characters in a group in `cudf::io::orc::gpu::rowgroup_char_counts_kernel` Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/15891 --- cpp/src/io/orc/dict_enc.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 5971482f80c..5181c4a1c0e 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -16,6 +16,7 @@ #include "orc_gpu.hpp" +#include #include #include #include @@ -43,11 +44,12 @@ CUDF_KERNEL void rowgroup_char_counts_kernel(device_2dspan char_count auto const start_row = rowgroup_bounds[row_group_idx][col_idx].begin + str_col.offset(); auto const num_rows = rowgroup_bounds[row_group_idx][col_idx].size(); - auto const& offsets = str_col.child(strings_column_view::offsets_column_index); + auto const& offsets = str_col.child(strings_column_view::offsets_column_index); + auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type()); char_counts[str_col_idx][row_group_idx] = (num_rows == 0) ? 0 - : offsets.element(start_row + num_rows) - offsets.element(start_row); + : static_cast(offsets_itr[start_row + num_rows] - offsets_itr[start_row]); } void rowgroup_char_counts(device_2dspan counts, From 1354abdb7a4f9eb58bfc6e359c49d0baabacb4e1 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 31 May 2024 16:03:09 -0400 Subject: [PATCH 06/17] Fix url-decode benchmark to use offsetalator (#15871) Fixes the logic for generating URLs in the url-decoder benchmark to use the offsetalator instead of hardcoding `size_type`. This will allow benchmarking with large strings column in the future. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/15871 --- cpp/benchmarks/string/url_decode.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/benchmarks/string/url_decode.cu b/cpp/benchmarks/string/url_decode.cu index b3aeb69e5ea..7720e585023 100644 --- a/cpp/benchmarks/string/url_decode.cu +++ b/cpp/benchmarks/string/url_decode.cu @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -43,7 +44,7 @@ struct url_string_generator { { } - __device__ void operator()(thrust::tuple str_begin_end) + __device__ void operator()(thrust::tuple str_begin_end) { auto begin = thrust::get<0>(str_begin_end); auto end = thrust::get<1>(str_begin_end); @@ -69,11 +70,11 @@ auto generate_column(cudf::size_type num_rows, cudf::size_type chars_per_row, do auto result_col = std::move(table_a->release()[0]); // string column with num_rows aaa... auto chars_data = static_cast(result_col->mutable_view().head()); auto offset_col = result_col->child(cudf::strings_column_view::offsets_column_index).view(); + auto offset_itr = cudf::detail::offsetalator_factory::make_input_iterator(offset_col); auto engine = thrust::default_random_engine{}; thrust::for_each_n(thrust::device, - thrust::make_zip_iterator(offset_col.begin(), - offset_col.begin() + 1), + thrust::make_zip_iterator(offset_itr, offset_itr + 1), num_rows, url_string_generator{chars_data, esc_seq_chance, engine}); return result_col; From e66f4f50d045da87125430d13e6b862dc845845c Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 3 Jun 2024 10:14:58 -0700 Subject: [PATCH 07/17] Add an option to run cuIO benchmarks with pinned buffers as input (#15830) Adds `io_type::PINNED_BUFFER`, which allows cuIO benchmarks to use a pinned buffer as an input. The output is still a `std::vector` in this case, same as with `io_type::HOST_BUFFER`. Also stops the used of `cudf::io::io_type` in benchmarks, to allow benchmark-specific IO types, such as this one. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/15830 --- cpp/benchmarks/io/csv/csv_reader_input.cpp | 16 +++++-------- cpp/benchmarks/io/csv/csv_writer.cpp | 8 +++---- cpp/benchmarks/io/cuio_common.cpp | 23 ++++++++++++------- cpp/benchmarks/io/cuio_common.hpp | 14 ++++++++--- cpp/benchmarks/io/json/json_reader_input.cpp | 14 +++++------ cpp/benchmarks/io/json/json_writer.cpp | 9 ++++---- cpp/benchmarks/io/nvbench_helpers.hpp | 11 +++++---- cpp/benchmarks/io/orc/orc_reader_input.cpp | 16 ++++++------- cpp/benchmarks/io/orc/orc_writer.cpp | 8 +++---- .../io/parquet/parquet_reader_multithread.cpp | 18 +++++++++++---- cpp/benchmarks/io/parquet/parquet_writer.cpp | 8 +++---- 11 files changed, 77 insertions(+), 68 deletions(-) diff --git a/cpp/benchmarks/io/csv/csv_reader_input.cpp b/cpp/benchmarks/io/csv/csv_reader_input.cpp index 2ad3bc36f59..a93bc05ac58 100644 --- a/cpp/benchmarks/io/csv/csv_reader_input.cpp +++ b/cpp/benchmarks/io/csv/csv_reader_input.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,9 +28,7 @@ constexpr size_t data_size = 256 << 20; constexpr cudf::size_type num_cols = 64; template -void csv_read_common(DataType const& data_types, - cudf::io::io_type const& source_type, - nvbench::state& state) +void csv_read_common(DataType const& data_types, io_type const& source_type, nvbench::state& state) { auto const tbl = create_random_table(cycle_dtypes(data_types, num_cols), table_size_bytes{data_size}); @@ -66,7 +64,7 @@ void csv_read_common(DataType const& data_types, state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); } -template +template void BM_csv_read_input(nvbench::state& state, nvbench::type_list, nvbench::enum_type>) { @@ -76,7 +74,7 @@ void BM_csv_read_input(nvbench::state& state, csv_read_common(d_type, source_type, state); } -template +template void BM_csv_read_io(nvbench::state& state, nvbench::type_list>) { auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL), @@ -97,12 +95,10 @@ using d_type_list = nvbench::enum_type_list; -using io_list = - nvbench::enum_type_list; +using io_list = nvbench::enum_type_list; NVBENCH_BENCH_TYPES(BM_csv_read_input, - NVBENCH_TYPE_AXES(d_type_list, - nvbench::enum_type_list)) + NVBENCH_TYPE_AXES(d_type_list, nvbench::enum_type_list)) .set_name("csv_read_data_type") .set_type_axes_names({"data_type", "io"}) .set_min_samples(4); diff --git a/cpp/benchmarks/io/csv/csv_writer.cpp b/cpp/benchmarks/io/csv/csv_writer.cpp index 8ff07be1531..7ba43850cf2 100644 --- a/cpp/benchmarks/io/csv/csv_writer.cpp +++ b/cpp/benchmarks/io/csv/csv_writer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,7 +28,7 @@ constexpr size_t data_size = 256 << 20; constexpr cudf::size_type num_cols = 64; -template +template void BM_csv_write_dtype_io(nvbench::state& state, nvbench::type_list, nvbench::enum_type>) { @@ -112,9 +112,7 @@ using d_type_list = nvbench::enum_type_list; -using io_list = nvbench::enum_type_list; +using io_list = nvbench::enum_type_list; NVBENCH_BENCH_TYPES(BM_csv_write_dtype_io, NVBENCH_TYPE_AXES(d_type_list, io_list)) .set_name("csv_write_dtype_io") diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index 3a61e5f1e7b..37ced8ea703 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -52,6 +52,11 @@ cudf::io::source_info cuio_source_sink_pair::make_source_info() switch (type) { case io_type::FILEPATH: return cudf::io::source_info(file_name); case io_type::HOST_BUFFER: return cudf::io::source_info(h_buffer.data(), h_buffer.size()); + case io_type::PINNED_BUFFER: { + pinned_buffer.resize(h_buffer.size()); + std::copy(h_buffer.begin(), h_buffer.end(), pinned_buffer.begin()); + return cudf::io::source_info(pinned_buffer.data(), pinned_buffer.size()); + } case io_type::DEVICE_BUFFER: { // TODO: make cuio_source_sink_pair stream-friendly and avoid implicit use of the default // stream @@ -71,7 +76,8 @@ cudf::io::sink_info cuio_source_sink_pair::make_sink_info() switch (type) { case io_type::VOID: return cudf::io::sink_info(void_sink.get()); case io_type::FILEPATH: return cudf::io::sink_info(file_name); - case io_type::HOST_BUFFER: [[fallthrough]]; + case io_type::HOST_BUFFER: + case io_type::PINNED_BUFFER: case io_type::DEVICE_BUFFER: return cudf::io::sink_info(&h_buffer); default: CUDF_FAIL("invalid output type"); } @@ -84,7 +90,8 @@ size_t cuio_source_sink_pair::size() case io_type::FILEPATH: return static_cast( std::ifstream(file_name, std::ifstream::ate | std::ifstream::binary).tellg()); - case io_type::HOST_BUFFER: [[fallthrough]]; + case io_type::HOST_BUFFER: + case io_type::PINNED_BUFFER: case io_type::DEVICE_BUFFER: return h_buffer.size(); default: CUDF_FAIL("invalid output type"); } @@ -204,13 +211,13 @@ void try_drop_l3_cache() "Failed to execute the drop cache command"); } -cudf::io::io_type retrieve_io_type_enum(std::string_view io_string) +io_type retrieve_io_type_enum(std::string_view io_string) { - if (io_string == "FILEPATH") { return cudf::io::io_type::FILEPATH; } - if (io_string == "HOST_BUFFER") { return cudf::io::io_type::HOST_BUFFER; } - if (io_string == "DEVICE_BUFFER") { return cudf::io::io_type::DEVICE_BUFFER; } - if (io_string == "VOID") { return cudf::io::io_type::VOID; } - if (io_string == "USER_IMPLEMENTED") { return cudf::io::io_type::USER_IMPLEMENTED; } + if (io_string == "FILEPATH") { return io_type::FILEPATH; } + if (io_string == "HOST_BUFFER") { return io_type::HOST_BUFFER; } + if (io_string == "PINNED_BUFFER") { return io_type::PINNED_BUFFER; } + if (io_string == "DEVICE_BUFFER") { return io_type::DEVICE_BUFFER; } + if (io_string == "VOID") { return io_type::VOID; } CUDF_FAIL("Unsupported io_type."); } diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp index 6e0b32219ce..d4f39a5f243 100644 --- a/cpp/benchmarks/io/cuio_common.hpp +++ b/cpp/benchmarks/io/cuio_common.hpp @@ -18,13 +18,20 @@ #include +#include #include #include -#include #include -using cudf::io::io_type; +// IO types supported in the benchmarks +enum class io_type { + FILEPATH, // Input/output are both files + HOST_BUFFER, // Input/output are both host buffers (pageable) + PINNED_BUFFER, // Input is a pinned host buffer, output is a host buffer (pageable) + DEVICE_BUFFER, // Input is a device buffer, output is a host buffer (pageable) + VOID +}; std::string random_file_in_dir(std::string const& dir_path); @@ -72,6 +79,7 @@ class cuio_source_sink_pair { io_type const type; std::vector h_buffer; + cudf::detail::pinned_host_vector pinned_buffer; rmm::device_uvector d_buffer; std::string const file_name; std::unique_ptr void_sink; @@ -144,7 +152,7 @@ void try_drop_l3_cache(); * * @return The io_type enum value */ -cudf::io::io_type retrieve_io_type_enum(std::string_view io_string); +io_type retrieve_io_type_enum(std::string_view io_string); /** * @brief Convert a string to the corresponding compression_type enum value. diff --git a/cpp/benchmarks/io/json/json_reader_input.cpp b/cpp/benchmarks/io/json/json_reader_input.cpp index aa73dacdbc5..4366790f208 100644 --- a/cpp/benchmarks/io/json/json_reader_input.cpp +++ b/cpp/benchmarks/io/json/json_reader_input.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -70,7 +70,7 @@ cudf::size_type json_write_bm_data(cudf::io::sink_info sink, return view.num_rows(); } -template +template void BM_json_read_io(nvbench::state& state, nvbench::type_list>) { cuio_source_sink_pair source_sink(IO); @@ -87,7 +87,7 @@ void BM_json_read_io(nvbench::state& state, nvbench::type_list +template void BM_json_read_data_type( nvbench::state& state, nvbench::type_list, nvbench::enum_type>) { @@ -107,16 +107,14 @@ using d_type_list = nvbench::enum_type_list; -using io_list = nvbench::enum_type_list; +using io_list = + nvbench::enum_type_list; using compression_list = nvbench::enum_type_list; NVBENCH_BENCH_TYPES(BM_json_read_data_type, - NVBENCH_TYPE_AXES(d_type_list, - nvbench::enum_type_list)) + NVBENCH_TYPE_AXES(d_type_list, nvbench::enum_type_list)) .set_name("json_read_data_type") .set_type_axes_names({"data_type", "io"}) .set_min_samples(4); diff --git a/cpp/benchmarks/io/json/json_writer.cpp b/cpp/benchmarks/io/json/json_writer.cpp index ae6bb81ff93..444457bbf0d 100644 --- a/cpp/benchmarks/io/json/json_writer.cpp +++ b/cpp/benchmarks/io/json/json_writer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -52,7 +52,7 @@ void json_write_common(cudf::io::json_writer_options const& write_opts, state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); } -template +template void BM_json_write_io(nvbench::state& state, nvbench::type_list>) { auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL), @@ -114,9 +114,8 @@ void BM_json_writer_options(nvbench::state& state) json_write_common(write_opts, source_sink, data_size, state); } -using io_list = nvbench::enum_type_list; +using io_list = + nvbench::enum_type_list; NVBENCH_BENCH_TYPES(BM_json_write_io, NVBENCH_TYPE_AXES(io_list)) .set_name("json_write_io") diff --git a/cpp/benchmarks/io/nvbench_helpers.hpp b/cpp/benchmarks/io/nvbench_helpers.hpp index 8b79912c7ee..1e3ab2b7b4f 100644 --- a/cpp/benchmarks/io/nvbench_helpers.hpp +++ b/cpp/benchmarks/io/nvbench_helpers.hpp @@ -56,13 +56,14 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS( [](auto) { return std::string{}; }) NVBENCH_DECLARE_ENUM_TYPE_STRINGS( - cudf::io::io_type, + io_type, [](auto value) { switch (value) { - case cudf::io::io_type::FILEPATH: return "FILEPATH"; - case cudf::io::io_type::HOST_BUFFER: return "HOST_BUFFER"; - case cudf::io::io_type::DEVICE_BUFFER: return "DEVICE_BUFFER"; - case cudf::io::io_type::VOID: return "VOID"; + case io_type::FILEPATH: return "FILEPATH"; + case io_type::HOST_BUFFER: return "HOST_BUFFER"; + case io_type::PINNED_BUFFER: return "PINNED_BUFFER"; + case io_type::DEVICE_BUFFER: return "DEVICE_BUFFER"; + case io_type::VOID: return "VOID"; default: return "Unknown"; } }, diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp index b7c214a8374..cafd3cc5c39 100644 --- a/cpp/benchmarks/io/orc/orc_reader_input.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp @@ -87,7 +87,7 @@ void orc_read_common(cudf::size_type num_rows_to_read, } // namespace -template +template void BM_orc_read_data(nvbench::state& state, nvbench::type_list, nvbench::enum_type>) { @@ -112,7 +112,7 @@ void BM_orc_read_data(nvbench::state& state, orc_read_common(num_rows_written, source_sink, state); } -template +template void orc_read_io_compression(nvbench::state& state) { auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL_SIGNED), @@ -150,7 +150,7 @@ void orc_read_io_compression(nvbench::state& state) orc_read_common(num_rows_written, source_sink, state); } -template +template void BM_orc_read_io_compression( nvbench::state& state, nvbench::type_list, nvbench::enum_type>) @@ -163,7 +163,7 @@ void BM_orc_chunked_read_io_compression(nvbench::state& state, nvbench::type_list>) { // Only run benchmark using HOST_BUFFER IO. - return orc_read_io_compression(state); + return orc_read_io_compression(state); } using d_type_list = nvbench::enum_type_list; -using io_list = nvbench::enum_type_list; +using io_list = + nvbench::enum_type_list; using compression_list = nvbench::enum_type_list; NVBENCH_BENCH_TYPES(BM_orc_read_data, - NVBENCH_TYPE_AXES(d_type_list, - nvbench::enum_type_list)) + NVBENCH_TYPE_AXES(d_type_list, nvbench::enum_type_list)) .set_name("orc_read_decode") .set_type_axes_names({"data_type", "io"}) .set_min_samples(4) diff --git a/cpp/benchmarks/io/orc/orc_writer.cpp b/cpp/benchmarks/io/orc/orc_writer.cpp index bb373297222..b795f3e3164 100644 --- a/cpp/benchmarks/io/orc/orc_writer.cpp +++ b/cpp/benchmarks/io/orc/orc_writer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -82,7 +82,7 @@ void BM_orc_write_encode(nvbench::state& state, nvbench::type_list +template void BM_orc_write_io_compression( nvbench::state& state, nvbench::type_list, nvbench::enum_type>) @@ -183,9 +183,7 @@ using d_type_list = nvbench::enum_type_list; -using io_list = nvbench::enum_type_list; +using io_list = nvbench::enum_type_list; using compression_list = nvbench::enum_type_list; diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp index bd80c4e0e88..a67d1932951 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp @@ -62,7 +62,7 @@ std::tuple, size_t, size_t> write_file_data( size_t total_file_size = 0; for (size_t i = 0; i < num_files; ++i) { - cuio_source_sink_pair source_sink{cudf::io::io_type::HOST_BUFFER}; + cuio_source_sink_pair source_sink{io_type::HOST_BUFFER}; auto const tbl = create_random_table( cycle_dtypes(d_types, num_cols), @@ -96,6 +96,11 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state, cudf::detail::thread_pool threads(num_threads); auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); + std::vector source_info_vector; + std::transform(source_sink_vector.begin(), + source_sink_vector.end(), + std::back_inserter(source_info_vector), + [](auto& source_sink) { return source_sink.make_source_info(); }); auto mem_stats_logger = cudf::memory_stats_logger(); @@ -104,9 +109,8 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state, [&](nvbench::launch& launch, auto& timer) { auto read_func = [&](int index) { auto const stream = streams[index % num_threads]; - auto& source_sink = source_sink_vector[index]; cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(source_sink.make_source_info()); + cudf::io::parquet_reader_options::builder(source_info_vector[index]); cudf::io::read_parquet(read_opts, stream, rmm::mr::get_current_device_resource()); }; @@ -174,6 +178,11 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state, auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); cudf::detail::thread_pool threads(num_threads); auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); + std::vector source_info_vector; + std::transform(source_sink_vector.begin(), + source_sink_vector.end(), + std::back_inserter(source_info_vector), + [](auto& source_sink) { return source_sink.make_source_info(); }); auto mem_stats_logger = cudf::memory_stats_logger(); @@ -183,9 +192,8 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state, [&](nvbench::launch& launch, auto& timer) { auto read_func = [&](int index) { auto const stream = streams[index % num_threads]; - auto& source_sink = source_sink_vector[index]; cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(source_sink.make_source_info()); + cudf::io::parquet_reader_options::builder(source_info_vector[index]); // divide chunk limits by number of threads so the number of chunks produced is the // same for all cases. this seems better than the alternative, which is to keep the // limits the same. if we do that, as the number of threads goes up, the number of diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp index 13b396ea267..46d2927a92b 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -82,7 +82,7 @@ void BM_parq_write_encode(nvbench::state& state, nvbench::type_list +template void BM_parq_write_io_compression( nvbench::state& state, nvbench::type_list, nvbench::enum_type>) @@ -188,9 +188,7 @@ using d_type_list = nvbench::enum_type_list; -using io_list = nvbench::enum_type_list; +using io_list = nvbench::enum_type_list; using compression_list = nvbench::enum_type_list; From ba1299dfc03e87f11cf021a67d01531ed6afd7f7 Mon Sep 17 00:00:00 2001 From: Brian Tepera Date: Mon, 3 Jun 2024 13:45:09 -0400 Subject: [PATCH 08/17] Implement day_name and month_name to match pandas (#15479) This PR implements the `month_name` and `day_name` datetime methods, matching the equivalent [month_name](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.dt.month_name.html) and [day_name](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.dt.day_name.html) methods from pandas. Currently this is implemented just for English locale, though it could be expanded to include additional languages in the future. Closes #12407 Authors: - Brian Tepera (https://github.com/btepera) - GALI PREM SAGAR (https://github.com/galipremsagar) - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15479 --- python/cudf/cudf/core/column/datetime.py | 29 ++++++++++ python/cudf/cudf/core/index.py | 39 +++++++++++++ python/cudf/cudf/core/series.py | 72 ++++++++++++++++++++++++ python/cudf/cudf/tests/test_datetime.py | 39 +++++++++++++ 4 files changed, 179 insertions(+) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index d92a3a00641..27f31c8f500 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -2,6 +2,7 @@ from __future__ import annotations +import calendar import datetime import functools import locale @@ -339,6 +340,34 @@ def element_indexing(self, index: int): def get_dt_field(self, field: str) -> ColumnBase: return libcudf.datetime.extract_datetime_component(self, field) + def _get_field_names( + self, + field: Literal["month", "weekday"], + labels: list[str], + locale: str | None = None, + ) -> ColumnBase: + if locale is not None: + raise NotImplementedError( + "Setting a locale is currently not supported. " + "Results will be returned in your current locale." + ) + col_labels = as_column(labels) + indices = self.get_dt_field(field) + has_nulls = indices.has_nulls() + if has_nulls: + indices = indices.fillna(len(col_labels)) + return col_labels.take(indices, nullify=True, check_bounds=has_nulls) + + def get_day_names(self, locale: str | None = None) -> ColumnBase: + return self._get_field_names( + "weekday", list(calendar.day_name), locale=locale + ) + + def get_month_names(self, locale: str | None = None) -> ColumnBase: + return self._get_field_names( + "month", list(calendar.month_name), locale=locale + ) + def ceil(self, freq: str) -> ColumnBase: return libcudf.datetime.ceil_datetime(self, freq) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 49bfb150f60..2a75b374a1e 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2120,6 +2120,45 @@ def quarter(self): res = extract_quarter(self._values) return Index(res, dtype="int8") + @_cudf_nvtx_annotate + def day_name(self, locale: str | None = None) -> Index: + """ + Return the day names. Currently supports English locale only. + + Examples + -------- + >>> import cudf + >>> datetime_index = cudf.date_range("2016-12-31", "2017-01-08", freq="D") + >>> datetime_index + DatetimeIndex(['2016-12-31', '2017-01-01', '2017-01-02', '2017-01-03', + '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07'], + dtype='datetime64[ns]', freq='D') + >>> datetime_index.day_name() + Index(['Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', + 'Friday', 'Saturday'], dtype='object') + """ + day_names = self._column.get_day_names(locale) + return Index._from_data({self.name: day_names}) + + @_cudf_nvtx_annotate + def month_name(self, locale: str | None = None) -> Index: + """ + Return the month names. Currently supports English locale only. + + Examples + -------- + >>> import cudf + >>> datetime_index = cudf.date_range("2017-12-30", periods=6, freq='W') + >>> datetime_index + DatetimeIndex(['2017-12-30', '2018-01-06', '2018-01-13', '2018-01-20', + '2018-01-27', '2018-02-03'], + dtype='datetime64[ns]', freq='7D') + >>> datetime_index.month_name() + Index(['December', 'January', 'January', 'January', 'January', 'February'], dtype='object') + """ + month_names = self._column.get_month_names(locale) + return Index._from_data({self.name: month_names}) + @_cudf_nvtx_annotate def isocalendar(self): """ diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 908347e389b..a5b204ef346 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4201,6 +4201,78 @@ def quarter(self): name=self.series.name, ) + @_cudf_nvtx_annotate + def day_name(self, locale=None): + """ + Return the day names. Currently supports English locale only. + + Examples + -------- + >>> import cudf + >>> datetime_series = cudf.Series(cudf.date_range('2016-12-31', + ... '2017-01-08', freq='D')) + >>> datetime_series + 0 2016-12-31 + 1 2017-01-01 + 2 2017-01-02 + 3 2017-01-03 + 4 2017-01-04 + 5 2017-01-05 + 6 2017-01-06 + 7 2017-01-07 + 8 2017-01-08 + dtype: datetime64[ns] + >>> datetime_series.dt.day_name() + 0 Saturday + 1 Sunday + 2 Monday + 3 Tuesday + 4 Wednesday + 5 Thursday + 6 Friday + 7 Saturday + dtype: object + """ + day_names = self.series._column.get_day_names(locale) + return Series._from_data( + ColumnAccessor({None: day_names}), + index=self.series.index, + name=self.series.name, + ) + + @_cudf_nvtx_annotate + def month_name(self, locale: str | None = None) -> Series: + """ + Return the month names. Currently supports English locale only. + + Examples + -------- + >>> import cudf + >>> datetime_series = cudf.Series(cudf.date_range("2017-12-30", periods=6, freq='W')) + >>> datetime_series + 0 2017-12-30 + 1 2018-01-06 + 2 2018-01-13 + 3 2018-01-20 + 4 2018-01-27 + 5 2018-02-03 + dtype: datetime64[ns] + >>> datetime_series.dt.month_name() + 0 December + 1 January + 2 January + 3 January + 4 January + 5 February + dtype: object + """ + month_names = self.series._column.get_month_names(locale) + return Series._from_data( + ColumnAccessor({None: month_names}), + index=self.series.index, + name=self.series.name, + ) + @_cudf_nvtx_annotate def isocalendar(self): """ diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 46a0dcd315d..4186fff038a 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -2419,3 +2419,42 @@ def test_date_range_tz(): result = pd.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC") expected = cudf.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC") assert_eq(result, expected) + + +@pytest.mark.parametrize("meth", ["day_name", "month_name"]) +@pytest.mark.parametrize("klass", [pd.Series, pd.DatetimeIndex]) +def test_day_month_name(meth, klass): + data = [ + "2020-05-31 08:00:00", + None, + "1999-12-31 18:40:00", + "2000-12-31 04:00:00", + None, + "1900-02-28 07:00:00", + "1800-03-14 07:30:00", + "2100-03-14 07:30:00", + "1970-01-01 00:00:00", + "1969-12-31 12:59:00", + ] + + p_obj = klass(data, dtype="datetime64[s]") + g_obj = cudf.from_pandas(p_obj) + + if klass is pd.Series: + p_obj = p_obj.dt + g_obj = g_obj.dt + + expect = getattr(p_obj, meth)() + got = getattr(g_obj, meth)() + + assert_eq(expect, got) + + +@pytest.mark.parametrize("meth", ["day_name", "month_name"]) +@pytest.mark.parametrize("klass", [cudf.Series, cudf.DatetimeIndex]) +def test_day_month_name_locale_not_implemented(meth, klass): + obj = klass(cudf.date_range("2020-01-01", periods=7)) + if klass is cudf.Series: + obj = obj.dt + with pytest.raises(NotImplementedError): + getattr(obj, meth)(locale="pt_BR.utf8") From 7d5561a8c0aeb8531913d7767faca55a5ab31fa5 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 3 Jun 2024 15:29:39 -0400 Subject: [PATCH 09/17] Fix debug assert in rowgroup_char_counts_kernel (#15902) Fixes assert triggered by `OrcWriterTest.EmptyChildStringColumn` in a Debug build. ``` $ gtests/ORC_TEST --gtest_filter=OrcWriterTest.EmptyChildStringColumn Note: Google Test filter = OrcWriterTest.EmptyChildStringColumn [==========] Running 1 test from 1 test suite. [----------] Global test environment set-up. [----------] 1 test from OrcWriterTest [ RUN ] OrcWriterTest.EmptyChildStringColumn /cudf/cpp/include/cudf/detail/offsets_iterator.cuh:79: cudf::detail::input_offsetalator::input_offsetalator(const void *, cudf::data_type, int): block: [0,0,0], thread: [0,0,0] Assertion `(dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) && "Unexpected offsets type"` failed. CUDA Error detected. cudaErrorAssert device-side assert triggered ORC_TEST: /conda/envs/rapids/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp:248: void rmm::mr::detail::stream_ordered_memory_resource::do_deallocate(void*, std::size_t, rmm::cuda_stream_view) [with PoolResource = rmm::mr::pool_memory_resource; FreeListType = rmm::mr::detail::coalescing_free_list; std::size_t = long unsigned int]: Assertion `status__ == cudaSuccess' failed. Aborted (core dumped) ``` Error introduced in #15891 where offsetalator wraps an offsets column in the `cudf::io::orc::gpu::rowgroup_char_counts_kernel`. But when `num_rows==0` the offsets column is `EMPTY` causing the assert to trigger. Checking the `num_rows` before accessing the offsets column fixes the issue. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/15902 --- cpp/src/io/orc/dict_enc.cu | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 5181c4a1c0e..5be75350951 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -44,12 +44,13 @@ CUDF_KERNEL void rowgroup_char_counts_kernel(device_2dspan char_count auto const start_row = rowgroup_bounds[row_group_idx][col_idx].begin + str_col.offset(); auto const num_rows = rowgroup_bounds[row_group_idx][col_idx].size(); - auto const& offsets = str_col.child(strings_column_view::offsets_column_index); - auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type()); - char_counts[str_col_idx][row_group_idx] = - (num_rows == 0) - ? 0 - : static_cast(offsets_itr[start_row + num_rows] - offsets_itr[start_row]); + size_type char_count = 0; + if (num_rows > 0) { + auto const& offsets = str_col.child(strings_column_view::offsets_column_index); + auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type()); + char_count = static_cast(offsets_itr[start_row + num_rows] - offsets_itr[start_row]); + } + char_counts[str_col_idx][row_group_idx] = char_count; } void rowgroup_char_counts(device_2dspan counts, From 4a17c451719a5d1e144b21703650bd323990e892 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 3 Jun 2024 15:32:12 -0400 Subject: [PATCH 10/17] Rename strings multiple target replace API (#15898) Renames the multi-target overload of `cudf::strings::replace()` to `cudf::strings::replace_multiple()`. This helps with some Cython issues involving fused types and overloaded functions with the same number of arguments. Reference: https://github.com/rapidsai/cudf/issues/15855#issuecomment-2129980298 This change deprecates the old name to be removed in a future release. Also added some additional error unit tests. Closes #15855 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - Nghia Truong (https://github.com/ttnghia) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15898 --- cpp/benchmarks/string/replace.cpp | 2 +- cpp/include/cudf/strings/detail/replace.hpp | 12 +++---- cpp/include/cudf/strings/replace.hpp | 14 +++++++- cpp/src/strings/replace/multi.cu | 23 +++++++++---- cpp/tests/json/json_tests.cpp | 2 +- cpp/tests/streams/strings/replace_test.cpp | 4 +-- cpp/tests/strings/replace_tests.cpp | 33 +++++++++++++++---- java/src/main/native/src/ColumnViewJni.cpp | 2 +- .../pylibcudf/libcudf/strings/replace.pxd | 2 +- python/cudf/cudf/_lib/strings/replace.pyx | 3 +- 10 files changed, 71 insertions(+), 26 deletions(-) diff --git a/cpp/benchmarks/string/replace.cpp b/cpp/benchmarks/string/replace.cpp index c8f26142193..3d9d51bfd6d 100644 --- a/cpp/benchmarks/string/replace.cpp +++ b/cpp/benchmarks/string/replace.cpp @@ -52,7 +52,7 @@ static void BM_replace(benchmark::State& state, replace_type rt) case scalar: cudf::strings::replace(input, target, repl); break; case slice: cudf::strings::replace_slice(input, repl, 1, 10); break; case multi: - cudf::strings::replace( + cudf::strings::replace_multiple( input, cudf::strings_column_view(targets), cudf::strings_column_view(repls)); break; } diff --git a/cpp/include/cudf/strings/detail/replace.hpp b/cpp/include/cudf/strings/detail/replace.hpp index aad89beb47e..481d00f1bce 100644 --- a/cpp/include/cudf/strings/detail/replace.hpp +++ b/cpp/include/cudf/strings/detail/replace.hpp @@ -39,14 +39,14 @@ std::unique_ptr replace(strings_column_view const& strings, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::strings::replace(strings_column_view const&, strings_column_view const&, + * @copydoc cudf::strings::replace_multiple(strings_column_view const&, strings_column_view const&, * strings_column_view const&, rmm::cuda_stream_view, rmm::device_async_resource_ref) */ -std::unique_ptr replace(strings_column_view const& strings, - strings_column_view const& targets, - strings_column_view const& repls, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); +std::unique_ptr replace_mutiple(strings_column_view const& strings, + strings_column_view const& targets, + strings_column_view const& repls, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** * @brief Replaces any null string entries with the given string. diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp index 9525db44b69..a19aa9be0c0 100644 --- a/cpp/include/cudf/strings/replace.hpp +++ b/cpp/include/cudf/strings/replace.hpp @@ -153,7 +153,19 @@ std::unique_ptr replace_slice( * @param mr Device memory resource used to allocate the returned column's device memory * @return New strings column */ -std::unique_ptr replace( +std::unique_ptr replace_multiple( + strings_column_view const& input, + strings_column_view const& targets, + strings_column_view const& repls, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + +/** + * @copydoc cudf::strings::replace_multiple + * + * @deprecated since 24.08 + */ +[[deprecated]] std::unique_ptr replace( strings_column_view const& input, strings_column_view const& targets, strings_column_view const& repls, diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index f4110707c79..8e5c5cf60b8 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -499,11 +499,11 @@ std::unique_ptr replace_string_parallel(strings_column_view const& input } // namespace -std::unique_ptr replace(strings_column_view const& input, - strings_column_view const& targets, - strings_column_view const& repls, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr replace_multiple(strings_column_view const& input, + strings_column_view const& targets, + strings_column_view const& repls, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { if (input.is_empty()) { return make_empty_column(type_id::STRING); } CUDF_EXPECTS(((targets.size() > 0) && (targets.null_count() == 0)), @@ -524,6 +524,17 @@ std::unique_ptr replace(strings_column_view const& input, // external API +std::unique_ptr replace_multiple(strings_column_view const& strings, + strings_column_view const& targets, + strings_column_view const& repls, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::replace_multiple(strings, targets, repls, stream, mr); +} + +// deprecated in 24.08 std::unique_ptr replace(strings_column_view const& strings, strings_column_view const& targets, strings_column_view const& repls, @@ -531,7 +542,7 @@ std::unique_ptr replace(strings_column_view const& strings, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::replace(strings, targets, repls, stream, mr); + return detail::replace_multiple(strings, targets, repls, stream, mr); } } // namespace strings diff --git a/cpp/tests/json/json_tests.cpp b/cpp/tests/json/json_tests.cpp index 6c9050becc1..e38ca6628f3 100644 --- a/cpp/tests/json/json_tests.cpp +++ b/cpp/tests/json/json_tests.cpp @@ -76,7 +76,7 @@ std::unique_ptr drop_whitespace(cudf::column_view const& col) cudf::strings_column_view strings(col); cudf::strings_column_view targets(whitespace); cudf::strings_column_view replacements(repl); - return cudf::strings::replace(strings, targets, replacements); + return cudf::strings::replace_multiple(strings, targets, replacements); } struct JsonPathTests : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/streams/strings/replace_test.cpp b/cpp/tests/streams/strings/replace_test.cpp index fc87460b706..95c1209b5db 100644 --- a/cpp/tests/streams/strings/replace_test.cpp +++ b/cpp/tests/streams/strings/replace_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,7 +34,7 @@ TEST_F(StringsReplaceTest, Replace) auto const target = cudf::string_scalar("é", true, cudf::test::get_default_stream()); auto const repl = cudf::string_scalar(" ", true, cudf::test::get_default_stream()); cudf::strings::replace(view, target, repl, -1, cudf::test::get_default_stream()); - cudf::strings::replace(view, view, view, cudf::test::get_default_stream()); + cudf::strings::replace_multiple(view, view, view, cudf::test::get_default_stream()); cudf::strings::replace_slice(view, repl, 1, 2, cudf::test::get_default_stream()); auto const pattern = std::string("[a-z]"); diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp index 726d9f95c7d..ef4f3bc2b2a 100644 --- a/cpp/tests/strings/replace_tests.cpp +++ b/cpp/tests/strings/replace_tests.cpp @@ -277,6 +277,23 @@ TEST_F(StringsReplaceTest, ReplaceErrors) EXPECT_THROW(cudf::strings::replace(sv, target, null_input), cudf::logic_error); EXPECT_THROW(cudf::strings::replace(sv, null_input, replacement), cudf::logic_error); EXPECT_THROW(cudf::strings::replace(sv, empty_input, replacement), cudf::logic_error); + + auto const empty = cudf::test::strings_column_wrapper(); + auto const ev = cudf::strings_column_view(empty); + auto const targets = cudf::test::strings_column_wrapper({"x"}); + auto const tv = cudf::strings_column_view(targets); + auto const target_null = cudf::test::strings_column_wrapper({""}, {0}); + auto const tv_null = cudf::strings_column_view(target_null); + auto const repls = cudf::test::strings_column_wrapper({"y", "z"}); + auto const rv = cudf::strings_column_view(repls); + auto const repl_null = cudf::test::strings_column_wrapper({""}, {0}); + auto const rv_null = cudf::strings_column_view(repl_null); + + EXPECT_THROW(cudf::strings::replace_multiple(sv, ev, rv), cudf::logic_error); + EXPECT_THROW(cudf::strings::replace_multiple(sv, tv_null, rv), cudf::logic_error); + EXPECT_THROW(cudf::strings::replace_multiple(sv, tv, ev), cudf::logic_error); + EXPECT_THROW(cudf::strings::replace_multiple(sv, tv, rv_null), cudf::logic_error); + EXPECT_THROW(cudf::strings::replace_multiple(sv, tv, rv), cudf::logic_error); } TEST_F(StringsReplaceTest, ReplaceSlice) @@ -341,7 +358,7 @@ TEST_F(StringsReplaceTest, ReplaceMulti) cudf::test::strings_column_wrapper repls({"_ ", "A ", "2 "}); auto repls_view = cudf::strings_column_view(repls); - auto results = cudf::strings::replace(strings_view, targets_view, repls_view); + auto results = cudf::strings::replace_multiple(strings_view, targets_view, repls_view); std::vector h_expected{"_ quick brown fox jumps over _ lazy dog", "_ fat cat lays next 2 _ other accénted cat", @@ -361,7 +378,7 @@ TEST_F(StringsReplaceTest, ReplaceMulti) cudf::test::strings_column_wrapper repls({"* "}); auto repls_view = cudf::strings_column_view(repls); - auto results = cudf::strings::replace(strings_view, targets_view, repls_view); + auto results = cudf::strings::replace_multiple(strings_view, targets_view, repls_view); std::vector h_expected{"* quick brown fox jumps over * lazy dog", "* fat cat lays next * * other accénted cat", @@ -422,7 +439,7 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong) cudf::test::strings_column_wrapper repls({"x", "PEAR", "avocado", "$$"}); auto repls_view = cudf::strings_column_view(repls); - auto results = cudf::strings::replace(strings_view, targets_view, repls_view); + auto results = cudf::strings::replace_multiple(strings_view, targets_view, repls_view); cudf::test::strings_column_wrapper expected( {"This string needs to be very long to trigger the long-replace internal functions. " @@ -454,7 +471,7 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong) cudf::test::strings_column_wrapper repls({"*"}); auto repls_view = cudf::strings_column_view(repls); - auto results = cudf::strings::replace(strings_view, targets_view, repls_view); + auto results = cudf::strings::replace_multiple(strings_view, targets_view, repls_view); cudf::test::strings_column_wrapper expected( {"This string needs to be very long to trigger the long-replace internal functions. " @@ -494,7 +511,7 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong) auto repls = cudf::test::strings_column_wrapper({""}); auto repls_view = cudf::strings_column_view(repls); - auto results = cudf::strings::replace(strings_view, targets_view, repls_view); + auto results = cudf::strings::replace_multiple(strings_view, targets_view, repls_view); cudf::test::strings_column_wrapper expected( {"This string needs to be very long to trigger the long-replace internal functions. " @@ -522,6 +539,10 @@ TEST_F(StringsReplaceTest, EmptyStringsColumn) auto strings_view = cudf::strings_column_view(zero_size_strings_column); auto results = cudf::strings::replace( strings_view, cudf::string_scalar("not"), cudf::string_scalar("pertinent")); - auto view = results->view(); + cudf::test::expect_column_empty(results->view()); + + auto const target = cudf::test::strings_column_wrapper({"x"}); + auto const target_view = cudf::strings_column_view(target); + results = cudf::strings::replace_multiple(strings_view, target_view, target_view); cudf::test::expect_column_empty(results->view()); } diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 086d4672788..8487fb6dc91 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1755,7 +1755,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplaceMulti( cudf::strings_column_view scvtargets(*cvtargets); cudf::column_view* cvrepls = reinterpret_cast(repls_cv); cudf::strings_column_view scvrepls(*cvrepls); - return release_as_jlong(cudf::strings::replace(scv, scvtargets, scvrepls)); + return release_as_jlong(cudf::strings::replace_multiple(scv, scvtargets, scvrepls)); } CATCH_STD(env, 0); } diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd index 92e142b33fc..34e03eec638 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd @@ -23,7 +23,7 @@ cdef extern from "cudf/strings/replace.hpp" namespace "cudf::strings" nogil: string_scalar repl, int32_t maxrepl) except + - cdef unique_ptr[column] replace( + cdef unique_ptr[column] replace_multiple( column_view source_strings, column_view target_strings, column_view repl_strings) except + diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx index 880201e65a2..2d9330a8a24 100644 --- a/python/cudf/cudf/_lib/strings/replace.pyx +++ b/python/cudf/cudf/_lib/strings/replace.pyx @@ -12,6 +12,7 @@ from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar from cudf._lib.pylibcudf.libcudf.strings.replace cimport ( replace as cpp_replace, + replace_multiple as cpp_replace_multiple, replace_slice as cpp_replace_slice, ) from cudf._lib.pylibcudf.libcudf.types cimport size_type @@ -126,7 +127,7 @@ def replace_multi(Column source_strings, cdef column_view repl_view = repl_strings.view() with nogil: - c_result = move(cpp_replace( + c_result = move(cpp_replace_multiple( source_view, target_view, repl_view From f30ea0a7d12625a755bb5726e7514dfdf12094d6 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 3 Jun 2024 17:37:56 -0400 Subject: [PATCH 11/17] Use offsetalator in strings shift functor (#15870) Replaces hardcoded `size_type` used for offset values in the `shift_chars_fn` functor with offsetalator. Follow on to #15630 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/15870 --- cpp/src/strings/copying/shift.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu index 5bba4855390..b386c0860d1 100644 --- a/cpp/src/strings/copying/shift.cu +++ b/cpp/src/strings/copying/shift.cu @@ -67,9 +67,9 @@ struct shift_chars_fn { if (offset < 0) { auto const last_index = -offset; if (idx < last_index) { - auto const first_index = - offset + d_column.child(strings_column_view::offsets_column_index) - .element(d_column.offset() + d_column.size()); + auto const offsets = d_column.child(strings_column_view::offsets_column_index); + auto const off_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type()); + auto const first_index = offset + off_itr[d_column.offset() + d_column.size()]; return d_column.head()[idx + first_index]; } else { auto const char_index = idx - last_index; @@ -79,9 +79,9 @@ struct shift_chars_fn { if (idx < offset) { return d_filler.data()[idx % d_filler.size_bytes()]; } else { - return d_column.head()[idx - offset + - d_column.child(strings_column_view::offsets_column_index) - .element(d_column.offset())]; + auto const offsets = d_column.child(strings_column_view::offsets_column_index); + auto const off_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type()); + return d_column.head()[idx - offset + off_itr[d_column.offset()]]; } } } From 90b3094f8a5a12b029a156cf484b673b589d2fec Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 3 Jun 2024 14:52:46 -0700 Subject: [PATCH 12/17] Clean up pylibcudf test assertations (#15892) Swap the order of result,expected to expected, result for assert_table_eq too Fix a few places where result,expected was swapped for assert_column_eq Authors: - Thomas Li (https://github.com/lithomas1) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15892 --- python/cudf/cudf/pylibcudf_tests/common/utils.py | 2 +- python/cudf/cudf/pylibcudf_tests/test_copying.py | 14 +++++++------- python/cudf/cudf/pylibcudf_tests/test_reshape.py | 4 ++-- .../cudf/pylibcudf_tests/test_string_capitalize.py | 6 +++--- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index 0befb3bb3e8..e00053529a8 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -54,7 +54,7 @@ def assert_column_eq( assert lhs.equals(rhs) -def assert_table_eq(plc_table: plc.Table, pa_table: pa.Table) -> None: +def assert_table_eq(pa_table: pa.Table, plc_table: plc.Table) -> None: """Verify that a pylibcudf table and PyArrow table are equal.""" plc_shape = (plc_table.num_rows(), plc_table.num_columns()) assert plc_shape == pa_table.shape diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py index ef70869a145..cd70ce4abf5 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_copying.py +++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py @@ -138,7 +138,7 @@ def test_gather(target_table, pa_target_table, index_column, pa_index_column): plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) expected = pa_target_table.take(pa_index_column) - assert_table_eq(result, expected) + assert_table_eq(expected, result) def test_gather_map_has_nulls(target_table): @@ -240,7 +240,7 @@ def test_scatter_table( pa_target_table, ) - assert_table_eq(result, expected) + assert_table_eq(expected, result) def test_scatter_table_num_col_mismatch( @@ -315,7 +315,7 @@ def test_scatter_scalars( pa_target_table, ) - assert_table_eq(result, expected) + assert_table_eq(expected, result) def test_scatter_scalars_num_scalars_mismatch( @@ -574,7 +574,7 @@ def test_slice_table(target_table, pa_target_table): lower_bounds = bounds[::2] result = plc.copying.slice(target_table, bounds) for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result): - assert_table_eq(slice_, pa_target_table[lb:ub]) + assert_table_eq(pa_target_table[lb:ub], slice_) def test_split_column(target_column, pa_target_column): @@ -600,7 +600,7 @@ def test_split_table(target_table, pa_target_table): lower_bounds = [0] + upper_bounds[:-1] result = plc.copying.split(target_table, upper_bounds) for lb, ub, split in zip(lower_bounds, upper_bounds, result): - assert_table_eq(split, pa_target_table[lb:ub]) + assert_table_eq(pa_target_table[lb:ub], split) def test_copy_if_else_column_column( @@ -753,7 +753,7 @@ def test_boolean_mask_scatter_from_table( pa_source_table, pa_mask, pa_target_table ) - assert_table_eq(result, expected) + assert_table_eq(expected, result) def test_boolean_mask_scatter_from_wrong_num_cols(source_table, target_table): @@ -828,7 +828,7 @@ def test_boolean_mask_scatter_from_scalars( pa_target_table, ) - assert_table_eq(result, expected) + assert_table_eq(expected, result) def test_get_element(input_column, pa_input_column): diff --git a/python/cudf/cudf/pylibcudf_tests/test_reshape.py b/python/cudf/cudf/pylibcudf_tests/test_reshape.py index b8b914f3f09..32d79257f4f 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_reshape.py +++ b/python/cudf/cudf/pylibcudf_tests/test_reshape.py @@ -27,7 +27,7 @@ def test_interleave_columns(reshape_data, reshape_plc_tbl): expect = pa.concat_arrays(interleaved_data) - assert_column_eq(res, expect) + assert_column_eq(expect, res) @pytest.mark.parametrize("cnt", [0, 1, 3]) @@ -40,4 +40,4 @@ def test_tile(reshape_data, reshape_plc_tbl, cnt): tiled_data, schema=plc.interop.to_arrow(reshape_plc_tbl).schema ) - assert_table_eq(res, expect) + assert_table_eq(expect, res) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py index dd7e96e871b..818d6e6e72a 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py @@ -37,7 +37,7 @@ def plc_data(pa_data): def test_capitalize(plc_data, pa_data): got = plc.strings.capitalize.capitalize(plc_data) expected = pa.compute.utf8_capitalize(pa_data) - assert_column_eq(got, expected) + assert_column_eq(expected, got) def test_title(plc_data, pa_data): @@ -45,10 +45,10 @@ def test_title(plc_data, pa_data): plc_data, plc.strings.char_types.StringCharacterTypes.CASE_TYPES ) expected = pa.compute.utf8_title(pa_data) - assert_column_eq(got, expected) + assert_column_eq(expected, got) def test_is_title(plc_data, pa_data): got = plc.strings.capitalize.is_title(plc_data) expected = pa.compute.utf8_is_title(pa_data) - assert_column_eq(got, expected) + assert_column_eq(expected, got) From 6176776e1f88718d802b317f506e2b56635fa31a Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 3 Jun 2024 15:06:39 -0700 Subject: [PATCH 13/17] Improve options docs (#15888) Recently I have answered a few user questions about how to use cudf options for display. We were missing documentation that explained that display options are inherited from pandas. I also found a broken link in the docs. This PR fixes both of those doc-related issues. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/15888 --- docs/cudf/source/cudf_pandas/how-it-works.md | 5 ++--- docs/cudf/source/user_guide/api_docs/options.rst | 13 +++++++++++++ docs/cudf/source/user_guide/options.md | 2 +- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/docs/cudf/source/cudf_pandas/how-it-works.md b/docs/cudf/source/cudf_pandas/how-it-works.md index ee856c84b78..75f57742ac9 100644 --- a/docs/cudf/source/cudf_pandas/how-it-works.md +++ b/docs/cudf/source/cudf_pandas/how-it-works.md @@ -34,6 +34,5 @@ correct result. Data is automatically transferred from host to device transfers. When using `cudf.pandas`, cuDF's [pandas compatibility -mode](https://docs.rapids.ai/api/cudf/stable/api_docs/options/#available-options) -is automatically enabled, ensuring consistency with pandas-specific -semantics like default sort ordering. +mode](api.options) is automatically enabled, ensuring consistency with +pandas-specific semantics like default sort ordering. diff --git a/docs/cudf/source/user_guide/api_docs/options.rst b/docs/cudf/source/user_guide/api_docs/options.rst index b3a4004e2d9..4c0f6684b76 100644 --- a/docs/cudf/source/user_guide/api_docs/options.rst +++ b/docs/cudf/source/user_guide/api_docs/options.rst @@ -12,6 +12,19 @@ Options and settings cudf.describe_option cudf.option_context +Display options are controlled by pandas +---------------------------------------- + +Options for display are inherited from pandas. This includes commonly accessed options such as: + +- ``display.max_columns`` +- ``display.max_info_rows`` +- ``display.max_rows`` +- ``display.max_seq_items`` + +For example, to show all rows of a DataFrame or Series in a Jupyter notebook, call ``pandas.set_option("display.max_rows", None)``. + +See also the :ref:`full list of pandas display options `. Available options ----------------- diff --git a/docs/cudf/source/user_guide/options.md b/docs/cudf/source/user_guide/options.md index 245d3fd1974..997681212fb 100644 --- a/docs/cudf/source/user_guide/options.md +++ b/docs/cudf/source/user_guide/options.md @@ -11,4 +11,4 @@ When no argument is provided, all options are printed. To set value to a option, use {py:func}`cudf.set_option`. -See the [API reference](api.options) for more details. +See the [options API reference](api.options) for descriptions of the available options. From 4a0b59133ed56c043fc73d24785f24be0b4fbe69 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 3 Jun 2024 15:08:31 -0700 Subject: [PATCH 14/17] Update Python labels and remove unnecessary ones (#15893) This PR leverages some of the new labels we have for organizing our issues and removes labels that aren't really used at the moment. If reviewers feel strongly I can keep the ci label, but AFAICT that doesn't really get used for anything at the moment and we'll benefit more from leveraging future labels to help direct tasks to the build/infra team vs cudf devs. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15893 --- .github/labeler.yml | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/.github/labeler.yml b/.github/labeler.yml index d14344384d1..48967417af3 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -1,9 +1,19 @@ # Documentation for config - https://github.com/actions/labeler#common-examples -cuDF (Python): +Python: - 'python/**' - 'notebooks/**' +cudf.pandas: + - 'python/cudf/cudf/pandas/**' + - 'python/cudf/cudf_pandas_tests/**' + +cudf.polars: + - 'python/cudf_polars/**' + +pylibcudf: + - 'python/cudf/cudf/_lib/pylibcudf/**' + libcudf: - 'cpp/**' @@ -12,11 +22,5 @@ CMake: - '**/cmake/**' - '**/*.cmake' -cuDF (Java): +Java: - 'java/**' - -ci: - - 'ci/**' - -conda: - - 'conda/**' From 382de32e8137a3a59a0800f46ef8a1de62b1a6e5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 3 Jun 2024 15:14:52 -0700 Subject: [PATCH 15/17] Add support for additional metaclasses of proxies and use for ExcelWriter (#15399) The ExcelWriter supports the abstract os.PathLike interface, but we would also like that support to be reflected in the class's MRO. Doing so is slightly complicated because os.PathLike is an ABC, and as such has a different metaclass. Therefore, in order to add os.PathLike as a base class, we must also generate a suitable combined metaclass for our ExcelWriter wrapper. This change ensures the `isinstance(pd.ExcelWriter(...), os.PathLike)` returns `True` when using cudf.pandas. Authors: - Vyas Ramasubramani (https://github.com/vyasr) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15399 --- python/cudf/cudf/pandas/_wrappers/pandas.py | 11 +++++-- python/cudf/cudf/pandas/fast_slow_proxy.py | 30 +++++++------------ .../cudf_pandas_tests/test_cudf_pandas.py | 5 ++++ 3 files changed, 25 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 2e3880e14f6..698dd946022 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -1,8 +1,10 @@ # SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 +import abc import copyreg import importlib +import os import pickle import sys @@ -857,7 +859,12 @@ def Index__new__(cls, *args, **kwargs): pd.ExcelWriter, fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, + additional_attributes={ + "__hash__": _FastSlowAttribute("__hash__"), + "__fspath__": _FastSlowAttribute("__fspath__"), + }, + bases=(os.PathLike,), + metaclasses=(abc.ABCMeta,), ) try: @@ -1032,7 +1039,7 @@ def holiday_calendar_factory_wrapper(*args, **kwargs): fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, - meta_class=pd_HolidayCalendarMetaClass, + metaclasses=(pd_HolidayCalendarMetaClass,), ) Holiday = make_final_proxy_type( diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 94caec1ce6c..169dd80e132 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -106,19 +106,6 @@ def __call__(self): _DELETE = object() -def create_composite_metaclass(base_meta, additional_meta): - """ - Dynamically creates a composite metaclass that inherits from both provided metaclasses. - This ensures that the metaclass behaviors of both base_meta and additional_meta are preserved. - """ - - class CompositeMeta(base_meta, additional_meta): - def __new__(cls, name, bases, namespace): - return super().__new__(cls, name, bases, namespace) - - return CompositeMeta - - def make_final_proxy_type( name: str, fast_type: type, @@ -130,7 +117,7 @@ def make_final_proxy_type( additional_attributes: Mapping[str, Any] | None = None, postprocess: Callable[[_FinalProxy, Any, Any], Any] | None = None, bases: Tuple = (), - meta_class=None, + metaclasses: Tuple = (), ) -> Type[_FinalProxy]: """ Defines a fast-slow proxy type for a pair of "final" fast and slow @@ -161,6 +148,8 @@ def make_final_proxy_type( construct said unwrapped object. See also `_maybe_wrap_result`. bases Optional tuple of base classes to insert into the mro. + metaclasses + Optional tuple of metaclasses to unify with the base proxy metaclass. Notes ----- @@ -241,15 +230,18 @@ def _fsproxy_state(self) -> _State: cls_dict[slow_name] = _FastSlowAttribute( slow_name, private=slow_name.startswith("_") ) - if meta_class is None: - meta_class = _FastSlowProxyMeta - else: - meta_class = create_composite_metaclass(_FastSlowProxyMeta, meta_class) + metaclass = _FastSlowProxyMeta + if metaclasses: + metaclass = types.new_class( # type: ignore + f"{name}_Meta", + metaclasses + (_FastSlowProxyMeta,), + {}, + ) cls = types.new_class( name, (*bases, _FinalProxy), - {"metaclass": meta_class}, + {"metaclass": metaclass}, lambda ns: ns.update(cls_dict), ) functools.update_wrapper( diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 75bceea3034..fef829b17fc 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -6,6 +6,7 @@ import copy import datetime import operator +import os import pathlib import pickle import tempfile @@ -1421,3 +1422,7 @@ def test_holidays_within_dates(holiday, start, expected): utc.localize(xpd.Timestamp(start)), ) ) == [utc.localize(dt) for dt in expected] + + +def test_excelwriter_pathlike(): + assert isinstance(pd.ExcelWriter("foo.xlsx"), os.PathLike) From eb460169786665b1624cb6c4f9b502b800810b37 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 4 Jun 2024 06:32:49 -0500 Subject: [PATCH 16/17] Migrate column factories to pylibcudf (#15257) This PR implements `column_factories.hpp` using `pylibcudf` and migrates the cuDF cython to use them cc @vyasr Authors: - https://github.com/brandon-b-miller - Lawrence Mitchell (https://github.com/wence-) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15257 --- cpp/src/column/column_factories.cpp | 17 +- cpp/tests/column/factories_test.cpp | 4 +- cpp/tests/fixed_point/fixed_point_tests.cpp | 2 +- .../api_docs/pylibcudf/column_factories.rst | 6 + .../user_guide/api_docs/pylibcudf/index.rst | 1 + python/cudf/cudf/_lib/column.pyx | 21 +- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 2 + python/cudf/cudf/_lib/pylibcudf/__init__.py | 4 +- .../cudf/_lib/pylibcudf/column_factories.pxd | 52 ++++ .../cudf/_lib/pylibcudf/column_factories.pyx | 205 ++++++++++++++ python/cudf/cudf/_lib/pylibcudf/interop.pyx | 82 ++++++ .../libcudf/column/column_factories.pxd | 73 ++++- python/cudf/cudf/_lib/pylibcudf/types.pxd | 1 + python/cudf/cudf/_lib/pylibcudf/types.pyx | 3 +- .../pylibcudf_tests/test_column_factories.py | 253 ++++++++++++++++++ .../cudf/cudf/pylibcudf_tests/test_interop.py | 69 +++++ 17 files changed, 767 insertions(+), 29 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/column_factories.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/column_factories.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_column_factories.py create mode 100644 python/cudf/cudf/pylibcudf_tests/test_interop.py diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp index e40056fc8a1..0260068d4db 100644 --- a/cpp/src/column/column_factories.cpp +++ b/cpp/src/column/column_factories.cpp @@ -65,7 +65,8 @@ std::size_t size_of(data_type element_type) std::unique_ptr make_empty_column(data_type type) { CUDF_EXPECTS(type.id() == type_id::EMPTY || !cudf::is_nested(type), - "make_empty_column is invalid to call on nested types"); + "make_empty_column is invalid to call on nested types", + cudf::data_type_error); return std::make_unique(type, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0); } @@ -80,7 +81,9 @@ std::unique_ptr make_numeric_column(data_type type, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type."); + CUDF_EXPECTS(type.id() != type_id::EMPTY && is_numeric(type), + "Invalid, non-numeric type.", + cudf::data_type_error); CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); return std::make_unique( @@ -100,7 +103,7 @@ std::unique_ptr make_fixed_point_column(data_type type, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type."); + CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.", cudf::data_type_error); CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); return std::make_unique( @@ -120,7 +123,7 @@ std::unique_ptr make_timestamp_column(data_type type, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type."); + CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.", cudf::data_type_error); CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); return std::make_unique( @@ -140,7 +143,7 @@ std::unique_ptr make_duration_column(data_type type, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type."); + CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.", cudf::data_type_error); CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); return std::make_unique( @@ -160,7 +163,9 @@ std::unique_ptr make_fixed_width_column(data_type type, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type."); + CUDF_EXPECTS(type.id() != type_id::EMPTY && is_fixed_width(type), + "Invalid, non-fixed-width type.", + cudf::data_type_error); // clang-format off if (is_timestamp (type)) return make_timestamp_column (type, size, state, stream, mr); diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp index afebc91dd73..dca36eaa4e7 100644 --- a/cpp/tests/column/factories_test.cpp +++ b/cpp/tests/column/factories_test.cpp @@ -164,7 +164,7 @@ TEST_P(NonNumericFactoryTest, NonNumericThrow) auto column = cudf::make_numeric_column( cudf::data_type{GetParam()}, this->size(), cudf::mask_state::UNALLOCATED); }; - EXPECT_THROW(construct(), cudf::logic_error); + EXPECT_THROW(construct(), cudf::data_type_error); } INSTANTIATE_TEST_CASE_P(NonNumeric, @@ -307,7 +307,7 @@ TEST_P(NonFixedWidthFactoryTest, NonFixedWidthThrow) auto column = cudf::make_fixed_width_column( cudf::data_type{GetParam()}, this->size(), cudf::mask_state::UNALLOCATED); }; - EXPECT_THROW(construct(), cudf::logic_error); + EXPECT_THROW(construct(), cudf::data_type_error); } INSTANTIATE_TEST_CASE_P(NonFixedWidth, diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp index 73de1fbaa68..ab7984d4b03 100644 --- a/cpp/tests/fixed_point/fixed_point_tests.cpp +++ b/cpp/tests/fixed_point/fixed_point_tests.cpp @@ -498,7 +498,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointColumnWrapper) TYPED_TEST(FixedPointTestAllReps, NoScaleOrWrongTypeID) { EXPECT_THROW(cudf::make_fixed_point_column(cudf::data_type{cudf::type_id::INT32}, 0), - cudf::logic_error); + cudf::data_type_error); } TYPED_TEST(FixedPointTestAllReps, SimpleFixedPointColumnWrapper) diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst new file mode 100644 index 00000000000..c858135b6ce --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst @@ -0,0 +1,6 @@ +================ +column_factories +================ + +.. automodule:: cudf._lib.pylibcudf.column_factories + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 26875ce7d12..58fea77adaa 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -11,6 +11,7 @@ This page provides API documentation for pylibcudf. aggregation binaryop column + column_factories concatenate copying filling diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index f33e121241d..7155017b7af 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -39,14 +39,10 @@ from cudf._lib.types cimport ( from cudf._lib.null_mask import bitmask_allocation_size_bytes from cudf._lib.types import dtype_from_pylibcudf_column -# TODO: We currently need this for "casting" empty pylibcudf columns in -# from_pylibcudf by instead creating an empty numeric column. We will be able -# to remove this once column factories are exposed to pylibcudf. cimport cudf._lib.pylibcudf.libcudf.copying as cpp_copying cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types cimport cudf._lib.pylibcudf.libcudf.unary as libcudf_unary -from cudf._lib.pylibcudf cimport Column as plc_Column from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_contents from cudf._lib.pylibcudf.libcudf.column.column_factories cimport ( make_column_from_scalar as cpp_make_column_from_scalar, @@ -623,22 +619,17 @@ cdef class Column: pylibcudf.Column A new pylibcudf.Column referencing the same data. """ - cdef libcudf_types.data_type new_dtype if col.type().id() == pylibcudf.TypeId.TIMESTAMP_DAYS: col = pylibcudf.unary.cast( col, pylibcudf.DataType(pylibcudf.TypeId.TIMESTAMP_SECONDS) ) elif col.type().id() == pylibcudf.TypeId.EMPTY: - new_dtype = libcudf_types.data_type(libcudf_types.type_id.INT8) - # TODO: This function call is what requires cimporting pylibcudf. - # We can remove the cimport once we can directly do - # pylibcudf.column_factories.make_numeric_column or equivalent. - col = plc_Column.from_libcudf( - move( - make_numeric_column( - new_dtype, col.size(), libcudf_types.mask_state.ALL_NULL - ) - ) + new_dtype = pylibcudf.DataType(pylibcudf.TypeId.INT8) + + col = pylibcudf.column_factories.make_numeric_column( + new_dtype, + col.size(), + pylibcudf.column_factories.MaskState.ALL_NULL ) dtype = dtype_from_pylibcudf_column(col) diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index eff14ad549b..7d0676f6def 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -16,6 +16,7 @@ set(cython_sources aggregation.pyx binaryop.pyx column.pyx + column_factories.pyx concatenate.pyx copying.pyx filling.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 4f77f8cbaef..b289d112a90 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -4,6 +4,7 @@ from . cimport ( aggregation, binaryop, + column_factories, concatenate, copying, filling, @@ -40,6 +41,7 @@ __all__ = [ "binaryop", "concatenate", "copying", + "column_factories", "filling", "gpumemoryview", "groupby", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 048b62b6013..2565332f3ed 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -3,6 +3,7 @@ from . import ( aggregation, binaryop, + column_factories, concatenate, copying, filling, @@ -27,7 +28,7 @@ from .gpumemoryview import gpumemoryview from .scalar import Scalar from .table import Table -from .types import DataType, TypeId +from .types import DataType, MaskState, TypeId __all__ = [ "Column", @@ -39,6 +40,7 @@ "binaryop", "concatenate", "copying", + "column_factories", "filling", "gpumemoryview", "groupby", diff --git a/python/cudf/cudf/_lib/pylibcudf/column_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/column_factories.pxd new file mode 100644 index 00000000000..9dbd74ab16c --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/column_factories.pxd @@ -0,0 +1,52 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type + +from .column cimport Column +from .types cimport DataType, size_type, type_id + +ctypedef fused MakeEmptyColumnOperand: + DataType + type_id + object + +ctypedef fused MaskArg: + mask_state + object + + +cpdef Column make_empty_column( + MakeEmptyColumnOperand type_or_id +) + +cpdef Column make_numeric_column( + DataType type_, + size_type size, + MaskArg mask, +) + +cpdef Column make_fixed_point_column( + DataType type_, + size_type size, + MaskArg mask, +) + +cpdef Column make_timestamp_column( + DataType type_, + size_type size, + MaskArg mask, +) + +cpdef Column make_duration_column( + DataType type_, + size_type size, + MaskArg mask, +) + +cpdef Column make_fixed_width_column( + DataType type_, + size_type size, + MaskArg mask, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx b/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx new file mode 100644 index 00000000000..ef7f512f0e5 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx @@ -0,0 +1,205 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.column.column_factories cimport ( + make_duration_column as cpp_make_duration_column, + make_empty_column as cpp_make_empty_column, + make_fixed_point_column as cpp_make_fixed_point_column, + make_fixed_width_column as cpp_make_fixed_width_column, + make_numeric_column as cpp_make_numeric_column, + make_timestamp_column as cpp_make_timestamp_column, +) +from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type + +from .types cimport DataType, type_id + +from .types import MaskState, TypeId + + +cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id): + cdef unique_ptr[column] result + cdef type_id id + + if MakeEmptyColumnOperand is object: + if isinstance(type_or_id, TypeId): + id = type_or_id + with nogil: + result = move( + cpp_make_empty_column( + id + ) + ) + else: + raise TypeError( + "Must pass a TypeId or DataType" + ) + elif MakeEmptyColumnOperand is DataType: + with nogil: + result = move( + cpp_make_empty_column( + type_or_id.c_obj + ) + ) + elif MakeEmptyColumnOperand is type_id: + with nogil: + result = move( + cpp_make_empty_column( + type_or_id + ) + ) + else: + raise TypeError( + "Must pass a TypeId or DataType" + ) + return Column.from_libcudf(move(result)) + + +cpdef Column make_numeric_column( + DataType type_, + size_type size, + MaskArg mstate +): + + cdef unique_ptr[column] result + cdef mask_state state + + if MaskArg is object: + if isinstance(mstate, MaskState): + state = mstate + else: + raise TypeError("Invalid mask argument") + elif MaskArg is mask_state: + state = mstate + else: + raise TypeError("Invalid mask argument") + with nogil: + result = move( + cpp_make_numeric_column( + type_.c_obj, + size, + state + ) + ) + + return Column.from_libcudf(move(result)) + +cpdef Column make_fixed_point_column( + DataType type_, + size_type size, + MaskArg mstate +): + + cdef unique_ptr[column] result + cdef mask_state state + + if MaskArg is object: + if isinstance(mstate, MaskState): + state = mstate + else: + raise TypeError("Invalid mask argument") + elif MaskArg is mask_state: + state = mstate + else: + raise TypeError("Invalid mask argument") + with nogil: + result = move( + cpp_make_fixed_point_column( + type_.c_obj, + size, + state + ) + ) + + return Column.from_libcudf(move(result)) + + +cpdef Column make_timestamp_column( + DataType type_, + size_type size, + MaskArg mstate +): + + cdef unique_ptr[column] result + cdef mask_state state + + if MaskArg is object: + if isinstance(mstate, MaskState): + state = mstate + else: + raise TypeError("Invalid mask argument") + elif MaskArg is mask_state: + state = mstate + else: + raise TypeError("Invalid mask argument") + with nogil: + result = move( + cpp_make_timestamp_column( + type_.c_obj, + size, + state + ) + ) + + return Column.from_libcudf(move(result)) + + +cpdef Column make_duration_column( + DataType type_, + size_type size, + MaskArg mstate +): + + cdef unique_ptr[column] result + cdef mask_state state + + if MaskArg is object: + if isinstance(mstate, MaskState): + state = mstate + else: + raise TypeError("Invalid mask argument") + elif MaskArg is mask_state: + state = mstate + else: + raise TypeError("Invalid mask argument") + with nogil: + result = move( + cpp_make_duration_column( + type_.c_obj, + size, + state + ) + ) + + return Column.from_libcudf(move(result)) + + +cpdef Column make_fixed_width_column( + DataType type_, + size_type size, + MaskArg mstate +): + + cdef unique_ptr[column] result + cdef mask_state state + + if MaskArg is object: + if isinstance(mstate, MaskState): + state = mstate + else: + raise TypeError("Invalid mask argument") + elif MaskArg is mask_state: + state = mstate + else: + raise TypeError("Invalid mask argument") + with nogil: + result = move( + cpp_make_fixed_width_column( + type_.c_obj, + size, + state + ) + ) + + return Column.from_libcudf(move(result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx index f172080cece..1e4102e4b64 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -33,6 +33,33 @@ from .scalar cimport Scalar from .table cimport Table from .types cimport DataType, type_id +ARROW_TO_PYLIBCUDF_TYPES = { + pa.int8(): type_id.INT8, + pa.int16(): type_id.INT16, + pa.int32(): type_id.INT32, + pa.int64(): type_id.INT64, + pa.uint8(): type_id.UINT8, + pa.uint16(): type_id.UINT16, + pa.uint32(): type_id.UINT32, + pa.uint64(): type_id.UINT64, + pa.float32(): type_id.FLOAT32, + pa.float64(): type_id.FLOAT64, + pa.bool_(): type_id.BOOL8, + pa.string(): type_id.STRING, + pa.duration('s'): type_id.DURATION_SECONDS, + pa.duration('ms'): type_id.DURATION_MILLISECONDS, + pa.duration('us'): type_id.DURATION_MICROSECONDS, + pa.duration('ns'): type_id.DURATION_NANOSECONDS, + pa.timestamp('s'): type_id.TIMESTAMP_SECONDS, + pa.timestamp('ms'): type_id.TIMESTAMP_MILLISECONDS, + pa.timestamp('us'): type_id.TIMESTAMP_MICROSECONDS, + pa.timestamp('ns'): type_id.TIMESTAMP_NANOSECONDS, + pa.date32(): type_id.TIMESTAMP_DAYS, +} + +LIBCUDF_TO_ARROW_TYPES = { + v: k for k, v in ARROW_TO_PYLIBCUDF_TYPES.items() +} cdef column_metadata _metadata_to_libcudf(metadata): """Convert a ColumnMetadata object to C++ column_metadata. @@ -77,6 +104,21 @@ def from_arrow(pyarrow_object, *, DataType data_type=None): raise TypeError("from_arrow only accepts Table and Scalar objects") +@from_arrow.register(pa.DataType) +def _from_arrow_datatype(pyarrow_object): + if isinstance(pyarrow_object, pa.Decimal128Type): + return DataType(type_id.DECIMAL128, scale=-pyarrow_object.scale) + elif isinstance(pyarrow_object, pa.StructType): + return DataType(type_id.STRUCT) + elif isinstance(pyarrow_object, pa.ListType): + return DataType(type_id.LIST) + else: + try: + return DataType(ARROW_TO_PYLIBCUDF_TYPES[pyarrow_object]) + except KeyError: + raise TypeError(f"Unable to convert {pyarrow_object} to cudf datatype") + + @from_arrow.register(pa.Table) def _from_arrow_table(pyarrow_object, *, DataType data_type=None): if data_type is not None: @@ -170,6 +212,46 @@ def to_arrow(cudf_object, metadata=None): raise TypeError("to_arrow only accepts Table and Scalar objects") +@to_arrow.register(DataType) +def _to_arrow_datatype(cudf_object, **kwargs): + """ + Convert a datatype to arrow. + + Translation of some types requires extra information as a keyword + argument. Specifically: + + - When translating a decimal type, provide ``precision`` + - When translating a struct type, provide ``fields`` + - When translating a list type, provide the wrapped ``value_type`` + """ + if cudf_object.id() in {type_id.DECIMAL32, type_id.DECIMAL64, type_id.DECIMAL128}: + if not (precision := kwargs.get("precision")): + raise ValueError( + "Precision must be provided for decimal types" + ) + # no pa.decimal32 or pa.decimal64 + return pa.decimal128(precision, -cudf_object.scale()) + elif cudf_object.id() == type_id.STRUCT: + if not (fields := kwargs.get("fields")): + raise ValueError( + "Fields must be provided for struct types" + ) + return pa.struct(fields) + elif cudf_object.id() == type_id.LIST: + if not (value_type := kwargs.get("value_type")): + raise ValueError( + "Value type must be provided for list types" + ) + return pa.list_(value_type) + else: + try: + return ARROW_TO_PYLIBCUDF_TYPES[cudf_object.id()] + except KeyError: + raise TypeError( + f"Unable to convert {cudf_object.id()} to arrow datatype" + ) + + @to_arrow.register(Table) def _to_arrow_table(cudf_object, metadata=None): if metadata is None: diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd index fd22d92cb30..2faff21a77b 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd @@ -2,9 +2,17 @@ from libcpp.memory cimport unique_ptr +from rmm._lib.device_buffer cimport device_buffer + from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar -from cudf._lib.pylibcudf.libcudf.types cimport data_type, mask_state, size_type +from cudf._lib.pylibcudf.libcudf.types cimport ( + bitmask_type, + data_type, + mask_state, + size_type, + type_id, +) cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil: @@ -12,5 +20,64 @@ cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil: size_type size, mask_state state) except + - cdef unique_ptr[column] make_column_from_scalar (const scalar & s, - size_type size) except + + cdef unique_ptr[column] make_numeric_column(data_type type, + size_type size, + device_buffer mask, + size_type null_count) except + + + cdef unique_ptr[column] make_fixed_point_column( + data_type type, + size_type size, + mask_state state) except + + + cdef unique_ptr[column] make_fixed_point_column( + data_type type, + size_type size, + device_buffer mask, + size_type null_count) except + + + cdef unique_ptr[column] make_timestamp_column( + data_type type, + size_type size, + mask_state state) except + + + cdef unique_ptr[column] make_timestamp_column( + data_type type, + size_type size, + device_buffer mask, + size_type null_count) except + + + cdef unique_ptr[column] make_duration_column( + data_type type, + size_type size, + mask_state state) except + + + cdef unique_ptr[column] make_duration_column( + data_type type, + size_type size, + device_buffer mask, + size_type null_count) except + + + cdef unique_ptr[column] make_fixed_width_column( + data_type type, + size_type size, + mask_state state) except + + + cdef unique_ptr[column] make_fixed_width_column( + data_type type, + size_type size, + device_buffer mask, + size_type null_count) except + + + cdef unique_ptr[column] make_column_from_scalar(const scalar& s, + size_type size) except + + + cdef unique_ptr[column] make_dictionary_from_scalar(const scalar& s, + size_type size) except + + + cdef unique_ptr[column] make_empty_column(type_id id) except + + cdef unique_ptr[column] make_empty_column(data_type type_) except + + + cdef unique_ptr[column] make_dictionary_column( + unique_ptr[column] keys_column, + unique_ptr[column] indices_column) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/types.pxd index e54a259819e..7d3ddca14a1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/types.pxd @@ -13,6 +13,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport ( null_order, null_policy, order, + size_type, sorted, type_id, ) diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx index a5248ad0a1f..6dbb287f3c4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx @@ -8,6 +8,7 @@ from cudf._lib.pylibcudf.libcudf.types import type_id as TypeId # no-cython-lin from cudf._lib.pylibcudf.libcudf.types import nan_policy as NanPolicy # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import null_policy as NullPolicy # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import interpolation as Interpolation # no-cython-lint, isort:skip +from cudf._lib.pylibcudf.libcudf.types import mask_state as MaskState # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import nan_equality as NanEquality # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import null_equality as NullEquality # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import null_order as NullOrder # no-cython-lint, isort:skip @@ -22,7 +23,7 @@ cdef class DataType: Parameters ---------- - id : TypeId + id : type_id The type's identifier scale : int The scale associated with the data. Only used for decimal data types. diff --git a/python/cudf/cudf/pylibcudf_tests/test_column_factories.py b/python/cudf/cudf/pylibcudf_tests/test_column_factories.py new file mode 100644 index 00000000000..4c05770a41f --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_column_factories.py @@ -0,0 +1,253 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import DEFAULT_STRUCT_TESTING_TYPE, assert_column_eq + +from cudf._lib import pylibcudf as plc + +EMPTY_COL_SIZE = 3 + +NUMERIC_TYPES = [ + pa.uint8(), + pa.uint16(), + pa.uint32(), + pa.uint64(), + pa.int8(), + pa.int16(), + pa.int32(), + pa.int64(), + pa.float32(), + pa.float64(), + pa.bool_(), +] + +TIMESTAMP_TYPES = [ + pa.timestamp("s"), + pa.timestamp("ms"), + pa.timestamp("us"), + pa.timestamp("ns"), +] + +DURATION_TYPES = [ + pa.duration("s"), + pa.duration("ms"), + pa.duration("us"), + pa.duration("ns"), +] + +DECIMAL_TYPES = [pa.decimal128(38, 2)] + +STRING_TYPES = [pa.string()] +STRUCT_TYPES = [DEFAULT_STRUCT_TESTING_TYPE] +LIST_TYPES = [pa.list_(pa.int64())] + +ALL_TYPES = ( + NUMERIC_TYPES + + TIMESTAMP_TYPES + + DURATION_TYPES + + STRING_TYPES + + DECIMAL_TYPES + + STRUCT_TYPES + + LIST_TYPES +) + + +@pytest.fixture(scope="module", params=NUMERIC_TYPES, ids=repr) +def numeric_pa_type(request): + return request.param + + +@pytest.fixture( + scope="module", + params=DECIMAL_TYPES, + ids=repr, +) +def fixed_point_pa_type(request): + return request.param + + +@pytest.fixture( + scope="module", + params=TIMESTAMP_TYPES, + ids=repr, +) +def timestamp_pa_type(request): + return request.param + + +@pytest.fixture( + scope="module", + params=DURATION_TYPES, + ids=repr, +) +def duration_pa_type(request): + return request.param + + +@pytest.fixture( + scope="module", + params=[ + plc.MaskState.UNALLOCATED, + plc.MaskState.ALL_VALID, + plc.MaskState.ALL_NULL, + plc.MaskState.UNINITIALIZED, + ], + ids=["unallocated", "all_valid", "all_null", "uninitialized"], +) +def mask_state(request): + return request.param + + +def test_make_empty_column_dtype(pa_type): + pa_col = pa.array([], type=pa_type) + + plc_type = plc.interop.from_arrow(pa_col).type() + + if isinstance(pa_type, (pa.ListType, pa.StructType)): + with pytest.raises(ValueError): + plc.column_factories.make_empty_column(plc_type) + return + + cudf_col = plc.column_factories.make_empty_column(plc_type) + assert_column_eq(cudf_col, pa_col) + + +def test_make_empty_column_typeid(pa_type): + pa_col = pa.array([], type=pa_type) + + tid = plc.interop.from_arrow(pa_col).type().id() + + if isinstance(pa_type, (pa.ListType, pa.StructType)): + with pytest.raises(ValueError): + plc.column_factories.make_empty_column(tid) + return + + cudf_col = plc.column_factories.make_empty_column(tid) + assert_column_eq(cudf_col, pa_col) + + +def validate_empty_column(col, mask_state, dtype): + assert col.size() == EMPTY_COL_SIZE + + if mask_state == plc.types.MaskState.UNALLOCATED: + assert col.null_count() == 0 + elif mask_state == plc.types.MaskState.ALL_VALID: + assert col.null_count() == 0 + elif mask_state == plc.types.MaskState.ALL_NULL: + assert col.null_count() == EMPTY_COL_SIZE + + assert plc.interop.to_arrow(col).type == dtype + + +def test_make_numeric_column(numeric_pa_type, mask_state): + plc_type = plc.interop.from_arrow(numeric_pa_type) + + got = plc.column_factories.make_numeric_column( + plc_type, EMPTY_COL_SIZE, mask_state + ) + validate_empty_column(got, mask_state, numeric_pa_type) + + +@pytest.mark.parametrize( + "non_numeric_pa_type", [t for t in ALL_TYPES if t not in NUMERIC_TYPES] +) +def test_make_numeric_column_dtype_err(non_numeric_pa_type): + plc_type = plc.interop.from_arrow(non_numeric_pa_type) + with pytest.raises(ValueError): + plc.column_factories.make_numeric_column( + plc_type, 3, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_numeric_column_negative_size_err(numeric_pa_type): + plc_type = plc.interop.from_arrow(numeric_pa_type) + with pytest.raises(RuntimeError): + plc.column_factories.make_numeric_column( + plc_type, -1, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_fixed_point_column(fixed_point_pa_type, mask_state): + plc_type = plc.interop.from_arrow(fixed_point_pa_type) + + got = plc.column_factories.make_fixed_point_column( + plc_type, EMPTY_COL_SIZE, mask_state + ) + + validate_empty_column(got, mask_state, fixed_point_pa_type) + + +@pytest.mark.parametrize( + "non_fixed_point_pa_type", [t for t in ALL_TYPES if t not in DECIMAL_TYPES] +) +def test_make_fixed_point_column_dtype_err(non_fixed_point_pa_type): + plc_type = plc.interop.from_arrow(non_fixed_point_pa_type) + with pytest.raises(ValueError): + plc.column_factories.make_fixed_point_column( + plc_type, 3, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_fixed_point_column_negative_size_err(fixed_point_pa_type): + plc_type = plc.interop.from_arrow(fixed_point_pa_type) + with pytest.raises(RuntimeError): + plc.column_factories.make_fixed_point_column( + plc_type, -1, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_timestamp_column(timestamp_pa_type, mask_state): + plc_type = plc.interop.from_arrow(timestamp_pa_type) + + got = plc.column_factories.make_timestamp_column( + plc_type, EMPTY_COL_SIZE, mask_state + ) + validate_empty_column(got, mask_state, timestamp_pa_type) + + +@pytest.mark.parametrize( + "non_timestamp_pa_type", [t for t in ALL_TYPES if t not in TIMESTAMP_TYPES] +) +def test_make_timestamp_column_dtype_err(non_timestamp_pa_type): + plc_type = plc.interop.from_arrow(non_timestamp_pa_type) + with pytest.raises(ValueError): + plc.column_factories.make_timestamp_column( + plc_type, 3, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_timestamp_column_negative_size_err(timestamp_pa_type): + plc_type = plc.interop.from_arrow(timestamp_pa_type) + with pytest.raises(RuntimeError): + plc.column_factories.make_timestamp_column( + plc_type, -1, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_duration_column(duration_pa_type, mask_state): + plc_type = plc.interop.from_arrow(duration_pa_type) + + got = plc.column_factories.make_duration_column( + plc_type, EMPTY_COL_SIZE, mask_state + ) + validate_empty_column(got, mask_state, duration_pa_type) + + +@pytest.mark.parametrize( + "non_duration_pa_type", [t for t in ALL_TYPES if t not in DURATION_TYPES] +) +def test_make_duration_column_dtype_err(non_duration_pa_type): + plc_type = plc.interop.from_arrow(non_duration_pa_type) + with pytest.raises(ValueError): + plc.column_factories.make_duration_column( + plc_type, 3, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_duration_column_negative_size_err(duration_pa_type): + plc_type = plc.interop.from_arrow(duration_pa_type) + with pytest.raises(RuntimeError): + plc.column_factories.make_duration_column( + plc_type, -1, plc.types.MaskState.UNALLOCATED + ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_interop.py b/python/cudf/cudf/pylibcudf_tests/test_interop.py new file mode 100644 index 00000000000..5c05f460e28 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_interop.py @@ -0,0 +1,69 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest + +import cudf._lib.pylibcudf as plc + + +def test_list_dtype_roundtrip(): + list_type = pa.list_(pa.int32()) + plc_type = plc.interop.from_arrow(list_type) + + assert plc_type == plc.types.DataType(plc.types.TypeId.LIST) + + with pytest.raises(ValueError): + plc.interop.to_arrow(plc_type) + + arrow_type = plc.interop.to_arrow( + plc_type, value_type=list_type.value_type + ) + assert arrow_type == list_type + + +def test_struct_dtype_roundtrip(): + struct_type = pa.struct([("a", pa.int32()), ("b", pa.string())]) + plc_type = plc.interop.from_arrow(struct_type) + + assert plc_type == plc.types.DataType(plc.types.TypeId.STRUCT) + + with pytest.raises(ValueError): + plc.interop.to_arrow(plc_type) + + arrow_type = plc.interop.to_arrow( + plc_type, + fields=[struct_type.field(i) for i in range(struct_type.num_fields)], + ) + assert arrow_type == struct_type + + +def test_decimal128_roundtrip(): + decimal_type = pa.decimal128(10, 2) + plc_type = plc.interop.from_arrow(decimal_type) + + assert plc_type.id() == plc.types.TypeId.DECIMAL128 + + with pytest.raises(ValueError): + plc.interop.to_arrow(plc_type) + + arrow_type = plc.interop.to_arrow( + plc_type, precision=decimal_type.precision + ) + assert arrow_type == decimal_type + + +@pytest.mark.parametrize( + "data_type", + [ + plc.types.DataType(plc.types.TypeId.DECIMAL32), + plc.types.DataType(plc.types.TypeId.DECIMAL64), + ], +) +def test_decimal_other(data_type): + precision = 3 + + with pytest.raises(ValueError): + plc.interop.to_arrow(data_type) + + arrow_type = plc.interop.to_arrow(data_type, precision=precision) + assert arrow_type == pa.decimal128(precision, 0) From fc31aa3c4f99d6348e7c32a3e3c52c68b26ca700 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 4 Jun 2024 10:19:30 -0400 Subject: [PATCH 17/17] Add overflow check when converting large strings to lists columns (#15887) Fixes a couple places where strings columns are converted to lists column as binary -- chars are represented as INT8. Since lists columns only support `size_type` offsets type, this change will throw an error if the size of the chars exceeds max `size_type` values. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/15887 --- cpp/src/io/utilities/column_buffer.cpp | 4 ++++ cpp/src/reshape/byte_cast.cu | 11 ++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp index e5d4e1a360f..27fc53fbc9e 100644 --- a/cpp/src/io/utilities/column_buffer.cpp +++ b/cpp/src/io/utilities/column_buffer.cpp @@ -191,6 +191,10 @@ std::unique_ptr make_column(column_buffer_base& buffer, auto data = col_content.data.release(); auto char_size = data->size(); + CUDF_EXPECTS(char_size < static_cast(std::numeric_limits::max()), + "Cannot convert strings column to lists column due to size_type limit", + std::overflow_error); + auto uint8_col = std::make_unique( data_type{type_id::UINT8}, char_size, std::move(*data), rmm::device_buffer{}, 0); diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu index 1b05a9744fa..3dfa0b65814 100644 --- a/cpp/src/reshape/byte_cast.cu +++ b/cpp/src/reshape/byte_cast.cu @@ -135,9 +135,14 @@ struct byte_list_conversion_fn(input, stream, mr)->release(); - auto const num_chars = col_content.data->size(); - auto uint8_col = std::make_unique( + auto const num_chars = strings_column_view(input).chars_size(stream); + CUDF_EXPECTS(num_chars < static_cast(std::numeric_limits::max()), + "Cannot convert strings column to lists column due to size_type limit", + std::overflow_error); + + auto col_content = std::make_unique(input, stream, mr)->release(); + + auto uint8_col = std::make_unique( output_type, num_chars, std::move(*(col_content.data)), rmm::device_buffer{}, 0); auto result = make_lists_column(