From 695d71f2c79c4214e9ce42f7a490cbafade91c57 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 11 Dec 2023 15:02:32 -0500 Subject: [PATCH 1/4] Create strings-specific make_offsets_child_column for handling int64 offsets --- .../cudf/strings/detail/strings_children.cuh | 54 +++++++++++++++++++ cpp/include/cudf/strings/detail/utilities.hpp | 12 +++++ cpp/src/strings/utilities.cu | 7 +++ 3 files changed, 73 insertions(+) diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh index 5f8a2a34606..e7f1d3f4334 100644 --- a/cpp/include/cudf/strings/detail/strings_children.cuh +++ b/cpp/include/cudf/strings/detail/strings_children.cuh @@ -122,6 +122,60 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr); } +/** + * @brief Create an offsets column to be a child of a compound column + * + * This function sets the offsets values by executing scan over the sizes in the provided + * Iterator. + * + * The return also includes the total number of elements -- the last element value from the + * scan. + * + * @throw std::overflow_error if the total size of the scan (last element) greater than maximum + * value of `size_type` + * + * @tparam InputIterator Used as input to scan to set the offset values + * @param begin The beginning of the input sequence + * @param end The end of the input sequence + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Offsets column and total elements + */ +template +std::pair, size_type> make_offsets_child_column( + InputIterator begin, + InputIterator end, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto count = static_cast(std::distance(begin, end)); + auto offsets_column = + make_numeric_column(data_type{type_id::INT32}, count + 1, mask_state::UNALLOCATED, stream, mr); + auto offsets_view = offsets_column->mutable_view(); + auto d_offsets = offsets_view.template data(); + + // The number of offsets is count+1 so to build the offsets from the sizes + // using exclusive-scan technically requires count+1 input values even though + // the final input value is never used. + // The input iterator is wrapped here to allow the last value to be safely read. + auto map_fn = + cuda::proclaim_return_type([begin, count] __device__(size_type idx) -> size_type { + return idx < count ? static_cast(begin[idx]) : size_type{0}; + }); + auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn); + // Use the sizes-to-offsets iterator to compute the total number of elements + auto const total_elements = sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, stream); + if (total_elements >= get_offset64_threshold()) { + // recompute as int64 offsets when above the threshold + offsets_column = make_numeric_column( + data_type{type_id::INT64}, count + 1, mask_state::UNALLOCATED, stream, mr); + auto d_offsets64 = offsets_column->mutable_view().template data(); + sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets64, stream); + } + + return std::pair(std::move(offsets_column), static_cast(total_elements)); +} + } // namespace detail } // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp index 41a2654dce3..8b03bd9ac58 100644 --- a/cpp/include/cudf/strings/detail/utilities.hpp +++ b/cpp/include/cudf/strings/detail/utilities.hpp @@ -53,6 +53,18 @@ rmm::device_uvector create_string_vector_from_column( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @brief Return the threshold size for a strings column to use int64 offsets + * + * A computed size above this threshold should using int64 offsets, otherwise + * int32 offsets. By default this function will return std::numeric_limits::max(). + * This value can be overriden at runtime using the environment variable + * LIBCUDF_LARGE_STRINGS_THRESHOLD. + * + * @return size in bytes + */ +int64_t get_offset64_threshold(); + } // namespace detail } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index c8c68d19ce6..18eac649f69 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -128,6 +128,13 @@ special_case_mapping const* get_special_case_mapping_table() }); } +int64_t get_offset64_threshold() +{ + auto const threshold = std::getenv("LIBCUDF_LARGE_STRINGS_THRESHOLD"); + std::size_t const rtn = threshold != nullptr ? std::atol(threshold) : 0; + return rtn > 0 ? rtn : std::numeric_limits::max(); +} + } // namespace detail } // namespace strings } // namespace cudf From 37c26b980a6e33c452a73a48dcde6aa1d024c07b Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 11 Dec 2023 17:08:28 -0500 Subject: [PATCH 2/4] fix style violation --- cpp/include/cudf/strings/detail/utilities.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp index 8b03bd9ac58..186a1351fc9 100644 --- a/cpp/include/cudf/strings/detail/utilities.hpp +++ b/cpp/include/cudf/strings/detail/utilities.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -58,7 +58,7 @@ rmm::device_uvector create_string_vector_from_column( * * A computed size above this threshold should using int64 offsets, otherwise * int32 offsets. By default this function will return std::numeric_limits::max(). - * This value can be overriden at runtime using the environment variable + * This value can be overridden at runtime using the environment variable * LIBCUDF_LARGE_STRINGS_THRESHOLD. * * @return size in bytes From a880e38d0f9b715b2fbc84dbdce0ac99eaf24b4a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 12 Dec 2023 17:51:46 -0500 Subject: [PATCH 3/4] fix doxygen comment --- cpp/include/cudf/strings/detail/strings_children.cuh | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh index e7f1d3f4334..1a7043a6da5 100644 --- a/cpp/include/cudf/strings/detail/strings_children.cuh +++ b/cpp/include/cudf/strings/detail/strings_children.cuh @@ -131,9 +131,6 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, * The return also includes the total number of elements -- the last element value from the * scan. * - * @throw std::overflow_error if the total size of the scan (last element) greater than maximum - * value of `size_type` - * * @tparam InputIterator Used as input to scan to set the offset values * @param begin The beginning of the input sequence * @param end The end of the input sequence From c385218f5da7a278209588676cb71542cff6cdae Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 18 Jan 2024 19:56:13 -0500 Subject: [PATCH 4/4] add limit doc to utility fn --- .../cudf/strings/detail/strings_children.cuh | 36 ++++++++++--------- cpp/include/cudf/strings/detail/utilities.hpp | 2 ++ cpp/src/strings/utilities.cu | 4 ++- 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh index 3c474ef2a90..42a180c27c1 100644 --- a/cpp/include/cudf/strings/detail/strings_children.cuh +++ b/cpp/include/cudf/strings/detail/strings_children.cuh @@ -145,34 +145,38 @@ std::pair, int64_t> make_offsets_child_column( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const count = static_cast(std::distance(begin, end)); - auto offsets_column = - make_numeric_column(data_type{type_id::INT32}, count + 1, mask_state::UNALLOCATED, stream, mr); - auto offsets_view = offsets_column->mutable_view(); - auto d_offsets = offsets_view.template data(); - - // The number of offsets is count+1 so to build the offsets from the sizes - // using exclusive-scan technically requires count+1 input values even though + auto constexpr size_type_max = static_cast(std::numeric_limits::max()); + auto const lcount = static_cast(std::distance(begin, end)); + CUDF_EXPECTS( + lcount <= size_type_max, "Size of output exceeds the column size limit", std::overflow_error); + auto const strings_count = static_cast(lcount); + auto offsets_column = make_numeric_column( + data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); + auto d_offsets = offsets_column->mutable_view().template data(); + + // The number of offsets is strings_count+1 so to build the offsets from the sizes + // using exclusive-scan technically requires strings_count+1 input values even though // the final input value is never used. // The input iterator is wrapped here to allow the 'last value' to be safely read. - auto map_fn = - cuda::proclaim_return_type([begin, count] __device__(size_type idx) -> size_type { - return idx < count ? static_cast(begin[idx]) : size_type{0}; + auto map_fn = cuda::proclaim_return_type( + [begin, strings_count] __device__(size_type idx) -> size_type { + return idx < strings_count ? static_cast(begin[idx]) : size_type{0}; }); auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn); // Use the sizes-to-offsets iterator to compute the total number of elements - auto const total_elements = sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, stream); + auto const total_elements = + sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream); // TODO: replace exception with if-statement when enabling creating INT64 offsets - CUDF_EXPECTS(total_elements <= static_cast(std::numeric_limits::max()), - "Size of output exceeds the column size limit", + CUDF_EXPECTS(total_elements <= size_type_max, + "Size of output exceeds the character size limit", std::overflow_error); // if (total_elements >= get_offset64_threshold()) { // // recompute as int64 offsets when above the threshold // offsets_column = make_numeric_column( - // data_type{type_id::INT64}, count + 1, mask_state::UNALLOCATED, stream, mr); + // data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); // auto d_offsets64 = offsets_column->mutable_view().template data(); - // sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets64, stream); + // sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream); // } return std::pair(std::move(offsets_column), total_elements); diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp index 7f0ec54af4a..3cf2850548d 100644 --- a/cpp/include/cudf/strings/detail/utilities.hpp +++ b/cpp/include/cudf/strings/detail/utilities.hpp @@ -68,6 +68,8 @@ int64_t get_offset64_threshold(); /** * @brief Return a normalized offset value from a strings offsets column * + * The maximum value returned is `std::numeric_limits::max()`. + * * @throw std::invalid_argument if `offsets` is neither INT32 nor INT64 * * @param offsets Input column of type INT32 or INT64 diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index e2ac772c621..782d9767fb5 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -133,7 +133,9 @@ int64_t get_offset64_threshold() { auto const threshold = std::getenv("LIBCUDF_LARGE_STRINGS_THRESHOLD"); std::size_t const rtn = threshold != nullptr ? std::atol(threshold) : 0; - return rtn > 0 ? rtn : std::numeric_limits::max(); + return (rtn > 0 && rtn < std::numeric_limits::max()) + ? rtn + : std::numeric_limits::max(); } int64_t get_offset_value(cudf::column_view const& offsets,