Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create strings-specific make_offsets_child_column for multiple offset types #14612

Merged
merged 24 commits into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
695d71f
Create strings-specific make_offsets_child_column for handling int64 …
davidwendt Dec 11, 2023
57ad03e
Merge branch 'branch-24.02' into chars-size-threshold
davidwendt Dec 11, 2023
37c26b9
fix style violation
davidwendt Dec 11, 2023
a8d1ec6
Merge branch 'branch-24.02' into chars-size-threshold
davidwendt Dec 12, 2023
a880e38
fix doxygen comment
davidwendt Dec 12, 2023
f52665b
Merge branch 'branch-24.02' into chars-size-threshold
davidwendt Dec 12, 2023
b6ed281
Merge branch 'branch-24.02' into chars-size-threshold
davidwendt Dec 13, 2023
529decb
Merge branch 'branch-24.02' into chars-size-threshold
davidwendt Dec 13, 2023
a200a64
Merge branch 'branch-24.02' into chars-size-threshold
davidwendt Dec 15, 2023
8d4c428
Merge branch 'branch-24.02' into chars-size-threshold
davidwendt Dec 19, 2023
900b786
Merge branch 'branch-24.02' into chars-size-threshold
davidwendt Jan 2, 2024
b799e80
Merge branch 'branch-24.02' into chars-size-threshold
davidwendt Jan 2, 2024
f6a7425
fix merge conflict
davidwendt Jan 10, 2024
64b8aa0
Merge branch 'branch-24.02' into chars-size-threshold
davidwendt Jan 10, 2024
66b1b90
Merge branch 'chars-size-threshold' of github.com:davidwendt/cudf int…
davidwendt Jan 10, 2024
0bb0651
Merge branch 'branch-24.02' into chars-size-threshold
davidwendt Jan 10, 2024
058aed7
Merge branch 'branch-24.02' into chars-size-threshold
davidwendt Jan 12, 2024
89d0cda
Merge branch 'branch-24.02' into chars-size-threshold
davidwendt Jan 17, 2024
befbcc7
Merge branch 'branch-24.02' into chars-size-threshold
davidwendt Jan 17, 2024
c385218
add limit doc to utility fn
davidwendt Jan 19, 2024
0a4efb6
Merge branch 'branch-24.02' into chars-size-threshold
davidwendt Jan 19, 2024
871cf6a
Merge branch 'chars-size-threshold' of github.com:davidwendt/cudf int…
davidwendt Jan 19, 2024
8af2fb1
Merge branch 'branch-24.02' into chars-size-threshold
davidwendt Jan 19, 2024
114ad99
Merge branch 'branch-24.02' into chars-size-threshold
davidwendt Jan 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 61 additions & 1 deletion cpp/include/cudf/strings/detail/strings_children.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -122,6 +122,66 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr);
}

/**
* @brief Create an offsets column to be a child of a compound column
*
* This function sets the offsets values by executing scan over the sizes in the provided
* Iterator.
*
* The return also includes the total number of elements -- the last element value from the
* scan.
*
* @tparam InputIterator Used as input to scan to set the offset values
* @param begin The beginning of the input sequence
* @param end The end of the input sequence
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return Offsets column and total elements
*/
template <typename InputIterator>
std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
InputIterator begin,
InputIterator end,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto constexpr size_type_max = static_cast<int64_t>(std::numeric_limits<size_type>::max());
auto const lcount = static_cast<int64_t>(std::distance(begin, end));
CUDF_EXPECTS(
lcount <= size_type_max, "Size of output exceeds the column size limit", std::overflow_error);
auto const strings_count = static_cast<size_type>(lcount);
auto offsets_column = make_numeric_column(
data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
auto d_offsets = offsets_column->mutable_view().template data<int32_t>();

// The number of offsets is strings_count+1 so to build the offsets from the sizes
// using exclusive-scan technically requires strings_count+1 input values even though
// the final input value is never used.
// The input iterator is wrapped here to allow the 'last value' to be safely read.
auto map_fn = cuda::proclaim_return_type<size_type>(
[begin, strings_count] __device__(size_type idx) -> size_type {
return idx < strings_count ? static_cast<size_type>(begin[idx]) : size_type{0};
});
auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
// Use the sizes-to-offsets iterator to compute the total number of elements
auto const total_elements =
sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);

// TODO: replace exception with if-statement when enabling creating INT64 offsets
CUDF_EXPECTS(total_elements <= size_type_max,
"Size of output exceeds the character size limit",
std::overflow_error);
// if (total_elements >= get_offset64_threshold()) {
// // recompute as int64 offsets when above the threshold
// offsets_column = make_numeric_column(
// data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
// auto d_offsets64 = offsets_column->mutable_view().template data<int64_t>();
// sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream);
// }

return std::pair(std::move(offsets_column), total_elements);
}

} // namespace detail
} // namespace strings
} // namespace cudf
14 changes: 14 additions & 0 deletions cpp/include/cudf/strings/detail/utilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,23 @@ rmm::device_uvector<string_view> create_string_vector_from_column(
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Return the threshold size for a strings column to use int64 offsets
*
* A computed size above this threshold should using int64 offsets, otherwise
* int32 offsets. By default this function will return std::numeric_limits<int32_t>::max().
* This value can be overridden at runtime using the environment variable
* LIBCUDF_LARGE_STRINGS_THRESHOLD.
*
* @return size in bytes
*/
int64_t get_offset64_threshold();

/**
* @brief Return a normalized offset value from a strings offsets column
*
* The maximum value returned is `std::numeric_limits<int32_t>::max()`.
*
* @throw std::invalid_argument if `offsets` is neither INT32 nor INT64
*
* @param offsets Input column of type INT32 or INT64
Expand Down
9 changes: 9 additions & 0 deletions cpp/src/strings/utilities.cu
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,15 @@ special_case_mapping const* get_special_case_mapping_table()
});
}

int64_t get_offset64_threshold()
{
auto const threshold = std::getenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
std::size_t const rtn = threshold != nullptr ? std::atol(threshold) : 0;
return (rtn > 0 && rtn < std::numeric_limits<int32_t>::max())
? rtn
: std::numeric_limits<int32_t>::max();
}

int64_t get_offset_value(cudf::column_view const& offsets,
size_type index,
rmm::cuda_stream_view stream)
Expand Down
Loading