Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Large strings support in cudf::merge #15374

Merged
merged 26 commits into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
f71aabd
Large strings support in cudf::merge
davidwendt Mar 22, 2024
31ed7c2
ensure empty!=null in indices output
davidwendt Mar 22, 2024
09c98a5
Merge branch 'branch-24.06' into ls-merge
davidwendt Mar 22, 2024
32199eb
Merge branch 'branch-24.06' into ls-merge
davidwendt Mar 29, 2024
2aa149f
Merge branch 'branch-24.06' into ls-merge
davidwendt Apr 3, 2024
998208d
Merge branch 'branch-24.06' into ls-merge
davidwendt Apr 4, 2024
de4f0a2
Merge branch 'branch-24.06' into ls-merge
davidwendt Apr 8, 2024
a8d51e2
enable large-strings test; fix factory
davidwendt Apr 8, 2024
6f43aba
Merge branch 'branch-24.06' into ls-merge
davidwendt Apr 8, 2024
890d8d1
fix doxygen
davidwendt Apr 8, 2024
a2faab1
Merge branch 'branch-24.06' into ls-merge
davidwendt Apr 8, 2024
8ee448a
Merge branch 'branch-24.06' into ls-merge
davidwendt Apr 10, 2024
0d648b4
remove unneeded headers
davidwendt Apr 10, 2024
540ee80
Merge branch 'branch-24.06' into ls-merge
davidwendt Apr 10, 2024
b16a099
Merge branch 'branch-24.06' into ls-merge
davidwendt Apr 11, 2024
5554e03
Merge branch 'branch-24.06' into ls-merge
davidwendt Apr 15, 2024
1a78415
Merge branch 'branch-24.06' into ls-merge
davidwendt Apr 16, 2024
23509b3
fix merge conflict
davidwendt Apr 17, 2024
1f4c22f
fix mr parameter type
davidwendt Apr 17, 2024
34dd22f
Merge branch 'branch-24.06' into ls-merge
davidwendt Apr 18, 2024
b2f5c3e
add 32-bit offsets test too
davidwendt Apr 18, 2024
ace98cd
Merge branch 'branch-24.06' into ls-merge
davidwendt Apr 18, 2024
8eafd8f
Merge branch 'branch-24.06' into ls-merge
davidwendt Apr 19, 2024
45853b9
roll if-stmt into CUDF_EXPECTS
davidwendt Apr 19, 2024
c194c1b
Merge branch 'branch-24.06' into ls-merge
davidwendt Apr 19, 2024
5684ba9
Merge branch 'branch-24.06' into ls-merge
davidwendt Apr 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -584,6 +584,7 @@ add_library(
src/strings/filling/fill.cu
src/strings/filter_chars.cu
src/strings/like.cu
src/strings/merge/merge.cu
src/strings/padding.cu
src/strings/regex/regcomp.cpp
src/strings/regex/regexec.cpp
Expand Down
4 changes: 3 additions & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,9 @@ ConfigureNVBench(HASHING_NVBENCH hashing/hash.cpp)
# ##################################################################################################
# * merge benchmark -------------------------------------------------------------------------------
ConfigureBench(MERGE_BENCH merge/merge.cpp)
ConfigureNVBench(MERGE_NVBENCH merge/merge_structs.cpp merge/merge_lists.cpp)
ConfigureNVBench(
MERGE_NVBENCH merge/merge_lists.cpp merge/merge_structs.cpp merge/merge_strings.cpp
)

# ##################################################################################################
# * null_mask benchmark ---------------------------------------------------------------------------
Expand Down
64 changes: 64 additions & 0 deletions cpp/benchmarks/merge/merge_strings.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf/merge.hpp>
#include <cudf/sorting.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <nvbench/nvbench.cuh>

void nvbench_merge_strings(nvbench::state& state)
{
auto stream = cudf::get_default_stream();

auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
if (static_cast<std::size_t>(2 * num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

data_profile const table_profile =
data_profile_builder()
.distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
.no_validity();
auto const source_tables = create_random_table(
{cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, table_profile);

auto const sorted_lhs = cudf::sort(cudf::table_view({source_tables->view().column(0)}));
auto const sorted_rhs = cudf::sort(cudf::table_view({source_tables->view().column(1)}));
auto const lhs = sorted_lhs->view().column(0);
auto const rhs = sorted_rhs->view().column(0);

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
auto chars_size = cudf::strings_column_view(lhs).chars_size(stream) +
cudf::strings_column_view(rhs).chars_size(stream);
state.add_global_memory_reads<nvbench::int8_t>(chars_size); // all bytes are read
state.add_global_memory_writes<nvbench::int8_t>(chars_size); // all bytes are written

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
[[maybe_unused]] auto result = cudf::merge(
{cudf::table_view({lhs}), cudf::table_view({rhs})}, {0}, {cudf::order::ASCENDING});
});
}

NVBENCH_BENCH(nvbench_merge_strings)
.set_name("merge_strings")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
110 changes: 0 additions & 110 deletions cpp/include/cudf/strings/detail/merge.cuh

This file was deleted.

45 changes: 45 additions & 0 deletions cpp/include/cudf/strings/detail/merge.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/detail/merge.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <rmm/cuda_stream_view.hpp>

namespace cudf {
namespace strings {
namespace detail {
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
/**
* @brief Merges two strings columns
*
* @param lhs First column
* @param rhs Second column
* @param row_order Indices for each column
vuule marked this conversation as resolved.
Show resolved Hide resolved
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings column
*/
std::unique_ptr<column> merge(strings_column_view const& lhs,
strings_column_view const& rhs,
cudf::detail::index_vector const& row_order,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

} // namespace detail
} // namespace strings
} // namespace cudf
29 changes: 15 additions & 14 deletions cpp/include/cudf/strings/detail/strings_children.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -163,22 +163,23 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
});
auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
// Use the sizes-to-offsets iterator to compute the total number of elements
auto const total_elements =
auto const total_bytes =
cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);

// TODO: replace exception with if-statement when enabling creating INT64 offsets
CUDF_EXPECTS(total_elements <= size_type_max,
"Size of output exceeds the character size limit",
std::overflow_error);
// if (total_elements >= get_offset64_threshold()) {
// // recompute as int64 offsets when above the threshold
// offsets_column = make_numeric_column(
// data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
// auto d_offsets64 = offsets_column->mutable_view().template data<int64_t>();
// sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream);
// }

return std::pair(std::move(offsets_column), total_elements);
auto const threshold = get_offset64_threshold();
if (!is_large_strings_enabled()) {
CUDF_EXPECTS(
total_bytes < threshold, "Size of output exceeds the column size limit", std::overflow_error);
}
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
if (total_bytes >= get_offset64_threshold()) {
// recompute as int64 offsets when above the threshold
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Once large strings are the default behavior, I wonder if we should do a first pass in int64 and then cast the offsets down if it fits in int32, rather than recomputing as int64 on overflow. I don't know how expensive the downcast would be compared to recomputing. Of course this also depends on whether we expect large strings to be common. I don't expect any action here, but would be open to your thoughts.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really expect large strings to be very common. At least not in the near to mid future.
I'd rather force any performance hit into the int64 path right now if possible.

offsets_column = make_numeric_column(
data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
auto d_offsets64 = offsets_column->mutable_view().template data<int64_t>();
cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream);
}

return std::pair(std::move(offsets_column), total_bytes);
}

} // namespace detail
Expand Down
29 changes: 11 additions & 18 deletions cpp/include/cudf/strings/detail/strings_column_factories.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,10 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
return (item.first != nullptr ? static_cast<size_type>(item.second) : size_type{0});
});
auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer);
auto [offsets_column, bytes] = cudf::detail::make_offsets_child_column(
auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
auto offsets_view = offsets_column->view();
auto const d_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());

// create null mask
auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; };
Expand All @@ -97,11 +98,10 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
(null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr};

// build chars column
auto chars_data = [offsets_view, bytes = bytes, begin, strings_count, null_count, stream, mr] {
auto chars_data = [d_offsets, bytes = bytes, begin, strings_count, null_count, stream, mr] {
auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
// use a character-parallel kernel for long string lengths
if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) {
auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets_view);
auto const str_begin = thrust::make_transform_iterator(
begin, cuda::proclaim_return_type<string_view>([] __device__(auto ip) {
return string_view{ip.first, ip.second};
Expand All @@ -120,12 +120,11 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
auto d_chars = chars_data.data();
auto copy_chars = [d_chars] __device__(auto item) {
string_index_pair const str = thrust::get<0>(item);
size_type const offset = thrust::get<1>(item);
int64_t const offset = thrust::get<1>(item);
if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
};
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_zip_iterator(
thrust::make_tuple(begin, offsets_view.template begin<size_type>())),
thrust::make_zip_iterator(thrust::make_tuple(begin, d_offsets)),
strings_count,
copy_chars);
return chars_data;
Expand Down Expand Up @@ -167,21 +166,15 @@ std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
{
CUDF_FUNC_RANGE();
size_type strings_count = thrust::distance(offsets_begin, offsets_end) - 1;
size_type bytes = std::distance(chars_begin, chars_end) * sizeof(char);
if (strings_count == 0) return make_empty_column(type_id::STRING);
if (strings_count == 0) { return make_empty_column(type_id::STRING); }

int64_t const bytes = std::distance(chars_begin, chars_end) * sizeof(char);
CUDF_EXPECTS(bytes >= 0, "invalid offsets data");

// build offsets column -- this is the number of strings + 1
auto offsets_column = make_numeric_column(
data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
auto offsets_view = offsets_column->mutable_view();
thrust::transform(rmm::exec_policy(stream),
offsets_begin,
offsets_end,
offsets_view.data<int32_t>(),
cuda::proclaim_return_type<int32_t>(
[] __device__(auto offset) { return static_cast<int32_t>(offset); }));
auto [offsets_column, computed_bytes] =
cudf::strings::detail::make_offsets_child_column(offsets_begin, offsets_end, stream, mr);
CUDF_EXPECTS(bytes == computed_bytes, "unexpected byte count");

// build chars column
rmm::device_uvector<char> chars_data(bytes, stream, mr);
Expand Down
16 changes: 3 additions & 13 deletions cpp/src/merge/merge.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
#include <cudf/dictionary/detail/update_keys.hpp>
#include <cudf/lists/detail/concatenate.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/strings/detail/merge.cuh>
#include <cudf/strings/detail/merge.hpp>
#include <cudf/structs/structs_column_view.hpp>
#include <cudf/table/experimental/row_operators.cuh>
#include <cudf/table/table.hpp>
Expand Down Expand Up @@ -433,18 +433,8 @@ std::unique_ptr<column> column_merger::operator()<cudf::string_view>(
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr) const
{
auto column = strings::detail::merge<index_type>(strings_column_view(lcol),
strings_column_view(rcol),
row_order_.begin(),
row_order_.end(),
stream,
mr);
if (lcol.has_nulls() || rcol.has_nulls()) {
auto merged_view = column->mutable_view();
materialize_bitmask(
lcol, rcol, merged_view.null_mask(), merged_view.size(), row_order_.data(), stream);
}
return column;
return strings::detail::merge(
strings_column_view(lcol), strings_column_view(rcol), row_order_, stream, mr);
}

// specialization for dictionary
Expand Down
Loading
Loading