Skip to content

Commit

Permalink
Deprecate cudf::make_strings_column accepting typed offsets (#14461)
Browse files Browse the repository at this point in the history
Deprecates the following factory functions:
```
std::unique_ptr<column> make_strings_column(
  cudf::device_span<char const> strings,
  cudf::device_span<size_type const> offsets,
  cudf::device_span<bitmask_type const> null_mask,
  size_type null_count,
  rmm::cuda_stream_view stream,
  rmm::mr::device_memory_resource* mr);

std::unique_ptr<column> make_strings_column(size_type num_strings,
  rmm::device_uvector<size_type>&& offsets,
  rmm::device_uvector<char>&& chars,
  rmm::device_buffer&& null_mask,
  size_type null_count);
```
The first one is not needed, rarely used, and inefficient. The only callers found can use other, more efficient functions which do not require copying the device data. The 2nd one is only used in 8/150 places and can be easily recoded in place to use type-erased factory function instead.
Removing these APIs helps reduce the number of functions that require type-specific offsets to make it easier to support offsets of larger types in the future.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: #14461
  • Loading branch information
davidwendt authored Dec 5, 2023
1 parent 1c46d7d commit 434df44
Show file tree
Hide file tree
Showing 12 changed files with 138 additions and 61 deletions.
8 changes: 4 additions & 4 deletions cpp/benchmarks/common/generate_input.cu
Original file line number Diff line number Diff line change
Expand Up @@ -536,10 +536,10 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
rmm::mr::get_current_device_resource());
return cudf::make_strings_column(
num_rows,
std::move(offsets),
std::move(chars),
profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{},
null_count);
std::make_unique<cudf::column>(std::move(offsets), rmm::device_buffer{}, 0),
std::make_unique<cudf::column>(std::move(chars), rmm::device_buffer{}, 0),
null_count,
profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{});
}

/**
Expand Down
16 changes: 10 additions & 6 deletions cpp/include/cudf/column/column_factories.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,8 @@ std::unique_ptr<column> make_strings_column(
* span of byte offsets identifying individual strings within the char vector, and an optional
* null bitmask.
*
* @deprecated Since 24.02
*
* `offsets.front()` must always be zero.
*
* The total number of char bytes must not exceed the maximum size of size_type. Use the
Expand All @@ -435,7 +437,7 @@ std::unique_ptr<column> make_strings_column(
* columns' device memory
* @return Constructed strings column
*/
std::unique_ptr<column> make_strings_column(
[[deprecated]] std::unique_ptr<column> make_strings_column(
cudf::device_span<char const> strings,
cudf::device_span<size_type const> offsets,
cudf::device_span<bitmask_type const> null_mask,
Expand Down Expand Up @@ -470,6 +472,8 @@ std::unique_ptr<column> make_strings_column(size_type num_strings,
* @brief Construct a STRING type column given offsets, columns, and optional null count and null
* mask.
*
* @deprecated Since 24.02
*
* @param[in] num_strings The number of strings the column represents.
* @param[in] offsets The offset values for this column. The number of elements is one more than the
* total number of strings so the `offset[last] - offset[0]` is the total number of bytes in the
Expand All @@ -481,11 +485,11 @@ std::unique_ptr<column> make_strings_column(size_type num_strings,
* @param[in] null_count The number of null string entries.
* @return Constructed strings column
*/
std::unique_ptr<column> make_strings_column(size_type num_strings,
rmm::device_uvector<size_type>&& offsets,
rmm::device_uvector<char>&& chars,
rmm::device_buffer&& null_mask,
size_type null_count);
[[deprecated]] std::unique_ptr<column> make_strings_column(size_type num_strings,
rmm::device_uvector<size_type>&& offsets,
rmm::device_uvector<char>&& chars,
rmm::device_buffer&& null_mask,
size_type null_count);

/**
* @brief Construct a LIST type column given offsets column, child column, null mask and null
Expand Down
14 changes: 14 additions & 0 deletions cpp/include/cudf/utilities/type_dispatcher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,20 @@ CUDF_TYPE_MAPPING(numeric::decimal64, type_id::DECIMAL64)
CUDF_TYPE_MAPPING(numeric::decimal128, type_id::DECIMAL128)
CUDF_TYPE_MAPPING(cudf::struct_view, type_id::STRUCT)

/**
* @brief Specialization to map 'char' type to type_id::INT8
*
* Required when passing device_uvector<char> to column constructor.
* Possibly can be removed when PR 14202 is merged.
*
* @return id for 'char' type
*/
template <> // CUDF_TYPE_MAPPING(char,INT8) causes duplicate id_to_type_impl definition
constexpr inline type_id type_to_id<char>()
{
return type_id::INT8;
}

/**
* @brief Use this specialization on `type_dispatcher` whenever you only need to operate on the
* underlying stored type.
Expand Down
33 changes: 23 additions & 10 deletions cpp/include/cudf_test/column_wrapper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -756,14 +756,21 @@ class strings_column_wrapper : public detail::column_wrapper {
template <typename StringsIterator>
strings_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{}
{
size_type num_strings = std::distance(begin, end);
auto all_valid = thrust::make_constant_iterator(true);
auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, all_valid);
auto d_chars = cudf::detail::make_device_uvector_sync(
chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
auto d_offsets = cudf::detail::make_device_uvector_sync(
offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
auto d_chars = std::make_unique<cudf::column>(
cudf::detail::make_device_uvector_sync(
chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
rmm::device_buffer{},
0);
auto d_offsets = std::make_unique<cudf::column>(
cudf::detail::make_device_uvector_sync(
offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
rmm::device_buffer{},
0);
wrapped =
cudf::make_strings_column(d_chars, d_offsets, {}, 0, cudf::test::get_default_stream());
cudf::make_strings_column(num_strings, std::move(d_offsets), std::move(d_chars), 0, {});
}

/**
Expand Down Expand Up @@ -801,14 +808,20 @@ class strings_column_wrapper : public detail::column_wrapper {
size_type num_strings = std::distance(begin, end);
auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, v);
auto [null_mask, null_count] = detail::make_null_mask_vector(v, v + num_strings);
auto d_chars = cudf::detail::make_device_uvector_sync(
chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
auto d_offsets = cudf::detail::make_device_uvector_sync(
offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
auto d_chars = std::make_unique<cudf::column>(
cudf::detail::make_device_uvector_sync(
chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
rmm::device_buffer{},
0);
auto d_offsets = std::make_unique<cudf::column>(
cudf::detail::make_device_uvector_sync(
offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
rmm::device_buffer{},
0);
auto d_bitmask = cudf::detail::make_device_uvector_sync(
null_mask, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
wrapped = cudf::make_strings_column(
d_chars, d_offsets, d_bitmask, null_count, cudf::test::get_default_stream());
num_strings, std::move(d_offsets), std::move(d_chars), null_count, d_bitmask.release());
}

/**
Expand Down
40 changes: 25 additions & 15 deletions cpp/src/io/json/legacy/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -530,21 +530,31 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
auto repl_chars = std::vector<char>{'"', '\\', '\t', '\r', '\b'};
auto repl_offsets = std::vector<size_type>{0, 1, 2, 3, 4, 5};

auto target =
make_strings_column(cudf::detail::make_device_uvector_async(
target_chars, stream, rmm::mr::get_current_device_resource()),
cudf::detail::make_device_uvector_async(
target_offsets, stream, rmm::mr::get_current_device_resource()),
{},
0,
stream);
auto repl = make_strings_column(cudf::detail::make_device_uvector_async(
repl_chars, stream, rmm::mr::get_current_device_resource()),
cudf::detail::make_device_uvector_async(
repl_offsets, stream, rmm::mr::get_current_device_resource()),
{},
0,
stream);
auto target = make_strings_column(
static_cast<size_type>(target_offsets.size() - 1),
std::make_unique<cudf::column>(
cudf::detail::make_device_uvector_async(
target_offsets, stream, rmm::mr::get_current_device_resource()),
rmm::device_buffer{},
0),
std::make_unique<cudf::column>(cudf::detail::make_device_uvector_async(
target_chars, stream, rmm::mr::get_current_device_resource()),
rmm::device_buffer{},
0),
0,
{});
auto repl = make_strings_column(
static_cast<size_type>(repl_offsets.size() - 1),
std::make_unique<cudf::column>(cudf::detail::make_device_uvector_async(
repl_offsets, stream, rmm::mr::get_current_device_resource()),
rmm::device_buffer{},
0),
std::make_unique<cudf::column>(cudf::detail::make_device_uvector_async(
repl_chars, stream, rmm::mr::get_current_device_resource()),
rmm::device_buffer{},
0),
0,
{});

auto const h_valid_counts = cudf::detail::make_std_vector_sync(d_valid_counts, stream);
std::vector<std::unique_ptr<column>> out_columns;
Expand Down
12 changes: 9 additions & 3 deletions cpp/src/io/json/write_json.cu
Original file line number Diff line number Diff line change
Expand Up @@ -763,10 +763,16 @@ std::unique_ptr<column> make_strings_column_from_host(host_span<std::string cons
offsets.begin() + 1,
std::plus<cudf::size_type>{},
[](auto& str) { return str.size(); });
auto d_offsets =
cudf::detail::make_device_uvector_sync(offsets, stream, rmm::mr::get_current_device_resource());
auto d_offsets = std::make_unique<cudf::column>(
cudf::detail::make_device_uvector_sync(offsets, stream, rmm::mr::get_current_device_resource()),
rmm::device_buffer{},
0);
return cudf::make_strings_column(
host_strings.size(), std::move(d_offsets), std::move(d_chars), {}, 0);
host_strings.size(),
std::move(d_offsets),
std::make_unique<cudf::column>(std::move(d_chars), rmm::device_buffer{}, 0),
0,
{});
}

std::unique_ptr<column> make_column_names_column(host_span<column_name_info const> column_names,
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/io/parquet/predicate_pushdown.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,11 +188,11 @@ struct stats_caster {
auto [d_chars, d_offsets] = make_strings_children(val, stream, mr);
return cudf::make_strings_column(
val.size(),
std::move(d_offsets),
std::move(d_chars),
std::make_unique<column>(std::move(d_offsets), rmm::device_buffer{}, 0),
std::make_unique<column>(std::move(d_chars), rmm::device_buffer{}, 0),
null_count,
rmm::device_buffer{
null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr},
null_count);
null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr});
}
return std::make_unique<column>(
dtype,
Expand Down
7 changes: 6 additions & 1 deletion cpp/src/io/text/multibyte_split.cu
Original file line number Diff line number Diff line change
Expand Up @@ -549,7 +549,12 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
});
return cudf::strings::detail::make_strings_column(it, it + string_count, stream, mr);
} else {
return cudf::make_strings_column(string_count, std::move(offsets), std::move(chars), {}, 0);
return cudf::make_strings_column(
string_count,
std::make_unique<cudf::column>(std::move(offsets), rmm::device_buffer{}, 0),
std::make_unique<cudf::column>(std::move(chars), rmm::device_buffer{}, 0),
0,
{});
}
}

Expand Down
4 changes: 4 additions & 0 deletions cpp/src/strings/strings_column_factories.cu
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ std::unique_ptr<column> make_strings_column(size_type num_strings,
{
CUDF_FUNC_RANGE();

if (num_strings == 0) { return make_empty_column(type_id::STRING); }

if (null_count > 0) CUDF_EXPECTS(null_mask.size() > 0, "Column with nulls must be nullable.");
CUDF_EXPECTS(num_strings == offsets_column->size() - 1,
"Invalid offsets column size for strings column.");
Expand All @@ -146,6 +148,8 @@ std::unique_ptr<column> make_strings_column(size_type num_strings,
{
CUDF_FUNC_RANGE();

if (num_strings == 0) { return make_empty_column(type_id::STRING); }

auto const offsets_size = static_cast<size_type>(offsets.size());
auto const chars_size = static_cast<size_type>(chars.size());

Expand Down
16 changes: 11 additions & 5 deletions cpp/tests/strings/contains_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -297,11 +297,17 @@ TEST_F(StringsContainsTests, HexTest)
std::vector<cudf::size_type> offsets(
{thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(0) + count + 1});
auto d_chars = cudf::detail::make_device_uvector_sync(
ascii_chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
auto d_offsets = cudf::detail::make_device_uvector_sync(
offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
auto input = cudf::make_strings_column(d_chars, d_offsets, {}, 0);
auto d_chars = std::make_unique<cudf::column>(
cudf::detail::make_device_uvector_sync(
ascii_chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
rmm::device_buffer{},
0);
auto d_offsets = std::make_unique<cudf::column>(
cudf::detail::make_device_uvector_sync(
offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
rmm::device_buffer{},
0);
auto input = cudf::make_strings_column(count, std::move(d_offsets), std::move(d_chars), 0, {});

auto strings_view = cudf::strings_column_view(input->view());
for (auto ch : ascii_chars) {
Expand Down
30 changes: 21 additions & 9 deletions cpp/tests/strings/factories_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,20 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
}

std::vector<cudf::bitmask_type> h_nulls{h_null_mask};
auto d_buffer = cudf::detail::make_device_uvector_sync(
h_buffer, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
auto d_offsets = cudf::detail::make_device_uvector_sync(
h_offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
auto d_buffer = std::make_unique<cudf::column>(
cudf::detail::make_device_uvector_sync(
h_buffer, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
rmm::device_buffer{},
0);
auto d_offsets = std::make_unique<cudf::column>(
cudf::detail::make_device_uvector_sync(
h_offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
rmm::device_buffer{},
0);
auto d_nulls = cudf::detail::make_device_uvector_sync(
h_nulls, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
auto column = cudf::make_strings_column(d_buffer, d_offsets, d_nulls, null_count);
auto column = cudf::make_strings_column(
count, std::move(d_offsets), std::move(d_buffer), null_count, d_nulls.release());
EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING});
EXPECT_EQ(column->null_count(), null_count);
EXPECT_EQ(2, column->num_children());
Expand Down Expand Up @@ -186,12 +193,17 @@ TEST_F(StringsFactoriesTest, CreateScalar)

TEST_F(StringsFactoriesTest, EmptyStringsColumn)
{
rmm::device_uvector<char> d_chars{0, cudf::get_default_stream()};
auto d_offsets = cudf::detail::make_zeroed_device_uvector_sync<cudf::size_type>(
1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
auto d_chars = std::make_unique<cudf::column>(
rmm::device_uvector<char>{0, cudf::get_default_stream()}, rmm::device_buffer{}, 0);
auto d_offsets = std::make_unique<cudf::column>(
cudf::detail::make_zeroed_device_uvector_sync<cudf::size_type>(
1, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
rmm::device_buffer{},
0);
rmm::device_uvector<cudf::bitmask_type> d_nulls{0, cudf::get_default_stream()};

auto results = cudf::make_strings_column(d_chars, d_offsets, d_nulls, 0);
auto results =
cudf::make_strings_column(0, std::move(d_offsets), std::move(d_chars), 0, d_nulls.release());
cudf::test::expect_column_empty(results->view());

rmm::device_uvector<thrust::pair<char const*, cudf::size_type>> d_strings{
Expand Down
11 changes: 7 additions & 4 deletions java/src/main/native/src/row_conversion.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2260,10 +2260,13 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
// stuff real string column
auto const null_count = string_row_offset_columns[string_idx]->null_count();
auto string_data = string_row_offset_columns[string_idx].release()->release();
output_columns[i] =
make_strings_column(num_rows, std::move(string_col_offsets[string_idx]),
std::move(string_data_cols[string_idx]),
std::move(*string_data.null_mask.release()), null_count);
output_columns[i] = make_strings_column(
num_rows,
std::make_unique<cudf::column>(std::move(string_col_offsets[string_idx]),
rmm::device_buffer{}, 0),
std::make_unique<cudf::column>(std::move(string_data_cols[string_idx]),
rmm::device_buffer{}, 0),
null_count, std::move(*string_data.null_mask.release()));
string_idx++;
}
}
Expand Down

0 comments on commit 434df44

Please sign in to comment.