Skip to content

Commit

Permalink
Remove calls to strings_column_view::offsets_begin() (#15112)
Browse files Browse the repository at this point in the history
Removes calls to `cudf::strings_column_view::offsets_begin()` since the result cannot have a hardcoded integer type.
The goal is to deprecate this member function in this release. Follow on changes may be required to further enable large strings support to these functions.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: #15112
  • Loading branch information
davidwendt authored Feb 27, 2024
1 parent c32725d commit 1719cda
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 35 deletions.
2 changes: 1 addition & 1 deletion cpp/examples/strings/custom_prealloc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
nvtxRangePushA("redact_strings");

auto const scv = cudf::strings_column_view(names);
auto const offsets = scv.offsets_begin();
auto const offsets = scv.offsets().begin<cudf::size_type>();

// create working memory to hold the output of each string
auto working_memory = rmm::device_uvector<char>(scv.chars_size(stream), stream);
Expand Down
33 changes: 17 additions & 16 deletions cpp/src/transform/row_conversion.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/sequence.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
Expand Down Expand Up @@ -212,7 +213,7 @@ struct batch_data {
* @return pair of device vector of size_types of the row sizes of the table and a device vector of
* offsets into the string column
*/
std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<strings_column_view::offset_iterator>>
std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<cudf::detail::input_offsetalator>>
build_string_row_offsets(table_view const& tbl,
size_type fixed_width_and_validity_size,
rmm::cuda_stream_view stream)
Expand All @@ -222,20 +223,20 @@ build_string_row_offsets(table_view const& tbl,
thrust::uninitialized_fill(rmm::exec_policy(stream), d_row_sizes.begin(), d_row_sizes.end(), 0);

auto d_offsets_iterators = [&]() {
std::vector<strings_column_view::offset_iterator> offsets_iterators;
auto offsets_iter = thrust::make_transform_iterator(
tbl.begin(), [](auto const& col) -> strings_column_view::offset_iterator {
if (!is_fixed_width(col.type())) {
CUDF_EXPECTS(col.type().id() == type_id::STRING, "only string columns are supported!");
return strings_column_view(col).offsets_begin();
} else {
return nullptr;
}
std::vector<cudf::detail::input_offsetalator> offsets_iterators;
auto itr = thrust::make_transform_iterator(
tbl.begin(), [](auto const& col) -> cudf::detail::input_offsetalator {
return cudf::detail::offsetalator_factory::make_input_iterator(
strings_column_view(col).offsets(), col.offset());
});
std::copy_if(offsets_iter,
offsets_iter + tbl.num_columns(),
std::back_inserter(offsets_iterators),
[](auto const& offset_ptr) { return offset_ptr != nullptr; });
auto stencil = thrust::make_transform_iterator(
tbl.begin(), [](auto const& col) -> bool { return !is_fixed_width(col.type()); });
thrust::copy_if(thrust::host,
itr,
itr + tbl.num_columns(),
stencil,
std::back_inserter(offsets_iterators),
thrust::identity<bool>{});
return make_device_uvector_sync(
offsets_iterators, stream, rmm::mr::get_current_device_resource());
}();
Expand Down Expand Up @@ -858,7 +859,7 @@ CUDF_KERNEL void copy_strings_to_rows(size_type const num_rows,
size_type const num_variable_columns,
int8_t const** variable_input_data,
size_type const* variable_col_output_offsets,
size_type const** variable_col_offsets,
cudf::detail::input_offsetalator* variable_col_offsets,
size_type fixed_width_row_size,
RowOffsetFunctor row_offsets,
size_type const batch_row_offset,
Expand Down Expand Up @@ -1844,7 +1845,7 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
batch_data& batch_info,
offsetFunctor offset_functor,
column_info_s const& column_info,
std::optional<rmm::device_uvector<strings_column_view::offset_iterator>> variable_width_offsets,
std::optional<rmm::device_uvector<cudf::detail::input_offsetalator>> variable_width_offsets,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
Expand Down
33 changes: 15 additions & 18 deletions cpp/tests/io/json_type_cast_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <cudf_test/type_lists.hpp>

#include <cudf/detail/iterator.cuh>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/io/datasource.hpp>
#include <cudf/io/json.hpp>
Expand All @@ -34,6 +35,8 @@

#include <rmm/exec_policy.hpp>

#include <thrust/adjacent_difference.h>

#include <algorithm>
#include <iterator>
#include <type_traits>
Expand All @@ -43,25 +46,15 @@ using namespace cudf::test::iterators;
struct JSONTypeCastTest : public cudf::test::BaseFixture {};

namespace {
struct offsets_to_length {
__device__ cudf::size_type operator()(thrust::tuple<cudf::size_type, cudf::size_type> const& p)
{
return thrust::get<1>(p) - thrust::get<0>(p);
}
};

/// Returns length of each string in the column
auto string_offset_to_length(cudf::strings_column_view const& column, rmm::cuda_stream_view stream)
{
auto offsets_begin = column.offsets_begin();
auto offsets_pair =
thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin)));
rmm::device_uvector<cudf::size_type> svs_length(column.size(), stream);
thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
offsets_pair,
offsets_pair + column.size(),
svs_length.begin(),
offsets_to_length{});
auto itr =
cudf::detail::offsetalator_factory::make_input_iterator(column.offsets(), column.offset());
thrust::adjacent_difference(
rmm::exec_policy(stream), itr + 1, itr + column.size() + 1, svs_length.begin());
return svs_length;
}
} // namespace
Expand Down Expand Up @@ -96,7 +89,8 @@ TEST_F(JSONTypeCastTest, String)

auto str_col = cudf::io::json::detail::parse_data(
column.chars_begin(stream),
thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())),
thrust::make_zip_iterator(
thrust::make_tuple(column.offsets().begin<cudf::size_type>(), svs_length.begin())),
column.size(),
type,
std::move(null_mask),
Expand Down Expand Up @@ -129,7 +123,8 @@ TEST_F(JSONTypeCastTest, Int)

auto col = cudf::io::json::detail::parse_data(
column.chars_begin(stream),
thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())),
thrust::make_zip_iterator(
thrust::make_tuple(column.offsets().begin<cudf::size_type>(), svs_length.begin())),
column.size(),
type,
std::move(null_mask),
Expand Down Expand Up @@ -169,7 +164,8 @@ TEST_F(JSONTypeCastTest, StringEscapes)

auto col = cudf::io::json::detail::parse_data(
column.chars_begin(stream),
thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())),
thrust::make_zip_iterator(
thrust::make_tuple(column.offsets().begin<cudf::size_type>(), svs_length.begin())),
column.size(),
type,
std::move(null_mask),
Expand Down Expand Up @@ -238,7 +234,8 @@ TEST_F(JSONTypeCastTest, ErrorNulls)

auto str_col = cudf::io::json::detail::parse_data(
column.chars_begin(stream),
thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())),
thrust::make_zip_iterator(
thrust::make_tuple(column.offsets().begin<cudf::size_type>(), svs_length.begin())),
column.size(),
type,
std::move(null_mask),
Expand Down

0 comments on commit 1719cda

Please sign in to comment.