Skip to content

Commit

Permalink
Fix split-record result list column offset type (#15707)
Browse files Browse the repository at this point in the history
Fixes offsets type for list column returned by `cudf::strings::split_record` and `cudf::strings::split_record_re` when large-strings enabled. The list column offsets type must be INT32. The code was changed to use the appropriate `make_offsets_child_column` utility function.
Also added some `is_large_strings_enabled()` checks to check-overflow gtests.
This allows all current gtests to pass when the large-strings support environment variable is set.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #15707
  • Loading branch information
davidwendt authored May 16, 2024
1 parent 1e92f3f commit bf255cb
Show file tree
Hide file tree
Showing 7 changed files with 24 additions and 19 deletions.
4 changes: 2 additions & 2 deletions cpp/src/strings/split/split.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -365,8 +365,8 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
});

// create offsets from the counts for return to the caller
auto [offsets, total_tokens] = cudf::strings::detail::make_offsets_child_column(
token_counts.begin(), token_counts.end(), stream, mr);
auto [offsets, total_tokens] =
cudf::detail::make_offsets_child_column(token_counts.begin(), token_counts.end(), stream, mr);
auto const d_tokens_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/strings/split/split_re.cu
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ std::pair<rmm::device_uvector<string_index_pair>, std::unique_ptr<column>> gener
auto const begin = cudf::detail::make_counting_transform_iterator(0, map_fn);
auto const end = begin + strings_count;

auto [offsets, total_tokens] = cudf::strings::detail::make_offsets_child_column(
auto [offsets, total_tokens] = cudf::detail::make_offsets_child_column(
begin, end, stream, rmm::mr::get_current_device_resource());
auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());

Expand Down
4 changes: 3 additions & 1 deletion cpp/tests/column/factories_test.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -24,6 +24,7 @@
#include <cudf/null_mask.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/scalar/scalar_factories.hpp>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/type_dispatcher.hpp>
Expand Down Expand Up @@ -761,6 +762,7 @@ TEST_F(ColumnFactoryTest, FromStructScalarNull) { struct_from_scalar(false); }

TEST_F(ColumnFactoryTest, FromScalarErrors)
{
if (cudf::strings::detail::is_large_strings_enabled()) { return; }
cudf::string_scalar ss("hello world");
EXPECT_THROW(cudf::make_column_from_scalar(ss, 214748365), std::overflow_error);

Expand Down
7 changes: 5 additions & 2 deletions cpp/tests/copying/concatenate_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <cudf/detail/null_mask.hpp>
#include <cudf/dictionary/encode.hpp>
#include <cudf/filling.hpp>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/table/table.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>
Expand Down Expand Up @@ -188,6 +189,8 @@ TEST_F(StringColumnTest, ConcatenateManyColumns)

TEST_F(StringColumnTest, ConcatenateTooLarge)
{
if (cudf::strings::detail::is_large_strings_enabled()) { return; }

std::string big_str(1000000, 'a'); // 1 million bytes x 5 = 5 million bytes
cudf::test::strings_column_wrapper input{big_str, big_str, big_str, big_str, big_str};
std::vector<cudf::column_view> input_cols;
Expand Down Expand Up @@ -374,7 +377,7 @@ TEST_F(OverflowTest, OverflowTest)
}

// string column, overflow on chars
{
if (!cudf::strings::detail::is_large_strings_enabled()) {
constexpr auto size = static_cast<cudf::size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);

// try and concatenate 6 string columns of with 1 billion chars in each
Expand Down Expand Up @@ -497,7 +500,7 @@ TEST_F(OverflowTest, Presliced)
}

// strings, overflow on chars
{
if (!cudf::strings::detail::is_large_strings_enabled()) {
constexpr cudf::size_type total_chars_size = 1024 * 1024 * 1024;
constexpr cudf::size_type string_size = 64;
constexpr cudf::size_type num_rows = total_chars_size / string_size;
Expand Down
3 changes: 3 additions & 0 deletions cpp/tests/strings/array_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <cudf/copying.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/sorting.hpp>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand Down Expand Up @@ -152,6 +153,8 @@ TEST_F(StringsColumnTest, GatherZeroSizeStringsColumn)

TEST_F(StringsColumnTest, GatherTooBig)
{
if (cudf::strings::detail::is_large_strings_enabled()) { return; }

std::vector<int8_t> h_chars(3000000);
cudf::test::fixed_width_column_wrapper<int8_t> chars(h_chars.begin(), h_chars.end());
cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets({0, 3000000});
Expand Down
18 changes: 6 additions & 12 deletions cpp/tests/strings/factories_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/iterator_utilities.hpp>

#include <cudf/column/column_factories.hpp>
#include <cudf/copying.hpp>
Expand Down Expand Up @@ -96,18 +97,11 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
EXPECT_EQ(strings_view.chars_size(cudf::get_default_stream()), memsize);

// check string data
auto h_chars_data = cudf::detail::make_std_vector_sync(
cudf::device_span<char const>(strings_view.chars_begin(cudf::get_default_stream()),
strings_view.chars_size(cudf::get_default_stream())),
cudf::get_default_stream());
auto h_offsets_data = cudf::detail::make_std_vector_sync(
cudf::device_span<cudf::size_type const>(
strings_view.offsets().data<cudf::size_type>() + strings_view.offset(),
strings_view.size() + 1),
cudf::get_default_stream());
EXPECT_EQ(memcmp(h_buffer.data(), h_chars_data.data(), h_buffer.size()), 0);
EXPECT_EQ(
memcmp(h_offsets.data(), h_offsets_data.data(), h_offsets.size() * sizeof(cudf::size_type)), 0);
cudf::test::strings_column_wrapper expected(
h_test_strings.begin(),
h_test_strings.end(),
cudf::test::iterators::nulls_from_nullptrs(h_test_strings));
CUDF_TEST_EXPECT_COLUMNS_EQUAL(column->view(), expected);
}

TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
Expand Down
5 changes: 4 additions & 1 deletion cpp/tests/strings/repeat_strings_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -20,6 +20,7 @@
#include <cudf_test/type_lists.hpp>

#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/repeat_strings.hpp>
#include <cudf/strings/strings_column_view.hpp>

Expand Down Expand Up @@ -220,6 +221,8 @@ TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesInvalidInput)

TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesOverflowOutput)
{
if (cudf::strings::detail::is_large_strings_enabled()) { return; }

auto const strs = strs_col{"1", "12", "123", "1234", "12345", "123456", "1234567"};
auto const strs_cv = cudf::strings_column_view(strs);

Expand Down

0 comments on commit bf255cb

Please sign in to comment.