diff --git a/cpp/benchmarks/string/repeat_strings.cpp b/cpp/benchmarks/string/repeat_strings.cpp index 1844e93bc53..fe015b27f13 100644 --- a/cpp/benchmarks/string/repeat_strings.cpp +++ b/cpp/benchmarks/string/repeat_strings.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -79,42 +79,6 @@ static void BM_repeat_strings_column_times(benchmark::State& state) (strings_col.chars_size() + repeat_times_col.size() * sizeof(int32_t))); } -static void BM_compute_output_strings_sizes(benchmark::State& state) -{ - auto const n_rows = static_cast(state.range(0)); - auto const max_str_length = static_cast(state.range(1)); - auto const table = create_data_table(2, n_rows, max_str_length); - auto const strings_col = cudf::strings_column_view(table->view().column(0)); - auto const repeat_times_col = table->view().column(1); - - for ([[maybe_unused]] auto _ : state) { - [[maybe_unused]] cuda_event_timer raii(state, true, cudf::get_default_stream()); - cudf::strings::repeat_strings_output_sizes(strings_col, repeat_times_col); - } - - state.SetBytesProcessed(state.iterations() * - (strings_col.chars_size() + repeat_times_col.size() * sizeof(int32_t))); -} - -static void BM_repeat_strings_column_times_precomputed_sizes(benchmark::State& state) -{ - auto const n_rows = static_cast(state.range(0)); - auto const max_str_length = static_cast(state.range(1)); - auto const table = create_data_table(2, n_rows, max_str_length); - auto const strings_col = cudf::strings_column_view(table->view().column(0)); - auto const repeat_times_col = table->view().column(1); - [[maybe_unused]] auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(strings_col, repeat_times_col); - - for ([[maybe_unused]] auto _ : state) { - [[maybe_unused]] cuda_event_timer raii(state, true, cudf::get_default_stream()); - cudf::strings::repeat_strings(strings_col, repeat_times_col, *sizes); - } - - state.SetBytesProcessed(state.iterations() * - (strings_col.chars_size() + repeat_times_col.size() * sizeof(int32_t))); -} - static void generate_bench_args(benchmark::internal::Benchmark* b) { int const min_rows = 1 << 8; @@ -145,23 +109,5 @@ class RepeatStrings : public cudf::benchmark { ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -#define COMPUTE_OUTPUT_STRINGS_SIZES_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(RepeatStrings, name) \ - (::benchmark::State & st) { BM_compute_output_strings_sizes(st); } \ - BENCHMARK_REGISTER_F(RepeatStrings, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -#define REPEAT_STRINGS_COLUMN_TIMES_PRECOMPUTED_SIZES_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(RepeatStrings, name) \ - (::benchmark::State & st) { BM_repeat_strings_column_times_precomputed_sizes(st); } \ - BENCHMARK_REGISTER_F(RepeatStrings, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - REPEAT_STRINGS_SCALAR_TIMES_BENCHMARK_DEFINE(scalar_times) REPEAT_STRINGS_COLUMN_TIMES_BENCHMARK_DEFINE(column_times) -COMPUTE_OUTPUT_STRINGS_SIZES_BENCHMARK_DEFINE(compute_output_strings_sizes) -REPEAT_STRINGS_COLUMN_TIMES_PRECOMPUTED_SIZES_BENCHMARK_DEFINE(precomputed_sizes) diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp index 0e6ee2126d3..26fe5f95983 100644 --- a/cpp/include/cudf/strings/repeat_strings.hpp +++ b/cpp/include/cudf/strings/repeat_strings.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,15 +32,15 @@ namespace strings { */ /** - * @brief Repeat the given string scalar by a given number of times. + * @brief Repeat the given string scalar a given number of times * * An output string scalar is generated by repeating the input string by a number of times given by - * the @p `repeat_times` parameter. + * the `repeat_times` parameter. * * In special cases: - * - If @p `repeat_times` is not a positive value, an empty (valid) string scalar will be returned. + * - If `repeat_times` is not a positive value, an empty (valid) string scalar will be returned. * - An invalid input scalar will always result in an invalid output scalar regardless of the - * value of @p `repeat_times` parameter. + * value of `repeat_times` parameter. * * @code{.pseudo} * Example: @@ -50,13 +50,13 @@ namespace strings { * @endcode * * @throw cudf::logic_error if the size of the output string scalar exceeds the maximum value that - * can be stored by the index type - * (i.e., @code input.size() * repeat_times > numeric_limits::max() @endcode). + * can be stored by the index type: + * `input.size() * repeat_times > max of size_type` * - * @param input The scalar containing the string to repeat. - * @param repeat_times The number of times the input string is repeated. - * @param mr Device memory resource used to allocate the returned string scalar. - * @return New string scalar in which the input string is repeated. + * @param input The scalar containing the string to repeat + * @param repeat_times The number of times the input string is repeated + * @param mr Device memory resource used to allocate the returned string scalar + * @return New string scalar in which the input string is repeated */ std::unique_ptr repeat_string( string_scalar const& input, @@ -64,19 +64,16 @@ std::unique_ptr repeat_string( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Repeat each string in the given strings column by a given number of times. + * @brief Repeat each string in the given strings column a given number of times * - * An output strings column is generated by repeating each string from the input strings column by a - * number of times given by the @p `repeat_times` parameter. + * An output strings column is generated by repeating each string from the input strings column by + * the number of times given by the `repeat_times` parameter. * * In special cases: - * - If @p `repeat_times` is not a positive number, a non-null input string will always result in + * - If `repeat_times` is not a positive number, a non-null input string will always result in * an empty output string. * - A null input string will always result in a null output string regardless of the value of the - * @p `repeat_times` parameter. - * - * The caller is responsible for checking the output column size will not exceed the maximum size of - * a strings column (number of total characters is less than the max size_type value). + * `repeat_times` parameter. * * @code{.pseudo} * Example: @@ -85,10 +82,10 @@ std::unique_ptr repeat_string( * out is ['aaaaaa', null, '', 'bbcbbcbbc'] * @endcode * - * @param input The column containing strings to repeat. - * @param repeat_times The number of times each input string is repeated. - * @param mr Device memory resource used to allocate the returned strings column. - * @return New column containing the repeated strings. + * @param input The column containing strings to repeat + * @param repeat_times The number of times each input string is repeated + * @param mr Device memory resource used to allocate the returned strings column + * @return New column containing the repeated strings */ std::unique_ptr repeat_strings( strings_column_view const& input, @@ -97,11 +94,10 @@ std::unique_ptr repeat_strings( /** * @brief Repeat each string in the given strings column by the numbers of times given in another - * numeric column. + * numeric column * * An output strings column is generated by repeating each of the input string by a number of times - * given by the corresponding row in a @p `repeat_times` numeric column. The computational time can - * be reduced if sizes of the output strings are known and provided. + * given by the corresponding row in a `repeat_times` numeric column. * * In special cases: * - Any null row (from either the input strings column or the `repeat_times` column) will always @@ -109,9 +105,6 @@ std::unique_ptr repeat_strings( * - If any value in the `repeat_times` column is not a positive number and its corresponding input * string is not null, the output string will be an empty string. * - * The caller is responsible for checking the output column size will not exceed the maximum size of - * a strings column (number of total characters is less than the max size_type value). - * * @code{.pseudo} * Example: * strs = ['aa', null, '', 'bbc-'] @@ -120,51 +113,16 @@ std::unique_ptr repeat_strings( * out is ['aa', null, '', 'bbc-bbc-bbc-bbc-'] * @endcode * - * @throw cudf::logic_error if the input `repeat_times` column has data type other than integer. + * @throw cudf::logic_error if the input `repeat_times` is not an integer type * @throw cudf::logic_error if the input columns have different sizes. * - * @param input The column containing strings to repeat. + * @param input The column containing strings to repeat * @param repeat_times The column containing numbers of times that the corresponding input strings - * are repeated. - * @param output_strings_sizes The optional column containing pre-computed sizes of the output - * strings. - * @param mr Device memory resource used to allocate the returned strings column. + * are repeated + * @param mr Device memory resource used to allocate the returned strings column * @return New column containing the repeated strings. */ std::unique_ptr repeat_strings( - strings_column_view const& input, - column_view const& repeat_times, - std::optional output_strings_sizes = std::nullopt, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Compute sizes of the output strings if each string in the input strings column - * is repeated by the numbers of times given in another numeric column. - * - * The output column storing string output sizes is not nullable. These string sizes are - * also summed up and returned (in an `int64_t` value), which can be used to detect if the input - * strings column can be safely repeated without data corruption due to overflow in string indexing. - * - * @code{.pseudo} - * Example: - * strs = ['aa', null, '', 'bbc-'] - * repeat_times = [ 1, 2, 3, 4 ] - * [output_sizes, total_size] = repeat_strings_output_sizes(strs, repeat_times) - * out is [2, 0, 0, 16], and total_size = 18 - * @endcode - * - * @throw cudf::logic_error if the input `repeat_times` column has data type other than integer. - * @throw cudf::logic_error if the input columns have different sizes. - * - * @param input The column containing strings to repeat. - * @param repeat_times The column containing numbers of times that the corresponding input strings - * are repeated. - * @param mr Device memory resource used to allocate the returned strings column. - * @return A pair with the first item is an int32_t column containing sizes of the output strings, - * and the second item is an int64_t number containing the total sizes (in bytes) of the - * output strings column. - */ -std::pair, int64_t> repeat_strings_output_sizes( strings_column_view const& input, column_view const& repeat_times, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index cc283fbcee2..3784b535a5b 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -176,7 +176,7 @@ namespace { * separate number of times. */ template -struct compute_size_and_repeat_separately_fn { +struct compute_sizes_and_repeat_fn { column_device_view const strings_dv; column_device_view const repeat_times_dv; Iterator const repeat_times_iter; @@ -189,146 +189,63 @@ struct compute_size_and_repeat_separately_fn { // If d_chars != nullptr: only repeat strings. char* d_chars{nullptr}; - __device__ int64_t operator()(size_type const idx) const noexcept + __device__ void operator()(size_type const idx) const noexcept { auto const string_is_valid = !strings_has_nulls || strings_dv.is_valid_nocheck(idx); auto const rtimes_is_valid = !rtimes_has_nulls || repeat_times_dv.is_valid_nocheck(idx); // Any null input (either string or repeat_times value) will result in a null output. auto const is_valid = string_is_valid && rtimes_is_valid; + if (!is_valid) { + if (!d_chars) { d_offsets[idx] = 0; } + return; + } - // When the input string is null, `repeat_times` and `string_size` are also set to 0. - // This makes sure that if `repeat_times > 0` then we will always have a valid input string, - // and if `repeat_times <= 0` we will never copy anything to the output. - auto const repeat_times = is_valid ? repeat_times_iter[idx] : size_type{0}; - auto const string_size = - is_valid ? strings_dv.element(idx).size_bytes() : size_type{0}; - - // The output_size is returned, and it needs to be an int64_t number to prevent overflow. - auto const output_size = - repeat_times > 0 ? static_cast(repeat_times) * static_cast(string_size) - : int64_t{0}; + auto repeat_times = repeat_times_iter[idx]; + auto const d_str = strings_dv.element(idx); if (!d_chars) { - // If overflow happen, the stored value of output string size will be incorrect due to - // downcasting. In such cases, the entire output string size array should be discarded. - d_offsets[idx] = static_cast(output_size); - } else if (repeat_times > 0 && string_size > 0) { - auto const d_str = strings_dv.element(idx); - auto const input_ptr = d_str.data(); - auto output_ptr = d_chars + d_offsets[idx]; - for (size_type repeat_idx = 0; repeat_idx < repeat_times; ++repeat_idx) { - output_ptr = copy_and_increment(output_ptr, input_ptr, string_size); + // repeat_times could be negative + d_offsets[idx] = (repeat_times > 0) ? (repeat_times * d_str.size_bytes()) : 0; + } else { + auto output_ptr = d_chars + d_offsets[idx]; + while (repeat_times-- > 0) { + output_ptr = copy_and_increment(output_ptr, d_str.data(), d_str.size_bytes()); } } - - // The output_size value may be used to sum up to detect overflow at the caller site. - // The caller can detect overflow easily by checking `SUM(output_size) > INT_MAX`. - return output_size; } }; -/** - * @brief Creates child offsets and chars columns by applying the template function that - * can be used for computing the output size of each string as well as create the output. - * - * This function is similar to `strings::detail::make_strings_children`, except that it accepts an - * optional input `std::optional` that can contain the precomputed sizes of the output - * strings. - * - * @deprecated This will be removed with issue 12542 - */ -template -auto make_strings_children(Func fn, - size_type exec_size, - size_type strings_count, - std::optional output_strings_sizes, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto offsets_column = make_numeric_column( - data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); - - auto offsets_view = offsets_column->mutable_view(); - auto d_offsets = offsets_view.template data(); - fn.d_offsets = d_offsets; - - // This may be called twice -- once for offsets and once for chars. - auto for_each_fn = [exec_size, stream](Func& fn) { - thrust::for_each_n( - rmm::exec_policy(stream), thrust::make_counting_iterator(0), exec_size, fn); - }; - - if (!output_strings_sizes.has_value()) { - // Compute the output sizes only if they are not given. - for_each_fn(fn); - - // Compute the offsets values. - auto const bytes = - cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream); - CUDF_EXPECTS(bytes <= static_cast(std::numeric_limits::max()), - "Size of output exceeds column size limit"); - } else { - // Compute the offsets values from the provided output string sizes. - auto const string_sizes = output_strings_sizes.value(); - CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(offset_type), stream.value())); - thrust::inclusive_scan(rmm::exec_policy(stream), - string_sizes.template begin(), - string_sizes.template end(), - d_offsets + 1); - } - - // Now build the chars column - auto const bytes = cudf::detail::get_value(offsets_view, strings_count, stream); - auto chars_column = create_chars_child_column(bytes, stream, mr); - - // Execute the function fn again to fill the chars column. - // Note that if the output chars column has zero size, the function fn should not be called to - // avoid accidentally overwriting the offsets. - if (bytes > 0) { - fn.d_chars = chars_column->mutable_view().template data(); - for_each_fn(fn); - } - - return std::pair(std::move(offsets_column), std::move(chars_column)); -} - } // namespace std::unique_ptr repeat_strings(strings_column_view const& input, column_view const& repeat_times, - std::optional output_strings_sizes, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(input.size() == repeat_times.size(), "The input columns must have the same size."); CUDF_EXPECTS(cudf::is_index_type(repeat_times.type()), "repeat_strings expects an integer type for the `repeat_times` input column."); - if (output_strings_sizes.has_value()) { - auto const output_sizes = output_strings_sizes.value(); - CUDF_EXPECTS(input.size() == output_sizes.size() && - (!output_sizes.nullable() || !output_sizes.has_nulls()), - "The given column of output string sizes is invalid."); - } auto const strings_count = input.size(); if (strings_count == 0) { return make_empty_column(type_id::STRING); } auto const strings_dv_ptr = column_device_view::create(input.parent(), stream); auto const repeat_times_dv_ptr = column_device_view::create(repeat_times, stream); - auto const strings_has_nulls = input.has_nulls(); - auto const rtimes_has_nulls = repeat_times.has_nulls(); auto const repeat_times_iter = cudf::detail::indexalator_factory::make_input_iterator(repeat_times); - auto const fn = compute_size_and_repeat_separately_fn{ - *strings_dv_ptr, *repeat_times_dv_ptr, repeat_times_iter, strings_has_nulls, rtimes_has_nulls}; - - auto [offsets_column, chars_column] = - make_strings_children(fn, strings_count, strings_count, output_strings_sizes, stream, mr); - - // We generate new bitmask by AND of the input columns' bitmasks. - // Note that if the input columns are nullable, the output column will also be nullable (which may - // not have nulls). + auto const fn = + compute_sizes_and_repeat_fn{*strings_dv_ptr, + *repeat_times_dv_ptr, + repeat_times_iter, + input.has_nulls(), + repeat_times.has_nulls()}; + + auto [offsets_column, chars_column] = make_strings_children(fn, strings_count, stream, mr); + + // We generate new bitmask by AND of the two input columns' bitmasks. + // Note that if either of the input columns are nullable, the output column will also be nullable + // but may not have nulls. auto [null_mask, null_count] = cudf::detail::bitmask_and(table_view{{input.parent(), repeat_times}}, stream, mr); @@ -338,52 +255,6 @@ std::unique_ptr repeat_strings(strings_column_view const& input, null_count, std::move(null_mask)); } - -std::pair, int64_t> repeat_strings_output_sizes( - strings_column_view const& input, - column_view const& repeat_times, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(input.size() == repeat_times.size(), "The input columns must have the same size."); - CUDF_EXPECTS( - cudf::is_index_type(repeat_times.type()), - "repeat_strings_output_sizes expects an integer type for the `repeat_times` input column."); - - auto const strings_count = input.size(); - if (strings_count == 0) { - return std::pair(make_empty_column(type_to_id()), int64_t{0}); - } - - auto output_sizes = make_numeric_column( - data_type{type_to_id()}, strings_count, mask_state::UNALLOCATED, stream, mr); - - auto const strings_dv_ptr = column_device_view::create(input.parent(), stream); - auto const repeat_times_dv_ptr = column_device_view::create(repeat_times, stream); - auto const strings_has_nulls = input.has_nulls(); - auto const rtimes_has_nulls = repeat_times.has_nulls(); - auto const repeat_times_iter = - cudf::detail::indexalator_factory::make_input_iterator(repeat_times); - - auto const fn = compute_size_and_repeat_separately_fn{ - *strings_dv_ptr, - *repeat_times_dv_ptr, - repeat_times_iter, - strings_has_nulls, - rtimes_has_nulls, - output_sizes->mutable_view().template begin()}; - - auto const total_bytes = - thrust::transform_reduce(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - fn, - int64_t{0}, - thrust::plus{}); - - return std::pair(std::move(output_sizes), total_bytes); -} - } // namespace detail std::unique_ptr repeat_string(string_scalar const& input, @@ -404,21 +275,10 @@ std::unique_ptr repeat_strings(strings_column_view const& input, std::unique_ptr repeat_strings(strings_column_view const& input, column_view const& repeat_times, - std::optional output_strings_sizes, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::repeat_strings( - input, repeat_times, output_strings_sizes, cudf::get_default_stream(), mr); -} - -std::pair, int64_t> repeat_strings_output_sizes( - strings_column_view const& input, - column_view const& repeat_times, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::repeat_strings_output_sizes(input, repeat_times, cudf::get_default_stream(), mr); + return detail::repeat_strings(input, repeat_times, cudf::get_default_stream(), mr); } } // namespace strings diff --git a/cpp/tests/strings/repeat_strings_tests.cpp b/cpp/tests/strings/repeat_strings_tests.cpp index 69d0494c253..e75409d9f39 100644 --- a/cpp/tests/strings/repeat_strings_tests.cpp +++ b/cpp/tests/strings/repeat_strings_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -207,20 +207,6 @@ TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesInvalidInput) EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times), cudf::logic_error); } - // Sizes mismatched between strings column and output_strings_sizes column. - { - auto const repeat_times = int32s_col{1, 2}; - auto const sizes = int32s_col{1, 2, 3, 4, 5}; - EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times, sizes), cudf::logic_error); - } - - // output_strings_sizes column has nulls. - { - auto const repeat_times = int32s_col{1, 2}; - auto const sizes = int32s_col{{null, 2}, null_at(0)}; - EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times, sizes), cudf::logic_error); - } - // Invalid data type for repeat_times column. { auto const repeat_times = cudf::test::fixed_width_column_wrapper{1, 2, 3, 4, 5, 6}; @@ -243,11 +229,7 @@ TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesOverflowOutput) auto const repeat_times = int32s_col{half_max, half_max, half_max, half_max, half_max, half_max, half_max}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(strs_cv, repeat_times); - (void)sizes; - auto const expected_bytes = static_cast(half_max) * int64_t{1 + 2 + 3 + 4 + 5 + 6 + 7}; - EXPECT_EQ(expected_bytes, total_bytes); + EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times), cudf::logic_error); } TYPED_TEST(RepeatStringsTypedTest, StringsColumnNoNullWithScalarRepeatTimes) @@ -301,15 +283,6 @@ TYPED_TEST(RepeatStringsTypedTest, StringsColumnNoNullWithColumnRepeatTimes) auto results = cudf::strings::repeat_strings(strs_cv, repeat_times); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{6, 12, 27, 0, 0}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(strs_cv, repeat_times); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(45, total_bytes); - - results = cudf::strings::repeat_strings(strs_cv, repeat_times, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } // repeat_times column has nulls. @@ -320,15 +293,6 @@ TYPED_TEST(RepeatStringsTypedTest, StringsColumnNoNullWithColumnRepeatTimes) auto results = cudf::strings::repeat_strings(strs_cv, repeat_times); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{6, 0, 27, 12, 0}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(strs_cv, repeat_times); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(45, total_bytes); - - results = cudf::strings::repeat_strings(strs_cv, repeat_times, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } } @@ -377,15 +341,6 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnNoNullWithColumnRepeatTime auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{6, 12, 27}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(45, total_bytes); - - results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } // Sliced the middle of the column. @@ -397,15 +352,6 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnNoNullWithColumnRepeatTime auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{12, 27}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(39, total_bytes); - - results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } // Sliced the second half of the column. @@ -417,15 +363,6 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnNoNullWithColumnRepeatTime auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{27, 12, 12}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(51, total_bytes); - - results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } } @@ -520,15 +457,6 @@ TYPED_TEST(RepeatStringsTypedTest, StringsColumnWithNullsWithColumnRepeatTimes) auto results = cudf::strings::repeat_strings(strs_cv, repeat_times); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{6, 0, 18, 0, 0, 0, 12, 12, 0, 0}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(strs_cv, repeat_times); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(48, total_bytes); - - results = cudf::strings::repeat_strings(strs_cv, repeat_times, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } // repeat_times column has nulls. @@ -549,15 +477,6 @@ TYPED_TEST(RepeatStringsTypedTest, StringsColumnWithNullsWithColumnRepeatTimes) auto results = cudf::strings::repeat_strings(strs_cv, repeat_times); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{6, 0, 0, 0, 0, 0, 12, 0, 0, 0}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(strs_cv, repeat_times); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(18, total_bytes); - - results = cudf::strings::repeat_strings(strs_cv, repeat_times, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } } @@ -631,15 +550,6 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnWithNullsWithColumnRepeatT auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{6, 0, 0}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(6, total_bytes); - - results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } // Sliced the middle of the column. @@ -652,15 +562,6 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnWithNullsWithColumnRepeatT auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{0, 0, 0, 0, 12}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(12, total_bytes); - - results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } // Sliced the second half of the column, output has nulls. @@ -672,15 +573,6 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnWithNullsWithColumnRepeatT auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{12, 0, 0, 0}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(12, total_bytes); - - results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } // Sliced the second half of the column, output does not have null. @@ -693,14 +585,5 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnWithNullsWithColumnRepeatT auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{0, 0}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(0, total_bytes); - - results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_strs, *results, verbosity); } }