Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove cudf::strings::repeat_strings_output_sizes and optional parameter from cudf::strings::repeat_strings #12609

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
c831395
Remove cudf::strings::repeat_strings_output_sizes and optional parameter
davidwendt Jan 25, 2023
15a0748
Merge branch 'branch-23.04' into remove-repeat-output-sizes
davidwendt Jan 25, 2023
d9616d0
Merge branch 'branch-23.04' into remove-repeat-output-sizes
davidwendt Jan 26, 2023
e7b695a
remove unneeded var
davidwendt Jan 27, 2023
e93105b
Merge branch 'branch-23.04' into remove-repeat-output-sizes
davidwendt Jan 27, 2023
36d1336
Merge branch 'branch-23.04' into remove-repeat-output-sizes
davidwendt Jan 30, 2023
9ee2dff
Merge branch 'branch-23.04' into remove-repeat-output-sizes
davidwendt Jan 30, 2023
93cc98f
Merge branch 'branch-23.04' into remove-repeat-output-sizes
davidwendt Jan 31, 2023
e068c1f
Merge branch 'branch-23.04' into remove-repeat-output-sizes
davidwendt Jan 31, 2023
31c7c07
Merge branch 'branch-23.04' into remove-repeat-output-sizes
davidwendt Jan 31, 2023
6caedd1
Merge branch 'branch-23.04' into remove-repeat-output-sizes
davidwendt Feb 1, 2023
ed3441b
Merge branch 'branch-23.04' into remove-repeat-output-sizes
davidwendt Feb 3, 2023
597cb7b
Merge branch 'branch-23.04' into remove-repeat-output-sizes
davidwendt Feb 6, 2023
7794954
Merge branch 'branch-23.04' into remove-repeat-output-sizes
davidwendt Feb 7, 2023
33fa717
Merge branch 'branch-23.04' into remove-repeat-output-sizes
davidwendt Feb 8, 2023
a6ee53f
Merge branch 'branch-23.04' into remove-repeat-output-sizes
davidwendt Feb 8, 2023
699eded
Merge branch 'branch-23.04' into remove-repeat-output-sizes
davidwendt Feb 8, 2023
7c6578b
fix some doxygen wording
davidwendt Feb 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 1 addition & 55 deletions cpp/benchmarks/string/repeat_strings.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -79,42 +79,6 @@ static void BM_repeat_strings_column_times(benchmark::State& state)
(strings_col.chars_size() + repeat_times_col.size() * sizeof(int32_t)));
}

static void BM_compute_output_strings_sizes(benchmark::State& state)
{
auto const n_rows = static_cast<cudf::size_type>(state.range(0));
auto const max_str_length = static_cast<cudf::size_type>(state.range(1));
auto const table = create_data_table(2, n_rows, max_str_length);
auto const strings_col = cudf::strings_column_view(table->view().column(0));
auto const repeat_times_col = table->view().column(1);

for ([[maybe_unused]] auto _ : state) {
[[maybe_unused]] cuda_event_timer raii(state, true, cudf::get_default_stream());
cudf::strings::repeat_strings_output_sizes(strings_col, repeat_times_col);
}

state.SetBytesProcessed(state.iterations() *
(strings_col.chars_size() + repeat_times_col.size() * sizeof(int32_t)));
}

static void BM_repeat_strings_column_times_precomputed_sizes(benchmark::State& state)
{
auto const n_rows = static_cast<cudf::size_type>(state.range(0));
auto const max_str_length = static_cast<cudf::size_type>(state.range(1));
auto const table = create_data_table(2, n_rows, max_str_length);
auto const strings_col = cudf::strings_column_view(table->view().column(0));
auto const repeat_times_col = table->view().column(1);
[[maybe_unused]] auto const [sizes, total_bytes] =
cudf::strings::repeat_strings_output_sizes(strings_col, repeat_times_col);

for ([[maybe_unused]] auto _ : state) {
[[maybe_unused]] cuda_event_timer raii(state, true, cudf::get_default_stream());
cudf::strings::repeat_strings(strings_col, repeat_times_col, *sizes);
}

state.SetBytesProcessed(state.iterations() *
(strings_col.chars_size() + repeat_times_col.size() * sizeof(int32_t)));
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
{
int const min_rows = 1 << 8;
Expand Down Expand Up @@ -145,23 +109,5 @@ class RepeatStrings : public cudf::benchmark {
->UseManualTime() \
->Unit(benchmark::kMillisecond);

#define COMPUTE_OUTPUT_STRINGS_SIZES_BENCHMARK_DEFINE(name) \
BENCHMARK_DEFINE_F(RepeatStrings, name) \
(::benchmark::State & st) { BM_compute_output_strings_sizes(st); } \
BENCHMARK_REGISTER_F(RepeatStrings, name) \
->Apply(generate_bench_args) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

#define REPEAT_STRINGS_COLUMN_TIMES_PRECOMPUTED_SIZES_BENCHMARK_DEFINE(name) \
BENCHMARK_DEFINE_F(RepeatStrings, name) \
(::benchmark::State & st) { BM_repeat_strings_column_times_precomputed_sizes(st); } \
BENCHMARK_REGISTER_F(RepeatStrings, name) \
->Apply(generate_bench_args) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

REPEAT_STRINGS_SCALAR_TIMES_BENCHMARK_DEFINE(scalar_times)
REPEAT_STRINGS_COLUMN_TIMES_BENCHMARK_DEFINE(column_times)
COMPUTE_OUTPUT_STRINGS_SIZES_BENCHMARK_DEFINE(compute_output_strings_sizes)
REPEAT_STRINGS_COLUMN_TIMES_PRECOMPUTED_SIZES_BENCHMARK_DEFINE(precomputed_sizes)
94 changes: 26 additions & 68 deletions cpp/include/cudf/strings/repeat_strings.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -32,15 +32,15 @@ namespace strings {
*/

/**
* @brief Repeat the given string scalar by a given number of times.
* @brief Repeat the given string scalar a given number of times
*
* An output string scalar is generated by repeating the input string by a number of times given by
* the @p `repeat_times` parameter.
* the `repeat_times` parameter.
*
* In special cases:
* - If @p `repeat_times` is not a positive value, an empty (valid) string scalar will be returned.
* - If `repeat_times` is not a positive value, an empty (valid) string scalar will be returned.
* - An invalid input scalar will always result in an invalid output scalar regardless of the
* value of @p `repeat_times` parameter.
* value of `repeat_times` parameter.
*
* @code{.pseudo}
* Example:
Expand All @@ -50,33 +50,30 @@ namespace strings {
* @endcode
*
* @throw cudf::logic_error if the size of the output string scalar exceeds the maximum value that
* can be stored by the index type
* (i.e., @code input.size() * repeat_times > numeric_limits<size_type>::max() @endcode).
* can be stored by the index type:
* `input.size() * repeat_times > max of size_type`
*
* @param input The scalar containing the string to repeat.
* @param repeat_times The number of times the input string is repeated.
* @param mr Device memory resource used to allocate the returned string scalar.
* @return New string scalar in which the input string is repeated.
* @param input The scalar containing the string to repeat
* @param repeat_times The number of times the input string is repeated
* @param mr Device memory resource used to allocate the returned string scalar
* @return New string scalar in which the input string is repeated
*/
std::unique_ptr<string_scalar> repeat_string(
string_scalar const& input,
size_type repeat_times,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Repeat each string in the given strings column by a given number of times.
* @brief Repeat each string in the given strings column a given number of times
*
* An output strings column is generated by repeating each string from the input strings column by a
* number of times given by the @p `repeat_times` parameter.
* An output strings column is generated by repeating each string from the input strings column by
* the number of times given by the `repeat_times` parameter.
*
* In special cases:
* - If @p `repeat_times` is not a positive number, a non-null input string will always result in
* - If `repeat_times` is not a positive number, a non-null input string will always result in
* an empty output string.
* - A null input string will always result in a null output string regardless of the value of the
* @p `repeat_times` parameter.
*
* The caller is responsible for checking the output column size will not exceed the maximum size of
* a strings column (number of total characters is less than the max size_type value).
* `repeat_times` parameter.
*
* @code{.pseudo}
* Example:
Expand All @@ -85,10 +82,10 @@ std::unique_ptr<string_scalar> repeat_string(
* out is ['aaaaaa', null, '', 'bbcbbcbbc']
* @endcode
*
* @param input The column containing strings to repeat.
* @param repeat_times The number of times each input string is repeated.
* @param mr Device memory resource used to allocate the returned strings column.
* @return New column containing the repeated strings.
* @param input The column containing strings to repeat
* @param repeat_times The number of times each input string is repeated
* @param mr Device memory resource used to allocate the returned strings column
* @return New column containing the repeated strings
*/
std::unique_ptr<column> repeat_strings(
strings_column_view const& input,
Expand All @@ -97,21 +94,17 @@ std::unique_ptr<column> repeat_strings(

/**
* @brief Repeat each string in the given strings column by the numbers of times given in another
* numeric column.
* numeric column
*
* An output strings column is generated by repeating each of the input string by a number of times
* given by the corresponding row in a @p `repeat_times` numeric column. The computational time can
* be reduced if sizes of the output strings are known and provided.
* given by the corresponding row in a `repeat_times` numeric column.
*
* In special cases:
* - Any null row (from either the input strings column or the `repeat_times` column) will always
* result in a null output string.
* - If any value in the `repeat_times` column is not a positive number and its corresponding input
* string is not null, the output string will be an empty string.
*
* The caller is responsible for checking the output column size will not exceed the maximum size of
* a strings column (number of total characters is less than the max size_type value).
*
* @code{.pseudo}
* Example:
* strs = ['aa', null, '', 'bbc-']
Expand All @@ -120,51 +113,16 @@ std::unique_ptr<column> repeat_strings(
* out is ['aa', null, '', 'bbc-bbc-bbc-bbc-']
* @endcode
*
* @throw cudf::logic_error if the input `repeat_times` column has data type other than integer.
* @throw cudf::logic_error if the input `repeat_times` is not an integer type
* @throw cudf::logic_error if the input columns have different sizes.
*
* @param input The column containing strings to repeat.
* @param input The column containing strings to repeat
* @param repeat_times The column containing numbers of times that the corresponding input strings
* are repeated.
* @param output_strings_sizes The optional column containing pre-computed sizes of the output
* strings.
* @param mr Device memory resource used to allocate the returned strings column.
* are repeated
* @param mr Device memory resource used to allocate the returned strings column
* @return New column containing the repeated strings.
*/
std::unique_ptr<column> repeat_strings(
strings_column_view const& input,
column_view const& repeat_times,
std::optional<column_view> output_strings_sizes = std::nullopt,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Compute sizes of the output strings if each string in the input strings column
* is repeated by the numbers of times given in another numeric column.
*
* The output column storing string output sizes is not nullable. These string sizes are
* also summed up and returned (in an `int64_t` value), which can be used to detect if the input
* strings column can be safely repeated without data corruption due to overflow in string indexing.
*
* @code{.pseudo}
* Example:
* strs = ['aa', null, '', 'bbc-']
* repeat_times = [ 1, 2, 3, 4 ]
* [output_sizes, total_size] = repeat_strings_output_sizes(strs, repeat_times)
* out is [2, 0, 0, 16], and total_size = 18
* @endcode
*
* @throw cudf::logic_error if the input `repeat_times` column has data type other than integer.
* @throw cudf::logic_error if the input columns have different sizes.
*
* @param input The column containing strings to repeat.
* @param repeat_times The column containing numbers of times that the corresponding input strings
* are repeated.
* @param mr Device memory resource used to allocate the returned strings column.
* @return A pair with the first item is an int32_t column containing sizes of the output strings,
* and the second item is an int64_t number containing the total sizes (in bytes) of the
* output strings column.
*/
std::pair<std::unique_ptr<column>, int64_t> repeat_strings_output_sizes(
strings_column_view const& input,
column_view const& repeat_times,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
Expand Down
Loading