Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add delimiter parameter to cudf::strings::capitalize() #8620

Merged
20 changes: 15 additions & 5 deletions cpp/include/cudf/strings/capitalize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,31 @@ namespace strings {
/**
* @brief Returns a column of capitalized strings.
*
* Any null string entries return corresponding null output column entries.
* If the `delimiters` is an empty string, then only the first character of each
* row is capitalized. Otherwise, a non-delimiter character is capitalized after
* any delimiter character is found.
*
* @code{.pseudo}
* Example:
* input = ["tesT1", "a Test", "Another Test"];
* input = ["tesT1", "a Test", "Another Test", "a\tb"];
* output = capitalize(input)
* output is ["Test1", "A test", "Another test"]
* output is ["Test1", "A test", "Another test", "A\tB"]
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
* output = capitalize(input, " ")
* output is ["Test1", "A Test", "Another Test", "A\tb"]
* @endcode
*
* @param[in] input String column.
* @param[in] mr Device memory resource used to allocate the returned column's device memory
* Any null string entries return corresponding null output column entries.
*
* @throw cudf::logic_error if `delimiter.is_valid()` is `false`.
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
*
* @param input String column.
* @param delimiters Used if identifying words to capitalize.
* @param mr Device memory resource used to allocate the returned column's device memory
* @return Column of strings capitalized from the input column.
*/
std::unique_ptr<column> capitalize(
strings_column_view const& input,
string_scalar const& delimiters = string_scalar(""),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down
113 changes: 61 additions & 52 deletions cpp/src/strings/capitalize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,24 @@ namespace {
* @brief Base class for capitalize and title functors.
*
* Utility functions here manage access to the character case and flags tables.
* Any derived class must supply a `capitalize_next` member function.
*
* @tparam Derived class uses the CRTP pattern to reuse code logic.
*/
template <typename Derived>
struct base_fn {
character_flags_table_type const* d_flags;
character_cases_table_type const* d_case_table;
column_device_view const d_column;
offset_type* d_offsets{};
char* d_chars{};

base_fn() : d_flags(get_character_flags_table()), d_case_table(get_character_cases_table()) {}
base_fn(column_device_view const& d_column)
: d_flags(get_character_flags_table()),
d_case_table(get_character_cases_table()),
d_column(d_column)
{
}

using char_info = thrust::pair<uint32_t, detail::character_flags_table_type>;

Expand All @@ -58,35 +70,31 @@ struct base_fn {
{
return codepoint_to_utf8(d_case_table[info.first]);
}
};

/**
* @brief Capitalize functor.
*
* This capitalizes the first letter of the string.
* Also lower-case any characters after the first letter.
*/
struct capitalize_fn : base_fn {
column_device_view const d_column;
offset_type* d_offsets{};
char* d_chars{};

capitalize_fn(column_device_view const& d_column) : base_fn(), d_column(d_column) {}

/**
* @brief Operator called for each row in `d_column`.
*
* This logic is shared by capitalize() and title() functions.
* The derived class must supply a `capitalize_next` member function.
*/
__device__ void operator()(size_type idx)
{
if (d_column.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
}

Derived& derived = static_cast<Derived&>(*this);
auto const d_str = d_column.element<string_view>(idx);
offset_type bytes = 0;
auto d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr;
bool capitalize = true;
for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) {
auto const info = get_char_info(*itr);
auto const flag = info.second;
auto const change_case = (itr == d_str.begin()) ? IS_LOWER(flag) : IS_UPPER(flag);
auto const change_case = capitalize ? IS_LOWER(flag) : IS_UPPER(flag);
auto const new_char = change_case ? convert_char(info) : *itr;
// capitalize the next char if this one is a delimiter
capitalize = derived.capitalize_next(*itr, flag);

if (d_buffer)
d_buffer += detail::from_char_utf8(new_char, d_buffer);
Expand All @@ -97,51 +105,48 @@ struct capitalize_fn : base_fn {
}
};

/**
* @brief Capitalize functor.
*
* This capitalizes the first character of the string and lower-cases
* the remaining characters.
* If a delimiter is specified, capitalization continues within the string
* on the first eligible character after any delimiter.
*/
struct capitalize_fn : base_fn<capitalize_fn> {
string_view const d_delimiters;

capitalize_fn(column_device_view const& d_column, string_view const& d_delimiters)
: base_fn(d_column), d_delimiters(d_delimiters)
{
}

__device__ bool capitalize_next(char_utf8 const chr, character_flags_table_type const)
{
return !d_delimiters.empty() && (d_delimiters.find(chr) >= 0);
}
};

/**
* @brief Title functor.
*
* This capitalizes the first letter of each word.
* The beginning of a word is identified as the first alphabetic
* character after a non-alphabetic character.
* Also, lower-case all other alpabetic characters.
* The beginning of a word is identified as the first sequence_type
* character after a non-sequence_type character.
* Also, lower-case all other alphabetic characters.
*/
struct title_fn : base_fn {
column_device_view const d_column;
struct title_fn : base_fn<title_fn> {
string_character_types sequence_type;
offset_type* d_offsets{};
char* d_chars{};

title_fn(column_device_view const& d_column, string_character_types sequence_type)
: base_fn(), d_column(d_column), sequence_type(sequence_type)
: base_fn(d_column), sequence_type(sequence_type)
{
}

__device__ void operator()(size_type idx)
__device__ bool capitalize_next(char_utf8 const, character_flags_table_type const flag)
{
if (d_column.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
}

auto const d_str = d_column.element<string_view>(idx);
offset_type bytes = 0;
auto d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr;
bool capitalize = true;
for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) {
auto const info = get_char_info(*itr);
auto const flag = info.second;
auto const change_case =
(flag & sequence_type) && (capitalize ? IS_LOWER(flag) : IS_UPPER(flag));
auto const new_char = change_case ? convert_char(info) : *itr;
// capitalize the next char if this one is not a sequence_type
capitalize = (flag & sequence_type) == 0;

if (d_buffer)
d_buffer += detail::from_char_utf8(new_char, d_buffer);
else
bytes += detail::bytes_in_char_utf8(new_char);
}
if (!d_chars) d_offsets[idx] = bytes;
}
return (flag & sequence_type) == 0;
};
};

/**
Expand Down Expand Up @@ -173,12 +178,15 @@ std::unique_ptr<column> capitalize_utility(CapitalFn cfn,
} // namespace

std::unique_ptr<column> capitalize(strings_column_view const& input,
string_scalar const& delimiters,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(delimiters.is_valid(stream), "Delimiter must be a valid string");
if (input.is_empty()) return make_empty_column(data_type{type_id::STRING});
auto d_column = column_device_view::create(input.parent(), stream);
return capitalize_utility(capitalize_fn{*d_column}, input, stream, mr);
auto const d_column = column_device_view::create(input.parent(), stream);
auto const d_delimiters = delimiters.value(stream);
return capitalize_utility(capitalize_fn{*d_column, d_delimiters}, input, stream, mr);
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
}

std::unique_ptr<column> title(strings_column_view const& input,
Expand All @@ -194,10 +202,11 @@ std::unique_ptr<column> title(strings_column_view const& input,
} // namespace detail

std::unique_ptr<column> capitalize(strings_column_view const& input,
string_scalar const& delimiter,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::capitalize(input, rmm::cuda_stream_default, mr);
return detail::capitalize(input, delimiter, rmm::cuda_stream_default, mr);
}

std::unique_ptr<column> title(strings_column_view const& input,
Expand Down
78 changes: 53 additions & 25 deletions cpp/tests/strings/case_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,36 +97,34 @@ TEST_F(StringsCaseTest, Swapcase)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(StringsCaseTest, EmptyStringsColumn)
{
cudf::column_view zero_size_strings_column(
cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
auto strings_view = cudf::strings_column_view(zero_size_strings_column);
auto results = cudf::strings::to_lower(strings_view);
auto view = results->view();
cudf::test::expect_strings_empty(results->view());
}

TEST_F(StringsCaseTest, Capitalize)
{
std::vector<const char*> h_strings{
"SȺȺnich xyZ", "Examples aBc", "thesé", nullptr, "ARE THE", "tést strings", ""};
std::vector<const char*> h_expected{
"Sⱥⱥnich xyz", "Examples abc", "Thesé", nullptr, "Are the", "Tést strings", ""};

cudf::test::strings_column_wrapper strings(
h_strings.begin(),
h_strings.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
{"SȺȺnich xyZ", "Examples aBc", "thesé", "", "ARE\tTHE", "tést\tstrings", ""},
{1, 1, 1, 0, 1, 1, 1});
auto strings_view = cudf::strings_column_view(strings);

auto results = cudf::strings::capitalize(strings_view);

cudf::test::strings_column_wrapper expected(
h_expected.begin(),
h_expected.end(),
thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
{
auto results = cudf::strings::capitalize(strings_view);
cudf::test::strings_column_wrapper expected(
{"Sⱥⱥnich xyz", "Examples abc", "Thesé", "", "Are\tthe", "Tést\tstrings", ""},
{1, 1, 1, 0, 1, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
{
auto results = cudf::strings::capitalize(strings_view, std::string(" "));
cudf::test::strings_column_wrapper expected(
{"Sⱥⱥnich Xyz", "Examples Abc", "Thesé", "", "Are\tthe", "Tést\tstrings", ""},
{1, 1, 1, 0, 1, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
{
auto results = cudf::strings::capitalize(strings_view, std::string(" \t"));
cudf::test::strings_column_wrapper expected(
{"Sⱥⱥnich Xyz", "Examples Abc", "Thesé", "", "Are\tThe", "Tést\tStrings", ""},
{1, 1, 1, 0, 1, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
}

TEST_F(StringsCaseTest, Title)
Expand Down Expand Up @@ -174,3 +172,33 @@ TEST_F(StringsCaseTest, MultiCharLower)

CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(StringsCaseTest, EmptyStringsColumn)
{
cudf::column_view zero_size_strings_column(
cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
auto strings_view = cudf::strings_column_view(zero_size_strings_column);

auto results = cudf::strings::to_lower(strings_view);
cudf::test::expect_strings_empty(results->view());

results = cudf::strings::to_upper(strings_view);
cudf::test::expect_strings_empty(results->view());

results = cudf::strings::swapcase(strings_view);
cudf::test::expect_strings_empty(results->view());

results = cudf::strings::capitalize(strings_view);
cudf::test::expect_strings_empty(results->view());

results = cudf::strings::title(strings_view);
cudf::test::expect_strings_empty(results->view());
}

TEST_F(StringsCaseTest, ErrorTest)
{
cudf::test::strings_column_wrapper input{"the column intentionally left blank"};
auto view = cudf::strings_column_view(input);

EXPECT_THROW(cudf::strings::capitalize(view, cudf::string_scalar("", false)), cudf::logic_error);
}