Skip to content

Commit

Permalink
address review
Browse files Browse the repository at this point in the history
  • Loading branch information
Matt711 committed Sep 25, 2024
1 parent 649c3a8 commit bf2d6ed
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 34 deletions.
5 changes: 3 additions & 2 deletions cpp/include/cudf/strings/char_types/char_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ namespace strings {
*/

/**
* @brief Returns a boolean column identifying strings entries in which all
* @brief Returns a boolean column identifying string entries where all
* characters are of the type specified.
*
* The output row entry will be set to false if the corresponding string element
Expand Down Expand Up @@ -105,7 +105,8 @@ std::unique_ptr<column> all_characters_of_type(
* `types_to_remove` will be filtered.
* @param mr Device memory resource used to allocate the returned column's device memory
* @param stream CUDA stream used for device memory operations and kernel launches
* @return New column of boolean results for each string
* @return New strings column with the specified characters filtered out and replaced with specified
* replacement string.
*/
std::unique_ptr<column> filter_characters_of_type(
strings_column_view const& input,
Expand Down
60 changes: 30 additions & 30 deletions python/cudf/cudf/_lib/strings/char_types.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,21 @@ from cudf.core.buffer import acquire_spill_lock

from cudf._lib.column cimport Column

import pylibcudf as plc
from pylibcudf.strings import char_types


@acquire_spill_lock()
def filter_alphanum(Column source_strings, object py_repl, bool keep=True):
"""
Returns a Column of strings keeping only alphanumeric character types.
"""
plc_column = plc.strings.char_types.filter_characters_of_type(
plc_column = char_types.filter_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
plc.strings.char_types.StringCharacterTypes.ALL_TYPES if keep
else plc.strings.char_types.StringCharacterTypes.ALPHANUM,
char_types.StringCharacterTypes.ALL_TYPES if keep
else char_types.StringCharacterTypes.ALPHANUM,
py_repl.device_value.c_value,
plc.strings.char_types.StringCharacterTypes.ALPHANUM if keep
else plc.strings.char_types.StringCharacterTypes.ALL_TYPES
char_types.StringCharacterTypes.ALPHANUM if keep
else char_types.StringCharacterTypes.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)

Expand All @@ -32,10 +32,10 @@ def is_decimal(Column source_strings):
that contain only decimal characters -- those that can be used
to extract base10 numbers.
"""
plc_column = plc.strings.char_types.all_characters_of_type(
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
plc.strings.char_types.StringCharacterTypes.DECIMAL,
plc.strings.char_types.StringCharacterTypes.ALL_TYPES
char_types.StringCharacterTypes.DECIMAL,
char_types.StringCharacterTypes.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)

Expand All @@ -48,10 +48,10 @@ def is_alnum(Column source_strings):
Equivalent to: is_alpha() or is_digit() or is_numeric() or is_decimal()
"""
plc_column = plc.strings.char_types.all_characters_of_type(
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
plc.strings.char_types.StringCharacterTypes.ALPHANUM,
plc.strings.char_types.StringCharacterTypes.ALL_TYPES
char_types.StringCharacterTypes.ALPHANUM,
char_types.StringCharacterTypes.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)

Expand All @@ -62,10 +62,10 @@ def is_alpha(Column source_strings):
Returns a Column of boolean values with True for `source_strings`
that contain only alphabetic characters.
"""
plc_column = plc.strings.char_types.all_characters_of_type(
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
plc.strings.char_types.StringCharacterTypes.ALPHA,
plc.strings.char_types.StringCharacterTypes.ALL_TYPES
char_types.StringCharacterTypes.ALPHA,
char_types.StringCharacterTypes.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)

Expand All @@ -76,10 +76,10 @@ def is_digit(Column source_strings):
Returns a Column of boolean values with True for `source_strings`
that contain only decimal and digit characters.
"""
plc_column = plc.strings.char_types.all_characters_of_type(
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
plc.strings.char_types.StringCharacterTypes.DIGIT,
plc.strings.char_types.StringCharacterTypes.ALL_TYPES
char_types.StringCharacterTypes.DIGIT,
char_types.StringCharacterTypes.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)

Expand All @@ -91,10 +91,10 @@ def is_numeric(Column source_strings):
that contain only numeric characters. These include digit and
numeric characters.
"""
plc_column = plc.strings.char_types.all_characters_of_type(
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
plc.strings.char_types.StringCharacterTypes.NUMERIC,
plc.strings.char_types.StringCharacterTypes.ALL_TYPES
char_types.StringCharacterTypes.NUMERIC,
char_types.StringCharacterTypes.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)

Expand All @@ -105,10 +105,10 @@ def is_upper(Column source_strings):
Returns a Column of boolean values with True for `source_strings`
that contain only upper-case characters.
"""
plc_column = plc.strings.char_types.all_characters_of_type(
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
plc.strings.char_types.StringCharacterTypes.UPPER,
plc.strings.char_types.StringCharacterTypes.CASE_TYPES
char_types.StringCharacterTypes.UPPER,
char_types.StringCharacterTypes.CASE_TYPES
)
return Column.from_pylibcudf(plc_column)

Expand All @@ -119,10 +119,10 @@ def is_lower(Column source_strings):
Returns a Column of boolean values with True for `source_strings`
that contain only lower-case characters.
"""
plc_column = plc.strings.char_types.all_characters_of_type(
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
plc.strings.char_types.StringCharacterTypes.LOWER,
plc.strings.char_types.StringCharacterTypes.CASE_TYPES
char_types.StringCharacterTypes.LOWER,
char_types.StringCharacterTypes.CASE_TYPES
)
return Column.from_pylibcudf(plc_column)

Expand All @@ -133,9 +133,9 @@ def is_space(Column source_strings):
Returns a Column of boolean values with True for `source_strings`
that contains all characters which are spaces only.
"""
plc_column = plc.strings.char_types.all_characters_of_type(
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
plc.strings.char_types.StringCharacterTypes.SPACE,
plc.strings.char_types.StringCharacterTypes.ALL_TYPES
char_types.StringCharacterTypes.SPACE,
char_types.StringCharacterTypes.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)
5 changes: 3 additions & 2 deletions python/pylibcudf/pylibcudf/strings/char_types.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ cpdef Column all_characters_of_type(
string_character_types verify_types
):
"""
Filter specific character types from a column of strings.
Identifies strings where all characters match the specified type.
Parameters
----------
Expand Down Expand Up @@ -72,7 +72,8 @@ cpdef Column filter_characters_of_type(
Returns
-------
Column
New column of boolean results for each string
New column with the specified characters filtered out and
replaced with the specified replacement string.
"""
cdef const string_scalar* c_replacement = <const string_scalar*>(
replacement.c_obj.get()
Expand Down

0 comments on commit bf2d6ed

Please sign in to comment.