diff --git a/CHANGELOG.md b/CHANGELOG.md index fa16fba39fe..cf4ebd8dfef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -108,6 +108,7 @@ - PR #6780 Move `cudf::cast` tests to separate test file - PR #6789 Rename `unary_op` to `unary_operator` - PR #6770 Support building decimal columns with Table.TestBuilder +- PR #6800 Push DeviceScalar to cython-only - PR #6822 Split out `cudf::distinct_count` from `drop_duplicates.cu` - PR #6813 Enable `expand=False` in `.str.split` and `.str.rsplit` - PR #6829 Enable workaround to write categorical columns in csv diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx index 25b18be2ac7..48d67110621 100644 --- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx +++ b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx @@ -15,7 +15,10 @@ from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar -def generate_ngrams(Column strings, int ngrams, DeviceScalar separator): +def generate_ngrams(Column strings, int ngrams, object py_separator): + + cdef DeviceScalar separator = py_separator.device_value + cdef column_view c_strings = strings.view() cdef size_type c_ngrams = ngrams cdef const string_scalar* c_separator = separator\ diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx index 82cf4e44f7a..cf0a4a0f55a 100644 --- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx +++ b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx @@ -17,9 +17,13 @@ from cudf._lib.scalar cimport DeviceScalar def ngrams_tokenize( Column strings, int ngrams, - DeviceScalar delimiter, - DeviceScalar separator + object py_delimiter, + object py_separator ): + + cdef DeviceScalar delimiter = py_delimiter.device_value + cdef DeviceScalar separator = py_separator.device_value + cdef column_view c_strings = strings.view() cdef size_type c_ngrams = ngrams cdef const string_scalar* c_separator = separator\ diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx index 93dc4ef47f5..cb552161b52 100644 --- a/python/cudf/cudf/_lib/nvtext/replace.pyx +++ b/python/cudf/cudf/_lib/nvtext/replace.pyx @@ -18,14 +18,16 @@ from cudf._lib.scalar cimport DeviceScalar def replace_tokens(Column strings, Column targets, Column replacements, - DeviceScalar delimiter): + object py_delimiter): """ The `targets` tokens are searched for within each `strings` in the Column and replaced with the corresponding `replacements` - if found. Tokens are identified by the `delimiter` character + if found. Tokens are identified by the `py_delimiter` character provided. """ + cdef DeviceScalar delimiter = py_delimiter.device_value + cdef column_view c_strings = strings.view() cdef column_view c_targets = targets.view() cdef column_view c_replacements = replacements.view() @@ -49,15 +51,18 @@ def replace_tokens(Column strings, def filter_tokens(Column strings, size_type min_token_length, - DeviceScalar replacement, - DeviceScalar delimiter): + object py_replacement, + object py_delimiter): """ Tokens smaller than `min_token_length` are removed from `strings` in the Column and optionally replaced with the corresponding - `replacement` string. Tokens are identified by the `delimiter` + `py_replacement` string. Tokens are identified by the `py_delimiter` character provided. """ + cdef DeviceScalar replacement = py_replacement.device_value + cdef DeviceScalar delimiter = py_delimiter.device_value + cdef column_view c_strings = strings.view() cdef const string_scalar* c_repl = replacement\ .get_raw_ptr() diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx index 41904f5ee75..c7f5c2a12c4 100644 --- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx +++ b/python/cudf/cudf/_lib/nvtext/tokenize.pyx @@ -17,21 +17,9 @@ from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar -def tokenize(Column strings, object delimiter): - if isinstance(delimiter, DeviceScalar): - return _tokenize_scalar(strings, delimiter) +def _tokenize_scalar(Column strings, object py_delimiter): - if isinstance(delimiter, Column): - return _tokenize_column(strings, delimiter) - - raise TypeError( - "Expected a DeviceScalar or Column for delimiters, but got {}".format( - type(delimiter) - ) - ) - - -def _tokenize_scalar(Column strings, DeviceScalar delimiter): + cdef DeviceScalar delimiter = py_delimiter.device_value cdef column_view c_strings = strings.view() cdef const string_scalar* c_delimiter = delimiter\ @@ -65,21 +53,10 @@ def _tokenize_column(Column strings, Column delimiters): return Column.from_unique_ptr(move(c_result)) -def count_tokens(Column strings, object delimiter): - if isinstance(delimiter, DeviceScalar): - return _count_tokens_scalar(strings, delimiter) +def _count_tokens_scalar(Column strings, object py_delimiter): - if isinstance(delimiter, Column): - return _count_tokens_column(strings, delimiter) + cdef DeviceScalar delimiter = py_delimiter.device_value - raise TypeError( - "Expected a DeviceScalar or Column for delimiters, but got {}".format( - type(delimiter) - ) - ) - - -def _count_tokens_scalar(Column strings, DeviceScalar delimiter): cdef column_view c_strings = strings.view() cdef const string_scalar* c_delimiter = delimiter\ .get_raw_ptr() @@ -123,7 +100,10 @@ def character_tokenize(Column strings): return Column.from_unique_ptr(move(c_result)) -def detokenize(Column strings, Column indices, DeviceScalar separator): +def detokenize(Column strings, Column indices, object py_separator): + + cdef DeviceScalar separator = py_separator.device_value + cdef column_view c_strings = strings.view() cdef column_view c_indices = indices.view() cdef const string_scalar* c_separator = separator\ diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx index ff13b5a0e5f..5d8d1522418 100644 --- a/python/cudf/cudf/_lib/strings/char_types.pyx +++ b/python/cudf/cudf/_lib/strings/char_types.pyx @@ -19,10 +19,13 @@ from cudf._lib.cpp.strings.char_types cimport ( ) -def filter_alphanum(Column source_strings, DeviceScalar repl, bool keep=True): +def filter_alphanum(Column source_strings, object py_repl, bool keep=True): """ Returns a Column of strings keeping only alphanumeric character types. """ + + cdef DeviceScalar repl = py_repl.device_value + cdef unique_ptr[column] c_result cdef column_view source_view = source_strings.view() cdef const string_scalar* scalar_repl = ( diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index f57e79ad547..04fde5be9e8 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -20,13 +20,16 @@ from cudf._lib.cpp.strings.combine cimport ( def concatenate(Table source_strings, - DeviceScalar separator, - DeviceScalar narep): + object py_separator, + object py_narep): """ Returns a Column by concatenating strings column-wise in `source_strings` - with the specified `separator` between each column and - `na`/`None` values are replaced by `narep` + with the specified `py_separator` between each column and + `na`/`None` values are replaced by `py_narep` """ + cdef DeviceScalar separator = py_separator.device_value + cdef DeviceScalar narep = py_narep.device_value + cdef unique_ptr[column] c_result cdef table_view source_view = source_strings.data_view() @@ -47,13 +50,17 @@ def concatenate(Table source_strings, def join(Column source_strings, - DeviceScalar separator, - DeviceScalar narep): + object py_separator, + object py_narep): """ Returns a Column by concatenating strings row-wise in `source_strings` - with the specified `separator` between each column and - `na`/`None` values are replaced by `narep` + with the specified `py_separator` between each column and + `na`/`None` values are replaced by `py_narep` """ + + cdef DeviceScalar separator = py_separator.device_value + cdef DeviceScalar narep = py_narep.device_value + cdef unique_ptr[column] c_result cdef column_view source_view = source_strings.view() diff --git a/python/cudf/cudf/_lib/strings/find.pyx b/python/cudf/cudf/_lib/strings/find.pyx index 3a6218ff875..3a360d31ef2 100644 --- a/python/cudf/cudf/_lib/strings/find.pyx +++ b/python/cudf/cudf/_lib/strings/find.pyx @@ -18,11 +18,13 @@ from cudf._lib.cpp.strings.find cimport ( ) -def contains(Column source_strings, DeviceScalar target): +def contains(Column source_strings, object py_target): """ Returns a Column of boolean values with True for `source_strings` - that contain the pattern given in `target`. + that contain the pattern given in `py_target`. """ + cdef DeviceScalar target = py_target.device_value + cdef unique_ptr[column] c_result cdef column_view source_view = source_strings.view() @@ -57,11 +59,14 @@ def contains_multiple(Column source_strings, Column target_strings): return Column.from_unique_ptr(move(c_result)) -def endswith(Column source_strings, DeviceScalar target): +def endswith(Column source_strings, object py_target): """ Returns a Column of boolean values with True for `source_strings` - that contain strings that end with the pattern given in `target`. + that contain strings that end with the pattern given in `py_target`. """ + + cdef DeviceScalar target = py_target.device_value + cdef unique_ptr[column] c_result cdef column_view source_view = source_strings.view() @@ -97,11 +102,14 @@ def endswith_multiple(Column source_strings, Column target_strings): return Column.from_unique_ptr(move(c_result)) -def startswith(Column source_strings, DeviceScalar target): +def startswith(Column source_strings, object py_target): """ Returns a Column of boolean values with True for `source_strings` - that contain strings that start with the pattern given in `target`. + that contain strings that start with the pattern given in `py_target`. """ + + cdef DeviceScalar target = py_target.device_value + cdef unique_ptr[column] c_result cdef column_view source_view = source_strings.view() @@ -138,15 +146,18 @@ def startswith_multiple(Column source_strings, Column target_strings): def find(Column source_strings, - DeviceScalar target, + object py_target, size_type start, size_type end): """ Returns a Column containing lowest indexes in each string of - `source_strings` that fully contain `target` string. + `source_strings` that fully contain `py_target` string. Scan portion of strings in `source_strings` can be controlled by setting `start` and `end` values. """ + + cdef DeviceScalar target = py_target.device_value + cdef unique_ptr[column] c_result cdef column_view source_view = source_strings.view() @@ -166,15 +177,18 @@ def find(Column source_strings, def rfind(Column source_strings, - DeviceScalar target, + object py_target, size_type start, size_type end): """ Returns a Column containing highest indexes in each string of - `source_strings` that fully contain `target` string. + `source_strings` that fully contain `py_target` string. Scan portion of strings in `source_strings` can be controlled by setting `start` and `end` values. """ + + cdef DeviceScalar target = py_target.device_value + cdef unique_ptr[column] c_result cdef column_view source_view = source_strings.view() diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx index 97f922aadb1..429e356be4a 100644 --- a/python/cudf/cudf/_lib/strings/replace.pyx +++ b/python/cudf/cudf/_lib/strings/replace.pyx @@ -24,13 +24,15 @@ from cudf._lib.cpp.strings.substring cimport ( def slice_replace(Column source_strings, size_type start, size_type stop, - DeviceScalar repl): + object py_repl): """ Returns a Column by replacing specified section - of each string with `repl`. Positions can be + of each string with `py_repl`. Positions can be specified with `start` and `stop` params. """ + cdef DeviceScalar repl = py_repl.device_value + cdef unique_ptr[column] c_result cdef column_view source_view = source_strings.view() @@ -51,11 +53,14 @@ def slice_replace(Column source_strings, def insert(Column source_strings, size_type start, - DeviceScalar repl): + object py_repl): """ Returns a Column by inserting a specified - string `repl` at a specific position in all strings. + string `py_repl` at a specific position in all strings. """ + + cdef DeviceScalar repl = py_repl.device_value + cdef unique_ptr[column] c_result cdef column_view source_view = source_strings.view() @@ -75,14 +80,16 @@ def insert(Column source_strings, def replace(Column source_strings, - DeviceScalar target, - DeviceScalar repl, + object py_target, + object py_repl, int32_t maxrepl): """ Returns a Column after replacing occurrences of - patterns `target` with `repl` in `source_strings`. + patterns `py_target` with `py_repl` in `source_strings`. `maxrepl` indicates number of replacements to make from start. """ + cdef DeviceScalar target = py_target.device_value + cdef DeviceScalar repl = py_repl.device_value cdef unique_ptr[column] c_result cdef column_view source_view = source_strings.view() diff --git a/python/cudf/cudf/_lib/strings/replace_re.pyx b/python/cudf/cudf/_lib/strings/replace_re.pyx index e2223169313..7993e3a172f 100644 --- a/python/cudf/cudf/_lib/strings/replace_re.pyx +++ b/python/cudf/cudf/_lib/strings/replace_re.pyx @@ -20,15 +20,17 @@ from libcpp.string cimport string def replace_re(Column source_strings, object pattern, - DeviceScalar repl, + object py_repl, size_type n): """ Returns a Column after replacing occurrences regular - expressions `pattern` with `repl` in `source_strings`. + expressions `pattern` with `py_repl` in `source_strings`. `n` indicates the number of resplacements to be made from start. (-1 indicates all) """ + cdef DeviceScalar repl = py_repl.device_value + cdef unique_ptr[column] c_result cdef column_view source_view = source_strings.view() diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx index c7aa4621e98..64d625bcb26 100644 --- a/python/cudf/cudf/_lib/strings/split/partition.pyx +++ b/python/cudf/cudf/_lib/strings/split/partition.pyx @@ -21,11 +21,14 @@ from cudf._lib.cpp.strings.split.partition cimport ( def partition(Column source_strings, - DeviceScalar delimiter): + object py_delimiter): """ Returns a Table by splitting the `source_strings` - column at the first occurrence of the specified `delimiter`. + column at the first occurrence of the specified `py_delimiter`. """ + + cdef DeviceScalar delimiter = py_delimiter.device_value + cdef unique_ptr[table] c_result cdef column_view source_view = source_strings.view() cdef const string_scalar* scalar_str = ( @@ -45,11 +48,14 @@ def partition(Column source_strings, def rpartition(Column source_strings, - DeviceScalar delimiter): + object py_delimiter): """ Returns a Column by splitting the `source_strings` - column at the last occurrence of the specified `delimiter`. + column at the last occurrence of the specified `py_delimiter`. """ + + cdef DeviceScalar delimiter = py_delimiter.device_value + cdef unique_ptr[table] c_result cdef column_view source_view = source_strings.view() cdef const string_scalar* scalar_str = ( diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx index 234974cf340..2dd66f99ad5 100644 --- a/python/cudf/cudf/_lib/strings/split/split.pyx +++ b/python/cudf/cudf/_lib/strings/split/split.pyx @@ -23,13 +23,16 @@ from cudf._lib.cpp.strings.split.split cimport ( def split(Column source_strings, - DeviceScalar delimiter, + object py_delimiter, size_type maxsplit): """ Returns a Table by splitting the `source_strings` - column around the specified `delimiter`. + column around the specified `py_delimiter`. The split happens from beginning. """ + + cdef DeviceScalar delimiter = py_delimiter.device_value + cdef unique_ptr[table] c_result cdef column_view source_view = source_strings.view() cdef const string_scalar* scalar_str = ( @@ -50,13 +53,16 @@ def split(Column source_strings, def split_record(Column source_strings, - DeviceScalar delimiter, + object py_delimiter, size_type maxsplit): """ Returns a Column by splitting the `source_strings` - column around the specified `delimiter`. + column around the specified `py_delimiter`. The split happens from beginning. """ + + cdef DeviceScalar delimiter = py_delimiter.device_value + cdef unique_ptr[column] c_result cdef column_view source_view = source_strings.view() cdef const string_scalar* scalar_str = ( @@ -76,13 +82,16 @@ def split_record(Column source_strings, def rsplit(Column source_strings, - DeviceScalar delimiter, + object py_delimiter, size_type maxsplit): """ Returns a Table by splitting the `source_strings` - column around the specified `delimiter`. + column around the specified `py_delimiter`. The split happens from the end. """ + + cdef DeviceScalar delimiter = py_delimiter.device_value + cdef unique_ptr[table] c_result cdef column_view source_view = source_strings.view() cdef const string_scalar* scalar_str = ( @@ -103,13 +112,16 @@ def rsplit(Column source_strings, def rsplit_record(Column source_strings, - DeviceScalar delimiter, + object py_delimiter, size_type maxsplit): """ Returns a Column by splitting the `source_strings` - column around the specified `delimiter`. + column around the specified `py_delimiter`. The split happens from the end. """ + + cdef DeviceScalar delimiter = py_delimiter.device_value + cdef unique_ptr[column] c_result cdef column_view source_view = source_strings.view() cdef const string_scalar* scalar_str = ( diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx index 27b0127e128..72dffa3d897 100644 --- a/python/cudf/cudf/_lib/strings/strip.pyx +++ b/python/cudf/cudf/_lib/strings/strip.pyx @@ -17,12 +17,15 @@ from cudf._lib.cpp.strings.strip cimport ( def strip(Column source_strings, - DeviceScalar repl): + object py_repl): """ Returns a Column by removing leading and trailing characters. The set of characters need be stripped from left and right side - can be specified by `repl`. + can be specified by `py_repl`. """ + + cdef DeviceScalar repl = py_repl.device_value + cdef unique_ptr[column] c_result cdef column_view source_view = source_strings.view() @@ -41,12 +44,15 @@ def strip(Column source_strings, def lstrip(Column source_strings, - DeviceScalar repl): + object py_repl): """ Returns a Column by removing leading and trailing characters. The set of characters need be stripped from left side can - be specified by `repl`. + be specified by `py_repl`. """ + + cdef DeviceScalar repl = py_repl.device_value + cdef unique_ptr[column] c_result cdef column_view source_view = source_strings.view() @@ -65,12 +71,15 @@ def lstrip(Column source_strings, def rstrip(Column source_strings, - DeviceScalar repl): + object py_repl): """ Returns a Column by removing leading and trailing characters. The set of characters need be stripped from right side can - be specified by `repl`. + be specified by `py_repl`. """ + + cdef DeviceScalar repl = py_repl.device_value + cdef unique_ptr[column] c_result cdef column_view source_view = source_strings.view() diff --git a/python/cudf/cudf/_lib/strings/translate.pyx b/python/cudf/cudf/_lib/strings/translate.pyx index f595bf5c40c..32b145736ca 100644 --- a/python/cudf/cudf/_lib/strings/translate.pyx +++ b/python/cudf/cudf/_lib/strings/translate.pyx @@ -54,11 +54,14 @@ def translate(Column source_strings, def filter_characters(Column source_strings, object mapping_table, bool keep, - DeviceScalar repl): + object py_repl): """ Removes or keeps individual characters within each string using the provided mapping_table. """ + + cdef DeviceScalar repl = py_repl.device_value + cdef unique_ptr[column] c_result cdef column_view source_view = source_strings.view() cdef const string_scalar* scalar_repl = ( diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 24d827e2fab..851562a78c7 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -81,9 +81,7 @@ def binary_operator(self, binop, rhs, reflect=False): tmp = rhs if reflect: tmp = self - if isinstance( - rhs, (NumericalColumn, cudf.Scalar, cudf._lib.scalar.DeviceScalar) - ) or np.isscalar(rhs): + if isinstance(rhs, (NumericalColumn, cudf.Scalar)) or np.isscalar(rhs): out_dtype = np.result_type(self.dtype, rhs.dtype) if binop in ["mod", "floordiv"]: if (tmp.dtype in int_dtypes) and ( diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 4d281aa62f8..6d478f64d5d 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -37,12 +37,13 @@ subword_tokenize as cpp_subword_tokenize, ) from cudf._lib.nvtext.tokenize import ( + _count_tokens_column as cpp_count_tokens_column, + _count_tokens_scalar as cpp_count_tokens_scalar, + _tokenize_column as cpp_tokenize_column, + _tokenize_scalar as cpp_tokenize_scalar, character_tokenize as cpp_character_tokenize, - count_tokens as cpp_count_tokens, detokenize as cpp_detokenize, - tokenize as cpp_tokenize, ) -from cudf._lib.scalar import DeviceScalar, as_device_scalar from cudf._lib.strings.attributes import ( code_points as cpp_code_points, count_bytes as cpp_count_bytes, @@ -425,9 +426,7 @@ def cat(self, others=None, sep=None, na_rep=None, **kwargs): if others is None: data = cpp_join( - self._column, - as_device_scalar(sep), - as_device_scalar(na_rep, "str"), + self._column, cudf.Scalar(sep), cudf.Scalar(na_rep, "str"), ) else: other_cols = _get_cols_list(self._parent, others) @@ -436,8 +435,8 @@ def cat(self, others=None, sep=None, na_rep=None, **kwargs): cudf.DataFrame( {index: value for index, value in enumerate(all_cols)} ), - as_device_scalar(sep), - as_device_scalar(na_rep, "str"), + cudf.Scalar(sep), + cudf.Scalar(na_rep, "str"), ) if len(data) == 1 and data.null_count == 1: @@ -648,7 +647,7 @@ def contains( result_col = cpp_contains_re(self._column, pat) else: result_col = cpp_contains( - self._column, as_device_scalar(pat, "str") + self._column, cudf.Scalar(pat, "str") ) else: result_col = cpp_contains_multiple( @@ -751,12 +750,12 @@ def replace( # Pandas forces non-regex replace when pat is a single-character return self._return_or_inplace( - cpp_replace_re(self._column, pat, as_device_scalar(repl, "str"), n) + cpp_replace_re(self._column, pat, cudf.Scalar(repl, "str"), n) if regex is True and len(pat) > 1 else cpp_replace( self._column, - as_device_scalar(pat, "str"), - as_device_scalar(repl, "str"), + cudf.Scalar(pat, "str"), + cudf.Scalar(repl, "str"), n, ), **kwargs, @@ -1741,7 +1740,7 @@ def filter_alphanum(self, repl=None, keep=True, **kwargs): repl = "" return self._return_or_inplace( - cpp_filter_alphanum(self._column, as_device_scalar(repl), keep), + cpp_filter_alphanum(self._column, cudf.Scalar(repl), keep), **kwargs, ) @@ -1871,9 +1870,7 @@ def slice_replace(self, start=None, stop=None, repl=None, **kwargs): repl = "" return self._return_or_inplace( - cpp_slice_replace( - self._column, start, stop, as_device_scalar(repl) - ), + cpp_slice_replace(self._column, start, stop, cudf.Scalar(repl)), **kwargs, ) @@ -1924,7 +1921,7 @@ def insert(self, start=0, repl=None, **kwargs): repl = "" return self._return_or_inplace( - cpp_string_insert(self._column, start, as_device_scalar(repl)), + cpp_string_insert(self._column, start, cudf.Scalar(repl)), **kwargs, ) @@ -2107,14 +2104,14 @@ def split(self, pat=None, n=-1, expand=False, **kwargs): result_table = [self._column.copy()] else: result_table = cpp_split( - self._column, as_device_scalar(pat, "str"), n + self._column, cudf.Scalar(pat, "str"), n ) if len(result_table._data) == 1: if result_table._data[0].null_count == len(self._parent): result_table = [] else: result_table = cpp_split_record( - self._column, as_device_scalar(pat, "str"), n + self._column, cudf.Scalar(pat, "str"), n ) return self._return_or_inplace(result_table, **kwargs,) @@ -2260,16 +2257,12 @@ def rsplit(self, pat=None, n=-1, expand=False, **kwargs): if self._column.null_count == len(self._column): result_table = [self._column.copy()] else: - result_table = cpp_rsplit( - self._column, as_device_scalar(pat), n - ) + result_table = cpp_rsplit(self._column, cudf.Scalar(pat), n) if len(result_table._data) == 1: if result_table._data[0].null_count == len(self._parent): result_table = [] else: - result_table = cpp_rsplit_record( - self._column, as_device_scalar(pat), n - ) + result_table = cpp_rsplit_record(self._column, cudf.Scalar(pat), n) return self._return_or_inplace(result_table, **kwargs) @@ -2357,7 +2350,7 @@ def partition(self, sep=" ", expand=True, **kwargs): sep = " " return self._return_or_inplace( - cpp_partition(self._column, as_device_scalar(sep)), **kwargs + cpp_partition(self._column, cudf.Scalar(sep)), **kwargs ) def rpartition(self, sep=" ", expand=True, **kwargs): @@ -2428,7 +2421,7 @@ def rpartition(self, sep=" ", expand=True, **kwargs): sep = " " return self._return_or_inplace( - cpp_rpartition(self._column, as_device_scalar(sep)), **kwargs + cpp_rpartition(self._column, cudf.Scalar(sep)), **kwargs ) def pad(self, width, side="left", fillchar=" ", **kwargs): @@ -2827,7 +2820,7 @@ def strip(self, to_strip=None, **kwargs): to_strip = "" return self._return_or_inplace( - cpp_strip(self._column, as_device_scalar(to_strip)), **kwargs + cpp_strip(self._column, cudf.Scalar(to_strip)), **kwargs ) def lstrip(self, to_strip=None, **kwargs): @@ -2874,7 +2867,7 @@ def lstrip(self, to_strip=None, **kwargs): to_strip = "" return self._return_or_inplace( - cpp_lstrip(self._column, as_device_scalar(to_strip)), **kwargs + cpp_lstrip(self._column, cudf.Scalar(to_strip)), **kwargs ) def rstrip(self, to_strip=None, **kwargs): @@ -2929,7 +2922,7 @@ def rstrip(self, to_strip=None, **kwargs): to_strip = "" return self._return_or_inplace( - cpp_rstrip(self._column, as_device_scalar(to_strip)), **kwargs + cpp_rstrip(self._column, cudf.Scalar(to_strip)), **kwargs ) def wrap(self, width, **kwargs): @@ -3289,9 +3282,7 @@ def endswith(self, pat, **kwargs): len(self._column), dtype="bool", masked=True ) elif is_scalar(pat): - result_col = cpp_endswith( - self._column, as_device_scalar(pat, "str") - ) + result_col = cpp_endswith(self._column, cudf.Scalar(pat, "str")) else: result_col = cpp_endswith_multiple( self._column, column.as_column(pat, dtype="str") @@ -3356,9 +3347,7 @@ def startswith(self, pat, **kwargs): len(self._column), dtype="bool", masked=True ) elif is_scalar(pat): - result_col = cpp_startswith( - self._column, as_device_scalar(pat, "str") - ) + result_col = cpp_startswith(self._column, cudf.Scalar(pat, "str")) else: result_col = cpp_startswith_multiple( self._column, column.as_column(pat, dtype="str") @@ -3416,7 +3405,7 @@ def find(self, sub, start=0, end=None, **kwargs): end = -1 result_col = cpp_find( - self._column, as_device_scalar(sub, "str"), start, end + self._column, cudf.Scalar(sub, "str"), start, end ) return self._return_or_inplace(result_col, **kwargs) @@ -3475,7 +3464,7 @@ def rfind(self, sub, start=0, end=None, **kwargs): end = -1 result_col = cpp_rfind( - self._column, as_device_scalar(sub, "str"), start, end + self._column, cudf.Scalar(sub, "str"), start, end ) return self._return_or_inplace(result_col, **kwargs) @@ -3530,7 +3519,7 @@ def index(self, sub, start=0, end=None, **kwargs): end = -1 result_col = cpp_find( - self._column, as_device_scalar(sub, "str"), start, end + self._column, cudf.Scalar(sub, "str"), start, end ) result = self._return_or_inplace(result_col, **kwargs) @@ -3590,7 +3579,7 @@ def rindex(self, sub, start=0, end=None, **kwargs): end = -1 result_col = cpp_rfind( - self._column, as_device_scalar(sub, "str"), start, end + self._column, cudf.Scalar(sub, "str"), start, end ) result = self._return_or_inplace(result_col, **kwargs) @@ -3846,7 +3835,7 @@ def filter_characters(self, table, keep=True, repl=None, **kwargs): table = str.maketrans(table) return self._return_or_inplace( cpp_filter_characters( - self._column, table, keep, as_device_scalar(repl) + self._column, table, keep, cudf.Scalar(repl) ), **kwargs, ) @@ -3953,9 +3942,20 @@ def tokenize(self, delimiter=" ", **kwargs): """ delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) kwargs.setdefault("retain_index", False) - return self._return_or_inplace( - cpp_tokenize(self._column, delimiter), **kwargs - ) + + if isinstance(delimiter, Column): + return self._return_or_inplace( + cpp_tokenize_column(self._column, delimiter), **kwargs + ) + elif isinstance(delimiter, cudf.Scalar): + return self._return_or_inplace( + cpp_tokenize_scalar(self._column, delimiter), **kwargs + ) + else: + raise TypeError( + f"Expected a Scalar or Column\ + for delimiters, but got {type(delimiter)}" + ) def detokenize(self, indices, separator=" ", **kwargs): """ @@ -4074,9 +4074,20 @@ def token_count(self, delimiter=" ", **kwargs): dtype: int32 """ delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) - return self._return_or_inplace( - cpp_count_tokens(self._column, delimiter), **kwargs - ) + if isinstance(delimiter, Column): + return self._return_or_inplace( + cpp_count_tokens_column(self._column, delimiter) + ) + + elif isinstance(delimiter, cudf.Scalar): + return self._return_or_inplace( + cpp_count_tokens_scalar(self._column, delimiter) + ) + else: + raise TypeError( + f"Expected a Scalar or Column\ + for delimiters, but got {type(delimiter)}" + ) def ngrams(self, n=2, separator="_", **kwargs): """ @@ -4267,7 +4278,7 @@ def replace_tokens(self, targets, replacements, delimiter=None, **kwargs): self._column, targets_column, replacements_column, - as_device_scalar(delimiter, dtype="str"), + cudf.Scalar(delimiter, dtype="str"), ), **kwargs, ) @@ -4335,8 +4346,8 @@ def filter_tokens( cpp_filter_tokens( self._column, min_token_length, - as_device_scalar(replacement, dtype="str"), - as_device_scalar(delimiter, dtype="str"), + cudf.Scalar(replacement, dtype="str"), + cudf.Scalar(delimiter, dtype="str"), ), **kwargs, ) @@ -4610,10 +4621,7 @@ def edit_distance(self, targets, **kwargs): def _massage_string_arg(value, name, allow_col=False): if isinstance(value, str): - return as_device_scalar(value, dtype="str") - - if isinstance(value, DeviceScalar) and is_string_dtype(value.dtype): - return value + return cudf.Scalar(value, dtype="str") allowed_types = ["Scalar"] diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 20914a88fe5..389dd52f21c 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -9,7 +9,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.scalar import DeviceScalar, as_device_scalar from cudf.core.column import column, string from cudf.core.column.datetime import _numpy_to_pandas_conversion from cudf.utils.dtypes import is_scalar, np_to_pa_dtype @@ -104,16 +103,14 @@ def _binary_op_floordiv(self, rhs): if pd.api.types.is_timedelta64_dtype(rhs.dtype): common_dtype = determine_out_dtype(self.dtype, rhs.dtype) lhs = lhs.astype(common_dtype).astype("float64") - - if isinstance(rhs, (cudf.Scalar, DeviceScalar)): + if isinstance(rhs, cudf.Scalar): if rhs.is_valid: rhs = np.timedelta64(rhs.value) rhs = rhs.astype(common_dtype).astype("float64") else: - rhs = as_device_scalar(None, "float64") + rhs = cudf.Scalar(None, "float64") else: rhs = rhs.astype(common_dtype).astype("float64") - out_dtype = np.dtype("int64") elif rhs.dtype.kind in ("f", "i", "u"): out_dtype = self.dtype @@ -171,12 +168,11 @@ def _binary_op_truediv(self, rhs): if pd.api.types.is_timedelta64_dtype(rhs.dtype): common_dtype = determine_out_dtype(self.dtype, rhs.dtype) lhs = lhs.astype(common_dtype).astype("float64") - - if isinstance(rhs, (cudf.Scalar, DeviceScalar)): + if isinstance(rhs, cudf.Scalar): if rhs.is_valid(): rhs = rhs.value.astype(common_dtype).astype("float64") else: - rhs = as_device_scalar(None, "float64") + rhs = cudf.Scalar(None, "float64") else: rhs = rhs.astype(common_dtype).astype("float64") @@ -219,7 +215,6 @@ def binary_operator(self, op, rhs, reflect=False): if reflect: lhs, rhs = rhs, lhs - return binop(lhs, rhs, op=op, out_dtype=out_dtype) def normalize_binop_value(self, other): @@ -380,61 +375,61 @@ def components(self, index=None): return cudf.DataFrame( data={ "days": self - // as_device_scalar( + // cudf.Scalar( np.timedelta64(_numpy_to_pandas_conversion["D"], "ns") ), "hours": ( self - % as_device_scalar( + % cudf.Scalar( np.timedelta64(_numpy_to_pandas_conversion["D"], "ns") ) ) - // as_device_scalar( + // cudf.Scalar( np.timedelta64(_numpy_to_pandas_conversion["h"], "ns") ), "minutes": ( self - % as_device_scalar( + % cudf.Scalar( np.timedelta64(_numpy_to_pandas_conversion["h"], "ns") ) ) - // as_device_scalar( + // cudf.Scalar( np.timedelta64(_numpy_to_pandas_conversion["m"], "ns") ), "seconds": ( self - % as_device_scalar( + % cudf.Scalar( np.timedelta64(_numpy_to_pandas_conversion["m"], "ns") ) ) - // as_device_scalar( + // cudf.Scalar( np.timedelta64(_numpy_to_pandas_conversion["s"], "ns") ), "milliseconds": ( self - % as_device_scalar( + % cudf.Scalar( np.timedelta64(_numpy_to_pandas_conversion["s"], "ns") ) ) - // as_device_scalar( + // cudf.Scalar( np.timedelta64(_numpy_to_pandas_conversion["ms"], "ns") ), "microseconds": ( self - % as_device_scalar( + % cudf.Scalar( np.timedelta64(_numpy_to_pandas_conversion["ms"], "ns") ) ) - // as_device_scalar( + // cudf.Scalar( np.timedelta64(_numpy_to_pandas_conversion["us"], "ns") ), "nanoseconds": ( self - % as_device_scalar( + % cudf.Scalar( np.timedelta64(_numpy_to_pandas_conversion["us"], "ns") ) ) - // as_device_scalar( + // cudf.Scalar( np.timedelta64(_numpy_to_pandas_conversion["ns"], "ns") ), }, @@ -450,7 +445,7 @@ def days(self): ------- NumericalColumn """ - return self // as_device_scalar( + return self // cudf.Scalar( np.timedelta64(_numpy_to_pandas_conversion["D"], "ns") ) @@ -470,10 +465,10 @@ def seconds(self): return ( self - % as_device_scalar( + % cudf.Scalar( np.timedelta64(_numpy_to_pandas_conversion["D"], "ns") ) - ) // as_device_scalar( + ) // cudf.Scalar( np.timedelta64(_numpy_to_pandas_conversion["s"], "ns") ) @@ -493,7 +488,7 @@ def microseconds(self): return ( self % np.timedelta64(_numpy_to_pandas_conversion["s"], "ns") - ) // as_device_scalar( + ) // cudf.Scalar( np.timedelta64(_numpy_to_pandas_conversion["us"], "ns") ) @@ -514,10 +509,10 @@ def nanoseconds(self): return ( self - % as_device_scalar( + % cudf.Scalar( np.timedelta64(_numpy_to_pandas_conversion["us"], "ns") ) - ) // as_device_scalar( + ) // cudf.Scalar( np.timedelta64(_numpy_to_pandas_conversion["ns"], "ns") ) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 835047f1b3d..732aafee3c0 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -6,7 +6,6 @@ from pandas.core.tools.datetimes import _unit_map import cudf -from cudf._lib.scalar import as_device_scalar from cudf._lib.strings.char_types import is_integer as cpp_is_integer from cudf.core import column from cudf.core.index import as_index @@ -180,7 +179,7 @@ def to_datetime( except ValueError: current_col = current_col.astype(dtype="float64") - factor = as_device_scalar( + factor = cudf.Scalar( column.datetime._numpy_to_pandas_conversion[u] / ( column.datetime._numpy_to_pandas_conversion["s"] @@ -257,7 +256,7 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format): if col.dtype.kind in ("f"): if unit not in (None, "ns"): - factor = as_device_scalar( + factor = cudf.Scalar( column.datetime._numpy_to_pandas_conversion[unit] ) col = col * factor @@ -284,7 +283,7 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format): if col.dtype.kind in ("i"): if unit in ("D", "h", "m"): - factor = as_device_scalar( + factor = cudf.Scalar( column.datetime._numpy_to_pandas_conversion[unit] / column.datetime._numpy_to_pandas_conversion["s"] ) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index c4592dfa100..18ffed91751 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -4,8 +4,8 @@ from collections.abc import Sequence from math import floor, isinf, isnan -import numpy as np import cupy as cp +import numpy as np import pandas as pd from numba import njit