From 8bf0dd677799dc170d613e827a28f56e1422ec25 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 22 Jan 2021 18:41:34 -0800 Subject: [PATCH 01/23] fix inplace updation of data and add Series.update --- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/series.py | 104 +++++++++++++++++++++++ python/cudf/cudf/tests/test_dataframe.py | 13 +-- python/cudf/cudf/tests/test_series.py | 37 ++++++++ 4 files changed, 146 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f9b61a60830..e625db5b0c5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1545,7 +1545,7 @@ def update( if mask.all(): continue - self[col] = this.where(mask, that) + self[col].where(mask, that, inplace=True) def __add__(self, other): return self._apply_op("__add__", other) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 147262be08d..68424546bf4 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2419,6 +2419,110 @@ def replace( return self._mimic_inplace(result, inplace=inplace) + def update(self, other): + """ + Modify Series in place using values from passed Series. + Uses non-NA values from passed Series to make updates. Aligns + on index. + + Parameters + ---------- + other : Series, or object coercible into Series + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.update(cudf.Series([4, 5, 6])) + >>> s + 0 4 + 1 5 + 2 6 + dtype: int64 + >>> s = cudf.Series(['a', 'b', 'c']) + >>> s + 0 a + 1 b + 2 c + dtype: object + >>> s.update(cudf.Series(['d', 'e'], index=[0, 2])) + >>> s + 0 d + 1 b + 2 e + dtype: object + >>> s = cudf.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.update(cudf.Series([4, 5, 6, 7, 8])) + >>> s + 0 4 + 1 5 + 2 6 + dtype: int64 + + If ``other`` contains NaNs the corresponding values are not updated + in the original Series. + + >>> s = cudf.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.update(cudf.Series([4, np.nan, 6], nan_as_null=False)) + >>> s + 0 4 + 1 2 + 2 6 + dtype: int64 + + ``other`` can also be a non-Series object type + that is coercible into a Series + + >>> s = cudf.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.update([4, np.nan, 6]) + >>> s + 0 4 + 1 2 + 2 6 + dtype: int64 + >>> s = cudf.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.update({1: 9}) + >>> s + 0 1 + 1 9 + 2 3 + dtype: int64 + """ + + if not isinstance(other, cudf.Series): + other = cudf.Series(other) + + if not self.index.equals(other.index): + other = other.reindex(index=self.index) + mask = other.notna() + + self.mask(mask, other, inplace=True) + def reverse(self): """Reverse the Series """ diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f41714ec1ad..8967b4f299d 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8175,9 +8175,6 @@ def test_agg_for_dataframe_with_string_columns(aggs): @pytest.mark.parametrize( "overwrite", [True, False], ) -@pytest.mark.parametrize( - "filter_func", [None], -) @pytest.mark.parametrize( "errors", ["ignore"], ) @@ -8222,19 +8219,17 @@ def test_agg_for_dataframe_with_string_columns(aggs): }, ], ) -def test_update_for_dataframes( - data, data2, join, overwrite, filter_func, errors -): +def test_update_for_dataframes(data, data2, join, overwrite, errors): pdf = pd.DataFrame(data) gdf = gd.DataFrame(data) other_pd = pd.DataFrame(data2) other_gd = gd.DataFrame(data2) - expect = pdf.update(other_pd, join, overwrite, filter_func, errors) - got = gdf.update(other_gd, join, overwrite, filter_func, errors) + pdf.update(other_pd, join, overwrite, errors) + gdf.update(other_gd, join, overwrite, errors) - assert_eq(expect, got) + assert_eq(pdf, gdf) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 980dcb5a13b..a97b632a07c 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1,4 +1,5 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. + import operator import re from string import ascii_letters, digits @@ -913,3 +914,39 @@ def custom_add_func(sr, val): lfunc_args_and_kwargs=([(custom_add_func, "val")], {"val": 11}), rfunc_args_and_kwargs=([(custom_add_func, "val")], {"val": 11}), ) + + +@pytest.mark.parametrize( + "data", + [cudf.Series([1, 2, 3]), cudf.Series([10, 11, 12], index=[1, 2, 3])], +) +@pytest.mark.parametrize( + "other", + [ + cudf.Series([4, 5, 6]), + cudf.Series([4, 5, 6, 7, 8]), + cudf.Series([4, np.nan, 6], nan_as_null=False), + [4, np.nan, 6], + {1: 9}, + ], +) +def test_series_update(data, other): + gs = data.copy(deep=True) + if isinstance(other, cudf.Series): + g_other = other.copy(deep=True) + p_other = g_other.to_pandas() + else: + g_other = other + p_other = other + + ps = gs.to_pandas() + + gs_column_before = gs._column + gs.update(g_other) + gs_column_after = gs._column + + assert_eq(gs_column_before.to_array(), gs_column_after.to_array()) + + ps.update(p_other) + + assert_eq(gs, ps) From c87de031660961fee8a875ce0db391b9a25cde90 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 1 Feb 2021 17:39:48 -0800 Subject: [PATCH 02/23] fix where type-casting logic and update logic --- python/cudf/cudf/_lib/copying.pyx | 4 +- python/cudf/cudf/core/dataframe.py | 8 +- python/cudf/cudf/core/frame.py | 245 ++++++++++++++++------- python/cudf/cudf/tests/test_dataframe.py | 6 +- python/cudf/cudf/tests/test_replace.py | 27 ++- python/cudf/cudf/tests/test_series.py | 13 ++ python/cudf/cudf/utils/dtypes.py | 9 +- 7 files changed, 226 insertions(+), 86 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index ad798a73ed2..9f56297edaa 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -554,11 +554,11 @@ def copy_if_else(object lhs, object rhs, Column boolean_mask): return _copy_if_else_column_column(lhs, rhs, boolean_mask) else: return _copy_if_else_column_scalar( - lhs, as_device_scalar(rhs, lhs.dtype), boolean_mask) + lhs, as_device_scalar(rhs), boolean_mask) else: if isinstance(rhs, Column): return _copy_if_else_scalar_column( - as_device_scalar(lhs, rhs.dtype), rhs, boolean_mask) + as_device_scalar(lhs), rhs, boolean_mask) else: if lhs is None and rhs is None: return lhs diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7e98bcfb156..7a6f1c9d52c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1547,8 +1547,9 @@ def update( if not self.index.equals(other.index): other = other.reindex(self.index, axis=0) - for col in self.columns: - this = self[col] + source_df = self.copy(deep=False) + for col in source_df.columns: + this = source_df[col] that = other[col] if errors == "raise": @@ -1565,8 +1566,9 @@ def update( # don't overwrite columns unnecessarily if mask.all(): continue + source_df[col] = source_df[col].where(mask, that) - self[col].where(mask, that, inplace=True) + self._mimic_inplace(source_df, inplace=True) def __add__(self, other): return self._apply_op("__add__", other) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5bc5675e1e6..96845755c17 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -738,29 +738,7 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): return self._mimic_inplace(output, inplace=inplace) - def _normalize_scalars(self, other): - """ - Try to normalizes scalar values as per self dtype - """ - if ( - other is not None - and (isinstance(other, float) and not np.isnan(other)) - ) and (self.dtype.type(other) != other): - raise TypeError( - f"Cannot safely cast non-equivalent " - f"{type(other).__name__} to {self.dtype.name}" - ) - - return ( - self.dtype.type(other) - if ( - other is not None - and (isinstance(other, float) and not np.isnan(other)) - ) - else other - ) - - def _normalize_columns_and_scalars_type(self, other): + def _normalize_columns_and_scalars_type(self, other, inplace=False): """ Try to normalize the other's dtypes as per self. @@ -785,34 +763,57 @@ def _normalize_columns_and_scalars_type(self, other): if isinstance(self, cudf.DataFrame) and isinstance( other, cudf.DataFrame ): - return [ - other[self_col].astype(self._data[self_col].dtype)._column - for self_col in self._data.names - ] + source_df = self.copy() + other_df = other.copy() + for self_col in source_df._data.names: + source_col, other_col = _check_and_cast_columns( + source_col=source_df._data[self_col], + other_col=other_df._data[self_col], + inplace=inplace, + ) + source_df._data[self_col] = source_col + other_df._data[self_col] = other_col + return source_df, other_df elif isinstance(self, (cudf.Series, cudf.Index)) and not is_scalar( other ): other = as_column(other) - return other.astype(self.dtype) - + input_col = self._data[self.name] + return _check_and_cast_columns( + source_col=input_col, other_col=other, inplace=inplace + ) else: # Handles scalar or list/array like scalars if isinstance(self, (cudf.Series, cudf.Index)) and is_scalar( other ): - return self._normalize_scalars(other) + input_col = self._data[self.name] + return _check_and_cast_columns_with_scalar( + source_col=self._data[self.name], + other_scalar=other, + inplace=inplace, + ) elif isinstance(self, cudf.DataFrame): - out = [] if is_scalar(other): other = [other for i in range(len(self._data.names))] - out = [ - self[in_col_name]._normalize_scalars(sclr) - for in_col_name, sclr in zip(self._data.names, other) - ] - return out + source_df = self.copy() + others = [] + for col_name, other_sclr in zip(self._data.names, other): + + ( + source_col, + other_scalar, + ) = _check_and_cast_columns_with_scalar( + source_col=source_df._data[col_name], + other_scalar=other_sclr, + inplace=inplace, + ) + source_df._data[col_name] = source_col + others.append(other_scalar) + return source_df, others else: raise ValueError( f"Inappropriate input {type(self)} " @@ -897,27 +898,33 @@ def where(self, cond, other=None, inplace=False): # as `cond` has no column names. cond.columns = self.columns - other = self._normalize_columns_and_scalars_type(other) + source_df, others = self._normalize_columns_and_scalars_type(other) + if isinstance(other, Frame): + others = others._data.columns + out_df = cudf.DataFrame(index=self.index) - if len(self._columns) != len(other): + if len(self._columns) != len(others): raise ValueError( """Replacement list length or number of dataframe columns should be equal to Number of columns of dataframe""" ) - - for column_name, other_column in zip(self._data.names, other): - input_col = self._data[column_name] + for i, column_name in enumerate(self._data.names): + input_col = source_df._data[column_name] + other_column = others[i] if column_name in cond._data: if isinstance( input_col, cudf.core.column.CategoricalColumn ): - if np.isscalar(other_column): + if is_scalar(other_column): try: other_column = input_col._encode(other_column) except ValueError: # When other is not present in categories, # fill with Null. other_column = None + other_column = cudf.Scalar( + other_column, dtype=input_col.codes.dtype + ) elif hasattr(other_column, "codes"): other_column = other_column.codes input_col = input_col.codes @@ -952,45 +959,49 @@ def where(self, cond, other=None, inplace=False): return self._mimic_inplace(out_df, inplace=inplace) else: - if isinstance(other, cudf.DataFrame): raise NotImplementedError( "cannot align with a higher dimensional Frame" ) - - other = self._normalize_columns_and_scalars_type(other) - + input_col = self._data[self.name] cond = as_column(cond) if len(cond) != len(self): raise ValueError( """Array conditional must be same shape as self""" ) - input_col = self._data[self.name] - if isinstance(input_col, cudf.core.column.CategoricalColumn): - if np.isscalar(other): - try: - other = input_col._encode(other) - except ValueError: - # When other is not present in categories, - # fill with Null. - other = None - elif hasattr(other, "codes"): - other = other.codes - - input_col = input_col.codes - - result = libcudf.copying.copy_if_else(input_col, other, cond) - - if is_categorical_dtype(self.dtype): - result = build_categorical_column( - categories=self._data[self.name].categories, - codes=as_column(result.base_data, dtype=result.dtype), - mask=result.base_mask, - size=result.size, - offset=result.offset, - ordered=self._data[self.name].ordered, + if cond.all(): + result = input_col + else: + input_col, other = self._normalize_columns_and_scalars_type( + other, inplace ) + if isinstance(input_col, cudf.core.column.CategoricalColumn): + if is_scalar(other): + try: + other = input_col._encode(other) + except ValueError: + # When other is not present in categories, + # fill with Null. + other = None + other = cudf.Scalar(other, dtype=input_col.codes.dtype) + elif hasattr(other, "codes"): + other = other.codes + + input_col = input_col.codes + + result = libcudf.copying.copy_if_else(input_col, other, cond) + + if is_categorical_dtype(self.dtype): + result = build_categorical_column( + categories=self._data[self.name].categories, + codes=as_column(result.base_data, dtype=result.dtype), + mask=result.base_mask, + size=result.size, + offset=result.offset, + ordered=self._data[self.name].ordered, + ) + if isinstance(self, cudf.Index): from cudf.core.index import as_index @@ -3746,3 +3757,97 @@ def _reassign_categories(categories, cols, col_idxs): offset=cols[name].offset, size=cols[name].size, ) + + +def _normalize_scalars(col, other): + """ + Try to normalizes scalar values as per col dtype + """ + if ( + other is not None + and (isinstance(other, float) and not np.isnan(other)) + ) and (col.dtype.type(other) != other): + raise TypeError( + f"Cannot safely cast non-equivalent " + f"{type(other).__name__} to {col.dtype.name}" + ) + + return ( + col.dtype.type(other) + if ( + other is not None + and (isinstance(other, float) and not np.isnan(other)) + ) + else other + ) + + +def _check_and_cast_columns(source_col, other_col, inplace): + """ + Returns type-casted columns of `source_col` & `other_col` + based on `inplace` parameter. + """ + if cudf.utils.dtypes.is_categorical_dtype(source_col.dtype): + return source_col, other_col + elif cudf.utils.dtypes.is_mixed_with_object_dtype(source_col, other_col): + raise TypeError( + "cudf does not support mixed types, please type-cast " + "the column of dataframe/series and other " + "to same dtypes." + ) + if inplace: + if not source_col.can_cast_safely(other_col.dtype): + warnings.warn( + f"Type-casting from {other_col.dtype} " + f"to {source_col.dtype}, there could be potential data loss" + ) + return source_col, other_col.astype(source_col.dtype) + else: + common_dtype = cudf.utils.dtypes.find_common_type( + [source_col.dtype, other_col.dtype] + ) + return source_col.astype(common_dtype), other_col.astype(common_dtype) + + +def _check_and_cast_columns_with_scalar(source_col, other_scalar, inplace): + """ + Returns type-casted column `source_col` & scalar `other_scalar` + based on `inplace` parameter. + """ + if cudf.utils.dtypes.is_categorical_dtype(source_col.dtype): + return source_col, other_scalar + + device_scalar = cudf.Scalar( + _normalize_scalars(source_col, other_scalar), + dtype=source_col.dtype if other_scalar is None else None, + ) + + if other_scalar is None: + return source_col, device_scalar + elif cudf.utils.dtypes.is_mixed_with_object_dtype( + device_scalar, source_col + ): + raise TypeError( + "cudf does not support mixed types, please type-cast " + "the column of dataframe/series and other " + "to same dtypes." + ) + if inplace: + if not np.can_cast(device_scalar, source_col.dtype): + warnings.warn( + f"Type-casting from {device_scalar.dtype} " + f"to {source_col.dtype}, there could be potential data loss" + ) + return source_col, device_scalar.astype(source_col.dtype) + else: + if pd.api.types.is_numeric_dtype(source_col.dtype) and np.can_cast( + other_scalar, source_col.dtype + ): + common_dtype = source_col.dtype + else: + common_dtype = cudf.utils.dtypes.find_common_type( + [source_col.dtype, np.min_scalar_type(other_scalar)] + ) + + source_col = source_col.astype(common_dtype) + return source_col, cudf.Scalar(other_scalar, dtype=common_dtype) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 8967b4f299d..3ac2b4fc918 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8226,10 +8226,10 @@ def test_update_for_dataframes(data, data2, join, overwrite, errors): other_pd = pd.DataFrame(data2) other_gd = gd.DataFrame(data2) - pdf.update(other_pd, join, overwrite, errors) - gdf.update(other_gd, join, overwrite, errors) + pdf.update(other=other_pd, join=join, overwrite=overwrite, errors=errors) + gdf.update(other=other_gd, join=join, overwrite=overwrite, errors=errors) - assert_eq(pdf, gdf) + assert_eq(pdf, gdf, check_dtype=False) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index 5338761372f..b2468ea990f 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -610,25 +610,40 @@ def test_series_where(data_dtype, fill_value): sr.where(sr > 0, fill_value) else: # Cast back to original dtype as pandas automatically upcasts - expect = psr.where(psr > 0, fill_value).astype(psr.dtype) + expect = psr.where(psr > 0, fill_value) got = sr.where(sr > 0, fill_value) - assert_eq(expect, got) + # pandas returns 'float16' dtype, which is not supported in cudf + assert_eq( + expect, + got, + check_dtype=False if expect.dtype.kind in ("f") else True, + ) if sr.dtype.type(fill_value) != fill_value: with pytest.raises(TypeError): sr.where(sr < 0, fill_value) else: - expect = psr.where(psr < 0, fill_value).astype(psr.dtype) + expect = psr.where(psr < 0, fill_value) got = sr.where(sr < 0, fill_value) - assert_eq(expect, got) + # pandas returns 'float16' dtype, which is not supported in cudf + assert_eq( + expect, + got, + check_dtype=False if expect.dtype.kind in ("f") else True, + ) if sr.dtype.type(fill_value) != fill_value: with pytest.raises(TypeError): sr.where(sr == 0, fill_value) else: - expect = psr.where(psr == 0, fill_value).astype(psr.dtype) + expect = psr.where(psr == 0, fill_value) got = sr.where(sr == 0, fill_value) - assert_eq(expect, got) + # pandas returns 'float16' dtype, which is not supported in cudf + assert_eq( + expect, + got, + check_dtype=False if expect.dtype.kind in ("f") else True, + ) @pytest.mark.parametrize("fill_value", [100, 100.0, 100.5]) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 5b4b260b9a3..c86e537f740 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -971,3 +971,16 @@ def test_fillna_with_nan(data, nan_as_null, fill_value): actual = gs.fillna(fill_value) assert_eq(expected, actual) + + +def test_series_mask_mixed_dtypes_error(): + s = cudf.Series(["a", "b", "c"]) + with pytest.raises( + TypeError, + match=re.escape( + "cudf does not support mixed types, please type-cast " + "the column of dataframe/series and other " + "to same dtypes." + ), + ): + s.where([True, False, True], [1, 2, 3]) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index d49b4abd399..d6d34be192e 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -318,7 +318,7 @@ def to_cudf_compatible_scalar(val, dtype=None): if not is_scalar(val): raise ValueError( f"Cannot convert value of type {type(val).__name__} " - " to cudf scalar" + "to cudf scalar" ) if isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0: @@ -624,7 +624,12 @@ def find_common_type(dtypes): dtypes = dtypes - td_dtypes dtypes.add(np.result_type(*td_dtypes)) - return np.find_common_type(list(dtypes), []) + common_dtype = np.find_common_type(list(dtypes), []) + if common_dtype == np.dtype("float16"): + # cuDF does not support float16 dtype + return np.dtype("float32") + else: + return common_dtype # Type dispatch loops similar to what are found in `np.add.types` From f05bda4ad58c9dcb6543af975268c2e9fbf97fe4 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 2 Feb 2021 16:36:26 -0800 Subject: [PATCH 03/23] move where related internal apis --- python/cudf/cudf/core/__init__.py | 2 +- python/cudf/cudf/core/frame.py | 192 ++------------------ python/cudf/cudf/core/internals/__init__.py | 0 python/cudf/cudf/core/internals/where.py | 181 ++++++++++++++++++ 4 files changed, 194 insertions(+), 181 deletions(-) create mode 100644 python/cudf/cudf/core/internals/__init__.py create mode 100644 python/cudf/cudf/core/internals/where.py diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py index 91a369c31f8..a71f15dd95b 100644 --- a/python/cudf/cudf/core/__init__.py +++ b/python/cudf/cudf/core/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) 2018-2020, NVIDIA CORPORATION. -from cudf.core import buffer, column, column_accessor, common +from cudf.core import buffer, column, column_accessor, common, internals from cudf.core.buffer import Buffer from cudf.core.dataframe import DataFrame, from_pandas, merge from cudf.core.index import ( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d3c3c18050e..ebe91d12012 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -20,6 +20,7 @@ import cudf from cudf import _lib as libcudf from cudf.core.column import as_column, build_categorical_column, column_empty +from cudf.core.internals import where as where_internals from cudf.utils.dtypes import ( is_categorical_dtype, is_column_like, @@ -737,88 +738,6 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): return self._mimic_inplace(output, inplace=inplace) - def _normalize_columns_and_scalars_type(self, other, inplace=False): - """ - Try to normalize the other's dtypes as per self. - - Parameters - ---------- - - self : Can be a DataFrame or Series or Index - other : Can be a DataFrame, Series, Index, Array - like object or a scalar value - - if self is DataFrame, other can be only a - scalar or array like with size of number of columns - in DataFrame or a DataFrame with same dimension - - if self is Series, other can be only a scalar or - a series like with same length as self - - Returns: - -------- - A dataframe/series/list/scalar form of normalized other - """ - if isinstance(self, cudf.DataFrame) and isinstance( - other, cudf.DataFrame - ): - source_df = self.copy() - other_df = other.copy() - for self_col in source_df._data.names: - source_col, other_col = _check_and_cast_columns( - source_col=source_df._data[self_col], - other_col=other_df._data[self_col], - inplace=inplace, - ) - source_df._data[self_col] = source_col - other_df._data[self_col] = other_col - return source_df, other_df - - elif isinstance(self, (cudf.Series, cudf.Index)) and not is_scalar( - other - ): - other = as_column(other) - input_col = self._data[self.name] - return _check_and_cast_columns( - source_col=input_col, other_col=other, inplace=inplace - ) - else: - # Handles scalar or list/array like scalars - if isinstance(self, (cudf.Series, cudf.Index)) and is_scalar( - other - ): - input_col = self._data[self.name] - return _check_and_cast_columns_with_scalar( - source_col=self._data[self.name], - other_scalar=other, - inplace=inplace, - ) - - elif isinstance(self, cudf.DataFrame): - if is_scalar(other): - other = [other for i in range(len(self._data.names))] - - source_df = self.copy() - others = [] - for col_name, other_sclr in zip(self._data.names, other): - - ( - source_col, - other_scalar, - ) = _check_and_cast_columns_with_scalar( - source_col=source_df._data[col_name], - other_scalar=other_sclr, - inplace=inplace, - ) - source_df._data[col_name] = source_col - others.append(other_scalar) - return source_df, others - else: - raise ValueError( - f"Inappropriate input {type(self)} " - f"and other {type(other)} combination" - ) - def where(self, cond, other=None, inplace=False): """ Replace values where the condition is False. @@ -897,7 +816,12 @@ def where(self, cond, other=None, inplace=False): # as `cond` has no column names. cond.columns = self.columns - source_df, others = self._normalize_columns_and_scalars_type(other) + ( + source_df, + others, + ) = where_internals._normalize_columns_and_scalars_type( + self, other + ) if isinstance(other, Frame): others = others._data.columns @@ -971,8 +895,11 @@ def where(self, cond, other=None, inplace=False): if cond.all(): result = input_col else: - input_col, other = self._normalize_columns_and_scalars_type( - other, inplace + ( + input_col, + other, + ) = where_internals._normalize_columns_and_scalars_type( + self, other, inplace ) if isinstance(input_col, cudf.core.column.CategoricalColumn): @@ -2725,7 +2652,6 @@ def searchsorted( array([4, 4, 4, 0], dtype=int32) """ # Call libcudf++ search_sorted primitive - from cudf.utils.dtypes import is_scalar scalar_flag = None if is_scalar(values): @@ -3862,100 +3788,6 @@ def _reassign_categories(categories, cols, col_idxs): ) -def _normalize_scalars(col, other): - """ - Try to normalizes scalar values as per col dtype - """ - if ( - other is not None - and (isinstance(other, float) and not np.isnan(other)) - ) and (col.dtype.type(other) != other): - raise TypeError( - f"Cannot safely cast non-equivalent " - f"{type(other).__name__} to {col.dtype.name}" - ) - - return ( - col.dtype.type(other) - if ( - other is not None - and (isinstance(other, float) and not np.isnan(other)) - ) - else other - ) - - -def _check_and_cast_columns(source_col, other_col, inplace): - """ - Returns type-casted columns of `source_col` & `other_col` - based on `inplace` parameter. - """ - if cudf.utils.dtypes.is_categorical_dtype(source_col.dtype): - return source_col, other_col - elif cudf.utils.dtypes.is_mixed_with_object_dtype(source_col, other_col): - raise TypeError( - "cudf does not support mixed types, please type-cast " - "the column of dataframe/series and other " - "to same dtypes." - ) - if inplace: - if not source_col.can_cast_safely(other_col.dtype): - warnings.warn( - f"Type-casting from {other_col.dtype} " - f"to {source_col.dtype}, there could be potential data loss" - ) - return source_col, other_col.astype(source_col.dtype) - else: - common_dtype = cudf.utils.dtypes.find_common_type( - [source_col.dtype, other_col.dtype] - ) - return source_col.astype(common_dtype), other_col.astype(common_dtype) - - -def _check_and_cast_columns_with_scalar(source_col, other_scalar, inplace): - """ - Returns type-casted column `source_col` & scalar `other_scalar` - based on `inplace` parameter. - """ - if cudf.utils.dtypes.is_categorical_dtype(source_col.dtype): - return source_col, other_scalar - - device_scalar = cudf.Scalar( - _normalize_scalars(source_col, other_scalar), - dtype=source_col.dtype if other_scalar is None else None, - ) - - if other_scalar is None: - return source_col, device_scalar - elif cudf.utils.dtypes.is_mixed_with_object_dtype( - device_scalar, source_col - ): - raise TypeError( - "cudf does not support mixed types, please type-cast " - "the column of dataframe/series and other " - "to same dtypes." - ) - if inplace: - if not np.can_cast(device_scalar, source_col.dtype): - warnings.warn( - f"Type-casting from {device_scalar.dtype} " - f"to {source_col.dtype}, there could be potential data loss" - ) - return source_col, device_scalar.astype(source_col.dtype) - else: - if pd.api.types.is_numeric_dtype(source_col.dtype) and np.can_cast( - other_scalar, source_col.dtype - ): - common_dtype = source_col.dtype - else: - common_dtype = cudf.utils.dtypes.find_common_type( - [source_col.dtype, np.min_scalar_type(other_scalar)] - ) - - source_col = source_col.astype(common_dtype) - return source_col, cudf.Scalar(other_scalar, dtype=common_dtype) - - def _is_series(obj): """ Checks if the `obj` is of type `cudf.Series` diff --git a/python/cudf/cudf/core/internals/__init__.py b/python/cudf/cudf/core/internals/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/core/internals/where.py b/python/cudf/cudf/core/internals/where.py new file mode 100644 index 00000000000..e51be89507e --- /dev/null +++ b/python/cudf/cudf/core/internals/where.py @@ -0,0 +1,181 @@ +import warnings + +import numpy as np +import pandas as pd + +import cudf + + +def _normalize_scalars(col, other): + """ + Try to normalizes scalar values as per col dtype + """ + if ( + other is not None + and (isinstance(other, float) and not np.isnan(other)) + ) and (col.dtype.type(other) != other): + raise TypeError( + f"Cannot safely cast non-equivalent " + f"{type(other).__name__} to {col.dtype.name}" + ) + + return ( + col.dtype.type(other) + if ( + other is not None + and (isinstance(other, float) and not np.isnan(other)) + ) + else other + ) + + +def _check_and_cast_columns(source_col, other_col, inplace): + """ + Returns type-casted columns of `source_col` & `other_col` + based on `inplace` parameter. + """ + if cudf.utils.dtypes.is_categorical_dtype(source_col.dtype): + return source_col, other_col + elif cudf.utils.dtypes.is_mixed_with_object_dtype(source_col, other_col): + raise TypeError( + "cudf does not support mixed types, please type-cast " + "the column of dataframe/series and other " + "to same dtypes." + ) + if inplace: + if not source_col.can_cast_safely(other_col.dtype): + warnings.warn( + f"Type-casting from {other_col.dtype} " + f"to {source_col.dtype}, there could be potential data loss" + ) + return source_col, other_col.astype(source_col.dtype) + else: + common_dtype = cudf.utils.dtypes.find_common_type( + [source_col.dtype, other_col.dtype] + ) + return source_col.astype(common_dtype), other_col.astype(common_dtype) + + +def _check_and_cast_columns_with_scalar(source_col, other_scalar, inplace): + """ + Returns type-casted column `source_col` & scalar `other_scalar` + based on `inplace` parameter. + """ + if cudf.utils.dtypes.is_categorical_dtype(source_col.dtype): + return source_col, other_scalar + + device_scalar = cudf.Scalar( + _normalize_scalars(source_col, other_scalar), + dtype=source_col.dtype if other_scalar is None else None, + ) + + if other_scalar is None: + return source_col, device_scalar + elif cudf.utils.dtypes.is_mixed_with_object_dtype( + device_scalar, source_col + ): + raise TypeError( + "cudf does not support mixed types, please type-cast " + "the column of dataframe/series and other " + "to same dtypes." + ) + if inplace: + if not np.can_cast(device_scalar, source_col.dtype): + warnings.warn( + f"Type-casting from {device_scalar.dtype} " + f"to {source_col.dtype}, there could be potential data loss" + ) + return source_col, device_scalar.astype(source_col.dtype) + else: + if pd.api.types.is_numeric_dtype(source_col.dtype) and np.can_cast( + other_scalar, source_col.dtype + ): + common_dtype = source_col.dtype + else: + common_dtype = cudf.utils.dtypes.find_common_type( + [source_col.dtype, np.min_scalar_type(other_scalar)] + ) + + source_col = source_col.astype(common_dtype) + return source_col, cudf.Scalar(other_scalar, dtype=common_dtype) + + +def _normalize_columns_and_scalars_type(frame, other, inplace=False): + """ + Try to normalize the other's dtypes as per frame. + + Parameters + ---------- + + frame : Can be a DataFrame or Series or Index + other : Can be a DataFrame, Series, Index, Array + like object or a scalar value + + if frame is DataFrame, other can be only a + scalar or array like with size of number of columns + in DataFrame or a DataFrame with same dimension + + if frame is Series, other can be only a scalar or + a series like with same length as frame + + Returns: + -------- + A dataframe/series/list/scalar form of normalized other + """ + if isinstance(frame, cudf.DataFrame) and isinstance(other, cudf.DataFrame): + source_df = frame.copy() + other_df = other.copy() + for self_col in source_df._data.names: + source_col, other_col = _check_and_cast_columns( + source_col=source_df._data[self_col], + other_col=other_df._data[self_col], + inplace=inplace, + ) + source_df._data[self_col] = source_col + other_df._data[self_col] = other_col + return source_df, other_df + + elif isinstance( + frame, (cudf.Series, cudf.Index) + ) and not cudf.utils.dtypes.is_scalar(other): + other = cudf.core.column.as_column(other) + input_col = frame._data[frame.name] + return _check_and_cast_columns( + source_col=input_col, other_col=other, inplace=inplace + ) + else: + # Handles scalar or list/array like scalars + if isinstance( + frame, (cudf.Series, cudf.Index) + ) and cudf.utils.dtypes.is_scalar(other): + input_col = frame._data[frame.name] + return _check_and_cast_columns_with_scalar( + source_col=frame._data[frame.name], + other_scalar=other, + inplace=inplace, + ) + + elif isinstance(frame, cudf.DataFrame): + if cudf.utils.dtypes.is_scalar(other): + other = [other for i in range(len(frame._data.names))] + + source_df = frame.copy() + others = [] + for col_name, other_sclr in zip(frame._data.names, other): + + ( + source_col, + other_scalar, + ) = _check_and_cast_columns_with_scalar( + source_col=source_df._data[col_name], + other_scalar=other_sclr, + inplace=inplace, + ) + source_df._data[col_name] = source_col + others.append(other_scalar) + return source_df, others + else: + raise ValueError( + f"Inappropriate input {type(frame)} " + f"and other {type(other)} combination" + ) From 5e7cf4e224e18da9c2c5afc4f3fd11e8fe84281b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 3 Feb 2021 16:57:10 -0800 Subject: [PATCH 04/23] move where core logic to where.py --- python/cudf/cudf/core/frame.py | 148 +---------------- python/cudf/cudf/core/internals/where.py | 194 +++++++++++++++++++++++ 2 files changed, 197 insertions(+), 145 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ebe91d12012..7d5d50796a0 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -791,151 +791,9 @@ def where(self, cond, other=None, inplace=False): dtype: int64 """ - if isinstance(self, cudf.DataFrame): - if hasattr(cond, "__cuda_array_interface__"): - cond = cudf.DataFrame( - cond, columns=self._data.names, index=self.index - ) - elif not isinstance(cond, cudf.DataFrame): - cond = self.from_pandas(pd.DataFrame(cond)) - - common_cols = set(self._data.names).intersection( - set(cond._data.names) - ) - if len(common_cols) > 0: - # If `self` and `cond` are having unequal index, - # then re-index `cond`. - if not self.index.equals(cond.index): - cond = cond.reindex(self.index) - else: - if cond.shape != self.shape: - raise ValueError( - """Array conditional must be same shape as self""" - ) - # Setting `self` column names to `cond` - # as `cond` has no column names. - cond.columns = self.columns - - ( - source_df, - others, - ) = where_internals._normalize_columns_and_scalars_type( - self, other - ) - if isinstance(other, Frame): - others = others._data.columns - - out_df = cudf.DataFrame(index=self.index) - if len(self._columns) != len(others): - raise ValueError( - """Replacement list length or number of dataframe columns - should be equal to Number of columns of dataframe""" - ) - for i, column_name in enumerate(self._data.names): - input_col = source_df._data[column_name] - other_column = others[i] - if column_name in cond._data: - if isinstance( - input_col, cudf.core.column.CategoricalColumn - ): - if is_scalar(other_column): - try: - other_column = input_col._encode(other_column) - except ValueError: - # When other is not present in categories, - # fill with Null. - other_column = None - other_column = cudf.Scalar( - other_column, dtype=input_col.codes.dtype - ) - elif hasattr(other_column, "codes"): - other_column = other_column.codes - input_col = input_col.codes - - result = libcudf.copying.copy_if_else( - input_col, other_column, cond._data[column_name] - ) - - if isinstance( - self._data[column_name], - cudf.core.column.CategoricalColumn, - ): - result = build_categorical_column( - categories=self._data[column_name].categories, - codes=as_column( - result.base_data, dtype=result.dtype - ), - mask=result.base_mask, - size=result.size, - offset=result.offset, - ordered=self._data[column_name].ordered, - ) - else: - from cudf._lib.null_mask import MaskState, create_null_mask - - out_mask = create_null_mask( - len(input_col), state=MaskState.ALL_NULL - ) - result = input_col.set_mask(out_mask) - out_df[column_name] = self[column_name].__class__(result) - - return self._mimic_inplace(out_df, inplace=inplace) - - else: - if isinstance(other, cudf.DataFrame): - raise NotImplementedError( - "cannot align with a higher dimensional Frame" - ) - input_col = self._data[self.name] - cond = as_column(cond) - if len(cond) != len(self): - raise ValueError( - """Array conditional must be same shape as self""" - ) - if cond.all(): - result = input_col - else: - ( - input_col, - other, - ) = where_internals._normalize_columns_and_scalars_type( - self, other, inplace - ) - - if isinstance(input_col, cudf.core.column.CategoricalColumn): - if is_scalar(other): - try: - other = input_col._encode(other) - except ValueError: - # When other is not present in categories, - # fill with Null. - other = None - other = cudf.Scalar(other, dtype=input_col.codes.dtype) - elif hasattr(other, "codes"): - other = other.codes - - input_col = input_col.codes - - result = libcudf.copying.copy_if_else(input_col, other, cond) - - if is_categorical_dtype(self.dtype): - result = build_categorical_column( - categories=self._data[self.name].categories, - codes=as_column(result.base_data, dtype=result.dtype), - mask=result.base_mask, - size=result.size, - offset=result.offset, - ordered=self._data[self.name].ordered, - ) - - if isinstance(self, cudf.Index): - from cudf.core.index import as_index - - result = as_index(result, name=self.name) - else: - result = self._copy_construct(data=result) - - return self._mimic_inplace(result, inplace=inplace) + return where_internals.where( + frame=self, cond=cond, other=other, inplace=inplace + ) def mask(self, cond, other=None, inplace=False): """ diff --git a/python/cudf/cudf/core/internals/where.py b/python/cudf/cudf/core/internals/where.py index e51be89507e..3fc2f87f299 100644 --- a/python/cudf/cudf/core/internals/where.py +++ b/python/cudf/cudf/core/internals/where.py @@ -179,3 +179,197 @@ def _normalize_columns_and_scalars_type(frame, other, inplace=False): f"Inappropriate input {type(frame)} " f"and other {type(other)} combination" ) + + +def where(frame, cond, other=None, inplace=False): + """ + Replace values where the condition is False. + + Parameters + ---------- + cond : bool Series/DataFrame, array-like + Where cond is True, keep the original value. + Where False, replace with corresponding value from other. + Callables are not supported. + other: scalar, list of scalars, Series/DataFrame + Entries where cond is False are replaced with + corresponding value from other. Callables are not + supported. Default is None. + + DataFrame expects only Scalar or array like with scalars or + dataframe with same dimension as frame. + + Series expects only scalar or series like with same length + inplace : bool, default False + Whether to perform the operation in place on the data. + + Returns + ------- + Same type as caller + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({"A":[1, 4, 5], "B":[3, 5, 8]}) + >>> df.where(df % 2 == 0, [-1, -1]) + A B + 0 -1 -1 + 1 4 -1 + 2 -1 8 + + >>> ser = cudf.Series([4, 3, 2, 1, 0]) + >>> ser.where(ser > 2, 10) + 0 4 + 1 3 + 2 10 + 3 10 + 4 10 + dtype: int64 + >>> ser.where(ser > 2) + 0 4 + 1 3 + 2 + 3 + 4 + dtype: int64 + """ + + if isinstance(frame, cudf.DataFrame): + if hasattr(cond, "__cuda_array_interface__"): + cond = cudf.DataFrame( + cond, columns=frame._data.names, index=frame.index + ) + elif not isinstance(cond, cudf.DataFrame): + cond = frame.from_pandas(pd.DataFrame(cond)) + + common_cols = set(frame._data.names).intersection( + set(cond._data.names) + ) + if len(common_cols) > 0: + # If `frame` and `cond` are having unequal index, + # then re-index `cond`. + if not frame.index.equals(cond.index): + cond = cond.reindex(frame.index) + else: + if cond.shape != frame.shape: + raise ValueError( + """Array conditional must be same shape as self""" + ) + # Setting `frame` column names to `cond` + # as `cond` has no column names. + cond.columns = frame.columns + + (source_df, others,) = _normalize_columns_and_scalars_type( + frame, other + ) + if isinstance(other, cudf.core.frame.Frame): + others = others._data.columns + + out_df = cudf.DataFrame(index=frame.index) + if len(frame._columns) != len(others): + raise ValueError( + """Replacement list length or number of dataframe columns + should be equal to Number of columns of dataframe""" + ) + for i, column_name in enumerate(frame._data.names): + input_col = source_df._data[column_name] + other_column = others[i] + if column_name in cond._data: + if isinstance(input_col, cudf.core.column.CategoricalColumn): + if cudf.utils.dtypes.is_scalar(other_column): + try: + other_column = input_col._encode(other_column) + except ValueError: + # When other is not present in categories, + # fill with Null. + other_column = None + other_column = cudf.Scalar( + other_column, dtype=input_col.codes.dtype + ) + elif hasattr(other_column, "codes"): + other_column = other_column.codes + input_col = input_col.codes + + result = cudf._lib.copying.copy_if_else( + input_col, other_column, cond._data[column_name] + ) + + if isinstance( + frame._data[column_name], + cudf.core.column.CategoricalColumn, + ): + result = cudf.core.column.build_categorical_column( + categories=frame._data[column_name].categories, + codes=cudf.core.column.as_column( + result.base_data, dtype=result.dtype + ), + mask=result.base_mask, + size=result.size, + offset=result.offset, + ordered=frame._data[column_name].ordered, + ) + else: + from cudf._lib.null_mask import MaskState, create_null_mask + + out_mask = create_null_mask( + len(input_col), state=MaskState.ALL_NULL + ) + result = input_col.set_mask(out_mask) + out_df[column_name] = frame[column_name].__class__(result) + + return frame._mimic_inplace(out_df, inplace=inplace) + + else: + if isinstance(other, cudf.DataFrame): + raise NotImplementedError( + "cannot align with a higher dimensional Frame" + ) + input_col = frame._data[frame.name] + cond = cudf.core.column.as_column(cond) + if len(cond) != len(frame): + raise ValueError( + """Array conditional must be same shape as self""" + ) + if cond.all(): + result = input_col + else: + (input_col, other,) = _normalize_columns_and_scalars_type( + frame, other, inplace + ) + + if isinstance(input_col, cudf.core.column.CategoricalColumn): + if cudf.utils.dtypes.is_scalar(other): + try: + other = input_col._encode(other) + except ValueError: + # When other is not present in categories, + # fill with Null. + other = None + other = cudf.Scalar(other, dtype=input_col.codes.dtype) + elif hasattr(other, "codes"): + other = other.codes + + input_col = input_col.codes + + result = cudf._lib.copying.copy_if_else(input_col, other, cond) + + if cudf.utils.dtypes.is_categorical_dtype(frame.dtype): + result = cudf.core.column.build_categorical_column( + categories=frame._data[frame.name].categories, + codes=cudf.core.column.as_column( + result.base_data, dtype=result.dtype + ), + mask=result.base_mask, + size=result.size, + offset=result.offset, + ordered=frame._data[frame.name].ordered, + ) + + if isinstance(frame, cudf.Index): + from cudf.core.index import as_index + + result = as_index(result, name=frame.name) + else: + result = frame._copy_construct(data=result) + + return frame._mimic_inplace(result, inplace=inplace) From 8dc1b9e3a008cbf217efd6dce6dec21aeae05154 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 25 Mar 2021 11:30:42 -0500 Subject: [PATCH 05/23] Apply suggestions from code review Co-authored-by: Keith Kraus --- python/cudf/cudf/core/internals/where.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/internals/where.py b/python/cudf/cudf/core/internals/where.py index 3fc2f87f299..4e1d404c874 100644 --- a/python/cudf/cudf/core/internals/where.py +++ b/python/cudf/cudf/core/internals/where.py @@ -8,7 +8,7 @@ def _normalize_scalars(col, other): """ - Try to normalizes scalar values as per col dtype + Try to normalize scalar values as per col dtype """ if ( other is not None From 7b4079edff3fc594e66e2ca98dd05730d3c97ab8 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 25 Mar 2021 10:28:49 -0700 Subject: [PATCH 06/23] use _column_names instead of columns --- python/cudf/cudf/core/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 4dd266e2fc9..608a04b84cf 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1659,7 +1659,7 @@ def update( other = other.reindex(self.index, axis=0) source_df = self.copy(deep=False) - for col in source_df.columns: + for col in source_df._column_names: this = source_df[col] that = other[col] From aae8f0b453617d6be2187ce496baf216e1f5cc92 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 25 Mar 2021 10:34:56 -0700 Subject: [PATCH 07/23] copyright --- python/cudf/cudf/core/internals/where.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/core/internals/where.py b/python/cudf/cudf/core/internals/where.py index 4e1d404c874..a005d7b7625 100644 --- a/python/cudf/cudf/core/internals/where.py +++ b/python/cudf/cudf/core/internals/where.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + import warnings import numpy as np From 467957c119bcc0e2b1fc2f569c2eefdedf937d7a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 25 Mar 2021 11:48:45 -0700 Subject: [PATCH 08/23] address reviews --- python/cudf/cudf/core/internals/where.py | 29 ++++++++++-------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/core/internals/where.py b/python/cudf/cudf/core/internals/where.py index a005d7b7625..9ca8fb0af84 100644 --- a/python/cudf/cudf/core/internals/where.py +++ b/python/cudf/cudf/core/internals/where.py @@ -12,10 +12,9 @@ def _normalize_scalars(col, other): """ Try to normalize scalar values as per col dtype """ - if ( - other is not None - and (isinstance(other, float) and not np.isnan(other)) - ) and (col.dtype.type(other) != other): + if (isinstance(other, float) and not np.isnan(other)) and ( + col.dtype.type(other) != other + ): raise TypeError( f"Cannot safely cast non-equivalent " f"{type(other).__name__} to {col.dtype.name}" @@ -23,10 +22,7 @@ def _normalize_scalars(col, other): return ( col.dtype.type(other) - if ( - other is not None - and (isinstance(other, float) and not np.isnan(other)) - ) + if (isinstance(other, float) and not np.isnan(other)) else other ) @@ -288,7 +284,9 @@ def where(frame, cond, other=None, inplace=False): other_column = cudf.Scalar( other_column, dtype=input_col.codes.dtype ) - elif hasattr(other_column, "codes"): + elif isinstance( + other_column, cudf.core.column.CategoricalColumn + ): other_column = other_column.codes input_col = input_col.codes @@ -311,10 +309,9 @@ def where(frame, cond, other=None, inplace=False): ordered=frame._data[column_name].ordered, ) else: - from cudf._lib.null_mask import MaskState, create_null_mask - - out_mask = create_null_mask( - len(input_col), state=MaskState.ALL_NULL + out_mask = cudf._lib.null_mask.create_null_mask( + len(input_col), + state=cudf._lib.null_mask.MaskState.ALL_NULL, ) result = input_col.set_mask(out_mask) out_df[column_name] = frame[column_name].__class__(result) @@ -348,7 +345,7 @@ def where(frame, cond, other=None, inplace=False): # fill with Null. other = None other = cudf.Scalar(other, dtype=input_col.codes.dtype) - elif hasattr(other, "codes"): + elif isinstance(other, cudf.core.column.CategoricalColumn): other = other.codes input_col = input_col.codes @@ -368,9 +365,7 @@ def where(frame, cond, other=None, inplace=False): ) if isinstance(frame, cudf.Index): - from cudf.core.index import as_index - - result = as_index(result, name=frame.name) + result = cudf.Index(result, name=frame.name) else: result = frame._copy_construct(data=result) From 679ffb4f0d4de5cbd4ca735044ab79bce2fbaccf Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 25 Mar 2021 11:56:03 -0700 Subject: [PATCH 09/23] address reviews --- python/cudf/cudf/core/__init__.py | 2 +- python/cudf/cudf/core/{internals => _internals}/__init__.py | 0 python/cudf/cudf/core/{internals => _internals}/where.py | 0 python/cudf/cudf/core/frame.py | 2 +- 4 files changed, 2 insertions(+), 2 deletions(-) rename python/cudf/cudf/core/{internals => _internals}/__init__.py (100%) rename python/cudf/cudf/core/{internals => _internals}/where.py (100%) diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py index a71f15dd95b..0ca9e4f6124 100644 --- a/python/cudf/cudf/core/__init__.py +++ b/python/cudf/cudf/core/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) 2018-2020, NVIDIA CORPORATION. -from cudf.core import buffer, column, column_accessor, common, internals +from cudf.core import _internals, buffer, column, column_accessor, common from cudf.core.buffer import Buffer from cudf.core.dataframe import DataFrame, from_pandas, merge from cudf.core.index import ( diff --git a/python/cudf/cudf/core/internals/__init__.py b/python/cudf/cudf/core/_internals/__init__.py similarity index 100% rename from python/cudf/cudf/core/internals/__init__.py rename to python/cudf/cudf/core/_internals/__init__.py diff --git a/python/cudf/cudf/core/internals/where.py b/python/cudf/cudf/core/_internals/where.py similarity index 100% rename from python/cudf/cudf/core/internals/where.py rename to python/cudf/cudf/core/_internals/where.py diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 602960bba97..5e541eed17e 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -19,8 +19,8 @@ import cudf from cudf import _lib as libcudf from cudf._typing import ColumnLike, DataFrameOrSeries +from cudf.core._internals import where as where_internals from cudf.core.column import as_column, build_categorical_column, column_empty -from cudf.core.internals import where as where_internals from cudf.utils.dtypes import ( is_categorical_dtype, is_column_like, From 85d79283e17b48dd05de467706e486fb80402501 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 29 Mar 2021 15:43:24 -0700 Subject: [PATCH 10/23] return a cudf scalar --- python/cudf/cudf/core/_internals/where.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 9ca8fb0af84..bfe6200e474 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -20,11 +20,7 @@ def _normalize_scalars(col, other): f"{type(other).__name__} to {col.dtype.name}" ) - return ( - col.dtype.type(other) - if (isinstance(other, float) and not np.isnan(other)) - else other - ) + return cudf.Scalar(other, dtype=col.dtype if other is None else None) def _check_and_cast_columns(source_col, other_col, inplace): @@ -62,10 +58,7 @@ def _check_and_cast_columns_with_scalar(source_col, other_scalar, inplace): if cudf.utils.dtypes.is_categorical_dtype(source_col.dtype): return source_col, other_scalar - device_scalar = cudf.Scalar( - _normalize_scalars(source_col, other_scalar), - dtype=source_col.dtype if other_scalar is None else None, - ) + device_scalar = _normalize_scalars(source_col, other_scalar) if other_scalar is None: return source_col, device_scalar @@ -121,8 +114,8 @@ def _normalize_columns_and_scalars_type(frame, other, inplace=False): A dataframe/series/list/scalar form of normalized other """ if isinstance(frame, cudf.DataFrame) and isinstance(other, cudf.DataFrame): - source_df = frame.copy() - other_df = other.copy() + source_df = frame.copy(deep=False) + other_df = other.copy(deep=False) for self_col in source_df._data.names: source_col, other_col = _check_and_cast_columns( source_col=source_df._data[self_col], @@ -157,7 +150,7 @@ def _normalize_columns_and_scalars_type(frame, other, inplace=False): if cudf.utils.dtypes.is_scalar(other): other = [other for i in range(len(frame._data.names))] - source_df = frame.copy() + source_df = frame.copy(deep=False) others = [] for col_name, other_sclr in zip(frame._data.names, other): From 2764a855dfd69baaa96afaa721632a11039193eb Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 29 Mar 2021 17:22:18 -0700 Subject: [PATCH 11/23] add cudf can_cast utility --- python/cudf/cudf/core/_internals/where.py | 10 +++-- python/cudf/cudf/utils/dtypes.py | 48 +++++++++++++++++++++++ 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index bfe6200e474..0132c168176 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -71,16 +71,18 @@ def _check_and_cast_columns_with_scalar(source_col, other_scalar, inplace): "to same dtypes." ) if inplace: - if not np.can_cast(device_scalar, source_col.dtype): + if not cudf.utils.dtypes.can_cast( + device_scalar.dtype, source_col.dtype + ): warnings.warn( f"Type-casting from {device_scalar.dtype} " f"to {source_col.dtype}, there could be potential data loss" ) return source_col, device_scalar.astype(source_col.dtype) else: - if pd.api.types.is_numeric_dtype(source_col.dtype) and np.can_cast( - other_scalar, source_col.dtype - ): + if pd.api.types.is_numeric_dtype( + source_col.dtype + ) and cudf.utils.dtypes.can_cast(other_scalar, source_col.dtype): common_dtype = source_col.dtype else: common_dtype = cudf.utils.dtypes.find_common_type( diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index a1eb2212ac2..afebfb6a00d 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -637,6 +637,11 @@ def find_common_type(dtypes): # Aggregate same types dtypes = set(dtypes) + if any(is_decimal_dtype(dtype) for dtype in dtypes): + raise NotImplementedError( + "DecimalDtype is not yet supported in find_common_type" + ) + # Corner case 1: # Resort to np.result_type to handle "M" and "m" types separately dt_dtypes = set(filter(lambda t: is_datetime_dtype(t), dtypes)) @@ -659,6 +664,49 @@ def find_common_type(dtypes): return common_dtype +def can_cast(from_dtype, to_dtype): + """ + Utility function to determine if we can cast + from `from_dtype` to `to_dtype`. This function primarily calls + `np.can_cast` but with some special handling around + cudf specific dtypes. + """ + if isinstance(from_dtype, cudf.core.dtypes.Decimal64Dtype): + if isinstance(to_dtype, cudf.core.dtypes.Decimal64Dtype): + return True + elif isinstance(to_dtype, np.dtype): + if to_dtype.kind in {"i", "f", "u", "U", "O"}: + return True + else: + return False + elif isinstance(from_dtype, np.dtype): + if isinstance(to_dtype, np.dtype): + return np.can_cast(from_dtype, to_dtype) + elif isinstance(to_dtype, cudf.core.dtypes.Decimal64Dtype): + if from_dtype.kind in {"i", "f", "u", "U", "O"}: + return True + else: + return False + elif isinstance(to_dtype, cudf.core.types.CategoricalDtype): + return True + else: + return False + elif isinstance(from_dtype, cudf.core.dtypes.ListDtype): + if isinstance(to_dtype, cudf.core.dtypes.ListDtype): + return True + else: + return False + elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype): + if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype): + return True + elif isinstance(to_dtype, np.dtype): + return np.can_cast(from_dtype._categories.dtype, to_dtype) + else: + return False + else: + return np.can_cast(from_dtype, to_dtype) + + # Type dispatch loops similar to what are found in `np.add.types` # In NumPy, whether or not an op can be performed between two # operands is determined by checking to see if NumPy has a c/c++ From 9362195c917bc2dff9fecd2fef131ed750ad86b2 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 29 Mar 2021 17:25:40 -0700 Subject: [PATCH 12/23] add type annotations --- python/cudf/cudf/core/_internals/where.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 0132c168176..dbda318aca1 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -6,9 +6,12 @@ import pandas as pd import cudf +from cudf._typing import ScalarLike -def _normalize_scalars(col, other): +def _normalize_scalars( + col: cudf.core.column.ColumnBase, other: ScalarLike +) -> cudf.Scalar: """ Try to normalize scalar values as per col dtype """ From f7ca26896dd67f270ca8c9e2d6ffb1a3a09d0169 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 29 Mar 2021 18:37:07 -0700 Subject: [PATCH 13/23] add typing for _check_and_cast_columns --- python/cudf/cudf/core/_internals/where.py | 17 ++++++++++++++--- python/cudf/cudf/core/column/column.py | 3 +++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index dbda318aca1..1247a371117 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -1,6 +1,7 @@ # Copyright (c) 2021, NVIDIA CORPORATION. import warnings +from typing import Any, Tuple, Union import numpy as np import pandas as pd @@ -26,7 +27,11 @@ def _normalize_scalars( return cudf.Scalar(other, dtype=col.dtype if other is None else None) -def _check_and_cast_columns(source_col, other_col, inplace): +def _check_and_cast_columns( + source_col: cudf.core.column.ColumnBase, + other_col: cudf.core.column.ColumnBase, + inplace: bool, +) -> Tuple[cudf.core.column.ColumnBase, cudf.core.column.ColumnBase]: """ Returns type-casted columns of `source_col` & `other_col` based on `inplace` parameter. @@ -53,7 +58,11 @@ def _check_and_cast_columns(source_col, other_col, inplace): return source_col.astype(common_dtype), other_col.astype(common_dtype) -def _check_and_cast_columns_with_scalar(source_col, other_scalar, inplace): +def _check_and_cast_columns_with_scalar( + source_col: cudf.core.column.ColumnBase, + other_scalar: ScalarLike, + inplace: bool, +) -> Tuple[cudf.core.column.ColumnBase, ScalarLike]: """ Returns type-casted column `source_col` & scalar `other_scalar` based on `inplace` parameter. @@ -96,7 +105,9 @@ def _check_and_cast_columns_with_scalar(source_col, other_scalar, inplace): return source_col, cudf.Scalar(other_scalar, dtype=common_dtype) -def _normalize_columns_and_scalars_type(frame, other, inplace=False): +def _normalize_columns_and_scalars_type( + frame: cudf.core.frame.Frame, other: Any, inplace: bool = False +) -> Tuple[Union[cudf.core.frame.Frame, cudf.core.column.ColumnBase], Any]: """ Try to normalize the other's dtypes as per frame. diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index dd06d97d105..7a5253a6b5d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1309,6 +1309,9 @@ def corr(self, other: ColumnBase): f"cannot perform corr with types {self.dtype}, {other.dtype}" ) + def can_cast_safely(self, to_dtype: Dtype) -> bool: + raise NotImplementedError() + def nans_to_nulls(self: T) -> T: if self.dtype.kind == "f": newmask = libcudf.transform.nans_to_nulls(self) From 21c2ac63c6a70596b5d1aacc2d78543eb91cb711 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 29 Mar 2021 19:04:57 -0700 Subject: [PATCH 14/23] add typing in where --- python/cudf/cudf/core/_internals/where.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 1247a371117..46b5874b528 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -107,7 +107,10 @@ def _check_and_cast_columns_with_scalar( def _normalize_columns_and_scalars_type( frame: cudf.core.frame.Frame, other: Any, inplace: bool = False -) -> Tuple[Union[cudf.core.frame.Frame, cudf.core.column.ColumnBase], Any]: +) -> Tuple[ + Union[cudf.core.frame.Frame, cudf.core.column.ColumnBase], + Union[cudf.core.frame.Frame, ScalarLike], +]: """ Try to normalize the other's dtypes as per frame. @@ -188,7 +191,9 @@ def _normalize_columns_and_scalars_type( ) -def where(frame, cond, other=None, inplace=False): +def where( + frame, cond, other=None, inplace=False, +): """ Replace values where the condition is False. From d770d61d5cf2fc6387f9262302af0728d0857f49 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 29 Mar 2021 20:54:40 -0700 Subject: [PATCH 15/23] typing --- python/cudf/cudf/core/_internals/where.py | 92 +++++++++++++---------- python/cudf/cudf/core/frame.py | 20 ++--- 2 files changed, 56 insertions(+), 56 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 46b5874b528..e47572b72db 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -1,18 +1,22 @@ # Copyright (c) 2021, NVIDIA CORPORATION. import warnings -from typing import Any, Tuple, Union +from typing import Any, Optional, Tuple, Union, cast import numpy as np import pandas as pd import cudf -from cudf._typing import ScalarLike +from cudf._typing import ColumnLike, ScalarLike +from cudf.core.column import ColumnBase +from cudf.core.dataframe import DataFrame +from cudf.core.frame import Frame +from cudf.core.index import Index +from cudf.core.scalar import Scalar +from cudf.core.series import Series -def _normalize_scalars( - col: cudf.core.column.ColumnBase, other: ScalarLike -) -> cudf.Scalar: +def _normalize_scalars(col: ColumnBase, other: ScalarLike) -> Scalar: """ Try to normalize scalar values as per col dtype """ @@ -24,14 +28,12 @@ def _normalize_scalars( f"{type(other).__name__} to {col.dtype.name}" ) - return cudf.Scalar(other, dtype=col.dtype if other is None else None) + return Scalar(other, dtype=col.dtype if other is None else None) def _check_and_cast_columns( - source_col: cudf.core.column.ColumnBase, - other_col: cudf.core.column.ColumnBase, - inplace: bool, -) -> Tuple[cudf.core.column.ColumnBase, cudf.core.column.ColumnBase]: + source_col: ColumnBase, other_col: ColumnBase, inplace: bool, +) -> Tuple[ColumnBase, ColumnBase]: """ Returns type-casted columns of `source_col` & `other_col` based on `inplace` parameter. @@ -59,10 +61,8 @@ def _check_and_cast_columns( def _check_and_cast_columns_with_scalar( - source_col: cudf.core.column.ColumnBase, - other_scalar: ScalarLike, - inplace: bool, -) -> Tuple[cudf.core.column.ColumnBase, ScalarLike]: + source_col: ColumnBase, other_scalar: ScalarLike, inplace: bool, +) -> Tuple[ColumnBase, ScalarLike]: """ Returns type-casted column `source_col` & scalar `other_scalar` based on `inplace` parameter. @@ -102,14 +102,13 @@ def _check_and_cast_columns_with_scalar( ) source_col = source_col.astype(common_dtype) - return source_col, cudf.Scalar(other_scalar, dtype=common_dtype) + return source_col, Scalar(other_scalar, dtype=common_dtype) def _normalize_columns_and_scalars_type( - frame: cudf.core.frame.Frame, other: Any, inplace: bool = False + frame: Union[Series, Index, DataFrame], other: Any, inplace: bool = False, ) -> Tuple[ - Union[cudf.core.frame.Frame, cudf.core.column.ColumnBase], - Union[cudf.core.frame.Frame, ScalarLike], + Union[Series, Index, DataFrame, ColumnLike], Any, ]: """ Try to normalize the other's dtypes as per frame. @@ -132,7 +131,7 @@ def _normalize_columns_and_scalars_type( -------- A dataframe/series/list/scalar form of normalized other """ - if isinstance(frame, cudf.DataFrame) and isinstance(other, cudf.DataFrame): + if isinstance(frame, DataFrame) and isinstance(other, DataFrame): source_df = frame.copy(deep=False) other_df = other.copy(deep=False) for self_col in source_df._data.names: @@ -146,7 +145,7 @@ def _normalize_columns_and_scalars_type( return source_df, other_df elif isinstance( - frame, (cudf.Series, cudf.Index) + frame, (Series, Index) ) and not cudf.utils.dtypes.is_scalar(other): other = cudf.core.column.as_column(other) input_col = frame._data[frame.name] @@ -155,9 +154,9 @@ def _normalize_columns_and_scalars_type( ) else: # Handles scalar or list/array like scalars - if isinstance( - frame, (cudf.Series, cudf.Index) - ) and cudf.utils.dtypes.is_scalar(other): + if isinstance(frame, (Series, Index)) and cudf.utils.dtypes.is_scalar( + other + ): input_col = frame._data[frame.name] return _check_and_cast_columns_with_scalar( source_col=frame._data[frame.name], @@ -165,7 +164,7 @@ def _normalize_columns_and_scalars_type( inplace=inplace, ) - elif isinstance(frame, cudf.DataFrame): + elif isinstance(frame, DataFrame): if cudf.utils.dtypes.is_scalar(other): other = [other for i in range(len(frame._data.names))] @@ -192,8 +191,11 @@ def _normalize_columns_and_scalars_type( def where( - frame, cond, other=None, inplace=False, -): + frame: Union[Series, Index, DataFrame], + cond: Any, + other: Any = None, + inplace: bool = False, +) -> Optional[Union[Frame]]: """ Replace values where the condition is False. @@ -222,14 +224,14 @@ def where( Examples -------- >>> import cudf - >>> df = cudf.DataFrame({"A":[1, 4, 5], "B":[3, 5, 8]}) + >>> df = DataFrame({"A":[1, 4, 5], "B":[3, 5, 8]}) >>> df.where(df % 2 == 0, [-1, -1]) A B 0 -1 -1 1 4 -1 2 -1 8 - >>> ser = cudf.Series([4, 3, 2, 1, 0]) + >>> ser = Series([4, 3, 2, 1, 0]) >>> ser.where(ser > 2, 10) 0 4 1 3 @@ -246,12 +248,12 @@ def where( dtype: int64 """ - if isinstance(frame, cudf.DataFrame): + if isinstance(frame, DataFrame): if hasattr(cond, "__cuda_array_interface__"): - cond = cudf.DataFrame( + cond = DataFrame( cond, columns=frame._data.names, index=frame.index ) - elif not isinstance(cond, cudf.DataFrame): + elif not isinstance(cond, DataFrame): cond = frame.from_pandas(pd.DataFrame(cond)) common_cols = set(frame._data.names).intersection( @@ -274,10 +276,10 @@ def where( (source_df, others,) = _normalize_columns_and_scalars_type( frame, other ) - if isinstance(other, cudf.core.frame.Frame): + if isinstance(other, Frame): others = others._data.columns - out_df = cudf.DataFrame(index=frame.index) + out_df = DataFrame(index=frame.index) if len(frame._columns) != len(others): raise ValueError( """Replacement list length or number of dataframe columns @@ -295,7 +297,7 @@ def where( # When other is not present in categories, # fill with Null. other_column = None - other_column = cudf.Scalar( + other_column = Scalar( other_column, dtype=input_col.codes.dtype ) elif isinstance( @@ -333,7 +335,7 @@ def where( return frame._mimic_inplace(out_df, inplace=inplace) else: - if isinstance(other, cudf.DataFrame): + if isinstance(other, DataFrame): raise NotImplementedError( "cannot align with a higher dimensional Frame" ) @@ -358,7 +360,7 @@ def where( # When other is not present in categories, # fill with Null. other = None - other = cudf.Scalar(other, dtype=input_col.codes.dtype) + other = Scalar(other, dtype=input_col.codes.dtype) elif isinstance(other, cudf.core.column.CategoricalColumn): other = other.codes @@ -366,20 +368,28 @@ def where( result = cudf._lib.copying.copy_if_else(input_col, other, cond) - if cudf.utils.dtypes.is_categorical_dtype(frame.dtype): + if isinstance( + frame._data[frame.name], cudf.core.column.CategoricalColumn + ): result = cudf.core.column.build_categorical_column( - categories=frame._data[frame.name].categories, + categories=cast( + cudf.core.column.CategoricalColumn, + frame._data[frame.name], + ).categories, codes=cudf.core.column.as_column( result.base_data, dtype=result.dtype ), mask=result.base_mask, size=result.size, offset=result.offset, - ordered=frame._data[frame.name].ordered, + ordered=cast( + cudf.core.column.CategoricalColumn, + frame._data[frame.name], + ).ordered, ) - if isinstance(frame, cudf.Index): - result = cudf.Index(result, name=frame.name) + if isinstance(frame, Index): + result = Index(result, name=frame.name) else: result = frame._copy_construct(data=result) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5e541eed17e..a03502bebde 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,7 +6,7 @@ import functools import warnings from collections import OrderedDict, abc as abc -from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, Union, overload +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, TypeVar, Union import cupy import numpy as np @@ -14,7 +14,6 @@ import pyarrow as pa from nvtx import annotate from pandas.api.types import is_dict_like, is_dtype_equal -from typing_extensions import Literal import cudf from cudf import _lib as libcudf @@ -53,19 +52,9 @@ class Frame(libcudf.table.Table): def _from_table(cls, table: Frame): return cls(table._data, index=table._index) - @overload - def _mimic_inplace(self, result: Frame) -> Frame: - ... - - @overload - def _mimic_inplace(self, result: Frame, inplace: Literal[True]): - ... - - @overload - def _mimic_inplace(self, result: Frame, inplace: Literal[False]) -> Frame: - ... - - def _mimic_inplace(self, result, inplace=False): + def _mimic_inplace( + self: T, result: Frame, inplace: bool = False + ) -> Optional[Frame]: if inplace: for col in self._data: if col in result._data: @@ -74,6 +63,7 @@ def _mimic_inplace(self, result, inplace=False): ) self._data = result._data self._index = result._index + return None else: return result From db25a599bb5866b292f60a18f16312108ffc472b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 29 Mar 2021 21:30:51 -0700 Subject: [PATCH 16/23] refactor --- python/cudf/cudf/core/_internals/__init__.py | 1 + python/cudf/cudf/core/_internals/where.py | 11 +++++------ python/cudf/cudf/core/frame.py | 3 +-- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/_internals/__init__.py b/python/cudf/cudf/core/_internals/__init__.py index e69de29bb2d..ed3e30fd65d 100644 --- a/python/cudf/cudf/core/_internals/__init__.py +++ b/python/cudf/cudf/core/_internals/__init__.py @@ -0,0 +1 @@ +from cudf.core._internals.where import where diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index e47572b72db..2310e613901 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -12,11 +12,10 @@ from cudf.core.dataframe import DataFrame from cudf.core.frame import Frame from cudf.core.index import Index -from cudf.core.scalar import Scalar from cudf.core.series import Series -def _normalize_scalars(col: ColumnBase, other: ScalarLike) -> Scalar: +def _normalize_scalars(col: ColumnBase, other: ScalarLike) -> ScalarLike: """ Try to normalize scalar values as per col dtype """ @@ -28,7 +27,7 @@ def _normalize_scalars(col: ColumnBase, other: ScalarLike) -> Scalar: f"{type(other).__name__} to {col.dtype.name}" ) - return Scalar(other, dtype=col.dtype if other is None else None) + return cudf.Scalar(other, dtype=col.dtype if other is None else None) def _check_and_cast_columns( @@ -102,7 +101,7 @@ def _check_and_cast_columns_with_scalar( ) source_col = source_col.astype(common_dtype) - return source_col, Scalar(other_scalar, dtype=common_dtype) + return source_col, cudf.Scalar(other_scalar, dtype=common_dtype) def _normalize_columns_and_scalars_type( @@ -297,7 +296,7 @@ def where( # When other is not present in categories, # fill with Null. other_column = None - other_column = Scalar( + other_column = cudf.Scalar( other_column, dtype=input_col.codes.dtype ) elif isinstance( @@ -360,7 +359,7 @@ def where( # When other is not present in categories, # fill with Null. other = None - other = Scalar(other, dtype=input_col.codes.dtype) + other = cudf.Scalar(other, dtype=input_col.codes.dtype) elif isinstance(other, cudf.core.column.CategoricalColumn): other = other.codes diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index a03502bebde..eeac98c3332 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -18,7 +18,6 @@ import cudf from cudf import _lib as libcudf from cudf._typing import ColumnLike, DataFrameOrSeries -from cudf.core._internals import where as where_internals from cudf.core.column import as_column, build_categorical_column, column_empty from cudf.utils.dtypes import ( is_categorical_dtype, @@ -836,7 +835,7 @@ def where(self, cond, other=None, inplace=False): dtype: int64 """ - return where_internals.where( + return cudf.core._internals.where( frame=self, cond=cond, other=other, inplace=inplace ) From e5b140a918670670d9a24e69d6130723c0bcc76f Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 30 Mar 2021 09:15:52 -0700 Subject: [PATCH 17/23] remove duplicated logic and squash into single method --- python/cudf/cudf/core/_internals/where.py | 103 +++++++++------------- python/cudf/cudf/core/column/column.py | 3 - 2 files changed, 44 insertions(+), 62 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 2310e613901..32544618e40 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -30,78 +30,63 @@ def _normalize_scalars(col: ColumnBase, other: ScalarLike) -> ScalarLike: return cudf.Scalar(other, dtype=col.dtype if other is None else None) -def _check_and_cast_columns( - source_col: ColumnBase, other_col: ColumnBase, inplace: bool, -) -> Tuple[ColumnBase, ColumnBase]: - """ - Returns type-casted columns of `source_col` & `other_col` - based on `inplace` parameter. - """ - if cudf.utils.dtypes.is_categorical_dtype(source_col.dtype): - return source_col, other_col - elif cudf.utils.dtypes.is_mixed_with_object_dtype(source_col, other_col): - raise TypeError( - "cudf does not support mixed types, please type-cast " - "the column of dataframe/series and other " - "to same dtypes." - ) - if inplace: - if not source_col.can_cast_safely(other_col.dtype): - warnings.warn( - f"Type-casting from {other_col.dtype} " - f"to {source_col.dtype}, there could be potential data loss" - ) - return source_col, other_col.astype(source_col.dtype) - else: - common_dtype = cudf.utils.dtypes.find_common_type( - [source_col.dtype, other_col.dtype] - ) - return source_col.astype(common_dtype), other_col.astype(common_dtype) - - -def _check_and_cast_columns_with_scalar( - source_col: ColumnBase, other_scalar: ScalarLike, inplace: bool, -) -> Tuple[ColumnBase, ScalarLike]: +def _check_and_cast_columns_with_other( + source_col: ColumnBase, + other: Union[ScalarLike, ColumnBase], + inplace: bool, +) -> Tuple[ColumnBase, Union[ScalarLike, ColumnBase]]: """ Returns type-casted column `source_col` & scalar `other_scalar` based on `inplace` parameter. """ if cudf.utils.dtypes.is_categorical_dtype(source_col.dtype): - return source_col, other_scalar + return source_col, other - device_scalar = _normalize_scalars(source_col, other_scalar) + if cudf.utils.dtypes.is_scalar(other): + device_obj = _normalize_scalars(source_col, other) + else: + device_obj = other - if other_scalar is None: - return source_col, device_scalar - elif cudf.utils.dtypes.is_mixed_with_object_dtype( - device_scalar, source_col - ): + if other is None: + return source_col, device_obj + elif cudf.utils.dtypes.is_mixed_with_object_dtype(device_obj, source_col): raise TypeError( "cudf does not support mixed types, please type-cast " "the column of dataframe/series and other " "to same dtypes." ) if inplace: - if not cudf.utils.dtypes.can_cast( - device_scalar.dtype, source_col.dtype - ): + if not cudf.utils.dtypes.can_cast(device_obj.dtype, source_col.dtype): warnings.warn( - f"Type-casting from {device_scalar.dtype} " + f"Type-casting from {device_obj.dtype} " f"to {source_col.dtype}, there could be potential data loss" ) - return source_col, device_scalar.astype(source_col.dtype) + return source_col, device_obj.astype(source_col.dtype) else: - if pd.api.types.is_numeric_dtype( - source_col.dtype - ) and cudf.utils.dtypes.can_cast(other_scalar, source_col.dtype): + if ( + cudf.utils.dtypes.is_scalar(other) + and pd.api.types.is_numeric_dtype(source_col.dtype) + and cudf.utils.dtypes.can_cast(other, source_col.dtype) + ): common_dtype = source_col.dtype + return ( + source_col.astype(common_dtype), + cudf.Scalar(other, dtype=common_dtype), + ) else: common_dtype = cudf.utils.dtypes.find_common_type( - [source_col.dtype, np.min_scalar_type(other_scalar)] + [ + source_col.dtype, + np.min_scalar_type(other) + if cudf.utils.dtypes.is_scalar(other) + else other.dtype, + ] ) - - source_col = source_col.astype(common_dtype) - return source_col, cudf.Scalar(other_scalar, dtype=common_dtype) + if cudf.utils.dtypes.is_scalar(device_obj): + device_obj = cudf.Scalar(other, dtype=common_dtype) + else: + device_obj = device_obj.astype(common_dtype) + return source_col.astype(common_dtype), device_obj def _normalize_columns_and_scalars_type( @@ -134,9 +119,9 @@ def _normalize_columns_and_scalars_type( source_df = frame.copy(deep=False) other_df = other.copy(deep=False) for self_col in source_df._data.names: - source_col, other_col = _check_and_cast_columns( + source_col, other_col = _check_and_cast_columns_with_other( source_col=source_df._data[self_col], - other_col=other_df._data[self_col], + other=other_df._data[self_col], inplace=inplace, ) source_df._data[self_col] = source_col @@ -148,8 +133,8 @@ def _normalize_columns_and_scalars_type( ) and not cudf.utils.dtypes.is_scalar(other): other = cudf.core.column.as_column(other) input_col = frame._data[frame.name] - return _check_and_cast_columns( - source_col=input_col, other_col=other, inplace=inplace + return _check_and_cast_columns_with_other( + source_col=input_col, other=other, inplace=inplace ) else: # Handles scalar or list/array like scalars @@ -157,9 +142,9 @@ def _normalize_columns_and_scalars_type( other ): input_col = frame._data[frame.name] - return _check_and_cast_columns_with_scalar( + return _check_and_cast_columns_with_other( source_col=frame._data[frame.name], - other_scalar=other, + other=other, inplace=inplace, ) @@ -174,9 +159,9 @@ def _normalize_columns_and_scalars_type( ( source_col, other_scalar, - ) = _check_and_cast_columns_with_scalar( + ) = _check_and_cast_columns_with_other( source_col=source_df._data[col_name], - other_scalar=other_sclr, + other=other_sclr, inplace=inplace, ) source_df._data[col_name] = source_col diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 10529c70d0f..e59b395ec0f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1319,9 +1319,6 @@ def corr(self, other: ColumnBase): f"cannot perform corr with types {self.dtype}, {other.dtype}" ) - def can_cast_safely(self, to_dtype: Dtype) -> bool: - raise NotImplementedError() - def nans_to_nulls(self: T) -> T: if self.dtype.kind == "f": newmask = libcudf.transform.nans_to_nulls(self) From 95d409b448cdd1f827f51fb5cf1de3889cc91e57 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 30 Mar 2021 09:29:39 -0700 Subject: [PATCH 18/23] use _column_names --- python/cudf/cudf/core/_internals/where.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 32544618e40..94ad760ceab 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -118,7 +118,7 @@ def _normalize_columns_and_scalars_type( if isinstance(frame, DataFrame) and isinstance(other, DataFrame): source_df = frame.copy(deep=False) other_df = other.copy(deep=False) - for self_col in source_df._data.names: + for self_col in source_df._column_names: source_col, other_col = _check_and_cast_columns_with_other( source_col=source_df._data[self_col], other=other_df._data[self_col], @@ -150,11 +150,11 @@ def _normalize_columns_and_scalars_type( elif isinstance(frame, DataFrame): if cudf.utils.dtypes.is_scalar(other): - other = [other for i in range(len(frame._data.names))] + other = [other for i in range(len(frame._column_names))] source_df = frame.copy(deep=False) others = [] - for col_name, other_sclr in zip(frame._data.names, other): + for col_name, other_sclr in zip(frame._column_names, other): ( source_col, @@ -235,13 +235,13 @@ def where( if isinstance(frame, DataFrame): if hasattr(cond, "__cuda_array_interface__"): cond = DataFrame( - cond, columns=frame._data.names, index=frame.index + cond, columns=frame._column_names, index=frame.index ) elif not isinstance(cond, DataFrame): cond = frame.from_pandas(pd.DataFrame(cond)) - common_cols = set(frame._data.names).intersection( - set(cond._data.names) + common_cols = set(frame._column_names).intersection( + set(cond._column_names) ) if len(common_cols) > 0: # If `frame` and `cond` are having unequal index, @@ -269,7 +269,7 @@ def where( """Replacement list length or number of dataframe columns should be equal to Number of columns of dataframe""" ) - for i, column_name in enumerate(frame._data.names): + for i, column_name in enumerate(frame._column_names): input_col = source_df._data[column_name] other_column = others[i] if column_name in cond._data: From 349fff5c027b1c5c4c21bb3dabcdd30db683eb80 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 30 Mar 2021 09:46:40 -0700 Subject: [PATCH 19/23] handle different shape --- python/cudf/cudf/core/_internals/where.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 94ad760ceab..584232f932c 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -210,7 +210,7 @@ def where( >>> import cudf >>> df = DataFrame({"A":[1, 4, 5], "B":[3, 5, 8]}) >>> df.where(df % 2 == 0, [-1, -1]) - A B + A B 0 -1 -1 1 4 -1 2 -1 8 @@ -237,6 +237,11 @@ def where( cond = DataFrame( cond, columns=frame._column_names, index=frame.index ) + elif ( + hasattr(cond, "__array_interface__") + and cond.__array_interface__["shape"] != frame.shape + ): + raise ValueError("conditional must be same shape as self") elif not isinstance(cond, DataFrame): cond = frame.from_pandas(pd.DataFrame(cond)) From 17a581e0354ed40456a7b8f3a0029f3f55e2467f Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 30 Mar 2021 10:20:18 -0700 Subject: [PATCH 20/23] address reviews --- python/cudf/cudf/_lib/copying.pyx | 2 +- python/cudf/cudf/core/__init__.py | 2 +- python/cudf/cudf/core/_internals/__init__.py | 2 + python/cudf/cudf/core/_internals/where.py | 84 ++++++++++---------- python/cudf/cudf/utils/dtypes.py | 12 ++- 5 files changed, 53 insertions(+), 49 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 6150c6110b9..8f93866612e 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. import pandas as pd diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py index 0ca9e4f6124..59173cc0247 100644 --- a/python/cudf/cudf/core/__init__.py +++ b/python/cudf/cudf/core/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. from cudf.core import _internals, buffer, column, column_accessor, common from cudf.core.buffer import Buffer diff --git a/python/cudf/cudf/core/_internals/__init__.py b/python/cudf/cudf/core/_internals/__init__.py index ed3e30fd65d..53d186def85 100644 --- a/python/cudf/cudf/core/_internals/__init__.py +++ b/python/cudf/cudf/core/_internals/__init__.py @@ -1 +1,3 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + from cudf.core._internals.where import where diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 584232f932c..566691d08be 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -56,7 +56,7 @@ def _check_and_cast_columns_with_other( "to same dtypes." ) if inplace: - if not cudf.utils.dtypes.can_cast(device_obj.dtype, source_col.dtype): + if not cudf.utils.dtypes._can_cast(device_obj.dtype, source_col.dtype): warnings.warn( f"Type-casting from {device_obj.dtype} " f"to {source_col.dtype}, there could be potential data loss" @@ -66,7 +66,7 @@ def _check_and_cast_columns_with_other( if ( cudf.utils.dtypes.is_scalar(other) and pd.api.types.is_numeric_dtype(source_col.dtype) - and cudf.utils.dtypes.can_cast(other, source_col.dtype) + and cudf.utils.dtypes._can_cast(other, source_col.dtype) ): common_dtype = source_col.dtype return ( @@ -334,48 +334,46 @@ def where( raise ValueError( """Array conditional must be same shape as self""" ) - if cond.all(): - result = input_col - else: - (input_col, other,) = _normalize_columns_and_scalars_type( - frame, other, inplace - ) - if isinstance(input_col, cudf.core.column.CategoricalColumn): - if cudf.utils.dtypes.is_scalar(other): - try: - other = input_col._encode(other) - except ValueError: - # When other is not present in categories, - # fill with Null. - other = None - other = cudf.Scalar(other, dtype=input_col.codes.dtype) - elif isinstance(other, cudf.core.column.CategoricalColumn): - other = other.codes - - input_col = input_col.codes - - result = cudf._lib.copying.copy_if_else(input_col, other, cond) - - if isinstance( - frame._data[frame.name], cudf.core.column.CategoricalColumn - ): - result = cudf.core.column.build_categorical_column( - categories=cast( - cudf.core.column.CategoricalColumn, - frame._data[frame.name], - ).categories, - codes=cudf.core.column.as_column( - result.base_data, dtype=result.dtype - ), - mask=result.base_mask, - size=result.size, - offset=result.offset, - ordered=cast( - cudf.core.column.CategoricalColumn, - frame._data[frame.name], - ).ordered, - ) + (input_col, other,) = _normalize_columns_and_scalars_type( + frame, other, inplace + ) + + if isinstance(input_col, cudf.core.column.CategoricalColumn): + if cudf.utils.dtypes.is_scalar(other): + try: + other = input_col._encode(other) + except ValueError: + # When other is not present in categories, + # fill with Null. + other = None + other = cudf.Scalar(other, dtype=input_col.codes.dtype) + elif isinstance(other, cudf.core.column.CategoricalColumn): + other = other.codes + + input_col = input_col.codes + + result = cudf._lib.copying.copy_if_else(input_col, other, cond) + + if isinstance( + frame._data[frame.name], cudf.core.column.CategoricalColumn + ): + result = cudf.core.column.build_categorical_column( + categories=cast( + cudf.core.column.CategoricalColumn, + frame._data[frame.name], + ).categories, + codes=cudf.core.column.as_column( + result.base_data, dtype=result.dtype + ), + mask=result.base_mask, + size=result.size, + offset=result.offset, + ordered=cast( + cudf.core.column.CategoricalColumn, + frame._data[frame.name], + ).ordered, + ) if isinstance(frame, Index): result = Index(result, name=frame.name) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index afebfb6a00d..e501f202754 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -664,13 +664,15 @@ def find_common_type(dtypes): return common_dtype -def can_cast(from_dtype, to_dtype): +def _can_cast(from_dtype, to_dtype): """ Utility function to determine if we can cast from `from_dtype` to `to_dtype`. This function primarily calls `np.can_cast` but with some special handling around cudf specific dtypes. """ + # TODO : Add precision & scale checking for + # decimal types in future if isinstance(from_dtype, cudf.core.dtypes.Decimal64Dtype): if isinstance(to_dtype, cudf.core.dtypes.Decimal64Dtype): return True @@ -680,7 +682,7 @@ def can_cast(from_dtype, to_dtype): else: return False elif isinstance(from_dtype, np.dtype): - if isinstance(to_dtype, np.dtype): + if isinstance(to_dtype, (np.dtype, type)): return np.can_cast(from_dtype, to_dtype) elif isinstance(to_dtype, cudf.core.dtypes.Decimal64Dtype): if from_dtype.kind in {"i", "f", "u", "U", "O"}: @@ -692,14 +694,16 @@ def can_cast(from_dtype, to_dtype): else: return False elif isinstance(from_dtype, cudf.core.dtypes.ListDtype): + # TODO: Add level based checks too once casting of + # list columns is supported if isinstance(to_dtype, cudf.core.dtypes.ListDtype): - return True + return np.can_cast(from_dtype.leaf_type, to_dtype.leaf_type) else: return False elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype): if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype): return True - elif isinstance(to_dtype, np.dtype): + elif isinstance(to_dtype, (np.dtype, type)): return np.can_cast(from_dtype._categories.dtype, to_dtype) else: return False From 16f973838b8550f2383b54190144f4f99b3587e6 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 30 Mar 2021 12:37:41 -0700 Subject: [PATCH 21/23] use cudf utility for is_numerical_dtype --- python/cudf/cudf/core/_internals/where.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 566691d08be..1fdc907875e 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -65,7 +65,7 @@ def _check_and_cast_columns_with_other( else: if ( cudf.utils.dtypes.is_scalar(other) - and pd.api.types.is_numeric_dtype(source_col.dtype) + and cudf.utils.dtypes.is_numerical_dtype(source_col.dtype) and cudf.utils.dtypes._can_cast(other, source_col.dtype) ): common_dtype = source_col.dtype From 4268d65f19e950c7bea700a6a38f4884b771f198 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 31 Mar 2021 09:37:35 -0700 Subject: [PATCH 22/23] handle generic types --- python/cudf/cudf/utils/dtypes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index e501f202754..2b92b4d1f10 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -681,7 +681,8 @@ def _can_cast(from_dtype, to_dtype): return True else: return False - elif isinstance(from_dtype, np.dtype): + elif isinstance(from_dtype, (np.dtype, type)): + from_dtype = np.dtype(from_dtype) if isinstance(to_dtype, (np.dtype, type)): return np.can_cast(from_dtype, to_dtype) elif isinstance(to_dtype, cudf.core.dtypes.Decimal64Dtype): From 05965dd4bb1884aa659f2d992f756b4453a602cf Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 31 Mar 2021 09:40:06 -0700 Subject: [PATCH 23/23] refactor --- python/cudf/cudf/utils/dtypes.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 2b92b4d1f10..be2b1bca2e0 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -671,6 +671,11 @@ def _can_cast(from_dtype, to_dtype): `np.can_cast` but with some special handling around cudf specific dtypes. """ + if isinstance(from_dtype, type): + from_dtype = np.dtype(from_dtype) + if isinstance(to_dtype, type): + to_dtype = np.dtype(to_dtype) + # TODO : Add precision & scale checking for # decimal types in future if isinstance(from_dtype, cudf.core.dtypes.Decimal64Dtype): @@ -681,9 +686,8 @@ def _can_cast(from_dtype, to_dtype): return True else: return False - elif isinstance(from_dtype, (np.dtype, type)): - from_dtype = np.dtype(from_dtype) - if isinstance(to_dtype, (np.dtype, type)): + elif isinstance(from_dtype, np.dtype): + if isinstance(to_dtype, np.dtype): return np.can_cast(from_dtype, to_dtype) elif isinstance(to_dtype, cudf.core.dtypes.Decimal64Dtype): if from_dtype.kind in {"i", "f", "u", "U", "O"}: @@ -704,7 +708,7 @@ def _can_cast(from_dtype, to_dtype): elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype): if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype): return True - elif isinstance(to_dtype, (np.dtype, type)): + elif isinstance(to_dtype, np.dtype): return np.can_cast(from_dtype._categories.dtype, to_dtype) else: return False