From ee12d29b96db17e0c0caa7314df15879a485ece5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sat, 24 Apr 2021 10:01:53 -0700 Subject: [PATCH 01/35] Rename Series._binaryop to _apply_op for consistency with DataFrame. --- python/cudf/cudf/core/series.py | 92 ++++++++++++++++----------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5ee40d576b6..2c4b76c3eb8 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1329,7 +1329,7 @@ def __repr__(self): return "\n".join(lines) @annotate("BINARY_OP", color="orange", domain="cudf_python") - def _binaryop( + def _apply_op( self, other, fn, fill_value=None, reflect=False, can_reindex=False ): """ @@ -1369,7 +1369,7 @@ def _binaryop( mask = (lmask | rmask).data lhs = lhs.fillna(fill_value) rhs = rhs.fillna(fill_value) - result = lhs._binaryop(rhs, fn=fn, reflect=reflect) + result = lhs._apply_op(rhs, fn=fn, reflect=reflect) data = column.build_column( data=result.data, dtype=result.dtype, mask=mask ) @@ -1445,10 +1445,10 @@ def add(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop(other, "add", fill_value) + return self._apply_op(other, "add", fill_value) def __add__(self, other): - return self._binaryop(other, "add") + return self._apply_op(other, "add") def radd(self, other, fill_value=None, axis=0): """Addition of series and other, element-wise @@ -1493,12 +1493,12 @@ def radd(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop( + return self._apply_op( other, "add", fill_value=fill_value, reflect=True ) def __radd__(self, other): - return self._binaryop(other, "add", reflect=True) + return self._apply_op(other, "add", reflect=True) def subtract(self, other, fill_value=None, axis=0): """Subtraction of series and other, element-wise @@ -1544,12 +1544,12 @@ def subtract(self, other, fill_value=None, axis=0): """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop(other, "sub", fill_value) + return self._apply_op(other, "sub", fill_value) sub = subtract def __sub__(self, other): - return self._binaryop(other, "sub") + return self._apply_op(other, "sub") def rsub(self, other, fill_value=None, axis=0): """Subtraction of series and other, element-wise @@ -1594,10 +1594,10 @@ def rsub(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop(other, "sub", fill_value, reflect=True) + return self._apply_op(other, "sub", fill_value, reflect=True) def __rsub__(self, other): - return self._binaryop(other, "sub", reflect=True) + return self._apply_op(other, "sub", reflect=True) def multiply(self, other, fill_value=None, axis=0): """Multiplication of series and other, element-wise @@ -1642,12 +1642,12 @@ def multiply(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop(other, "mul", fill_value=fill_value) + return self._apply_op(other, "mul", fill_value=fill_value) mul = multiply def __mul__(self, other): - return self._binaryop(other, "mul") + return self._apply_op(other, "mul") def rmul(self, other, fill_value=None, axis=0): """Multiplication of series and other, element-wise @@ -1695,10 +1695,10 @@ def rmul(self, other, fill_value=None, axis=0): """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop(other, "mul", fill_value, True) + return self._apply_op(other, "mul", fill_value, True) def __rmul__(self, other): - return self._binaryop(other, "mul", reflect=True) + return self._apply_op(other, "mul", reflect=True) def mod(self, other, fill_value=None, axis=0): """Modulo of series and other, element-wise @@ -1733,10 +1733,10 @@ def mod(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop(other, "mod", fill_value) + return self._apply_op(other, "mod", fill_value) def __mod__(self, other): - return self._binaryop(other, "mod") + return self._apply_op(other, "mod") def rmod(self, other, fill_value=None, axis=0): """Modulo of series and other, element-wise @@ -1784,10 +1784,10 @@ def rmod(self, other, fill_value=None, axis=0): """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop(other, "mod", fill_value, True) + return self._apply_op(other, "mod", fill_value, True) def __rmod__(self, other): - return self._binaryop(other, "mod", reflect=True) + return self._apply_op(other, "mod", reflect=True) def pow(self, other, fill_value=None, axis=0): """Exponential power of series and other, element-wise @@ -1832,10 +1832,10 @@ def pow(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop(other, "pow", fill_value) + return self._apply_op(other, "pow", fill_value) def __pow__(self, other): - return self._binaryop(other, "pow") + return self._apply_op(other, "pow") def rpow(self, other, fill_value=None, axis=0): """Exponential power of series and other, element-wise @@ -1880,10 +1880,10 @@ def rpow(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop(other, "pow", fill_value, True) + return self._apply_op(other, "pow", fill_value, True) def __rpow__(self, other): - return self._binaryop(other, "pow", reflect=True) + return self._apply_op(other, "pow", reflect=True) def floordiv(self, other, fill_value=None, axis=0): """Integer division of series and other, element-wise @@ -1928,10 +1928,10 @@ def floordiv(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop(other, "floordiv", fill_value) + return self._apply_op(other, "floordiv", fill_value) def __floordiv__(self, other): - return self._binaryop(other, "floordiv") + return self._apply_op(other, "floordiv") def rfloordiv(self, other, fill_value=None, axis=0): """Integer division of series and other, element-wise @@ -1984,10 +1984,10 @@ def rfloordiv(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop(other, "floordiv", fill_value, True) + return self._apply_op(other, "floordiv", fill_value, True) def __rfloordiv__(self, other): - return self._binaryop(other, "floordiv", reflect=True) + return self._apply_op(other, "floordiv", reflect=True) def truediv(self, other, fill_value=None, axis=0): """Floating division of series and other, element-wise @@ -2032,10 +2032,10 @@ def truediv(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop(other, "truediv", fill_value) + return self._apply_op(other, "truediv", fill_value) def __truediv__(self, other): - return self._binaryop(other, "truediv") + return self._apply_op(other, "truediv") def rtruediv(self, other, fill_value=None, axis=0): """Floating division of series and other, element-wise @@ -2080,10 +2080,10 @@ def rtruediv(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop(other, "truediv", fill_value, True) + return self._apply_op(other, "truediv", fill_value, True) def __rtruediv__(self, other): - return self._binaryop(other, "truediv", reflect=True) + return self._apply_op(other, "truediv", reflect=True) __div__ = __truediv__ @@ -2097,7 +2097,7 @@ def _bitwise_binop(self, other, op): ): # TODO: This doesn't work on Series (op) DataFrame # because dataframe doesn't have dtype - ser = self._binaryop(other, op) + ser = self._apply_op(other, op) if np.issubdtype(self.dtype, np.bool_) or np.issubdtype( other.dtype, np.bool_ ): @@ -2128,15 +2128,15 @@ def __xor__(self, other): return self._bitwise_binop(other, "xor") def logical_and(self, other): - ser = self._binaryop(other, "l_and") + ser = self._apply_op(other, "l_and") return ser.astype(np.bool_) def remainder(self, other): - ser = self._binaryop(other, "mod") + ser = self._apply_op(other, "mod") return ser def logical_or(self, other): - ser = self._binaryop(other, "l_or") + ser = self._apply_op(other, "l_or") return ser.astype(np.bool_) def logical_not(self): @@ -2205,12 +2205,12 @@ def eq(self, other, fill_value=None, axis=0): """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop( + return self._apply_op( other=other, fn="eq", fill_value=fill_value, can_reindex=True ) def __eq__(self, other): - return self._binaryop(other, "eq") + return self._apply_op(other, "eq") def ne(self, other, fill_value=None, axis=0): """Not equal to of series and other, element-wise @@ -2260,12 +2260,12 @@ def ne(self, other, fill_value=None, axis=0): """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop( + return self._apply_op( other=other, fn="ne", fill_value=fill_value, can_reindex=True ) def __ne__(self, other): - return self._binaryop(other, "ne") + return self._apply_op(other, "ne") def lt(self, other, fill_value=None, axis=0): """Less than of series and other, element-wise @@ -2315,12 +2315,12 @@ def lt(self, other, fill_value=None, axis=0): """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop( + return self._apply_op( other=other, fn="lt", fill_value=fill_value, can_reindex=True ) def __lt__(self, other): - return self._binaryop(other, "lt") + return self._apply_op(other, "lt") def le(self, other, fill_value=None, axis=0): """Less than or equal to of series and other, element-wise @@ -2370,12 +2370,12 @@ def le(self, other, fill_value=None, axis=0): """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop( + return self._apply_op( other=other, fn="le", fill_value=fill_value, can_reindex=True ) def __le__(self, other): - return self._binaryop(other, "le") + return self._apply_op(other, "le") def gt(self, other, fill_value=None, axis=0): """Greater than of series and other, element-wise @@ -2425,12 +2425,12 @@ def gt(self, other, fill_value=None, axis=0): """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop( + return self._apply_op( other=other, fn="gt", fill_value=fill_value, can_reindex=True ) def __gt__(self, other): - return self._binaryop(other, "gt") + return self._apply_op(other, "gt") def ge(self, other, fill_value=None, axis=0): """Greater than or equal to of series and other, element-wise @@ -2480,12 +2480,12 @@ def ge(self, other, fill_value=None, axis=0): """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop( + return self._apply_op( other=other, fn="ge", fill_value=fill_value, can_reindex=True ) def __ge__(self, other): - return self._binaryop(other, "ge") + return self._apply_op(other, "ge") def __invert__(self): """Bitwise invert (~) for each element. From 252c9e1f53d6c4609334fd42677524dbba24b52f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Apr 2021 13:27:13 -0700 Subject: [PATCH 02/35] Simplify logic for DataFrame-Series binaryop. --- python/cudf/cudf/core/dataframe.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f2be0e3bd6e..04fd56e3c18 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1478,6 +1478,7 @@ def op(lhs, rhs): result[col] = getattr(self[col], fn)() return result elif isinstance(other, Sequence): + # This adds the ith element of other to the ith column of self. for k, col in enumerate(self._data): result[col] = getattr(self[col], fn)(other[k]) elif isinstance(other, DataFrame): @@ -1515,23 +1516,19 @@ def fallback(col, fn): result[col] = fallback(rhs[col], _reverse_op(fn)) elif isinstance(other, Series): other_cols = other.to_pandas().to_dict() - other_cols_keys = list(other_cols.keys()) - result_cols = list(self.columns) + result_cols = list(self._column_names) df_cols = list(result_cols) - for new_col in other_cols.keys(): + + for new_col in other_cols: if new_col not in result_cols: result_cols.append(new_col) for col in result_cols: - if col in df_cols and col in other_cols_keys: - l_opr = self[col] - r_opr = other_cols[col] - else: - if col not in df_cols: - r_opr = other_cols[col] - l_opr = Series(as_column(np.nan, length=len(self))) - if col not in other_cols_keys: - r_opr = None - l_opr = self[col] + l_opr = ( + self[col] + if col in df_cols + else Series(as_column(np.nan, length=len(self))) + ) + r_opr = other_cols[col] if col in other_cols else None result[col] = op(l_opr, r_opr) elif isinstance(other, (numbers.Number, cudf.Scalar)) or ( From f489c198b426b6257432658cd2e83664c901e52e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Apr 2021 14:02:04 -0700 Subject: [PATCH 03/35] Minor simplifications. --- python/cudf/cudf/core/dataframe.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 04fd56e3c18..9b0f9c78c44 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1476,7 +1476,6 @@ def op(lhs, rhs): if other is None: for col in self._data: result[col] = getattr(self[col], fn)() - return result elif isinstance(other, Sequence): # This adds the ith element of other to the ith column of self. for k, col in enumerate(self._data): @@ -1528,8 +1527,7 @@ def fallback(col, fn): if col in df_cols else Series(as_column(np.nan, length=len(self))) ) - r_opr = other_cols[col] if col in other_cols else None - result[col] = op(l_opr, r_opr) + result[col] = op(l_opr, other_cols.get(col)) elif isinstance(other, (numbers.Number, cudf.Scalar)) or ( isinstance(other, np.ndarray) and other.ndim == 0 From 9c0dc5374ddb1a4e895033891c5492cbf156d516 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Apr 2021 14:32:39 -0700 Subject: [PATCH 04/35] Simplify construction of result columns list. --- python/cudf/cudf/core/dataframe.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 9b0f9c78c44..1580f80f989 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1515,19 +1515,19 @@ def fallback(col, fn): result[col] = fallback(rhs[col], _reverse_op(fn)) elif isinstance(other, Series): other_cols = other.to_pandas().to_dict() - result_cols = list(self._column_names) - df_cols = list(result_cols) + df_cols = self._column_names + result_cols = df_cols + tuple( + col for col in other_cols if col not in df_cols + ) - for new_col in other_cols: - if new_col not in result_cols: - result_cols.append(new_col) for col in result_cols: l_opr = ( self[col] if col in df_cols else Series(as_column(np.nan, length=len(self))) ) - result[col] = op(l_opr, other_cols.get(col)) + r_opr = other_cols.get(col) + result[col] = op(l_opr, r_opr) elif isinstance(other, (numbers.Number, cudf.Scalar)) or ( isinstance(other, np.ndarray) and other.ndim == 0 From 19f1b69dbe93ef07e444538b89a5fd62875a758e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Apr 2021 14:54:15 -0700 Subject: [PATCH 05/35] Revert "Rename Series._binaryop to _apply_op for consistency with DataFrame." This reverts commit a49f3748c7d19fa39181df05549c7a7dcced4a9a. --- python/cudf/cudf/core/series.py | 92 ++++++++++++++++----------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 2c4b76c3eb8..5ee40d576b6 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1329,7 +1329,7 @@ def __repr__(self): return "\n".join(lines) @annotate("BINARY_OP", color="orange", domain="cudf_python") - def _apply_op( + def _binaryop( self, other, fn, fill_value=None, reflect=False, can_reindex=False ): """ @@ -1369,7 +1369,7 @@ def _apply_op( mask = (lmask | rmask).data lhs = lhs.fillna(fill_value) rhs = rhs.fillna(fill_value) - result = lhs._apply_op(rhs, fn=fn, reflect=reflect) + result = lhs._binaryop(rhs, fn=fn, reflect=reflect) data = column.build_column( data=result.data, dtype=result.dtype, mask=mask ) @@ -1445,10 +1445,10 @@ def add(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._apply_op(other, "add", fill_value) + return self._binaryop(other, "add", fill_value) def __add__(self, other): - return self._apply_op(other, "add") + return self._binaryop(other, "add") def radd(self, other, fill_value=None, axis=0): """Addition of series and other, element-wise @@ -1493,12 +1493,12 @@ def radd(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._apply_op( + return self._binaryop( other, "add", fill_value=fill_value, reflect=True ) def __radd__(self, other): - return self._apply_op(other, "add", reflect=True) + return self._binaryop(other, "add", reflect=True) def subtract(self, other, fill_value=None, axis=0): """Subtraction of series and other, element-wise @@ -1544,12 +1544,12 @@ def subtract(self, other, fill_value=None, axis=0): """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._apply_op(other, "sub", fill_value) + return self._binaryop(other, "sub", fill_value) sub = subtract def __sub__(self, other): - return self._apply_op(other, "sub") + return self._binaryop(other, "sub") def rsub(self, other, fill_value=None, axis=0): """Subtraction of series and other, element-wise @@ -1594,10 +1594,10 @@ def rsub(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._apply_op(other, "sub", fill_value, reflect=True) + return self._binaryop(other, "sub", fill_value, reflect=True) def __rsub__(self, other): - return self._apply_op(other, "sub", reflect=True) + return self._binaryop(other, "sub", reflect=True) def multiply(self, other, fill_value=None, axis=0): """Multiplication of series and other, element-wise @@ -1642,12 +1642,12 @@ def multiply(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._apply_op(other, "mul", fill_value=fill_value) + return self._binaryop(other, "mul", fill_value=fill_value) mul = multiply def __mul__(self, other): - return self._apply_op(other, "mul") + return self._binaryop(other, "mul") def rmul(self, other, fill_value=None, axis=0): """Multiplication of series and other, element-wise @@ -1695,10 +1695,10 @@ def rmul(self, other, fill_value=None, axis=0): """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._apply_op(other, "mul", fill_value, True) + return self._binaryop(other, "mul", fill_value, True) def __rmul__(self, other): - return self._apply_op(other, "mul", reflect=True) + return self._binaryop(other, "mul", reflect=True) def mod(self, other, fill_value=None, axis=0): """Modulo of series and other, element-wise @@ -1733,10 +1733,10 @@ def mod(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._apply_op(other, "mod", fill_value) + return self._binaryop(other, "mod", fill_value) def __mod__(self, other): - return self._apply_op(other, "mod") + return self._binaryop(other, "mod") def rmod(self, other, fill_value=None, axis=0): """Modulo of series and other, element-wise @@ -1784,10 +1784,10 @@ def rmod(self, other, fill_value=None, axis=0): """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._apply_op(other, "mod", fill_value, True) + return self._binaryop(other, "mod", fill_value, True) def __rmod__(self, other): - return self._apply_op(other, "mod", reflect=True) + return self._binaryop(other, "mod", reflect=True) def pow(self, other, fill_value=None, axis=0): """Exponential power of series and other, element-wise @@ -1832,10 +1832,10 @@ def pow(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._apply_op(other, "pow", fill_value) + return self._binaryop(other, "pow", fill_value) def __pow__(self, other): - return self._apply_op(other, "pow") + return self._binaryop(other, "pow") def rpow(self, other, fill_value=None, axis=0): """Exponential power of series and other, element-wise @@ -1880,10 +1880,10 @@ def rpow(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._apply_op(other, "pow", fill_value, True) + return self._binaryop(other, "pow", fill_value, True) def __rpow__(self, other): - return self._apply_op(other, "pow", reflect=True) + return self._binaryop(other, "pow", reflect=True) def floordiv(self, other, fill_value=None, axis=0): """Integer division of series and other, element-wise @@ -1928,10 +1928,10 @@ def floordiv(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._apply_op(other, "floordiv", fill_value) + return self._binaryop(other, "floordiv", fill_value) def __floordiv__(self, other): - return self._apply_op(other, "floordiv") + return self._binaryop(other, "floordiv") def rfloordiv(self, other, fill_value=None, axis=0): """Integer division of series and other, element-wise @@ -1984,10 +1984,10 @@ def rfloordiv(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._apply_op(other, "floordiv", fill_value, True) + return self._binaryop(other, "floordiv", fill_value, True) def __rfloordiv__(self, other): - return self._apply_op(other, "floordiv", reflect=True) + return self._binaryop(other, "floordiv", reflect=True) def truediv(self, other, fill_value=None, axis=0): """Floating division of series and other, element-wise @@ -2032,10 +2032,10 @@ def truediv(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._apply_op(other, "truediv", fill_value) + return self._binaryop(other, "truediv", fill_value) def __truediv__(self, other): - return self._apply_op(other, "truediv") + return self._binaryop(other, "truediv") def rtruediv(self, other, fill_value=None, axis=0): """Floating division of series and other, element-wise @@ -2080,10 +2080,10 @@ def rtruediv(self, other, fill_value=None, axis=0): """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._apply_op(other, "truediv", fill_value, True) + return self._binaryop(other, "truediv", fill_value, True) def __rtruediv__(self, other): - return self._apply_op(other, "truediv", reflect=True) + return self._binaryop(other, "truediv", reflect=True) __div__ = __truediv__ @@ -2097,7 +2097,7 @@ def _bitwise_binop(self, other, op): ): # TODO: This doesn't work on Series (op) DataFrame # because dataframe doesn't have dtype - ser = self._apply_op(other, op) + ser = self._binaryop(other, op) if np.issubdtype(self.dtype, np.bool_) or np.issubdtype( other.dtype, np.bool_ ): @@ -2128,15 +2128,15 @@ def __xor__(self, other): return self._bitwise_binop(other, "xor") def logical_and(self, other): - ser = self._apply_op(other, "l_and") + ser = self._binaryop(other, "l_and") return ser.astype(np.bool_) def remainder(self, other): - ser = self._apply_op(other, "mod") + ser = self._binaryop(other, "mod") return ser def logical_or(self, other): - ser = self._apply_op(other, "l_or") + ser = self._binaryop(other, "l_or") return ser.astype(np.bool_) def logical_not(self): @@ -2205,12 +2205,12 @@ def eq(self, other, fill_value=None, axis=0): """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._apply_op( + return self._binaryop( other=other, fn="eq", fill_value=fill_value, can_reindex=True ) def __eq__(self, other): - return self._apply_op(other, "eq") + return self._binaryop(other, "eq") def ne(self, other, fill_value=None, axis=0): """Not equal to of series and other, element-wise @@ -2260,12 +2260,12 @@ def ne(self, other, fill_value=None, axis=0): """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._apply_op( + return self._binaryop( other=other, fn="ne", fill_value=fill_value, can_reindex=True ) def __ne__(self, other): - return self._apply_op(other, "ne") + return self._binaryop(other, "ne") def lt(self, other, fill_value=None, axis=0): """Less than of series and other, element-wise @@ -2315,12 +2315,12 @@ def lt(self, other, fill_value=None, axis=0): """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._apply_op( + return self._binaryop( other=other, fn="lt", fill_value=fill_value, can_reindex=True ) def __lt__(self, other): - return self._apply_op(other, "lt") + return self._binaryop(other, "lt") def le(self, other, fill_value=None, axis=0): """Less than or equal to of series and other, element-wise @@ -2370,12 +2370,12 @@ def le(self, other, fill_value=None, axis=0): """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._apply_op( + return self._binaryop( other=other, fn="le", fill_value=fill_value, can_reindex=True ) def __le__(self, other): - return self._apply_op(other, "le") + return self._binaryop(other, "le") def gt(self, other, fill_value=None, axis=0): """Greater than of series and other, element-wise @@ -2425,12 +2425,12 @@ def gt(self, other, fill_value=None, axis=0): """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._apply_op( + return self._binaryop( other=other, fn="gt", fill_value=fill_value, can_reindex=True ) def __gt__(self, other): - return self._apply_op(other, "gt") + return self._binaryop(other, "gt") def ge(self, other, fill_value=None, axis=0): """Greater than or equal to of series and other, element-wise @@ -2480,12 +2480,12 @@ def ge(self, other, fill_value=None, axis=0): """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") - return self._apply_op( + return self._binaryop( other=other, fn="ge", fill_value=fill_value, can_reindex=True ) def __ge__(self, other): - return self._apply_op(other, "ge") + return self._binaryop(other, "ge") def __invert__(self): """Bitwise invert (~) for each element. From bc7585fc3b46175149e3ede3f0d7dbdf284ae5df Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 28 Apr 2021 10:08:00 -0700 Subject: [PATCH 06/35] Simplify null handling logic in binops. --- python/cudf/cudf/core/series.py | 15 ++------------- python/cudf/cudf/tests/test_series.py | 8 ++++++++ 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5ee40d576b6..7eca39b40f8 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1363,20 +1363,9 @@ def _binaryop( if is_scalar(rhs): lhs = lhs.fillna(fill_value) else: - if lhs.nullable and rhs.nullable: - lmask = Series(data=lhs.nullmask) - rmask = Series(data=rhs.nullmask) - mask = (lmask | rmask).data + if lhs.nullable: lhs = lhs.fillna(fill_value) - rhs = rhs.fillna(fill_value) - result = lhs._binaryop(rhs, fn=fn, reflect=reflect) - data = column.build_column( - data=result.data, dtype=result.dtype, mask=mask - ) - return lhs._copy_construct(data=data) - elif lhs.nullable: - lhs = lhs.fillna(fill_value) - elif rhs.nullable: + if rhs.nullable: rhs = rhs.fillna(fill_value) outcol = lhs._column.binary_operator(fn, rhs, reflect=reflect) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 0cc0ad57745..4f9b52be7c7 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1196,3 +1196,11 @@ def test_explode(data, ignore_index, p_index): assert_eq(expect, got, check_dtype=False) else: assert_eq(expect, got, check_dtype=False) + + +def test_binop_add_nullable_fill(): + """Test a binary op (addition) with a fill value on two nullable series.""" + x = cudf.Series([1, 2, None]) + y = cudf.Series([1, None, 3]) + assert_eq(x.add(y), cudf.Series([2, None, None])) + assert_eq(x.add(y, fill_value=0), cudf.Series([2, 2, 3])) From 00dffb5df8d6c239faf2da8b85e8fe9695b0a9a4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 28 Apr 2021 11:40:29 -0700 Subject: [PATCH 07/35] More simplifications. --- python/cudf/cudf/core/series.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 7eca39b40f8..5fff5cc424b 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1360,24 +1360,21 @@ def _binaryop( lhs = lhs.astype(truediv_type) if fill_value is not None: - if is_scalar(rhs): + if lhs.nullable: lhs = lhs.fillna(fill_value) - else: - if lhs.nullable: - lhs = lhs.fillna(fill_value) - if rhs.nullable: - rhs = rhs.fillna(fill_value) + if not is_scalar(rhs) and rhs.nullable: + rhs = rhs.fillna(fill_value) outcol = lhs._column.binary_operator(fn, rhs, reflect=reflect) # Get the appropriate name for output operations involving two objects - # that are a mix of pandas and cudf Series and Index. If the two inputs - # are identically named, the output shares this name. - if isinstance(other, (cudf.Series, cudf.Index, pd.Series, pd.Index)): - if self.name == other.name: - result_name = self.name - else: - result_name = None + # that are Series-like objects. The output shares the lhs's name unless + # the rhs is a _differently_ named Series-like object. + if ( + isinstance(other, (cudf.Series, cudf.Index, pd.Series, pd.Index)) + and self.name != other.name + ): + result_name = None else: result_name = self.name From 5cae0a434ebf5ba359fb6aebbd9ff663c34dea1a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 28 Apr 2021 11:48:11 -0700 Subject: [PATCH 08/35] Move copy construct from series to frame. --- python/cudf/cudf/core/frame.py | 10 ++++++++++ python/cudf/cudf/core/series.py | 11 ++--------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 4a434be42ce..5bb741ccb0e 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -73,6 +73,16 @@ def _mimic_inplace( else: return result + @property + def _copy_construct_defaults(self): + """A default dictionary of kwargs to be used for copy construction.""" + raise NotImplementedError + + def _copy_construct(self, **kwargs): + """Shallow copy this object by replacing certain ctor args. + """ + return self.__class__(**{**self._copy_construct_defaults, **kwargs}) + @property def size(self): """ diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5fff5cc424b..031f3e24f45 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -390,16 +390,9 @@ def deserialize(cls, header, frames): return Series(column, index=index, name=name) + @property def _copy_construct_defaults(self): - return dict(data=self._column, index=self._index, name=self.name) - - def _copy_construct(self, **kwargs): - """Shallow copy this object by replacing certain ctor args. - """ - params = self._copy_construct_defaults() - cls = type(self) - params.update(kwargs) - return cls(**params) + return {"data": self._column, "index": self._index, "name": self.name} def _get_columns_by_label(self, labels, downcast=False): """Return the column specified by `labels` From ddebce620e2b242945e70a4ded937f1922bc0f05 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 28 Apr 2021 12:10:23 -0700 Subject: [PATCH 09/35] Add trivial internal method for index non-reflective binary ops (currently still a pass-through to Series). --- python/cudf/cudf/core/index.py | 36 +++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 5f390be79e2..ada48c1deb6 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -794,6 +794,9 @@ def _apply_op(self, fn, other=None): else: return as_index(op()) + def _binaryop(self, fn, other): + return self._apply_op(fn, other) + def sort_values(self, return_indexer=False, ascending=True, key=None): """ Return a sorted copy of the index, and optionally return the indices @@ -890,40 +893,40 @@ def unique(self): return as_index(self._values.unique(), name=self.name) def __add__(self, other): - return self._apply_op("__add__", other) + return self._binaryop("__add__", other) def __radd__(self, other): return self._apply_op("__radd__", other) def __sub__(self, other): - return self._apply_op("__sub__", other) + return self._binaryop("__sub__", other) def __rsub__(self, other): return self._apply_op("__rsub__", other) def __mul__(self, other): - return self._apply_op("__mul__", other) + return self._binaryop("__mul__", other) def __rmul__(self, other): return self._apply_op("__rmul__", other) def __mod__(self, other): - return self._apply_op("__mod__", other) + return self._binaryop("__mod__", other) def __rmod__(self, other): return self._apply_op("__rmod__", other) def __pow__(self, other): - return self._apply_op("__pow__", other) + return self._binaryop("__pow__", other) def __floordiv__(self, other): - return self._apply_op("__floordiv__", other) + return self._binaryop("__floordiv__", other) def __rfloordiv__(self, other): return self._apply_op("__rfloordiv__", other) def __truediv__(self, other): - return self._apply_op("__truediv__", other) + return self._binaryop("__truediv__", other) def __rtruediv__(self, other): return self._apply_op("__rtruediv__", other) @@ -940,22 +943,22 @@ def __xor__(self, other): return self._apply_op("__xor__", other) def __eq__(self, other): - return self._apply_op("__eq__", other) + return self._binaryop("__eq__", other) def __ne__(self, other): - return self._apply_op("__ne__", other) + return self._binaryop("__ne__", other) def __lt__(self, other): - return self._apply_op("__lt__", other) + return self._binaryop("__lt__", other) def __le__(self, other): - return self._apply_op("__le__", other) + return self._binaryop("__le__", other) def __gt__(self, other): - return self._apply_op("__gt__", other) + return self._binaryop("__gt__", other) def __ge__(self, other): - return self._apply_op("__ge__", other) + return self._binaryop("__ge__", other) def join( self, other, how="left", level=None, return_indexers=False, sort=False @@ -1425,6 +1428,10 @@ def _from_table(cls, table): else: return as_index(table) + @property + def _copy_construct_defaults(self): + return {"data": self._column, "name": self.name} + @classmethod def _from_data(cls, data, index=None): return cls._from_table(SingleColumnFrame(data=data)) @@ -1626,9 +1633,6 @@ def __getitem__(self, index): return as_index(self._values[index], name=self.name) - def __eq__(self, other): - return super(type(self), self).__eq__(other) - def equals(self, other): if isinstance(other, RangeIndex): if (self._start, self._stop, self._step) == ( From 3f26b410dea8f8416f0d57a4bc55ecc976f5d0a2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 28 Apr 2021 13:53:29 -0700 Subject: [PATCH 10/35] Initial version of binary op for index. --- python/cudf/cudf/core/index.py | 55 ++++++++++++++++++++++++++- python/cudf/cudf/tests/test_series.py | 15 ++++++++ 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index ada48c1deb6..eee68517619 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -795,7 +795,60 @@ def _apply_op(self, fn, other=None): return as_index(op()) def _binaryop(self, fn, other): - return self._apply_op(fn, other) + if isinstance(other, cudf.DataFrame): + return NotImplemented + + if isinstance(other, cudf.Series): + # TODO: Change this to fall back to adding series, first checking + # that the lengths of self and other are the same. Should be fine + # to just return NotImplemented I think. + self._apply_op(fn, other) + else: + lhs, rhs = self, other + rhs = self._normalize_binop_value(rhs) + + if fn == "truediv": + if str(lhs.dtype) in truediv_int_dtype_corrections: + truediv_type = truediv_int_dtype_corrections[str(lhs.dtype)] + lhs = lhs.astype(truediv_type) + + if fill_value is not None: + if lhs.nullable: + lhs = lhs.fillna(fill_value) + if not is_scalar(rhs) and rhs.nullable: + rhs = rhs.fillna(fill_value) + + outcol = lhs._column.binary_operator(fn, rhs) + + # Get the appropriate name for output operations involving two objects + # that are Series-like objects. The output shares the lhs's name unless + # the rhs is a _differently_ named Series-like object. + if ( + isinstance(other, (cudf.Series, cudf.Index, pd.Series, pd.Index)) + and self.name != other.name + ): + result_name = None + else: + result_name = self.name + + return lhs._copy_construct(data=outcol, name=result_name) + + def _normalize_binop_value(self, other): + """Returns a *column* (not a Series) or scalar for performing + binary operations with self._column. + """ + # TODO: Fix some of this, it may not make sense to have something + # general for multiple frame types. + if isinstance(other, ColumnBase): + return other + if isinstance(other, cudf.Series): + return other._column + elif isinstance(other, cudf.Index): + return cudf.Series(other)._column + elif other is cudf.NA: + return cudf.Scalar(other, dtype=self.dtype) + else: + return self._column.normalize_binop_value(other) def sort_values(self, return_indexer=False, ascending=True, key=None): """ diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 4f9b52be7c7..27e23d0840b 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1204,3 +1204,18 @@ def test_binop_add_nullable_fill(): y = cudf.Series([1, None, 3]) assert_eq(x.add(y), cudf.Series([2, None, None])) assert_eq(x.add(y, fill_value=0), cudf.Series([2, 2, 3])) + + +# def test_binop_indexed_series_withindex(): +# """Test a binary op (addition) with a non-integral index and an Index.""" +# psr = pd.Series([1, 2, 3], index=['a', 'b', 'c']) +# pidx = pd.Index([1, None, 3]) +# +# gsr = cudf.Series(psr) +# gidx = cudf.Index(pidx) +# assert_eq(psr + pidx, gsr + gidx) +# assert_eq(psr + pd.Series(pidx), gsr + cudf.Series(gidx)) +# assert_eq(pidx + psr, gidx + gsr) +# assert_eq(pd.Series(pidx) + psr, cudf.Series(gidx) + gsr) +# +# # TODO: Also test with arbitrary sequence objects (like lists). From 8216e6c3110c0f2f0391d66cd180ed5617dc3ec7 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 3 May 2021 16:07:54 -0700 Subject: [PATCH 11/35] Revert dataframe changes (restricting this PR to index and series only). --- python/cudf/cudf/core/dataframe.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1580f80f989..f2be0e3bd6e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1476,8 +1476,8 @@ def op(lhs, rhs): if other is None: for col in self._data: result[col] = getattr(self[col], fn)() + return result elif isinstance(other, Sequence): - # This adds the ith element of other to the ith column of self. for k, col in enumerate(self._data): result[col] = getattr(self[col], fn)(other[k]) elif isinstance(other, DataFrame): @@ -1515,18 +1515,23 @@ def fallback(col, fn): result[col] = fallback(rhs[col], _reverse_op(fn)) elif isinstance(other, Series): other_cols = other.to_pandas().to_dict() - df_cols = self._column_names - result_cols = df_cols + tuple( - col for col in other_cols if col not in df_cols - ) - + other_cols_keys = list(other_cols.keys()) + result_cols = list(self.columns) + df_cols = list(result_cols) + for new_col in other_cols.keys(): + if new_col not in result_cols: + result_cols.append(new_col) for col in result_cols: - l_opr = ( - self[col] - if col in df_cols - else Series(as_column(np.nan, length=len(self))) - ) - r_opr = other_cols.get(col) + if col in df_cols and col in other_cols_keys: + l_opr = self[col] + r_opr = other_cols[col] + else: + if col not in df_cols: + r_opr = other_cols[col] + l_opr = Series(as_column(np.nan, length=len(self))) + if col not in other_cols_keys: + r_opr = None + l_opr = self[col] result[col] = op(l_opr, r_opr) elif isinstance(other, (numbers.Number, cudf.Scalar)) or ( From 26e22ffb049d80d98fb1c64d9af778a4888706c3 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 3 May 2021 16:58:49 -0700 Subject: [PATCH 12/35] Basic working implementation of index binops. --- python/cudf/cudf/core/index.py | 67 ++++++++++++++++++--------------- python/cudf/cudf/core/series.py | 4 +- 2 files changed, 37 insertions(+), 34 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index eee68517619..77bf3b51663 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -795,29 +795,36 @@ def _apply_op(self, fn, other=None): return as_index(op()) def _binaryop(self, fn, other): - if isinstance(other, cudf.DataFrame): + assert False + if isinstance(other, (cudf.DataFrame, cudf.Series)): return NotImplemented - if isinstance(other, cudf.Series): - # TODO: Change this to fall back to adding series, first checking - # that the lengths of self and other are the same. Should be fine - # to just return NotImplemented I think. - self._apply_op(fn, other) - else: - lhs, rhs = self, other - rhs = self._normalize_binop_value(rhs) + lhs, rhs = self, self._normalize_binop_value(other) + + truediv_int_dtype_corrections = { + "int8": "float32", + "int16": "float32", + "int32": "float32", + "int64": "float64", + "uint8": "float32", + "uint16": "float32", + "uint32": "float64", + "uint64": "float64", + "bool": "float32", + "int": "float", + } if fn == "truediv": if str(lhs.dtype) in truediv_int_dtype_corrections: truediv_type = truediv_int_dtype_corrections[str(lhs.dtype)] lhs = lhs.astype(truediv_type) - if fill_value is not None: - if lhs.nullable: - lhs = lhs.fillna(fill_value) - if not is_scalar(rhs) and rhs.nullable: - rhs = rhs.fillna(fill_value) - + # if fill_value is not None: + # if lhs.nullable: + # lhs = lhs.fillna(fill_value) + # if not is_scalar(rhs) and rhs.nullable: + # rhs = rhs.fillna(fill_value) + # outcol = lhs._column.binary_operator(fn, rhs) # Get the appropriate name for output operations involving two objects @@ -841,10 +848,8 @@ def _normalize_binop_value(self, other): # general for multiple frame types. if isinstance(other, ColumnBase): return other - if isinstance(other, cudf.Series): + if isinstance(other, (cudf.Series, cudf.Index)): return other._column - elif isinstance(other, cudf.Index): - return cudf.Series(other)._column elif other is cudf.NA: return cudf.Scalar(other, dtype=self.dtype) else: @@ -946,40 +951,40 @@ def unique(self): return as_index(self._values.unique(), name=self.name) def __add__(self, other): - return self._binaryop("__add__", other) + return self._binaryop("add", other) def __radd__(self, other): return self._apply_op("__radd__", other) def __sub__(self, other): - return self._binaryop("__sub__", other) + return self._binaryop("sub", other) def __rsub__(self, other): return self._apply_op("__rsub__", other) def __mul__(self, other): - return self._binaryop("__mul__", other) + return self._binaryop("mul", other) def __rmul__(self, other): return self._apply_op("__rmul__", other) def __mod__(self, other): - return self._binaryop("__mod__", other) + return self._binaryop("mod", other) def __rmod__(self, other): return self._apply_op("__rmod__", other) def __pow__(self, other): - return self._binaryop("__pow__", other) + return self._binaryop("pow", other) def __floordiv__(self, other): - return self._binaryop("__floordiv__", other) + return self._binaryop("floordiv", other) def __rfloordiv__(self, other): return self._apply_op("__rfloordiv__", other) def __truediv__(self, other): - return self._binaryop("__truediv__", other) + return self._binaryop("truediv", other) def __rtruediv__(self, other): return self._apply_op("__rtruediv__", other) @@ -996,22 +1001,22 @@ def __xor__(self, other): return self._apply_op("__xor__", other) def __eq__(self, other): - return self._binaryop("__eq__", other) + return self._binaryop("eq", other) def __ne__(self, other): - return self._binaryop("__ne__", other) + return self._binaryop("ne", other) def __lt__(self, other): - return self._binaryop("__lt__", other) + return self._binaryop("lt", other) def __le__(self, other): - return self._binaryop("__le__", other) + return self._binaryop("le", other) def __gt__(self, other): - return self._binaryop("__gt__", other) + return self._binaryop("gt", other) def __ge__(self, other): - return self._binaryop("__ge__", other) + return self._binaryop("ge", other) def join( self, other, how="left", level=None, return_indexers=False, sort=False diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 031f3e24f45..e53da9de59c 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2127,10 +2127,8 @@ def _normalize_binop_value(self, other): """ if isinstance(other, ColumnBase): return other - if isinstance(other, Series): + if isinstance(other, (Series, Index)): return other._column - elif isinstance(other, Index): - return Series(other)._column elif other is cudf.NA: return cudf.Scalar(other, dtype=self.dtype) else: From 946f6b1b072fea78d6ab3bb7bb4c58016dcb4a23 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 3 May 2021 17:25:49 -0700 Subject: [PATCH 13/35] Add tests for binary operations. --- python/cudf/cudf/core/index.py | 1 - python/cudf/cudf/tests/test_index.py | 36 +++++++++++++++++++++++++++ python/cudf/cudf/tests/test_series.py | 36 +++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 77bf3b51663..10c4e75091c 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -795,7 +795,6 @@ def _apply_op(self, fn, other=None): return as_index(op()) def _binaryop(self, fn, other): - assert False if isinstance(other, (cudf.DataFrame, cudf.Series)): return NotImplemented diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 158dffc3884..3a6855fac25 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -5,10 +5,14 @@ """ import re +import cupy as cp import numpy as np import pandas as pd import pyarrow as pa import pytest +from hypothesis import given, settings +from hypothesis.extra.numpy import arrays +from hypothesis.strategies import floats import cudf from cudf.core._compat import PANDAS_GE_110 @@ -2058,3 +2062,35 @@ def test_index_set_names_error(idx, level, names): lfunc_args_and_kwargs=([], {"names": names, "level": level}), rfunc_args_and_kwargs=([], {"names": names, "level": level}), ) + + +@pytest.mark.parametrize( + "binop", + [ + "__add__", + "__sub__", + "__mul__", + "__floordiv__", + "__truediv__", + "__eq__", + "__ne__", + "__lt__", + "__le__", + "__gt__", + "__ge__", + ], +) +@given( + x=arrays(np.float64, (100,), elements=floats(-10, 10, width=64)), + y=arrays(np.float64, (100,), elements=floats(-10, 10, width=64)), +) +@settings(deadline=None) +def test_binops(binop, x, y): + """Test binary operations.""" + x = cudf.Index(x) + y = cudf.Index(y) + got = (getattr(x, binop)(y)).values + expected = getattr(x.values, binop)(y.values) + assert cp.all(got == expected) or ( + cp.all(cp.isnan(got)) and cp.all(cp.isnan(expected)) + ) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 27e23d0840b..434e1bb30d9 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -4,9 +4,13 @@ import re from string import ascii_letters, digits +import cupy as cp import numpy as np import pandas as pd import pytest +from hypothesis import given, settings +from hypothesis.extra.numpy import arrays +from hypothesis.strategies import floats import cudf from cudf.tests.utils import ( @@ -1198,6 +1202,38 @@ def test_explode(data, ignore_index, p_index): assert_eq(expect, got, check_dtype=False) +@pytest.mark.parametrize( + "binop", + [ + "__add__", + "__sub__", + "__mul__", + "__floordiv__", + "__truediv__", + "__eq__", + "__ne__", + "__lt__", + "__le__", + "__gt__", + "__ge__", + ], +) +@given( + x=arrays(np.float64, (100,), elements=floats(-10, 10, width=64)), + y=arrays(np.float64, (100,), elements=floats(-10, 10, width=64)), +) +@settings(deadline=None) +def test_binops(binop, x, y): + """Test binary operations.""" + x = cudf.Series(x) + y = cudf.Series(y) + got = (getattr(x, binop)(y)).values + expected = getattr(x.values, binop)(y.values) + assert cp.all(got == expected) or ( + cp.all(cp.isnan(got)) and cp.all(cp.isnan(expected)) + ) + + def test_binop_add_nullable_fill(): """Test a binary op (addition) with a fill value on two nullable series.""" x = cudf.Series([1, 2, None]) From e94b2cf9012ee1497f11fe69c155bf27fb8c2bd0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 3 May 2021 19:40:57 -0700 Subject: [PATCH 14/35] Centralize tests for index and series. --- .../cudf/tests/single_column_frame_tests.py | 76 +++++++++++++++++++ python/cudf/cudf/tests/test_index.py | 40 ++-------- python/cudf/cudf/tests/test_series.py | 61 ++------------- 3 files changed, 87 insertions(+), 90 deletions(-) create mode 100644 python/cudf/cudf/tests/single_column_frame_tests.py diff --git a/python/cudf/cudf/tests/single_column_frame_tests.py b/python/cudf/cudf/tests/single_column_frame_tests.py new file mode 100644 index 00000000000..4b842080754 --- /dev/null +++ b/python/cudf/cudf/tests/single_column_frame_tests.py @@ -0,0 +1,76 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +import cupy as cp +import numpy as np +import pytest +from hypothesis import given, settings +from hypothesis.extra.numpy import arrays +from hypothesis.strategies import floats + +# from cudf.tests.utils import ( +# assert_eq, +# ) + + +class SingleColumnFrameTests: + """Common set of tests for use with subclasses of SingleColumnFrame. + + Subclasses must define the _cls class variable, which is used to construct + suitable objects in the tests. + """ + + _cls = None + + @pytest.mark.parametrize( + "binop", + [ + "__add__", + "__sub__", + "__mul__", + "__floordiv__", + "__truediv__", + "__eq__", + "__ne__", + "__lt__", + "__le__", + "__gt__", + "__ge__", + ], + ) + @given( + x=arrays(np.float64, (100,), elements=floats(-10, 10, width=64)), + y=arrays(np.float64, (100,), elements=floats(-10, 10, width=64)), + ) + @settings(deadline=None) + def test_binops(self, binop, x, y): + """Test binary operations.""" + x = self._cls(x) + y = self._cls(y) + got = (getattr(x, binop)(y)).values + expected = getattr(x.values, binop)(y.values) + assert cp.all(got == expected) or ( + cp.all(cp.isnan(got)) and cp.all(cp.isnan(expected)) + ) + + +# def test_binop_add_nullable_fill(): +# """Test a binary op with a fill value on two nullable series.""" +# x = cudf.Series([1, 2, None]) +# y = cudf.Series([1, None, 3]) +# assert_eq(x.add(y), cudf.Series([2, None, None])) +# assert_eq(x.add(y, fill_value=0), cudf.Series([2, 2, 3])) +# +# +# def test_binop_indexed_series_withindex(): +# """Test a binary op (addition) with a non-integral index and an Index.""" +# psr = pd.Series([1, 2, 3], index=['a', 'b', 'c']) +# pidx = pd.Index([1, None, 3]) +# +# gsr = cudf.Series(psr) +# gidx = cudf.Index(pidx) +# assert_eq(psr + pidx, gsr + gidx) +# assert_eq(psr + pd.Series(pidx), gsr + cudf.Series(gidx)) +# assert_eq(pidx + psr, gidx + gsr) +# assert_eq(pd.Series(pidx) + psr, cudf.Series(gidx) + gsr) +# +# # TODO: Also test with arbitrary sequence objects (like lists). diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 3a6855fac25..39ae1741508 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -5,14 +5,10 @@ """ import re -import cupy as cp import numpy as np import pandas as pd import pyarrow as pa import pytest -from hypothesis import given, settings -from hypothesis.extra.numpy import arrays -from hypothesis.strategies import floats import cudf from cudf.core._compat import PANDAS_GE_110 @@ -37,6 +33,8 @@ ) from cudf.utils.utils import search_range +from .single_column_frame_tests import SingleColumnFrameTests + def test_df_set_index_from_series(): df = cudf.DataFrame() @@ -2064,33 +2062,7 @@ def test_index_set_names_error(idx, level, names): ) -@pytest.mark.parametrize( - "binop", - [ - "__add__", - "__sub__", - "__mul__", - "__floordiv__", - "__truediv__", - "__eq__", - "__ne__", - "__lt__", - "__le__", - "__gt__", - "__ge__", - ], -) -@given( - x=arrays(np.float64, (100,), elements=floats(-10, 10, width=64)), - y=arrays(np.float64, (100,), elements=floats(-10, 10, width=64)), -) -@settings(deadline=None) -def test_binops(binop, x, y): - """Test binary operations.""" - x = cudf.Index(x) - y = cudf.Index(y) - got = (getattr(x, binop)(y)).values - expected = getattr(x.values, binop)(y.values) - assert cp.all(got == expected) or ( - cp.all(cp.isnan(got)) and cp.all(cp.isnan(expected)) - ) +class TestIndex(SingleColumnFrameTests): + """Tests that share logic with other single column frames.""" + + _cls = cudf.Index diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 434e1bb30d9..290669f2d83 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -4,13 +4,9 @@ import re from string import ascii_letters, digits -import cupy as cp import numpy as np import pandas as pd import pytest -from hypothesis import given, settings -from hypothesis.extra.numpy import arrays -from hypothesis.strategies import floats import cudf from cudf.tests.utils import ( @@ -21,6 +17,8 @@ assert_exceptions_equal, ) +from .single_column_frame_tests import SingleColumnFrameTests + def _series_na_data(): return [ @@ -1202,56 +1200,7 @@ def test_explode(data, ignore_index, p_index): assert_eq(expect, got, check_dtype=False) -@pytest.mark.parametrize( - "binop", - [ - "__add__", - "__sub__", - "__mul__", - "__floordiv__", - "__truediv__", - "__eq__", - "__ne__", - "__lt__", - "__le__", - "__gt__", - "__ge__", - ], -) -@given( - x=arrays(np.float64, (100,), elements=floats(-10, 10, width=64)), - y=arrays(np.float64, (100,), elements=floats(-10, 10, width=64)), -) -@settings(deadline=None) -def test_binops(binop, x, y): - """Test binary operations.""" - x = cudf.Series(x) - y = cudf.Series(y) - got = (getattr(x, binop)(y)).values - expected = getattr(x.values, binop)(y.values) - assert cp.all(got == expected) or ( - cp.all(cp.isnan(got)) and cp.all(cp.isnan(expected)) - ) - +class TestSeries(SingleColumnFrameTests): + """Tests that share logic with other single column frames.""" -def test_binop_add_nullable_fill(): - """Test a binary op (addition) with a fill value on two nullable series.""" - x = cudf.Series([1, 2, None]) - y = cudf.Series([1, None, 3]) - assert_eq(x.add(y), cudf.Series([2, None, None])) - assert_eq(x.add(y, fill_value=0), cudf.Series([2, 2, 3])) - - -# def test_binop_indexed_series_withindex(): -# """Test a binary op (addition) with a non-integral index and an Index.""" -# psr = pd.Series([1, 2, 3], index=['a', 'b', 'c']) -# pidx = pd.Index([1, None, 3]) -# -# gsr = cudf.Series(psr) -# gidx = cudf.Index(pidx) -# assert_eq(psr + pidx, gsr + gidx) -# assert_eq(psr + pd.Series(pidx), gsr + cudf.Series(gidx)) -# assert_eq(pidx + psr, gidx + gsr) -# assert_eq(pd.Series(pidx) + psr, cudf.Series(gidx) + gsr) -# -# # TODO: Also test with arbitrary sequence objects (like lists). + _cls = cudf.Series From c924936971826382c3073180aea3840b1400df99 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 3 May 2021 20:22:47 -0700 Subject: [PATCH 15/35] Add reflected ops. --- python/cudf/cudf/core/index.py | 16 ++++++++-------- .../cudf/cudf/tests/single_column_frame_tests.py | 2 ++ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 10c4e75091c..5954ef89ea9 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -794,7 +794,7 @@ def _apply_op(self, fn, other=None): else: return as_index(op()) - def _binaryop(self, fn, other): + def _binaryop(self, fn, other, reflect=False): if isinstance(other, (cudf.DataFrame, cudf.Series)): return NotImplemented @@ -824,7 +824,7 @@ def _binaryop(self, fn, other): # if not is_scalar(rhs) and rhs.nullable: # rhs = rhs.fillna(fill_value) # - outcol = lhs._column.binary_operator(fn, rhs) + outcol = lhs._column.binary_operator(fn, rhs, reflect=reflect) # Get the appropriate name for output operations involving two objects # that are Series-like objects. The output shares the lhs's name unless @@ -953,25 +953,25 @@ def __add__(self, other): return self._binaryop("add", other) def __radd__(self, other): - return self._apply_op("__radd__", other) + return self._binaryop("add", other, reflect=True) def __sub__(self, other): return self._binaryop("sub", other) def __rsub__(self, other): - return self._apply_op("__rsub__", other) + return self._binaryop("sub", other, reflect=True) def __mul__(self, other): return self._binaryop("mul", other) def __rmul__(self, other): - return self._apply_op("__rmul__", other) + return self._binaryop("mul", other, reflect=True) def __mod__(self, other): return self._binaryop("mod", other) def __rmod__(self, other): - return self._apply_op("__rmod__", other) + return self._binaryop("mod", other, reflect=True) def __pow__(self, other): return self._binaryop("pow", other) @@ -980,13 +980,13 @@ def __floordiv__(self, other): return self._binaryop("floordiv", other) def __rfloordiv__(self, other): - return self._apply_op("__rfloordiv__", other) + return self._binaryop("floordiv", other, reflect=True) def __truediv__(self, other): return self._binaryop("truediv", other) def __rtruediv__(self, other): - return self._apply_op("__rtruediv__", other) + return self._binaryop("truediv", other, reflect=True) __div__ = __truediv__ diff --git a/python/cudf/cudf/tests/single_column_frame_tests.py b/python/cudf/cudf/tests/single_column_frame_tests.py index 4b842080754..a1e73ca3381 100644 --- a/python/cudf/cudf/tests/single_column_frame_tests.py +++ b/python/cudf/cudf/tests/single_column_frame_tests.py @@ -24,11 +24,13 @@ class SingleColumnFrameTests: @pytest.mark.parametrize( "binop", [ + # Arithmetic operations. "__add__", "__sub__", "__mul__", "__floordiv__", "__truediv__", + # Logical operations. "__eq__", "__ne__", "__lt__", From e40814f7b658676dc557463b95a0003880cd9e25 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 3 May 2021 20:31:53 -0700 Subject: [PATCH 16/35] Enable null filling. --- python/cudf/cudf/core/index.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 5954ef89ea9..11e26b6d65c 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -794,7 +794,7 @@ def _apply_op(self, fn, other=None): else: return as_index(op()) - def _binaryop(self, fn, other, reflect=False): + def _binaryop(self, fn, other, fill_value=None, reflect=False): if isinstance(other, (cudf.DataFrame, cudf.Series)): return NotImplemented @@ -818,12 +818,12 @@ def _binaryop(self, fn, other, reflect=False): truediv_type = truediv_int_dtype_corrections[str(lhs.dtype)] lhs = lhs.astype(truediv_type) - # if fill_value is not None: - # if lhs.nullable: - # lhs = lhs.fillna(fill_value) - # if not is_scalar(rhs) and rhs.nullable: - # rhs = rhs.fillna(fill_value) - # + if fill_value is not None: + if lhs.nullable: + lhs = lhs.fillna(fill_value) + if not is_scalar(rhs) and rhs.nullable: + rhs = rhs.fillna(fill_value) + outcol = lhs._column.binary_operator(fn, rhs, reflect=reflect) # Get the appropriate name for output operations involving two objects From fa11a6b2b701851facd1e501c7fd6f2e7c62b005 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 4 May 2021 09:38:11 -0700 Subject: [PATCH 17/35] Move most binary operations to SingleColumnFrame. --- python/cudf/cudf/core/frame.py | 62 +++++++++++++++++++++++++++++++++ python/cudf/cudf/core/index.py | 61 +------------------------------- python/cudf/cudf/core/series.py | 62 --------------------------------- 3 files changed, 63 insertions(+), 122 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5bb741ccb0e..71532d0fbd4 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3515,6 +3515,68 @@ def to_arrow(self): """ return self._column.to_arrow() + def __add__(self, other): + return self._binaryop(other, "add") + + def __radd__(self, other): + return self._binaryop(other, "add", reflect=True) + + def __sub__(self, other): + return self._binaryop(other, "sub") + + def __rsub__(self, other): + return self._binaryop(other, "sub", reflect=True) + + def __mul__(self, other): + return self._binaryop(other, "mul") + + def __rmul__(self, other): + return self._binaryop(other, "mul", reflect=True) + + def __mod__(self, other): + return self._binaryop(other, "mod") + + def __rmod__(self, other): + return self._binaryop(other, "mod", reflect=True) + + def __pow__(self, other): + return self._binaryop(other, "pow") + + def __rpow__(self, other): + return self._binaryop(other, "pow", reflect=True) + + def __floordiv__(self, other): + return self._binaryop(other, "floordiv") + + def __rfloordiv__(self, other): + return self._binaryop(other, "floordiv", reflect=True) + + def __truediv__(self, other): + return self._binaryop(other, "truediv") + + def __rtruediv__(self, other): + return self._binaryop(other, "truediv", reflect=True) + + __div__ = __truediv__ + + def __eq__(self, other): + return self._binaryop(other, "eq") + + def __ne__(self, other): + return self._binaryop(other, "ne") + + def __lt__(self, other): + return self._binaryop(other, "lt") + + def __le__(self, other): + return self._binaryop(other, "le") + + def __gt__(self, other): + return self._binaryop(other, "gt") + + def __ge__(self, other): + return self._binaryop(other, "ge") + def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 11e26b6d65c..fb214015796 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -794,7 +794,7 @@ def _apply_op(self, fn, other=None): else: return as_index(op()) - def _binaryop(self, fn, other, fill_value=None, reflect=False): + def _binaryop(self, other, fn, fill_value=None, reflect=False): if isinstance(other, (cudf.DataFrame, cudf.Series)): return NotImplemented @@ -949,47 +949,6 @@ def unique(self): """ return as_index(self._values.unique(), name=self.name) - def __add__(self, other): - return self._binaryop("add", other) - - def __radd__(self, other): - return self._binaryop("add", other, reflect=True) - - def __sub__(self, other): - return self._binaryop("sub", other) - - def __rsub__(self, other): - return self._binaryop("sub", other, reflect=True) - - def __mul__(self, other): - return self._binaryop("mul", other) - - def __rmul__(self, other): - return self._binaryop("mul", other, reflect=True) - - def __mod__(self, other): - return self._binaryop("mod", other) - - def __rmod__(self, other): - return self._binaryop("mod", other, reflect=True) - - def __pow__(self, other): - return self._binaryop("pow", other) - - def __floordiv__(self, other): - return self._binaryop("floordiv", other) - - def __rfloordiv__(self, other): - return self._binaryop("floordiv", other, reflect=True) - - def __truediv__(self, other): - return self._binaryop("truediv", other) - - def __rtruediv__(self, other): - return self._binaryop("truediv", other, reflect=True) - - __div__ = __truediv__ - def __and__(self, other): return self._apply_op("__and__", other) @@ -999,24 +958,6 @@ def __or__(self, other): def __xor__(self, other): return self._apply_op("__xor__", other) - def __eq__(self, other): - return self._binaryop("eq", other) - - def __ne__(self, other): - return self._binaryop("ne", other) - - def __lt__(self, other): - return self._binaryop("lt", other) - - def __le__(self, other): - return self._binaryop("le", other) - - def __gt__(self, other): - return self._binaryop("gt", other) - - def __ge__(self, other): - return self._binaryop("ge", other) - def join( self, other, how="left", level=None, return_indexers=False, sort=False ): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index e53da9de59c..a4034aae400 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1426,9 +1426,6 @@ def add(self, other, fill_value=None, axis=0): raise NotImplementedError("Only axis=0 supported at this time.") return self._binaryop(other, "add", fill_value) - def __add__(self, other): - return self._binaryop(other, "add") - def radd(self, other, fill_value=None, axis=0): """Addition of series and other, element-wise (binary operator radd). @@ -1476,9 +1473,6 @@ def radd(self, other, fill_value=None, axis=0): other, "add", fill_value=fill_value, reflect=True ) - def __radd__(self, other): - return self._binaryop(other, "add", reflect=True) - def subtract(self, other, fill_value=None, axis=0): """Subtraction of series and other, element-wise (binary operator sub). @@ -1527,9 +1521,6 @@ def subtract(self, other, fill_value=None, axis=0): sub = subtract - def __sub__(self, other): - return self._binaryop(other, "sub") - def rsub(self, other, fill_value=None, axis=0): """Subtraction of series and other, element-wise (binary operator rsub). @@ -1575,9 +1566,6 @@ def rsub(self, other, fill_value=None, axis=0): raise NotImplementedError("Only axis=0 supported at this time.") return self._binaryop(other, "sub", fill_value, reflect=True) - def __rsub__(self, other): - return self._binaryop(other, "sub", reflect=True) - def multiply(self, other, fill_value=None, axis=0): """Multiplication of series and other, element-wise (binary operator mul). @@ -1625,9 +1613,6 @@ def multiply(self, other, fill_value=None, axis=0): mul = multiply - def __mul__(self, other): - return self._binaryop(other, "mul") - def rmul(self, other, fill_value=None, axis=0): """Multiplication of series and other, element-wise (binary operator rmul). @@ -1676,9 +1661,6 @@ def rmul(self, other, fill_value=None, axis=0): raise NotImplementedError("Only axis=0 supported at this time.") return self._binaryop(other, "mul", fill_value, True) - def __rmul__(self, other): - return self._binaryop(other, "mul", reflect=True) - def mod(self, other, fill_value=None, axis=0): """Modulo of series and other, element-wise (binary operator mod). @@ -1714,9 +1696,6 @@ def mod(self, other, fill_value=None, axis=0): raise NotImplementedError("Only axis=0 supported at this time.") return self._binaryop(other, "mod", fill_value) - def __mod__(self, other): - return self._binaryop(other, "mod") - def rmod(self, other, fill_value=None, axis=0): """Modulo of series and other, element-wise (binary operator rmod). @@ -1765,9 +1744,6 @@ def rmod(self, other, fill_value=None, axis=0): raise NotImplementedError("Only axis=0 supported at this time.") return self._binaryop(other, "mod", fill_value, True) - def __rmod__(self, other): - return self._binaryop(other, "mod", reflect=True) - def pow(self, other, fill_value=None, axis=0): """Exponential power of series and other, element-wise (binary operator pow). @@ -1813,9 +1789,6 @@ def pow(self, other, fill_value=None, axis=0): raise NotImplementedError("Only axis=0 supported at this time.") return self._binaryop(other, "pow", fill_value) - def __pow__(self, other): - return self._binaryop(other, "pow") - def rpow(self, other, fill_value=None, axis=0): """Exponential power of series and other, element-wise (binary operator rpow). @@ -1861,9 +1834,6 @@ def rpow(self, other, fill_value=None, axis=0): raise NotImplementedError("Only axis=0 supported at this time.") return self._binaryop(other, "pow", fill_value, True) - def __rpow__(self, other): - return self._binaryop(other, "pow", reflect=True) - def floordiv(self, other, fill_value=None, axis=0): """Integer division of series and other, element-wise (binary operator floordiv). @@ -1909,9 +1879,6 @@ def floordiv(self, other, fill_value=None, axis=0): raise NotImplementedError("Only axis=0 supported at this time.") return self._binaryop(other, "floordiv", fill_value) - def __floordiv__(self, other): - return self._binaryop(other, "floordiv") - def rfloordiv(self, other, fill_value=None, axis=0): """Integer division of series and other, element-wise (binary operator rfloordiv). @@ -1965,9 +1932,6 @@ def rfloordiv(self, other, fill_value=None, axis=0): raise NotImplementedError("Only axis=0 supported at this time.") return self._binaryop(other, "floordiv", fill_value, True) - def __rfloordiv__(self, other): - return self._binaryop(other, "floordiv", reflect=True) - def truediv(self, other, fill_value=None, axis=0): """Floating division of series and other, element-wise (binary operator truediv). @@ -2013,9 +1977,6 @@ def truediv(self, other, fill_value=None, axis=0): raise NotImplementedError("Only axis=0 supported at this time.") return self._binaryop(other, "truediv", fill_value) - def __truediv__(self, other): - return self._binaryop(other, "truediv") - def rtruediv(self, other, fill_value=None, axis=0): """Floating division of series and other, element-wise (binary operator rtruediv). @@ -2061,11 +2022,6 @@ def rtruediv(self, other, fill_value=None, axis=0): raise NotImplementedError("Only axis=0 supported at this time.") return self._binaryop(other, "truediv", fill_value, True) - def __rtruediv__(self, other): - return self._binaryop(other, "truediv", reflect=True) - - __div__ = __truediv__ - def _bitwise_binop(self, other, op): if ( np.issubdtype(self.dtype, np.bool_) @@ -2186,9 +2142,6 @@ def eq(self, other, fill_value=None, axis=0): other=other, fn="eq", fill_value=fill_value, can_reindex=True ) - def __eq__(self, other): - return self._binaryop(other, "eq") - def ne(self, other, fill_value=None, axis=0): """Not equal to of series and other, element-wise (binary operator ne). @@ -2241,9 +2194,6 @@ def ne(self, other, fill_value=None, axis=0): other=other, fn="ne", fill_value=fill_value, can_reindex=True ) - def __ne__(self, other): - return self._binaryop(other, "ne") - def lt(self, other, fill_value=None, axis=0): """Less than of series and other, element-wise (binary operator lt). @@ -2296,9 +2246,6 @@ def lt(self, other, fill_value=None, axis=0): other=other, fn="lt", fill_value=fill_value, can_reindex=True ) - def __lt__(self, other): - return self._binaryop(other, "lt") - def le(self, other, fill_value=None, axis=0): """Less than or equal to of series and other, element-wise (binary operator le). @@ -2351,9 +2298,6 @@ def le(self, other, fill_value=None, axis=0): other=other, fn="le", fill_value=fill_value, can_reindex=True ) - def __le__(self, other): - return self._binaryop(other, "le") - def gt(self, other, fill_value=None, axis=0): """Greater than of series and other, element-wise (binary operator gt). @@ -2406,9 +2350,6 @@ def gt(self, other, fill_value=None, axis=0): other=other, fn="gt", fill_value=fill_value, can_reindex=True ) - def __gt__(self, other): - return self._binaryop(other, "gt") - def ge(self, other, fill_value=None, axis=0): """Greater than or equal to of series and other, element-wise (binary operator ge). @@ -2461,9 +2402,6 @@ def ge(self, other, fill_value=None, axis=0): other=other, fn="ge", fill_value=fill_value, can_reindex=True ) - def __ge__(self, other): - return self._binaryop(other, "ge") - def __invert__(self): """Bitwise invert (~) for each element. Logical NOT if dtype is bool From ffcf6d8c2e996c67c02a50a9969407b31ebb804b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 4 May 2021 11:06:44 -0700 Subject: [PATCH 18/35] Move binaryop implementation to SingleColumnFrame. --- python/cudf/cudf/core/frame.py | 77 +++++++++++++++++++++++++++++++++ python/cudf/cudf/core/index.py | 46 +++----------------- python/cudf/cudf/core/series.py | 54 +++++------------------ 3 files changed, 94 insertions(+), 83 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 71532d0fbd4..ca1e0f812d6 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3515,6 +3515,83 @@ def to_arrow(self): """ return self._column.to_arrow() + def _binaryop( + self, + other, + fn, + fill_value=None, + reflect=False, + lhs=None, + *args, + **kwargs, + ): + """Perform a binary operation between two frames. + + Parameters + ---------- + other : SingleColumnFrame + The second operand. + fn : str + The operation + fill_value : Any, default None + The value to replace null values with. If ``None``, nulls are not + filled before the operation. + reflect : bool, default False + If ``True`` the operation is reflected (i.e whether to swap the + left and right operands). + lhs : SingleColumnFrame, default None + The left hand operand. If ``None``, self is used. This parameter + allows child classes to preprocess the inputs if necessary. + + Returns + ------- + SingleColumnFrame + A new instance containing the result of the operation. + """ + if lhs is None: + lhs = self + + rhs = self._normalize_binop_value(other) + + truediv_int_dtype_corrections = { + "int8": "float32", + "int16": "float32", + "int32": "float32", + "int64": "float64", + "uint8": "float32", + "uint16": "float32", + "uint32": "float64", + "uint64": "float64", + "bool": "float32", + "int": "float", + } + + if fn == "truediv": + if str(lhs.dtype) in truediv_int_dtype_corrections: + truediv_type = truediv_int_dtype_corrections[str(lhs.dtype)] + lhs = lhs.astype(truediv_type) + + if fill_value is not None: + if lhs.nullable: + lhs = lhs.fillna(fill_value) + if not is_scalar(rhs) and rhs.nullable: + rhs = rhs.fillna(fill_value) + + outcol = lhs._column.binary_operator(fn, rhs, reflect=reflect) + + # Get the appropriate name for output operations involving two objects + # that are Series-like objects. The output shares the lhs's name unless + # the rhs is a _differently_ named Series-like object. + if ( + isinstance(other, (cudf.Series, cudf.Index, pd.Series, pd.Index)) + and self.name != other.name + ): + result_name = None + else: + result_name = self.name + + return lhs._copy_construct(data=outcol, name=result_name) + def __add__(self, other): return self._binaryop(other, "add") diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index fb214015796..7443d1b9cfb 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -795,49 +795,15 @@ def _apply_op(self, fn, other=None): return as_index(op()) def _binaryop(self, other, fn, fill_value=None, reflect=False): + # TODO: Rather than including an allowlist of acceptable types, we + # should instead return NotImplemented for __all__ other types. That + # will allow other types to support binops with cudf objects if they so + # choose, and just as importantly will allow better error messages if + # they don't support it. if isinstance(other, (cudf.DataFrame, cudf.Series)): return NotImplemented - lhs, rhs = self, self._normalize_binop_value(other) - - truediv_int_dtype_corrections = { - "int8": "float32", - "int16": "float32", - "int32": "float32", - "int64": "float64", - "uint8": "float32", - "uint16": "float32", - "uint32": "float64", - "uint64": "float64", - "bool": "float32", - "int": "float", - } - - if fn == "truediv": - if str(lhs.dtype) in truediv_int_dtype_corrections: - truediv_type = truediv_int_dtype_corrections[str(lhs.dtype)] - lhs = lhs.astype(truediv_type) - - if fill_value is not None: - if lhs.nullable: - lhs = lhs.fillna(fill_value) - if not is_scalar(rhs) and rhs.nullable: - rhs = rhs.fillna(fill_value) - - outcol = lhs._column.binary_operator(fn, rhs, reflect=reflect) - - # Get the appropriate name for output operations involving two objects - # that are Series-like objects. The output shares the lhs's name unless - # the rhs is a _differently_ named Series-like object. - if ( - isinstance(other, (cudf.Series, cudf.Index, pd.Series, pd.Index)) - and self.name != other.name - ): - result_name = None - else: - result_name = self.name - - return lhs._copy_construct(data=outcol, name=result_name) + return super()._binaryop(other, fn, fill_value, reflect) def _normalize_binop_value(self, other): """Returns a *column* (not a Series) or scalar for performing diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index a4034aae400..d689feb2d82 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -13,7 +13,6 @@ import cupy import numpy as np import pandas as pd -from nvtx import annotate from pandas._config import get_option from pandas.api.types import is_dict_like @@ -1321,57 +1320,26 @@ def __repr__(self): lines.append(category_memory) return "\n".join(lines) - @annotate("BINARY_OP", color="orange", domain="cudf_python") def _binaryop( self, other, fn, fill_value=None, reflect=False, can_reindex=False ): - """ - Internal util to call a binary operator *fn* on operands *self* - and *other*. Return the output Series. The output dtype is - determined by the input operands. - - If ``reflect`` is ``True``, swap the order of the operands. - """ if isinstance(other, cudf.DataFrame): return NotImplemented if isinstance(other, Series): - if not can_reindex and fn in cudf.utils.utils._EQUALITY_OPS: - if not self.index.equals(other.index): - raise ValueError( - "Can only compare identically-labeled " - "Series objects" - ) - lhs, rhs = _align_indices([self, other], allow_non_unique=True) - else: - lhs, rhs = self, other - rhs = self._normalize_binop_value(rhs) - - if fn == "truediv": - if str(lhs.dtype) in truediv_int_dtype_corrections: - truediv_type = truediv_int_dtype_corrections[str(lhs.dtype)] - lhs = lhs.astype(truediv_type) - - if fill_value is not None: - if lhs.nullable: - lhs = lhs.fillna(fill_value) - if not is_scalar(rhs) and rhs.nullable: - rhs = rhs.fillna(fill_value) - - outcol = lhs._column.binary_operator(fn, rhs, reflect=reflect) - - # Get the appropriate name for output operations involving two objects - # that are Series-like objects. The output shares the lhs's name unless - # the rhs is a _differently_ named Series-like object. - if ( - isinstance(other, (cudf.Series, cudf.Index, pd.Series, pd.Index)) - and self.name != other.name - ): - result_name = None + if ( + not can_reindex + and fn in cudf.utils.utils._EQUALITY_OPS + and not self.index.equals(other.index) + ): + raise ValueError( + "Can only compare identically-labeled " "Series objects" + ) + lhs, other = _align_indices([self, other], allow_non_unique=True) else: - result_name = self.name + lhs = self - return lhs._copy_construct(data=outcol, name=result_name) + return super()._binaryop(other, fn, fill_value, reflect, lhs) def add(self, other, fill_value=None, axis=0): """ From 829df17a804acee1d68ab9cf5ad1fad86df9500e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 4 May 2021 11:19:20 -0700 Subject: [PATCH 19/35] Reintroduce old null handling temporarily. --- python/cudf/cudf/core/frame.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ca1e0f812d6..faaad9e4160 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -18,7 +18,12 @@ import cudf from cudf import _lib as libcudf from cudf._typing import ColumnLike, DataFrameOrSeries -from cudf.core.column import as_column, build_categorical_column, column_empty +from cudf.core.column import ( + as_column, + build_categorical_column, + build_column, + column_empty, +) from cudf.core.join import merge from cudf.utils.dtypes import ( is_categorical_dtype, @@ -3572,10 +3577,29 @@ def _binaryop( lhs = lhs.astype(truediv_type) if fill_value is not None: - if lhs.nullable: + # if lhs.nullable: + # lhs = lhs.fillna(fill_value) + # if not is_scalar(rhs) and rhs.nullable: + # rhs = rhs.fillna(fill_value) + + if is_scalar(rhs): lhs = lhs.fillna(fill_value) - if not is_scalar(rhs) and rhs.nullable: - rhs = rhs.fillna(fill_value) + else: + if lhs.nullable and rhs.nullable: + lmask = self.__class__(data=lhs.nullmask) + rmask = self.__class__(data=rhs.nullmask) + mask = (lmask | rmask).data + lhs = lhs.fillna(fill_value) + rhs = rhs.fillna(fill_value) + result = lhs._binaryop(rhs, fn=fn, reflect=reflect) + data = build_column( + data=result.data, dtype=result.dtype, mask=mask + ) + return lhs._copy_construct(data=data) + elif lhs.nullable: + lhs = lhs.fillna(fill_value) + elif rhs.nullable: + rhs = rhs.fillna(fill_value) outcol = lhs._column.binary_operator(fn, rhs, reflect=reflect) From f2da7e1142fe00a2bf7d7c227b913b63d35abf38 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 4 May 2021 14:00:39 -0700 Subject: [PATCH 20/35] Override copy_construct for index to support returning a different index type. --- python/cudf/cudf/core/index.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 7443d1b9cfb..414a812db72 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -805,6 +805,17 @@ def _binaryop(self, other, fn, fill_value=None, reflect=False): return super()._binaryop(other, fn, fill_value, reflect) + def _copy_construct(self, **kwargs): + # Need to override the parent behavior because pandas allows operations + # on unsigned types to return signed values, forcing us to choose the + # right index type here. + data = kwargs["data"] + if data.dtype != self.dtype: + cls = _dtype_to_index[getattr(np, data.dtype.name)] + else: + cls = self.__class__ + return cls(**{**self._copy_construct_defaults, **kwargs}) + def _normalize_binop_value(self, other): """Returns a *column* (not a Series) or scalar for performing binary operations with self._column. From a309ce58984120586e8e2e7f74cc7c3f7a5a18e9 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 4 May 2021 14:51:06 -0700 Subject: [PATCH 21/35] Fix behavior for bool and categorical dtype indices. --- python/cudf/cudf/core/index.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 414a812db72..8e65773daec 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -809,11 +809,15 @@ def _copy_construct(self, **kwargs): # Need to override the parent behavior because pandas allows operations # on unsigned types to return signed values, forcing us to choose the # right index type here. - data = kwargs["data"] - if data.dtype != self.dtype: - cls = _dtype_to_index[getattr(np, data.dtype.name)] - else: - cls = self.__class__ + data = kwargs.get("data") + cls = self.__class__ + + if data is not None and self.dtype != data.dtype: + try: + cls = _dtype_to_index[getattr(np, data.dtype.name)] + except KeyError: + pass + return cls(**{**self._copy_construct_defaults, **kwargs}) def _normalize_binop_value(self, other): @@ -2939,6 +2943,7 @@ def as_index(arbitrary, **kwargs) -> Index: _dtype_to_index = { + bool: Index, np.int8: Int8Index, np.int16: Int16Index, np.int32: Int32Index, From 522ee42e4257b852e56a50c19b24df628dd52e85 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 4 May 2021 15:22:20 -0700 Subject: [PATCH 22/35] Move copy construct. --- python/cudf/cudf/core/frame.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index faaad9e4160..612028dd3d3 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -78,16 +78,6 @@ def _mimic_inplace( else: return result - @property - def _copy_construct_defaults(self): - """A default dictionary of kwargs to be used for copy construction.""" - raise NotImplementedError - - def _copy_construct(self, **kwargs): - """Shallow copy this object by replacing certain ctor args. - """ - return self.__class__(**{**self._copy_construct_defaults, **kwargs}) - @property def size(self): """ @@ -3520,6 +3510,16 @@ def to_arrow(self): """ return self._column.to_arrow() + @property + def _copy_construct_defaults(self): + """A default dictionary of kwargs to be used for copy construction.""" + raise NotImplementedError + + def _copy_construct(self, **kwargs): + """Shallow copy this object by replacing certain ctor args. + """ + return self.__class__(**{**self._copy_construct_defaults, **kwargs}) + def _binaryop( self, other, From 64f92a6f313339197d213f8b0a4c9ed888a9a655 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 4 May 2021 17:25:07 -0700 Subject: [PATCH 23/35] Simplify the behavior for null filling again. --- python/cudf/cudf/core/frame.py | 39 ++++++++++++++-------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 612028dd3d3..9b583238286 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -18,12 +18,7 @@ import cudf from cudf import _lib as libcudf from cudf._typing import ColumnLike, DataFrameOrSeries -from cudf.core.column import ( - as_column, - build_categorical_column, - build_column, - column_empty, -) +from cudf.core.column import as_column, build_categorical_column, column_empty from cudf.core.join import merge from cudf.utils.dtypes import ( is_categorical_dtype, @@ -3572,30 +3567,24 @@ def _binaryop( } if fn == "truediv": - if str(lhs.dtype) in truediv_int_dtype_corrections: - truediv_type = truediv_int_dtype_corrections[str(lhs.dtype)] + truediv_type = truediv_int_dtype_corrections.get(str(lhs.dtype)) + if truediv_type is not None: lhs = lhs.astype(truediv_type) + output_mask = None if fill_value is not None: - # if lhs.nullable: - # lhs = lhs.fillna(fill_value) - # if not is_scalar(rhs) and rhs.nullable: - # rhs = rhs.fillna(fill_value) - if is_scalar(rhs): - lhs = lhs.fillna(fill_value) + if lhs.nullable: + lhs = lhs.fillna(fill_value) else: + # If both columns are nullable, pandas semantics dictate that + # nulls that are present in both lhs and rhs are not filled. if lhs.nullable and rhs.nullable: - lmask = self.__class__(data=lhs.nullmask) - rmask = self.__class__(data=rhs.nullmask) - mask = (lmask | rmask).data + # Note: using bitwise and rather than logical, but these + # are boolean outputs so should be fine. + output_mask = lhs.isnull() & rhs.isnull() lhs = lhs.fillna(fill_value) rhs = rhs.fillna(fill_value) - result = lhs._binaryop(rhs, fn=fn, reflect=reflect) - data = build_column( - data=result.data, dtype=result.dtype, mask=mask - ) - return lhs._copy_construct(data=data) elif lhs.nullable: lhs = lhs.fillna(fill_value) elif rhs.nullable: @@ -3614,7 +3603,11 @@ def _binaryop( else: result_name = self.name - return lhs._copy_construct(data=outcol, name=result_name) + output = lhs._copy_construct(data=outcol, name=result_name) + + if output_mask is not None: + output._column[output_mask._column] = None + return output def __add__(self, other): return self._binaryop(other, "add") From 971ed93fe491b0c48ec490d09aa65a702831b149 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 4 May 2021 17:26:35 -0700 Subject: [PATCH 24/35] Remove now redundant tests. --- .../cudf/tests/single_column_frame_tests.py | 78 ------------------- python/cudf/cudf/tests/test_index.py | 8 -- python/cudf/cudf/tests/test_series.py | 8 -- 3 files changed, 94 deletions(-) delete mode 100644 python/cudf/cudf/tests/single_column_frame_tests.py diff --git a/python/cudf/cudf/tests/single_column_frame_tests.py b/python/cudf/cudf/tests/single_column_frame_tests.py deleted file mode 100644 index a1e73ca3381..00000000000 --- a/python/cudf/cudf/tests/single_column_frame_tests.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -import cupy as cp -import numpy as np -import pytest -from hypothesis import given, settings -from hypothesis.extra.numpy import arrays -from hypothesis.strategies import floats - -# from cudf.tests.utils import ( -# assert_eq, -# ) - - -class SingleColumnFrameTests: - """Common set of tests for use with subclasses of SingleColumnFrame. - - Subclasses must define the _cls class variable, which is used to construct - suitable objects in the tests. - """ - - _cls = None - - @pytest.mark.parametrize( - "binop", - [ - # Arithmetic operations. - "__add__", - "__sub__", - "__mul__", - "__floordiv__", - "__truediv__", - # Logical operations. - "__eq__", - "__ne__", - "__lt__", - "__le__", - "__gt__", - "__ge__", - ], - ) - @given( - x=arrays(np.float64, (100,), elements=floats(-10, 10, width=64)), - y=arrays(np.float64, (100,), elements=floats(-10, 10, width=64)), - ) - @settings(deadline=None) - def test_binops(self, binop, x, y): - """Test binary operations.""" - x = self._cls(x) - y = self._cls(y) - got = (getattr(x, binop)(y)).values - expected = getattr(x.values, binop)(y.values) - assert cp.all(got == expected) or ( - cp.all(cp.isnan(got)) and cp.all(cp.isnan(expected)) - ) - - -# def test_binop_add_nullable_fill(): -# """Test a binary op with a fill value on two nullable series.""" -# x = cudf.Series([1, 2, None]) -# y = cudf.Series([1, None, 3]) -# assert_eq(x.add(y), cudf.Series([2, None, None])) -# assert_eq(x.add(y, fill_value=0), cudf.Series([2, 2, 3])) -# -# -# def test_binop_indexed_series_withindex(): -# """Test a binary op (addition) with a non-integral index and an Index.""" -# psr = pd.Series([1, 2, 3], index=['a', 'b', 'c']) -# pidx = pd.Index([1, None, 3]) -# -# gsr = cudf.Series(psr) -# gidx = cudf.Index(pidx) -# assert_eq(psr + pidx, gsr + gidx) -# assert_eq(psr + pd.Series(pidx), gsr + cudf.Series(gidx)) -# assert_eq(pidx + psr, gidx + gsr) -# assert_eq(pd.Series(pidx) + psr, cudf.Series(gidx) + gsr) -# -# # TODO: Also test with arbitrary sequence objects (like lists). diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 39ae1741508..158dffc3884 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -33,8 +33,6 @@ ) from cudf.utils.utils import search_range -from .single_column_frame_tests import SingleColumnFrameTests - def test_df_set_index_from_series(): df = cudf.DataFrame() @@ -2060,9 +2058,3 @@ def test_index_set_names_error(idx, level, names): lfunc_args_and_kwargs=([], {"names": names, "level": level}), rfunc_args_and_kwargs=([], {"names": names, "level": level}), ) - - -class TestIndex(SingleColumnFrameTests): - """Tests that share logic with other single column frames.""" - - _cls = cudf.Index diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 290669f2d83..0cc0ad57745 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -17,8 +17,6 @@ assert_exceptions_equal, ) -from .single_column_frame_tests import SingleColumnFrameTests - def _series_na_data(): return [ @@ -1198,9 +1196,3 @@ def test_explode(data, ignore_index, p_index): assert_eq(expect, got, check_dtype=False) else: assert_eq(expect, got, check_dtype=False) - - -class TestSeries(SingleColumnFrameTests): - """Tests that share logic with other single column frames.""" - - _cls = cudf.Series From ad9b1eb20b9c8beba4e0abe6e0ea7bfecc2bd739 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 5 May 2021 08:59:19 -0700 Subject: [PATCH 25/35] Handle various index dtypes correctly. --- python/cudf/cudf/core/index.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 8e65773daec..4f6e52d6063 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -813,10 +813,21 @@ def _copy_construct(self, **kwargs): cls = self.__class__ if data is not None and self.dtype != data.dtype: - try: - cls = _dtype_to_index[getattr(np, data.dtype.name)] - except KeyError: - pass + # TODO: This logic is largely copied from `as_index`. The two + # should be unified via a centralized type dispatching scheme. + if isinstance(data, NumericalColumn): + try: + cls = _dtype_to_index[data.dtype.type] + except KeyError: + cls = GenericIndex + elif isinstance(data, StringColumn): + cls = StringIndex + elif isinstance(data, DatetimeColumn): + cls = DatetimeIndex + elif isinstance(data, TimeDeltaColumn): + cls = TimedeltaIndex + elif isinstance(data, CategoricalColumn): + cls = CategoricalIndex return cls(**{**self._copy_construct_defaults, **kwargs}) From 2274af639489b46d70b86047961a8af5cef62367 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 5 May 2021 09:25:45 -0700 Subject: [PATCH 26/35] Handle correct types for RangeIndex binops. --- python/cudf/cudf/core/index.py | 39 +++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 4f6e52d6063..caee5a64ed4 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -812,22 +812,37 @@ def _copy_construct(self, **kwargs): data = kwargs.get("data") cls = self.__class__ - if data is not None and self.dtype != data.dtype: - # TODO: This logic is largely copied from `as_index`. The two - # should be unified via a centralized type dispatching scheme. - if isinstance(data, NumericalColumn): + if data is not None: + if self.dtype != data.dtype: + # TODO: This logic is largely copied from `as_index`. The two + # should be unified via a centralized type dispatching scheme. + if isinstance(data, NumericalColumn): + try: + cls = _dtype_to_index[data.dtype.type] + except KeyError: + cls = GenericIndex + elif isinstance(data, StringColumn): + cls = StringIndex + elif isinstance(data, DatetimeColumn): + cls = DatetimeIndex + elif isinstance(data, TimeDeltaColumn): + cls = TimedeltaIndex + elif isinstance(data, CategoricalColumn): + cls = CategoricalIndex + elif cls is RangeIndex: + # RangeIndex must convert to other numerical types for ops + + # TODO: The one exception is that multiplication of a + # RangeIndex in pandas results in another RangeIndex. + # Propagating that information through cudf with the current + # APIs is likely more hacky than it's worth, and it's more a + # performance loss than a functional loss to have an Int64Index + # instead of a RangeIndex. As Index logic is cleaned up it may + # become more feasible to ensure the appropriate type. try: cls = _dtype_to_index[data.dtype.type] except KeyError: cls = GenericIndex - elif isinstance(data, StringColumn): - cls = StringIndex - elif isinstance(data, DatetimeColumn): - cls = DatetimeIndex - elif isinstance(data, TimeDeltaColumn): - cls = TimedeltaIndex - elif isinstance(data, CategoricalColumn): - cls = CategoricalIndex return cls(**{**self._copy_construct_defaults, **kwargs}) From a14b5ebfdd77827e29b45f031ef622704ad30f42 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 5 May 2021 10:39:14 -0700 Subject: [PATCH 27/35] Fix kwargs for GenericIndex. --- python/cudf/cudf/core/index.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index caee5a64ed4..f259359f282 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -821,6 +821,10 @@ def _copy_construct(self, **kwargs): cls = _dtype_to_index[data.dtype.type] except KeyError: cls = GenericIndex + # TODO: GenericIndex has a different API for __new__ + # than other Index types. Refactoring Index types will + # be necessary to clean this up. + kwargs["values"] = kwargs.pop("data") elif isinstance(data, StringColumn): cls = StringIndex elif isinstance(data, DatetimeColumn): From 6d07e1588e59516b3c808679d06e2da6791f6a3f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 5 May 2021 10:46:43 -0700 Subject: [PATCH 28/35] Centralize _normalize_binop_value. --- python/cudf/cudf/core/frame.py | 20 +++++++++++++++++++- python/cudf/cudf/core/index.py | 15 --------------- python/cudf/cudf/core/series.py | 27 --------------------------- 3 files changed, 19 insertions(+), 43 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 9b583238286..d1d525d2638 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -18,7 +18,12 @@ import cudf from cudf import _lib as libcudf from cudf._typing import ColumnLike, DataFrameOrSeries -from cudf.core.column import as_column, build_categorical_column, column_empty +from cudf.core.column import ( + ColumnBase, + as_column, + build_categorical_column, + column_empty, +) from cudf.core.join import merge from cudf.utils.dtypes import ( is_categorical_dtype, @@ -3609,6 +3614,19 @@ def _binaryop( output._column[output_mask._column] = None return output + def _normalize_binop_value(self, other): + """Returns a *column* (not a Series) or scalar for performing + binary operations with self._column. + """ + if isinstance(other, ColumnBase): + return other + if isinstance(other, self.__class__): + return other._column + if other is cudf.NA: + return cudf.Scalar(other, dtype=self.dtype) + else: + return self._column.normalize_binop_value(other) + def __add__(self, other): return self._binaryop(other, "add") diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index f259359f282..b8e38d92a80 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -850,21 +850,6 @@ def _copy_construct(self, **kwargs): return cls(**{**self._copy_construct_defaults, **kwargs}) - def _normalize_binop_value(self, other): - """Returns a *column* (not a Series) or scalar for performing - binary operations with self._column. - """ - # TODO: Fix some of this, it may not make sense to have something - # general for multiple frame types. - if isinstance(other, ColumnBase): - return other - if isinstance(other, (cudf.Series, cudf.Index)): - return other._column - elif other is cudf.NA: - return cudf.Scalar(other, dtype=self.dtype) - else: - return self._column.normalize_binop_value(other) - def sort_values(self, return_indexer=False, ascending=True, key=None): """ Return a sorted copy of the index, and optionally return the indices diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index d689feb2d82..3448a73b06c 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2045,19 +2045,6 @@ def logical_or(self, other): def logical_not(self): return self._unaryop("not") - def _normalize_binop_value(self, other): - """Returns a *column* (not a Series) or scalar for performing - binary operations with self._column. - """ - if isinstance(other, ColumnBase): - return other - if isinstance(other, (Series, Index)): - return other._column - elif other is cudf.NA: - return cudf.Scalar(other, dtype=self.dtype) - else: - return self._column.normalize_binop_value(other) - def eq(self, other, fill_value=None, axis=0): """Equal to of series and other, element-wise (binary operator eq). @@ -6215,20 +6202,6 @@ def explode(self, ignore_index=False): return super()._explode(self._column_names[0], ignore_index) -truediv_int_dtype_corrections = { - "int8": "float32", - "int16": "float32", - "int32": "float32", - "int64": "float64", - "uint8": "float32", - "uint16": "float32", - "uint32": "float64", - "uint64": "float64", - "bool": "float32", - "int": "float", -} - - class DatetimeProperties(object): """ Accessor object for datetimelike properties of the Series values. From 16aa1f898bb12508104d45563dd38054ffad7bfc Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 5 May 2021 11:09:53 -0700 Subject: [PATCH 29/35] Move bitwise binops to frame so indexes can use directly. --- python/cudf/cudf/core/frame.py | 33 ++++++++++++++++++++-- python/cudf/cudf/core/index.py | 9 ------ python/cudf/cudf/core/series.py | 49 ++------------------------------- 3 files changed, 34 insertions(+), 57 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d1d525d2638..c1c86b9cd96 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3530,7 +3530,7 @@ def _binaryop( *args, **kwargs, ): - """Perform a binary operation between two frames. + """Perform a binary operation between two single column frames. Parameters ---------- @@ -3620,13 +3620,33 @@ def _normalize_binop_value(self, other): """ if isinstance(other, ColumnBase): return other - if isinstance(other, self.__class__): + if isinstance(other, SingleColumnFrame): return other._column if other is cudf.NA: return cudf.Scalar(other, dtype=self.dtype) else: return self._column.normalize_binop_value(other) + def _bitwise_binop(self, other, op): + """Type-coercing wrapper around _binaryop for bitwise operations.""" + self_is_bool = np.issubdtype(self.dtype, np.bool_) + other_is_bool = np.issubdtype(other.dtype, np.bool_) + + if (self_is_bool or np.issubdtype(self.dtype, np.integer)) and ( + other_is_bool or np.issubdtype(other.dtype, np.integer) + ): + # TODO: This doesn't work on Series (op) DataFrame + # because dataframe doesn't have dtype + ser = self._binaryop(other, op) + if self_is_bool or other_is_bool: + ser = ser.astype(np.bool_) + return ser + else: + raise TypeError( + f"Operation 'bitwise {op}' not supported between " + f"{self.dtype.type.__name__} and {other.dtype.type.__name__}" + ) + def __add__(self, other): return self._binaryop(other, "add") @@ -3689,6 +3709,15 @@ def __gt__(self, other): def __ge__(self, other): return self._binaryop(other, "ge") + def __and__(self, other): + return self._bitwise_binop(other, "and") + + def __or__(self, other): + return self._bitwise_binop(other, "or") + + def __xor__(self, other): + return self._bitwise_binop(other, "xor") + def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b8e38d92a80..38ca1e35fee 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -945,15 +945,6 @@ def unique(self): """ return as_index(self._values.unique(), name=self.name) - def __and__(self, other): - return self._apply_op("__and__", other) - - def __or__(self, other): - return self._apply_op("__or__", other) - - def __xor__(self, other): - return self._apply_op("__xor__", other) - def join( self, other, how="left", level=None, return_indexers=False, sort=False ): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 3448a73b06c..d8bb068d21c 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1990,57 +1990,14 @@ def rtruediv(self, other, fill_value=None, axis=0): raise NotImplementedError("Only axis=0 supported at this time.") return self._binaryop(other, "truediv", fill_value, True) - def _bitwise_binop(self, other, op): - if ( - np.issubdtype(self.dtype, np.bool_) - or np.issubdtype(self.dtype, np.integer) - ) and ( - np.issubdtype(other.dtype, np.bool_) - or np.issubdtype(other.dtype, np.integer) - ): - # TODO: This doesn't work on Series (op) DataFrame - # because dataframe doesn't have dtype - ser = self._binaryop(other, op) - if np.issubdtype(self.dtype, np.bool_) or np.issubdtype( - other.dtype, np.bool_ - ): - ser = ser.astype(np.bool_) - return ser - else: - raise TypeError( - f"Operation 'bitwise {op}' not supported between " - f"{self.dtype.type.__name__} and {other.dtype.type.__name__}" - ) - - def __and__(self, other): - """Performs vectorized bitwise and (&) on corresponding elements of two - series. - """ - return self._bitwise_binop(other, "and") - - def __or__(self, other): - """Performs vectorized bitwise or (|) on corresponding elements of two - series. - """ - return self._bitwise_binop(other, "or") - - def __xor__(self, other): - """Performs vectorized bitwise xor (^) on corresponding elements of two - series. - """ - return self._bitwise_binop(other, "xor") - def logical_and(self, other): - ser = self._binaryop(other, "l_and") - return ser.astype(np.bool_) + return self._binaryop(other, "l_and").astype(np.bool_) def remainder(self, other): - ser = self._binaryop(other, "mod") - return ser + return self._binaryop(other, "mod") def logical_or(self, other): - ser = self._binaryop(other, "l_or") - return ser.astype(np.bool_) + return self._binaryop(other, "l_or").astype(np.bool_) def logical_not(self): return self._unaryop("not") From 12827ad1a65a6b058081c4ce4a3d7092bd58f7fe Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 5 May 2021 11:20:04 -0700 Subject: [PATCH 30/35] Move many more operators to SingleColumnFrame so they work on Index as well. --- python/cudf/cudf/core/frame.py | 24 ++++++++++++++++++------ python/cudf/cudf/core/series.py | 16 +--------------- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index c1c86b9cd96..0c9f9f760ef 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3647,6 +3647,7 @@ def _bitwise_binop(self, other, op): f"{self.dtype.type.__name__} and {other.dtype.type.__name__}" ) + # Binary arithmetic operations. def __add__(self, other): return self._binaryop(other, "add") @@ -3691,6 +3692,16 @@ def __rtruediv__(self, other): __div__ = __truediv__ + def __and__(self, other): + return self._bitwise_binop(other, "and") + + def __or__(self, other): + return self._bitwise_binop(other, "or") + + def __xor__(self, other): + return self._bitwise_binop(other, "xor") + + # Binary rich comparison operations. def __eq__(self, other): return self._binaryop(other, "eq") @@ -3709,14 +3720,15 @@ def __gt__(self, other): def __ge__(self, other): return self._binaryop(other, "ge") - def __and__(self, other): - return self._bitwise_binop(other, "and") + # Unary logical operators + def __neg__(self): + return -1 * self - def __or__(self, other): - return self._bitwise_binop(other, "or") + def __pos__(self): + return self.copy(deep=True) - def __xor__(self, other): - return self._bitwise_binop(other, "xor") + def __abs__(self): + return self._unaryop("abs") def _get_replacement_values_for_columns( diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index d8bb068d21c..e0b84eed0d4 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2315,11 +2315,7 @@ def ge(self, other, fill_value=None, axis=0): ) def __invert__(self): - """Bitwise invert (~) for each element. - Logical NOT if dtype is bool - - Returns a new Series. - """ + """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" if np.issubdtype(self.dtype, np.integer): return self._unaryop("invert") elif np.issubdtype(self.dtype, np.bool_): @@ -2329,13 +2325,6 @@ def __invert__(self): f"Operation `~` not supported on {self.dtype.type.__name__}" ) - def __neg__(self): - """Negated value (-) for each element - - Returns a new Series. - """ - return self.__mul__(-1) - @copy_docstring(CategoricalAccessor.__init__) # type: ignore @property def cat(self): @@ -5356,9 +5345,6 @@ def abs(self): """ return self._unaryop("abs") - def __abs__(self): - return self.abs() - # Rounding def ceil(self): """ From 974637608ad74a2786c9a938c4859d2f3d1f8eb8 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 5 May 2021 11:26:04 -0700 Subject: [PATCH 31/35] Remove Index._apply_op entirely. --- python/cudf/cudf/core/index.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 38ca1e35fee..bcbd4162a07 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -785,15 +785,6 @@ def difference(self, other, sort=None): return difference - def _apply_op(self, fn, other=None): - - idx_series = cudf.Series(self, name=self.name) - op = getattr(idx_series, fn) - if other is not None: - return as_index(op(other)) - else: - return as_index(op()) - def _binaryop(self, other, fn, fill_value=None, reflect=False): # TODO: Rather than including an allowlist of acceptable types, we # should instead return NotImplemented for __all__ other types. That From 8a26869d7a85c7a697cbb526b30ea2206d165993 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 5 May 2021 15:12:44 -0700 Subject: [PATCH 32/35] First pass at addressing PR comments. --- python/cudf/cudf/core/frame.py | 31 ++++++++++++++++--------------- python/cudf/cudf/core/index.py | 22 ++++++++++++---------- 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0c9f9f760ef..e195ce843d3 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3306,6 +3306,20 @@ def _reindex( return self._mimic_inplace(result, inplace=inplace) +_truediv_int_dtype_corrections = { + "int8": "float32", + "int16": "float32", + "int32": "float32", + "int64": "float64", + "uint8": "float32", + "uint16": "float32", + "uint32": "float64", + "uint64": "float64", + "bool": "float32", + "int": "float", +} + + class SingleColumnFrame(Frame): """A one-dimensional frame. @@ -3558,21 +3572,8 @@ def _binaryop( rhs = self._normalize_binop_value(other) - truediv_int_dtype_corrections = { - "int8": "float32", - "int16": "float32", - "int32": "float32", - "int64": "float64", - "uint8": "float32", - "uint16": "float32", - "uint32": "float64", - "uint64": "float64", - "bool": "float32", - "int": "float", - } - if fn == "truediv": - truediv_type = truediv_int_dtype_corrections.get(str(lhs.dtype)) + truediv_type = _truediv_int_dtype_corrections.get(str(lhs.dtype)) if truediv_type is not None: lhs = lhs.astype(truediv_type) @@ -3601,7 +3602,7 @@ def _binaryop( # that are Series-like objects. The output shares the lhs's name unless # the rhs is a _differently_ named Series-like object. if ( - isinstance(other, (cudf.Series, cudf.Index, pd.Series, pd.Index)) + isinstance(other, (SingleColumnFrame, pd.Series, pd.Index)) and self.name != other.name ): result_name = None diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index bcbd4162a07..4070ea99b45 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -787,7 +787,7 @@ def difference(self, other, sort=None): def _binaryop(self, other, fn, fill_value=None, reflect=False): # TODO: Rather than including an allowlist of acceptable types, we - # should instead return NotImplemented for __all__ other types. That + # should instead return NotImplemented for __all__ other types. That # will allow other types to support binops with cudf objects if they so # choose, and just as importantly will allow better error messages if # they don't support it. @@ -827,13 +827,15 @@ def _copy_construct(self, **kwargs): elif cls is RangeIndex: # RangeIndex must convert to other numerical types for ops - # TODO: The one exception is that multiplication of a - # RangeIndex in pandas results in another RangeIndex. - # Propagating that information through cudf with the current - # APIs is likely more hacky than it's worth, and it's more a - # performance loss than a functional loss to have an Int64Index - # instead of a RangeIndex. As Index logic is cleaned up it may - # become more feasible to ensure the appropriate type. + # TODO: The one exception to the output type selected here is + # that scalar multiplication of a RangeIndex in pandas results + # in another RangeIndex. Propagating that information through + # cudf with the current internals is possible, but requires + # significant hackery since we'd need _copy_construct or some + # other constructor to be intrinsically capable of processing + # operations. We should fix this behavior once we've completed + # a more thorough refactoring of the various Index classes that + # makes it easier to propagate this logic. try: cls = _dtype_to_index[data.dtype.type] except KeyError: @@ -2939,7 +2941,7 @@ def as_index(arbitrary, **kwargs) -> Index: ) -_dtype_to_index = { +_dtype_to_index: Dict[Any, Type[Index]] = { bool: Index, np.int8: Int8Index, np.int16: Int16Index, @@ -2951,7 +2953,7 @@ def as_index(arbitrary, **kwargs) -> Index: np.uint64: UInt64Index, np.float32: Float32Index, np.float64: Float64Index, -} # type: Dict[Any, Type[Index]] +} _index_to_dtype = { Int8Index: np.int8, From de74ae1478383363f6fb2a4eaff37e13e37e2a9f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 5 May 2021 15:53:31 -0700 Subject: [PATCH 33/35] Use binop on column to avoid expensive isnull. --- python/cudf/cudf/core/frame.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index e195ce843d3..9d233eb57e5 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3586,9 +3586,10 @@ def _binaryop( # If both columns are nullable, pandas semantics dictate that # nulls that are present in both lhs and rhs are not filled. if lhs.nullable and rhs.nullable: - # Note: using bitwise and rather than logical, but these - # are boolean outputs so should be fine. - output_mask = lhs.isnull() & rhs.isnull() + # Note: lhs is a Frame, while rhs is already a column. + lmask = as_column(lhs._column.nullmask) + rmask = as_column(rhs.nullmask) + output_mask = (lmask | rmask).data lhs = lhs.fillna(fill_value) rhs = rhs.fillna(fill_value) elif lhs.nullable: @@ -3612,7 +3613,7 @@ def _binaryop( output = lhs._copy_construct(data=outcol, name=result_name) if output_mask is not None: - output._column[output_mask._column] = None + output._column = output._column.set_mask(output_mask) return output def _normalize_binop_value(self, other): From 15e9458974b67fef57e13a2fcbca6c1da5fc1fe5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 5 May 2021 16:36:48 -0700 Subject: [PATCH 34/35] Switch string truediv types to dtypes. --- python/cudf/cudf/core/frame.py | 21 ++++++++++----------- python/cudf/cudf/core/index.py | 1 - 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 9d233eb57e5..3d1866cffeb 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3307,16 +3307,15 @@ def _reindex( _truediv_int_dtype_corrections = { - "int8": "float32", - "int16": "float32", - "int32": "float32", - "int64": "float64", - "uint8": "float32", - "uint16": "float32", - "uint32": "float64", - "uint64": "float64", - "bool": "float32", - "int": "float", + np.int8: np.float32, + np.int16: np.float32, + np.int32: np.float32, + np.int64: np.float64, + np.uint8: np.float32, + np.uint16: np.float32, + np.uint32: np.float64, + np.uint64: np.float64, + np.bool_: np.float32, } @@ -3573,7 +3572,7 @@ def _binaryop( rhs = self._normalize_binop_value(other) if fn == "truediv": - truediv_type = _truediv_int_dtype_corrections.get(str(lhs.dtype)) + truediv_type = _truediv_int_dtype_corrections.get(lhs.dtype.type) if truediv_type is not None: lhs = lhs.astype(truediv_type) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 4070ea99b45..2846dc241db 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2942,7 +2942,6 @@ def as_index(arbitrary, **kwargs) -> Index: _dtype_to_index: Dict[Any, Type[Index]] = { - bool: Index, np.int8: Int8Index, np.int16: Int16Index, np.int32: Int32Index, From a930a111c7998694e10655142a74e554bf549183 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 5 May 2021 16:41:29 -0700 Subject: [PATCH 35/35] Error gracefully on attempts to perform bitwise binops with extension dtypes. --- python/cudf/cudf/core/frame.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 3d1866cffeb..2e6a3c54885 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3630,8 +3630,15 @@ def _normalize_binop_value(self, other): def _bitwise_binop(self, other, op): """Type-coercing wrapper around _binaryop for bitwise operations.""" - self_is_bool = np.issubdtype(self.dtype, np.bool_) - other_is_bool = np.issubdtype(other.dtype, np.bool_) + # This will catch attempts at bitwise ops on extension dtypes. + try: + self_is_bool = np.issubdtype(self.dtype, np.bool_) + other_is_bool = np.issubdtype(other.dtype, np.bool_) + except TypeError: + raise TypeError( + f"Operation 'bitwise {op}' not supported between " + f"{self.dtype.type.__name__} and {other.dtype.type.__name__}" + ) if (self_is_bool or np.issubdtype(self.dtype, np.integer)) and ( other_is_bool or np.issubdtype(other.dtype, np.integer)