From 985a7b06b3ec12f06dea8982f3e9ff9476703382 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 29 Apr 2021 15:37:39 -0700 Subject: [PATCH 01/18] Inline _numeric_column_binop. --- python/cudf/cudf/core/column/numerical.py | 40 ++++------------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index d710129900a..39bbf10c235 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -11,7 +11,6 @@ import numpy as np import pandas as pd from numba import cuda, njit -from nvtx import annotate from pandas.api.types import is_integer_dtype import cudf @@ -151,7 +150,7 @@ def binary_operator( msg = "{!r} operator not supported between {} and {}" raise TypeError(msg.format(binop, type(self), type(rhs))) if isinstance(rhs, cudf.core.column.DecimalColumn): - lhs = self.as_decimal_column( + lhs: Union[ScalarLike, ColumnBase] = self.as_decimal_column( Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0) ) return lhs.binary_operator(binop, rhs) @@ -163,9 +162,11 @@ def binary_operator( or ((isinstance(tmp, NumericalColumn)) and (0.0 in tmp)) ): out_dtype = np.dtype("float64") - return _numeric_column_binop( - lhs=self, rhs=rhs, op=binop, out_dtype=out_dtype, reflect=reflect - ) + + if binop in {"lt", "gt", "le", "ge", "eq", "ne", "NULL_EQUALS"}: + out_dtype = "bool" + lhs, rhs = (self, rhs) if not reflect else (rhs, self) + return libcudf.binaryop.binaryop(lhs, rhs, binop, out_dtype) def _apply_scan_op(self, op: str) -> ColumnBase: return libcudf.reduce.scan(op, self, True) @@ -761,35 +762,6 @@ def to_pandas( return pd_series -@annotate("BINARY_OP", color="orange", domain="cudf_python") -def _numeric_column_binop( - lhs: Union[ColumnBase, ScalarLike], - rhs: Union[ColumnBase, ScalarLike], - op: str, - out_dtype: Dtype, - reflect: bool = False, -) -> ColumnBase: - if reflect: - lhs, rhs = rhs, lhs - - is_op_comparison = op in [ - "lt", - "gt", - "le", - "ge", - "eq", - "ne", - "NULL_EQUALS", - ] - - if is_op_comparison: - out_dtype = "bool" - - out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) - - return out - - def _numeric_column_unaryop(operand: ColumnBase, op: str) -> ColumnBase: if callable(op): return libcudf.transform.transform(operand, op) From 3a416bdb4e7fe0efeff6032871145f43f8559892 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 29 Apr 2021 15:40:06 -0700 Subject: [PATCH 02/18] Inline binop for datetime columns. --- python/cudf/cudf/core/column/datetime.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 14c82b5ff45..b96a49c2514 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -11,7 +11,6 @@ import numpy as np import pandas as pd -from nvtx import annotate import cudf from cudf import _lib as libcudf @@ -307,7 +306,7 @@ def binary_operator( ) -> ColumnBase: if isinstance(rhs, cudf.DateOffset): return rhs._datetime_binop(self, op, reflect=reflect) - lhs, rhs = self, rhs + lhs: Union[ScalarLike, ColumnBase] = self if op in ("eq", "ne", "lt", "gt", "le", "ge", "NULL_EQUALS"): out_dtype = np.dtype(np.bool_) # type: Dtype elif op == "add" and pd.api.types.is_timedelta64_dtype(rhs.dtype): @@ -332,7 +331,10 @@ def binary_operator( f"Series of dtype {self.dtype} cannot perform " f" the operation {op}" ) - return binop(lhs, rhs, op=op, out_dtype=out_dtype, reflect=reflect) + + if reflect: + lhs, rhs = rhs, lhs + return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) def fillna( self, fill_value: Any = None, method: str = None, dtype: Dtype = None @@ -422,20 +424,6 @@ def _make_copy_with_na_as_null(self): return out_col -@annotate("BINARY_OP", color="orange", domain="cudf_python") -def binop( - lhs: Union[ColumnBase, ScalarLike], - rhs: Union[ColumnBase, ScalarLike], - op: str, - out_dtype: Dtype, - reflect: bool, -) -> ColumnBase: - if reflect: - lhs, rhs = rhs, lhs - out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) - return out - - def binop_offset(lhs, rhs, op): if rhs._is_no_op: return lhs From e9a80e0ac36ba15272693dc0f9c52f81170d88ba Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 29 Apr 2021 15:57:47 -0700 Subject: [PATCH 03/18] Inline binop for string columns. --- python/cudf/cudf/core/column/string.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index e4d7b6d4188..f969593e6d9 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -12,7 +12,6 @@ import pandas as pd import pyarrow as pa from numba import cuda -from nvtx import annotate import cudf from cudf import _lib as libcudf @@ -5302,7 +5301,9 @@ def binary_operator( if op == "add": return cast("column.ColumnBase", lhs.str().cat(others=rhs)) elif op in ("eq", "ne", "gt", "lt", "ge", "le", "NULL_EQUALS"): - return _string_column_binop(self, rhs, op=op, out_dtype="bool") + return libcudf.binaryop.binaryop( + lhs=self, rhs=rhs, op=op, dtype="bool" + ) raise TypeError( f"{op} operator not supported between {type(self)} and {type(rhs)}" @@ -5335,17 +5336,6 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase": return to_view.view(dtype) -@annotate("BINARY_OP", color="orange", domain="cudf_python") -def _string_column_binop( - lhs: "column.ColumnBase", - rhs: "column.ColumnBase", - op: str, - out_dtype: Dtype, -) -> "column.ColumnBase": - out = libcudf.binaryop.binaryop(lhs=lhs, rhs=rhs, op=op, dtype=out_dtype) - return out - - def _get_cols_list(parent_obj, others): parent_index = ( From eef4b18bbd463634a14be508201bb52b63651296 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 29 Apr 2021 15:58:32 -0700 Subject: [PATCH 04/18] Various minor cleanup tasks for string columns. --- python/cudf/cudf/core/column/string.py | 11 ++--------- python/cudf/cudf/tests/test_string.py | 1 - 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index f969593e6d9..5cf32cc9561 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5003,13 +5003,6 @@ def __contains__(self, item: ScalarLike) -> bool: def str(self, parent: ParentType = None) -> StringMethods: return StringMethods(self, parent=parent) - @property - def _nbytes(self) -> int: - if self.size == 0: - return 0 - else: - return self.children[1].size - def as_numerical_column( self, dtype: Dtype ) -> "cudf.core.column.NumericalColumn": @@ -5126,7 +5119,7 @@ def to_array(self, fillna: bool = None) -> np.ndarray: return self.to_arrow().to_pandas().values def to_pandas( - self, index: ColumnLike = None, nullable: bool = False, **kwargs + self, index: pd.Index = None, nullable: bool = False, **kwargs ) -> "pd.Series": if nullable: pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow()) @@ -5139,7 +5132,7 @@ def to_pandas( return pd_series def serialize(self) -> Tuple[dict, list]: - header = {"null_count": self.null_count} # type: Dict[Any, Any] + header: Dict[Any, Any] = {"null_count": self.null_count} header["type-serialized"] = pickle.dumps(type(self)) header["size"] = self.size diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 0ff5b81ce81..0f87f007266 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1317,7 +1317,6 @@ def test_string_no_children_properties(): assert empty_col.children == () assert empty_col.size == 0 - assert empty_col._nbytes == 0 assert getsizeof(empty_col) >= 0 # Accounts for Python GC overhead From 7d4ea4e293fa24509d943bd3f7d7e6756068c6fa Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 29 Apr 2021 16:10:06 -0700 Subject: [PATCH 05/18] Inline binop for timedelta columns. --- python/cudf/cudf/core/column/timedelta.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index d8ad11f41b3..b202838662c 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -9,7 +9,6 @@ import numpy as np import pandas as pd import pyarrow as pa -from nvtx import annotate import cudf from cudf import _lib as libcudf @@ -247,7 +246,7 @@ def binary_operator( if reflect: lhs, rhs = rhs, lhs # type: ignore - return binop(lhs, rhs, op=op, out_dtype=out_dtype) + return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) def normalize_binop_value(self, other) -> BinaryOperand: if isinstance(other, cudf.Scalar): @@ -575,17 +574,6 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn": ) -@annotate("BINARY_OP", color="orange", domain="cudf_python") -def binop( - lhs: "column.ColumnBase", - rhs: "column.ColumnBase", - op: str, - out_dtype: DtypeObj, -) -> "cudf.core.column.ColumnBase": - out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) - return out - - def determine_out_dtype(lhs_dtype: Dtype, rhs_dtype: Dtype) -> Dtype: if np.can_cast(np.dtype(lhs_dtype), np.dtype(rhs_dtype)): return rhs_dtype From eb692837633b7dbdc836246fcbb50d6bdd212a65 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 29 Apr 2021 16:27:40 -0700 Subject: [PATCH 06/18] Cleanup of categorical. --- python/cudf/cudf/core/column/categorical.py | 21 ++++---------- python/cudf/cudf/core/column/column.py | 32 +++++++-------------- 2 files changed, 17 insertions(+), 36 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 3cd1a599ddc..456d2cc2b46 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -35,6 +35,7 @@ min_signed_type, min_unsigned_type, ) +from cudf.utils.utils import cached_property if TYPE_CHECKING: from cudf.core.column import ( @@ -819,7 +820,7 @@ def __contains__(self, item: ScalarLike) -> bool: return self._encode(item) in self.as_numerical def serialize(self) -> Tuple[dict, list]: - header = {} # type: Dict[Any, Any] + header: Dict[Any, Any] = {} frames = [] header["type-serialized"] = pickle.dumps(type(self)) header["dtype"], dtype_frames = self.dtype.serialize() @@ -1341,23 +1342,13 @@ def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: """ return self.as_numerical.find_last_value(self._encode(value)) - @property + @cached_property def is_monotonic_increasing(self) -> bool: - if not hasattr(self, "_is_monotonic_increasing"): - self._is_monotonic_increasing = ( - bool(self.ordered) - and self.as_numerical.is_monotonic_increasing - ) - return self._is_monotonic_increasing + return bool(self.ordered) and self.as_numerical.is_monotonic_increasing - @property + @cached_property def is_monotonic_decreasing(self) -> bool: - if not hasattr(self, "_is_monotonic_decreasing"): - self._is_monotonic_decreasing = ( - bool(self.ordered) - and self.as_numerical.is_monotonic_decreasing - ) - return self._is_monotonic_decreasing + return bool(self.ordered) and self.as_numerical.is_monotonic_decreasing def as_categorical_column( self, dtype: Dtype, **kwargs diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 0f039b137bc..62767c6aae3 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -56,7 +56,7 @@ min_unsigned_type, np_to_pa_dtype, ) -from cudf.utils.utils import mask_dtype +from cudf.utils.utils import cached_property, mask_dtype T = TypeVar("T", bound="ColumnBase") @@ -932,27 +932,17 @@ def is_unique(self) -> bool: def is_monotonic(self) -> bool: return self.is_monotonic_increasing - @property + @cached_property def is_monotonic_increasing(self) -> bool: - if not hasattr(self, "_is_monotonic_increasing"): - if self.has_nulls: - self._is_monotonic_increasing = False - else: - self._is_monotonic_increasing = self.as_frame()._is_sorted( - ascending=None, null_position=None - ) - return self._is_monotonic_increasing + return not self.has_nulls and self.as_frame()._is_sorted( + ascending=None, null_position=None + ) - @property + @cached_property def is_monotonic_decreasing(self) -> bool: - if not hasattr(self, "_is_monotonic_decreasing"): - if self.has_nulls: - self._is_monotonic_decreasing = False - else: - self._is_monotonic_decreasing = self.as_frame()._is_sorted( - ascending=[False], null_position=None - ) - return self._is_monotonic_decreasing + return not self.has_nulls and self.as_frame()._is_sorted( + ascending=[False], null_position=None + ) def get_slice_bound( self, label: ScalarLike, side: builtins.str, kind: builtins.str @@ -1211,7 +1201,7 @@ def unique(self) -> ColumnBase: ) def serialize(self) -> Tuple[dict, list]: - header = {} # type: Dict[Any, Any] + header: Dict[Any, Any] = {} frames = [] header["type-serialized"] = pickle.dumps(type(self)) header["dtype"] = self.dtype.str @@ -2226,7 +2216,7 @@ def serialize_columns(columns) -> Tuple[List[dict], List]: frames : list list of frames """ - headers = [] # type List[Dict[Any, Any], ...] + headers: List[Dict[Any, Any]] = [] frames = [] if len(columns) > 0: From 13fee0e4f8b14c5df9a66ca852f3336053af7133 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 30 Apr 2021 08:36:35 -0700 Subject: [PATCH 07/18] Move ColumnBase._concat out of the class. --- python/cudf/cudf/core/column/column.py | 201 ++++++++++++------------- python/cudf/cudf/core/index.py | 3 +- python/cudf/cudf/core/series.py | 3 +- 3 files changed, 104 insertions(+), 103 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 62767c6aae3..c2b40c0d70e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -197,106 +197,6 @@ def cat( def str(self, parent=None) -> "cudf.core.column.string.StringMethods": raise NotImplementedError() - @classmethod - def _concat( - cls, objs: "MutableSequence[ColumnBase]", dtype: Dtype = None - ) -> ColumnBase: - if len(objs) == 0: - dtype = pd.api.types.pandas_dtype(dtype) - if is_categorical_dtype(dtype): - dtype = CategoricalDtype() - return column_empty(0, dtype=dtype, masked=True) - - # If all columns are `NumericalColumn` with different dtypes, - # we cast them to a common dtype. - # Notice, we can always cast pure null columns - not_null_cols = list(filter(lambda o: o.valid_count > 0, objs)) - if len(not_null_cols) > 0 and ( - len( - [ - o - for o in not_null_cols - if not is_numerical_dtype(o.dtype) - or np.issubdtype(o.dtype, np.datetime64) - ] - ) - == 0 - ): - col_dtypes = [o.dtype for o in not_null_cols] - # Use NumPy to find a common dtype - common_dtype = np.find_common_type(col_dtypes, []) - # Cast all columns to the common dtype - for i in range(len(objs)): - objs[i] = objs[i].astype(common_dtype) - - # Find the first non-null column: - head = objs[0] - for i, obj in enumerate(objs): - if obj.valid_count > 0: - head = obj - break - - for i, obj in enumerate(objs): - # Check that all columns are the same type: - if not pd.api.types.is_dtype_equal(obj.dtype, head.dtype): - # if all null, cast to appropriate dtype - if obj.valid_count == 0: - objs[i] = column_empty_like( - head, dtype=head.dtype, masked=True, newsize=len(obj) - ) - else: - raise ValueError("All columns must be the same type") - - cats = None - is_categorical = all(is_categorical_dtype(o.dtype) for o in objs) - - # Combine CategoricalColumn categories - if is_categorical: - # Combine and de-dupe the categories - cats = ( - cudf.concat([o.cat().categories for o in objs]) - .to_series() - .drop_duplicates(ignore_index=True) - ._column - ) - objs = [ - o.cat()._set_categories( - o.cat().categories, cats, is_unique=True - ) - for o in objs - ] - # Map `objs` into a list of the codes until we port Categorical to - # use the libcudf++ Category data type. - objs = [o.cat().codes._column for o in objs] - head = head.cat().codes._column - - newsize = sum(map(len, objs)) - if newsize > libcudf.MAX_COLUMN_SIZE: - raise MemoryError( - f"Result of concat cannot have " - f"size > {libcudf.MAX_COLUMN_SIZE_STR}" - ) - - # Filter out inputs that have 0 length - objs = [o for o in objs if len(o) > 0] - - # Perform the actual concatenation - if newsize > 0: - col = libcudf.concat.concat_columns(objs) - else: - col = column_empty(0, head.dtype, masked=True) - - if is_categorical: - col = build_categorical_column( - categories=as_column(cats), - codes=as_column(col.base_data, dtype=col.dtype), - mask=col.base_mask, - size=col.size, - offset=col.offset, - ) - - return col - def dropna(self, drop_nan: bool = False) -> ColumnBase: if drop_nan: col = self.nans_to_nulls() @@ -796,7 +696,7 @@ def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: return indices[-1] def append(self, other: ColumnBase) -> ColumnBase: - return self.__class__._concat([self, as_column(other)]) + return _concat_columns([self, as_column(other)]) def quantile( self, @@ -2336,3 +2236,102 @@ def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase: dtype: int8 """ return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size) + + +def _concat_columns( + objs: "MutableSequence[ColumnBase]", dtype: Dtype = None +) -> ColumnBase: + """Concatenate a sequence of columns.""" + if len(objs) == 0: + dtype = pd.api.types.pandas_dtype(dtype) + if is_categorical_dtype(dtype): + dtype = CategoricalDtype() + return column_empty(0, dtype=dtype, masked=True) + + # If all columns are `NumericalColumn` with different dtypes, + # we cast them to a common dtype. + # Notice, we can always cast pure null columns + not_null_cols = list(filter(lambda o: o.valid_count > 0, objs)) + if len(not_null_cols) > 0 and ( + len( + [ + o + for o in not_null_cols + if not is_numerical_dtype(o.dtype) + or np.issubdtype(o.dtype, np.datetime64) + ] + ) + == 0 + ): + col_dtypes = [o.dtype for o in not_null_cols] + # Use NumPy to find a common dtype + common_dtype = np.find_common_type(col_dtypes, []) + # Cast all columns to the common dtype + for i in range(len(objs)): + objs[i] = objs[i].astype(common_dtype) + + # Find the first non-null column: + head = objs[0] + for i, obj in enumerate(objs): + if obj.valid_count > 0: + head = obj + break + + for i, obj in enumerate(objs): + # Check that all columns are the same type: + if not pd.api.types.is_dtype_equal(obj.dtype, head.dtype): + # if all null, cast to appropriate dtype + if obj.valid_count == 0: + objs[i] = column_empty_like( + head, dtype=head.dtype, masked=True, newsize=len(obj) + ) + else: + raise ValueError("All columns must be the same type") + + cats = None + is_categorical = all(is_categorical_dtype(o.dtype) for o in objs) + + # Combine CategoricalColumn categories + if is_categorical: + # Combine and de-dupe the categories + cats = ( + cudf.concat([o.cat().categories for o in objs]) + .to_series() + .drop_duplicates(ignore_index=True) + ._column + ) + objs = [ + o.cat()._set_categories(o.cat().categories, cats, is_unique=True) + for o in objs + ] + # Map `objs` into a list of the codes until we port Categorical to + # use the libcudf++ Category data type. + objs = [o.cat().codes._column for o in objs] + head = head.cat().codes._column + + newsize = sum(map(len, objs)) + if newsize > libcudf.MAX_COLUMN_SIZE: + raise MemoryError( + f"Result of concat cannot have " + f"size > {libcudf.MAX_COLUMN_SIZE_STR}" + ) + + # Filter out inputs that have 0 length + objs = [o for o in objs if len(o) > 0] + + # Perform the actual concatenation + if newsize > 0: + col = libcudf.concat.concat_columns(objs) + else: + col = column_empty(0, head.dtype, masked=True) + + if is_categorical: + col = build_categorical_column( + categories=as_column(cats), + codes=as_column(col.base_data, dtype=col.dtype), + mask=col.base_mask, + size=col.size, + offset=col.offset, + ) + + return col diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 0ffe0c11fef..c2528a91927 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -27,6 +27,7 @@ arange, column, ) +from cudf.core.column.column import _concat_columns from cudf.core.column.string import StringMethods as StringMethods from cudf.core.dtypes import IntervalDtype from cudf.core.frame import Frame @@ -779,7 +780,7 @@ def sum(self): @classmethod def _concat(cls, objs): - data = ColumnBase._concat([o._values for o in objs]) + data = _concat_columns([o._values for o in objs]) names = {obj.name for obj in objs} if len(names) == 1: [name] = names diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 4cc5fb56a4c..67bf6805d38 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -34,6 +34,7 @@ from cudf.core.column.categorical import ( CategoricalAccessor as CategoricalAccessor, ) +from cudf.core.column.column import _concat_columns from cudf.core.column.lists import ListMethods from cudf.core.column.string import StringMethods from cudf.core.column.struct import StructMethods @@ -2749,7 +2750,7 @@ def _concat(cls, objs, axis=0, index=True): if dtype_mismatch: objs = numeric_normalize_types(*objs) - col = ColumnBase._concat([o._column for o in objs]) + col = _concat_columns([o._column for o in objs]) return cls(data=col, index=index, name=name) @property From 42a490c025b19f674c3a47f16387302896d5e6fe Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 30 Apr 2021 12:15:07 -0700 Subject: [PATCH 08/18] Various cleanup tasks in _concat_columns. --- python/cudf/cudf/core/column/column.py | 55 +++++++++----------------- 1 file changed, 18 insertions(+), 37 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index c2b40c0d70e..cb1e3a78196 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2238,44 +2238,29 @@ def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase: return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size) -def _concat_columns( - objs: "MutableSequence[ColumnBase]", dtype: Dtype = None -) -> ColumnBase: +def _concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: """Concatenate a sequence of columns.""" - if len(objs) == 0: - dtype = pd.api.types.pandas_dtype(dtype) - if is_categorical_dtype(dtype): - dtype = CategoricalDtype() - return column_empty(0, dtype=dtype, masked=True) + # TODO: This function currently assumes that len(objs) > 0. Concatenation + # is only accessible via cudf.concat (which calls through to this from + # methods like Index._concat or Series._concat), and that method + # pre-filters on zero objects being provided. We may need to add logic for + # handling an empty input sequence as more Frame logic becomes centralized. # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. # Notice, we can always cast pure null columns - not_null_cols = list(filter(lambda o: o.valid_count > 0, objs)) - if len(not_null_cols) > 0 and ( - len( - [ - o - for o in not_null_cols - if not is_numerical_dtype(o.dtype) - or np.issubdtype(o.dtype, np.datetime64) - ] - ) - == 0 + not_null_col_dtypes = [o.dtype for o in objs if o.valid_count > 0] + if len(not_null_col_dtypes) and all( + is_numerical_dtype(dtyp) and np.issubdtype(dtyp, np.datetime64) + for dtyp in not_null_col_dtypes ): - col_dtypes = [o.dtype for o in not_null_cols] # Use NumPy to find a common dtype - common_dtype = np.find_common_type(col_dtypes, []) + common_dtype = np.find_common_type(not_null_col_dtypes, []) # Cast all columns to the common dtype - for i in range(len(objs)): - objs[i] = objs[i].astype(common_dtype) + objs = [obj.astype(common_dtype) for obj in objs] # Find the first non-null column: - head = objs[0] - for i, obj in enumerate(objs): - if obj.valid_count > 0: - head = obj - break + head = next((obj for obj in objs if obj.valid_count), objs[0]) for i, obj in enumerate(objs): # Check that all columns are the same type: @@ -2288,7 +2273,6 @@ def _concat_columns( else: raise ValueError("All columns must be the same type") - cats = None is_categorical = all(is_categorical_dtype(o.dtype) for o in objs) # Combine CategoricalColumn categories @@ -2315,15 +2299,12 @@ def _concat_columns( f"Result of concat cannot have " f"size > {libcudf.MAX_COLUMN_SIZE_STR}" ) - - # Filter out inputs that have 0 length - objs = [o for o in objs if len(o) > 0] - - # Perform the actual concatenation - if newsize > 0: - col = libcudf.concat.concat_columns(objs) - else: + elif newsize == 0: col = column_empty(0, head.dtype, masked=True) + else: + # Filter out inputs that have 0 length, then concatenate. + objs = [o for o in objs if len(o) > 0] + col = libcudf.concat.concat_columns(objs) if is_categorical: col = build_categorical_column( From f8c2590815e987684dfc72510990a3a928a3b7ee Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 30 Apr 2021 14:20:05 -0700 Subject: [PATCH 09/18] Move categorical column concatenation into CategoricalColumn. --- python/cudf/cudf/core/column/categorical.py | 47 +++++++++++++++++++++ python/cudf/cudf/core/column/column.py | 40 ++++++------------ 2 files changed, 59 insertions(+), 28 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 456d2cc2b46..d9a02a2c981 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -3,6 +3,7 @@ from __future__ import annotations import pickle +from collections.abc import MutableSequence from typing import ( TYPE_CHECKING, Any, @@ -1463,6 +1464,52 @@ def view(self, dtype: Dtype) -> ColumnBase: "Categorical column views are not currently supported" ) + @staticmethod + def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn: + # TODO: This function currently assumes it is being called from + # column._concat_columns, at least to the extent that all the + # preprocessing in that function has already been done. That should be + # improved as the concatenation API is solidified. + # Find the first non-null column: + head = next((obj for obj in objs if obj.valid_count), objs[0]) + + # Combine and de-dupe the categories + cats = ( + cudf.concat([o.cat().categories for o in objs]) + .to_series() + .drop_duplicates(ignore_index=True) + ._column + ) + objs = [ + o.cat()._set_categories(o.cat().categories, cats, is_unique=True) + for o in objs + ] + # Map `objs` into a list of the codes until we port Categorical to + # use the libcudf++ Category data type. + objs = [o.cat().codes._column for o in objs] + head = head.cat().codes._column + + newsize = sum(map(len, objs)) + if newsize > libcudf.MAX_COLUMN_SIZE: + raise MemoryError( + f"Result of concat cannot have " + f"size > {libcudf.MAX_COLUMN_SIZE_STR}" + ) + elif newsize == 0: + col = column.column_empty(0, head.dtype, masked=True) + else: + # Filter out inputs that have 0 length, then concatenate. + objs = [o for o in objs if len(o) > 0] + col = libcudf.concat.concat_columns(objs) + + return column.build_categorical_column( + categories=column.as_column(cats), + codes=column.as_column(col.base_data, dtype=col.dtype), + mask=col.base_mask, + size=col.size, + offset=col.offset, + ) + def _create_empty_categorical_column( categorical_column: CategoricalColumn, dtype: "CategoricalDtype" diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index cb1e3a78196..e6dd8ac822a 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2273,25 +2273,19 @@ def _concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: else: raise ValueError("All columns must be the same type") - is_categorical = all(is_categorical_dtype(o.dtype) for o in objs) - - # Combine CategoricalColumn categories - if is_categorical: - # Combine and de-dupe the categories - cats = ( - cudf.concat([o.cat().categories for o in objs]) - .to_series() - .drop_duplicates(ignore_index=True) - ._column + # TODO: This logic should be generalized to a dispatch to + # ColumnBase._concat so that all subclasses can override necessary + # behavior. However, at the moment it's not clear what that API should look + # like, so CategoricalColumn simply implements a minimal working API. + if all(is_categorical_dtype(o.dtype) for o in objs): + return cudf.core.column.categorical.CategoricalColumn._concat( + cast( + MutableSequence[ + cudf.core.column.categorical.CategoricalColumn + ], + objs, + ) ) - objs = [ - o.cat()._set_categories(o.cat().categories, cats, is_unique=True) - for o in objs - ] - # Map `objs` into a list of the codes until we port Categorical to - # use the libcudf++ Category data type. - objs = [o.cat().codes._column for o in objs] - head = head.cat().codes._column newsize = sum(map(len, objs)) if newsize > libcudf.MAX_COLUMN_SIZE: @@ -2305,14 +2299,4 @@ def _concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: # Filter out inputs that have 0 length, then concatenate. objs = [o for o in objs if len(o) > 0] col = libcudf.concat.concat_columns(objs) - - if is_categorical: - col = build_categorical_column( - categories=as_column(cats), - codes=as_column(col.base_data, dtype=col.dtype), - mask=col.base_mask, - size=col.size, - offset=col.offset, - ) - return col From 4338d576e923849bcc1205fb8c3f3ecee4f637b0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 30 Apr 2021 14:36:19 -0700 Subject: [PATCH 10/18] Remove unnecessary generic methods. --- python/cudf/cudf/core/column/column.py | 8 -------- python/cudf/cudf/core/column/string.py | 2 +- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index e6dd8ac822a..a73836fb972 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -189,14 +189,6 @@ def __sizeof__(self) -> int: n += bitmask_allocation_size_bytes(self.size) return n - def cat( - self, parent=None - ) -> "cudf.core.column.categorical.CategoricalAccessor": - raise NotImplementedError() - - def str(self, parent=None) -> "cudf.core.column.string.StringMethods": - raise NotImplementedError() - def dropna(self, drop_nan: bool = False) -> ColumnBase: if drop_nan: col = self.nans_to_nulls() diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 5cf32cc9561..4e3b7963f56 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4976,7 +4976,7 @@ def sum( result_col = self._process_for_reduction( skipna=skipna, min_count=min_count ) - if isinstance(result_col, cudf.core.column.ColumnBase): + if isinstance(result_col, type(self)): return result_col.str().cat() else: return result_col From 45c0c04e96db4033e72451e6a15571c607261cc8 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 3 May 2021 10:59:47 -0700 Subject: [PATCH 11/18] Address PR comments. --- python/cudf/cudf/core/column/categorical.py | 6 +++--- python/cudf/cudf/core/column/column.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index d9a02a2c981..6a70089e92e 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -36,7 +36,6 @@ min_signed_type, min_unsigned_type, ) -from cudf.utils.utils import cached_property if TYPE_CHECKING: from cudf.core.column import ( @@ -1343,11 +1342,11 @@ def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: """ return self.as_numerical.find_last_value(self._encode(value)) - @cached_property + @property def is_monotonic_increasing(self) -> bool: return bool(self.ordered) and self.as_numerical.is_monotonic_increasing - @cached_property + @property def is_monotonic_decreasing(self) -> bool: return bool(self.ordered) and self.as_numerical.is_monotonic_decreasing @@ -1470,6 +1469,7 @@ def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn: # column._concat_columns, at least to the extent that all the # preprocessing in that function has already been done. That should be # improved as the concatenation API is solidified. + # Find the first non-null column: head = next((obj for obj in objs if obj.valid_count), objs[0]) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index a73836fb972..735a7b4645c 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -56,7 +56,7 @@ min_unsigned_type, np_to_pa_dtype, ) -from cudf.utils.utils import cached_property, mask_dtype +from cudf.utils.utils import mask_dtype T = TypeVar("T", bound="ColumnBase") @@ -824,13 +824,13 @@ def is_unique(self) -> bool: def is_monotonic(self) -> bool: return self.is_monotonic_increasing - @cached_property + @property def is_monotonic_increasing(self) -> bool: return not self.has_nulls and self.as_frame()._is_sorted( ascending=None, null_position=None ) - @cached_property + @property def is_monotonic_decreasing(self) -> bool: return not self.has_nulls and self.as_frame()._is_sorted( ascending=[False], null_position=None From 5a69ea483928dd6e3bc8bf4f93eedc4c79b25ec1 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 3 May 2021 13:51:58 -0700 Subject: [PATCH 12/18] Switch from collections.abc to typing for MutableSequence. --- python/cudf/cudf/core/column/column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 735a7b4645c..d0ed6c98d32 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -5,13 +5,13 @@ import builtins import pickle import warnings -from collections.abc import MutableSequence from types import SimpleNamespace from typing import ( Any, Callable, Dict, List, + MutableSequence, Optional, Sequence, Tuple, From fb030a9a5197cc0868d22f06d8f0bce80fe4ba2d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 5 May 2021 17:08:10 -0700 Subject: [PATCH 13/18] Avoid caching monotonic properties in MultiIndex. --- python/cudf/cudf/core/multiindex.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index ca029198e52..6b551dc72c3 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1386,11 +1386,7 @@ def is_monotonic_increasing(self): Return if the index is monotonic increasing (only equal or increasing) values. """ - if not hasattr(self, "_is_monotonic_increasing"): - self._is_monotonic_increasing = self._is_sorted( - ascending=None, null_position=None - ) - return self._is_monotonic_increasing + return self._is_sorted(ascending=None, null_position=None) @property def is_monotonic_decreasing(self): @@ -1398,11 +1394,9 @@ def is_monotonic_decreasing(self): Return if the index is monotonic decreasing (only equal or decreasing) values. """ - if not hasattr(self, "_is_monotonic_decreasing"): - self._is_monotonic_decreasing = self._is_sorted( - ascending=[False] * len(self.levels), null_position=None - ) - return self._is_monotonic_decreasing + return self._is_sorted( + ascending=[False] * len(self.levels), null_position=None + ) def argsort(self, ascending=True, **kwargs): indices = self._source_data.argsort(ascending=ascending, **kwargs) From 4d1239b54baa1759a874db2b9d7f36a1199214c3 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 5 May 2021 18:20:21 -0700 Subject: [PATCH 14/18] Handle empty list of objects to concatenate. --- python/cudf/cudf/core/column/column.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d0ed6c98d32..9e632a57383 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2232,11 +2232,9 @@ def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase: def _concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: """Concatenate a sequence of columns.""" - # TODO: This function currently assumes that len(objs) > 0. Concatenation - # is only accessible via cudf.concat (which calls through to this from - # methods like Index._concat or Series._concat), and that method - # pre-filters on zero objects being provided. We may need to add logic for - # handling an empty input sequence as more Frame logic becomes centralized. + if len(objs) == 0: + dtype = pd.api.types.pandas_dtype(None) + return column_empty(0, dtype=dtype, masked=True) # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. From 05c33921bbbc6bca34fa903ec261f56fab91ce6a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 11 May 2021 16:34:46 -0700 Subject: [PATCH 15/18] Apply suggestions from code review Co-authored-by: Keith Kraus --- python/cudf/cudf/core/column/categorical.py | 2 +- python/cudf/cudf/core/column/column.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 6a70089e92e..97bc097b7d2 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1499,7 +1499,7 @@ def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn: col = column.column_empty(0, head.dtype, masked=True) else: # Filter out inputs that have 0 length, then concatenate. - objs = [o for o in objs if len(o) > 0] + objs = [o for o in objs if len(o)] col = libcudf.concat.concat_columns(objs) return column.build_categorical_column( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 9e632a57383..31fbd33c990 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2239,7 +2239,7 @@ def _concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. # Notice, we can always cast pure null columns - not_null_col_dtypes = [o.dtype for o in objs if o.valid_count > 0] + not_null_col_dtypes = [o.dtype for o in objs if o.valid_count] if len(not_null_col_dtypes) and all( is_numerical_dtype(dtyp) and np.issubdtype(dtyp, np.datetime64) for dtyp in not_null_col_dtypes @@ -2287,6 +2287,6 @@ def _concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: col = column_empty(0, head.dtype, masked=True) else: # Filter out inputs that have 0 length, then concatenate. - objs = [o for o in objs if len(o) > 0] + objs = [o for o in objs if len(o)] col = libcudf.concat.concat_columns(objs) return col From 6d73c444ab8f6a87e52ead0418a7fe8c0f6108a4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 14 May 2021 10:20:06 -0700 Subject: [PATCH 16/18] Use codes directly. --- python/cudf/cudf/core/column/categorical.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 97bc097b7d2..67f50cedb1e 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1484,30 +1484,27 @@ def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn: o.cat()._set_categories(o.cat().categories, cats, is_unique=True) for o in objs ] - # Map `objs` into a list of the codes until we port Categorical to - # use the libcudf++ Category data type. - objs = [o.cat().codes._column for o in objs] - head = head.cat().codes._column + codes = [o.codes for o in objs] - newsize = sum(map(len, objs)) + newsize = sum(map(len, codes)) if newsize > libcudf.MAX_COLUMN_SIZE: raise MemoryError( f"Result of concat cannot have " f"size > {libcudf.MAX_COLUMN_SIZE_STR}" ) elif newsize == 0: - col = column.column_empty(0, head.dtype, masked=True) + codes_col = column.column_empty(0, head.codes.dtype, masked=True) else: # Filter out inputs that have 0 length, then concatenate. - objs = [o for o in objs if len(o)] - col = libcudf.concat.concat_columns(objs) + codes = [o for o in codes if len(o)] + codes_col = libcudf.concat.concat_columns(objs) return column.build_categorical_column( categories=column.as_column(cats), - codes=column.as_column(col.base_data, dtype=col.dtype), - mask=col.base_mask, - size=col.size, - offset=col.offset, + codes=column.as_column(codes_col.base_data, dtype=codes_col.dtype), + mask=codes_col.base_mask, + size=codes_col.size, + offset=codes_col.offset, ) From 565c99779b7b892f2b1aa3ca5cfbd2bcdc25b2f4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 14 May 2021 11:25:07 -0700 Subject: [PATCH 17/18] Use column.ColumnBase and remove import to try to appease flake8. --- python/cudf/cudf/core/series.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index cc594474520..9dec934a0ae 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -22,7 +22,6 @@ from cudf._lib.transform import bools_to_mask from cudf.core.abc import Serializable from cudf.core.column import ( - ColumnBase, DatetimeColumn, TimeDeltaColumn, arange, @@ -2148,7 +2147,7 @@ def _normalize_binop_value(self, other): """Returns a *column* (not a Series) or scalar for performing binary operations with self._column. """ - if isinstance(other, ColumnBase): + if isinstance(other, column.ColumnBase): return other if isinstance(other, Series): return other._column From 49ca36a6cd49c27f0609307c3c3bb99cef72b4cc Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 14 May 2021 14:10:27 -0700 Subject: [PATCH 18/18] Remove conversion to series. --- python/cudf/cudf/core/column/categorical.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 67f50cedb1e..c199947d261 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1476,8 +1476,7 @@ def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn: # Combine and de-dupe the categories cats = ( cudf.concat([o.cat().categories for o in objs]) - .to_series() - .drop_duplicates(ignore_index=True) + .drop_duplicates() ._column ) objs = [