diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 3cd1a599ddc..c199947d261 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -3,6 +3,7 @@ from __future__ import annotations import pickle +from collections.abc import MutableSequence from typing import ( TYPE_CHECKING, Any, @@ -819,7 +820,7 @@ def __contains__(self, item: ScalarLike) -> bool: return self._encode(item) in self.as_numerical def serialize(self) -> Tuple[dict, list]: - header = {} # type: Dict[Any, Any] + header: Dict[Any, Any] = {} frames = [] header["type-serialized"] = pickle.dumps(type(self)) header["dtype"], dtype_frames = self.dtype.serialize() @@ -1343,21 +1344,11 @@ def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: @property def is_monotonic_increasing(self) -> bool: - if not hasattr(self, "_is_monotonic_increasing"): - self._is_monotonic_increasing = ( - bool(self.ordered) - and self.as_numerical.is_monotonic_increasing - ) - return self._is_monotonic_increasing + return bool(self.ordered) and self.as_numerical.is_monotonic_increasing @property def is_monotonic_decreasing(self) -> bool: - if not hasattr(self, "_is_monotonic_decreasing"): - self._is_monotonic_decreasing = ( - bool(self.ordered) - and self.as_numerical.is_monotonic_decreasing - ) - return self._is_monotonic_decreasing + return bool(self.ordered) and self.as_numerical.is_monotonic_decreasing def as_categorical_column( self, dtype: Dtype, **kwargs @@ -1472,6 +1463,49 @@ def view(self, dtype: Dtype) -> ColumnBase: "Categorical column views are not currently supported" ) + @staticmethod + def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn: + # TODO: This function currently assumes it is being called from + # column._concat_columns, at least to the extent that all the + # preprocessing in that function has already been done. That should be + # improved as the concatenation API is solidified. + + # Find the first non-null column: + head = next((obj for obj in objs if obj.valid_count), objs[0]) + + # Combine and de-dupe the categories + cats = ( + cudf.concat([o.cat().categories for o in objs]) + .drop_duplicates() + ._column + ) + objs = [ + o.cat()._set_categories(o.cat().categories, cats, is_unique=True) + for o in objs + ] + codes = [o.codes for o in objs] + + newsize = sum(map(len, codes)) + if newsize > libcudf.MAX_COLUMN_SIZE: + raise MemoryError( + f"Result of concat cannot have " + f"size > {libcudf.MAX_COLUMN_SIZE_STR}" + ) + elif newsize == 0: + codes_col = column.column_empty(0, head.codes.dtype, masked=True) + else: + # Filter out inputs that have 0 length, then concatenate. + codes = [o for o in codes if len(o)] + codes_col = libcudf.concat.concat_columns(objs) + + return column.build_categorical_column( + categories=column.as_column(cats), + codes=column.as_column(codes_col.base_data, dtype=codes_col.dtype), + mask=codes_col.base_mask, + size=codes_col.size, + offset=codes_col.offset, + ) + def _create_empty_categorical_column( categorical_column: CategoricalColumn, dtype: "CategoricalDtype" diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 42bfce0408c..20f302f7e59 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -5,13 +5,13 @@ import builtins import pickle import warnings -from collections.abc import MutableSequence from types import SimpleNamespace from typing import ( Any, Callable, Dict, List, + MutableSequence, Optional, Sequence, Tuple, @@ -189,114 +189,6 @@ def __sizeof__(self) -> int: n += bitmask_allocation_size_bytes(self.size) return n - def cat( - self, parent=None - ) -> "cudf.core.column.categorical.CategoricalAccessor": - raise NotImplementedError() - - def str(self, parent=None) -> "cudf.core.column.string.StringMethods": - raise NotImplementedError() - - @classmethod - def _concat( - cls, objs: "MutableSequence[ColumnBase]", dtype: Dtype = None - ) -> ColumnBase: - if len(objs) == 0: - dtype = pd.api.types.pandas_dtype(dtype) - if is_categorical_dtype(dtype): - dtype = CategoricalDtype() - return column_empty(0, dtype=dtype, masked=True) - - # If all columns are `NumericalColumn` with different dtypes, - # we cast them to a common dtype. - # Notice, we can always cast pure null columns - not_null_cols = list(filter(lambda o: o.valid_count > 0, objs)) - if len(not_null_cols) > 0 and ( - len( - [ - o - for o in not_null_cols - if not is_numerical_dtype(o.dtype) - or np.issubdtype(o.dtype, np.datetime64) - ] - ) - == 0 - ): - col_dtypes = [o.dtype for o in not_null_cols] - # Use NumPy to find a common dtype - common_dtype = np.find_common_type(col_dtypes, []) - # Cast all columns to the common dtype - for i in range(len(objs)): - objs[i] = objs[i].astype(common_dtype) - - # Find the first non-null column: - head = objs[0] - for i, obj in enumerate(objs): - if obj.valid_count > 0: - head = obj - break - - for i, obj in enumerate(objs): - # Check that all columns are the same type: - if not pd.api.types.is_dtype_equal(obj.dtype, head.dtype): - # if all null, cast to appropriate dtype - if obj.valid_count == 0: - objs[i] = column_empty_like( - head, dtype=head.dtype, masked=True, newsize=len(obj) - ) - else: - raise ValueError("All columns must be the same type") - - cats = None - is_categorical = all(is_categorical_dtype(o.dtype) for o in objs) - - # Combine CategoricalColumn categories - if is_categorical: - # Combine and de-dupe the categories - cats = ( - cudf.concat([o.cat().categories for o in objs]) - .to_series() - .drop_duplicates(ignore_index=True) - ._column - ) - objs = [ - o.cat()._set_categories( - o.cat().categories, cats, is_unique=True - ) - for o in objs - ] - # Map `objs` into a list of the codes until we port Categorical to - # use the libcudf++ Category data type. - objs = [o.cat().codes._column for o in objs] - head = head.cat().codes._column - - newsize = sum(map(len, objs)) - if newsize > libcudf.MAX_COLUMN_SIZE: - raise MemoryError( - f"Result of concat cannot have " - f"size > {libcudf.MAX_COLUMN_SIZE_STR}" - ) - - # Filter out inputs that have 0 length - objs = [o for o in objs if len(o) > 0] - - # Perform the actual concatenation - if newsize > 0: - col = libcudf.concat.concat_columns(objs) - else: - col = column_empty(0, head.dtype, masked=True) - - if is_categorical: - col = build_categorical_column( - categories=as_column(cats), - codes=as_column(col.base_data, dtype=col.dtype), - mask=col.base_mask, - size=col.size, - offset=col.offset, - ) - - return col - def dropna(self, drop_nan: bool = False) -> ColumnBase: if drop_nan: col = self.nans_to_nulls() @@ -796,7 +688,7 @@ def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: return indices[-1] def append(self, other: ColumnBase) -> ColumnBase: - return self.__class__._concat([self, as_column(other)]) + return _concat_columns([self, as_column(other)]) def quantile( self, @@ -934,25 +826,15 @@ def is_monotonic(self) -> bool: @property def is_monotonic_increasing(self) -> bool: - if not hasattr(self, "_is_monotonic_increasing"): - if self.has_nulls: - self._is_monotonic_increasing = False - else: - self._is_monotonic_increasing = self.as_frame()._is_sorted( - ascending=None, null_position=None - ) - return self._is_monotonic_increasing + return not self.has_nulls and self.as_frame()._is_sorted( + ascending=None, null_position=None + ) @property def is_monotonic_decreasing(self) -> bool: - if not hasattr(self, "_is_monotonic_decreasing"): - if self.has_nulls: - self._is_monotonic_decreasing = False - else: - self._is_monotonic_decreasing = self.as_frame()._is_sorted( - ascending=[False], null_position=None - ) - return self._is_monotonic_decreasing + return not self.has_nulls and self.as_frame()._is_sorted( + ascending=[False], null_position=None + ) def get_slice_bound( self, label: ScalarLike, side: builtins.str, kind: builtins.str @@ -1211,7 +1093,7 @@ def unique(self) -> ColumnBase: ) def serialize(self) -> Tuple[dict, list]: - header = {} # type: Dict[Any, Any] + header: Dict[Any, Any] = {} frames = [] header["type-serialized"] = pickle.dumps(type(self)) header["dtype"] = self.dtype.str @@ -2226,7 +2108,7 @@ def serialize_columns(columns) -> Tuple[List[dict], List]: frames : list list of frames """ - headers = [] # type List[Dict[Any, Any], ...] + headers: List[Dict[Any, Any]] = [] frames = [] if len(columns) > 0: @@ -2346,3 +2228,65 @@ def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase: dtype: int8 """ return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size) + + +def _concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: + """Concatenate a sequence of columns.""" + if len(objs) == 0: + dtype = pd.api.types.pandas_dtype(None) + return column_empty(0, dtype=dtype, masked=True) + + # If all columns are `NumericalColumn` with different dtypes, + # we cast them to a common dtype. + # Notice, we can always cast pure null columns + not_null_col_dtypes = [o.dtype for o in objs if o.valid_count] + if len(not_null_col_dtypes) and all( + is_numerical_dtype(dtyp) and np.issubdtype(dtyp, np.datetime64) + for dtyp in not_null_col_dtypes + ): + # Use NumPy to find a common dtype + common_dtype = np.find_common_type(not_null_col_dtypes, []) + # Cast all columns to the common dtype + objs = [obj.astype(common_dtype) for obj in objs] + + # Find the first non-null column: + head = next((obj for obj in objs if obj.valid_count), objs[0]) + + for i, obj in enumerate(objs): + # Check that all columns are the same type: + if not pd.api.types.is_dtype_equal(obj.dtype, head.dtype): + # if all null, cast to appropriate dtype + if obj.valid_count == 0: + objs[i] = column_empty_like( + head, dtype=head.dtype, masked=True, newsize=len(obj) + ) + else: + raise ValueError("All columns must be the same type") + + # TODO: This logic should be generalized to a dispatch to + # ColumnBase._concat so that all subclasses can override necessary + # behavior. However, at the moment it's not clear what that API should look + # like, so CategoricalColumn simply implements a minimal working API. + if all(is_categorical_dtype(o.dtype) for o in objs): + return cudf.core.column.categorical.CategoricalColumn._concat( + cast( + MutableSequence[ + cudf.core.column.categorical.CategoricalColumn + ], + objs, + ) + ) + + newsize = sum(map(len, objs)) + if newsize > libcudf.MAX_COLUMN_SIZE: + raise MemoryError( + f"Result of concat cannot have " + f"size > {libcudf.MAX_COLUMN_SIZE_STR}" + ) + elif newsize == 0: + col = column_empty(0, head.dtype, masked=True) + else: + # Filter out inputs that have 0 length, then concatenate. + objs = [o for o in objs if len(o)] + col = libcudf.concat.concat_columns(objs) + return col diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 14c82b5ff45..b96a49c2514 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -11,7 +11,6 @@ import numpy as np import pandas as pd -from nvtx import annotate import cudf from cudf import _lib as libcudf @@ -307,7 +306,7 @@ def binary_operator( ) -> ColumnBase: if isinstance(rhs, cudf.DateOffset): return rhs._datetime_binop(self, op, reflect=reflect) - lhs, rhs = self, rhs + lhs: Union[ScalarLike, ColumnBase] = self if op in ("eq", "ne", "lt", "gt", "le", "ge", "NULL_EQUALS"): out_dtype = np.dtype(np.bool_) # type: Dtype elif op == "add" and pd.api.types.is_timedelta64_dtype(rhs.dtype): @@ -332,7 +331,10 @@ def binary_operator( f"Series of dtype {self.dtype} cannot perform " f" the operation {op}" ) - return binop(lhs, rhs, op=op, out_dtype=out_dtype, reflect=reflect) + + if reflect: + lhs, rhs = rhs, lhs + return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) def fillna( self, fill_value: Any = None, method: str = None, dtype: Dtype = None @@ -422,20 +424,6 @@ def _make_copy_with_na_as_null(self): return out_col -@annotate("BINARY_OP", color="orange", domain="cudf_python") -def binop( - lhs: Union[ColumnBase, ScalarLike], - rhs: Union[ColumnBase, ScalarLike], - op: str, - out_dtype: Dtype, - reflect: bool, -) -> ColumnBase: - if reflect: - lhs, rhs = rhs, lhs - out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) - return out - - def binop_offset(lhs, rhs, op): if rhs._is_no_op: return lhs diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index d710129900a..39bbf10c235 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -11,7 +11,6 @@ import numpy as np import pandas as pd from numba import cuda, njit -from nvtx import annotate from pandas.api.types import is_integer_dtype import cudf @@ -151,7 +150,7 @@ def binary_operator( msg = "{!r} operator not supported between {} and {}" raise TypeError(msg.format(binop, type(self), type(rhs))) if isinstance(rhs, cudf.core.column.DecimalColumn): - lhs = self.as_decimal_column( + lhs: Union[ScalarLike, ColumnBase] = self.as_decimal_column( Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0) ) return lhs.binary_operator(binop, rhs) @@ -163,9 +162,11 @@ def binary_operator( or ((isinstance(tmp, NumericalColumn)) and (0.0 in tmp)) ): out_dtype = np.dtype("float64") - return _numeric_column_binop( - lhs=self, rhs=rhs, op=binop, out_dtype=out_dtype, reflect=reflect - ) + + if binop in {"lt", "gt", "le", "ge", "eq", "ne", "NULL_EQUALS"}: + out_dtype = "bool" + lhs, rhs = (self, rhs) if not reflect else (rhs, self) + return libcudf.binaryop.binaryop(lhs, rhs, binop, out_dtype) def _apply_scan_op(self, op: str) -> ColumnBase: return libcudf.reduce.scan(op, self, True) @@ -761,35 +762,6 @@ def to_pandas( return pd_series -@annotate("BINARY_OP", color="orange", domain="cudf_python") -def _numeric_column_binop( - lhs: Union[ColumnBase, ScalarLike], - rhs: Union[ColumnBase, ScalarLike], - op: str, - out_dtype: Dtype, - reflect: bool = False, -) -> ColumnBase: - if reflect: - lhs, rhs = rhs, lhs - - is_op_comparison = op in [ - "lt", - "gt", - "le", - "ge", - "eq", - "ne", - "NULL_EQUALS", - ] - - if is_op_comparison: - out_dtype = "bool" - - out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) - - return out - - def _numeric_column_unaryop(operand: ColumnBase, op: str) -> ColumnBase: if callable(op): return libcudf.transform.transform(operand, op) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 84dcad516df..940b38ef5ff 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -12,7 +12,6 @@ import pandas as pd import pyarrow as pa from numba import cuda -from nvtx import annotate import cudf from cudf import _lib as libcudf @@ -5034,7 +5033,7 @@ def sum( result_col = self._process_for_reduction( skipna=skipna, min_count=min_count ) - if isinstance(result_col, cudf.core.column.ColumnBase): + if isinstance(result_col, type(self)): return result_col.str().cat() else: return result_col @@ -5061,13 +5060,6 @@ def __contains__(self, item: ScalarLike) -> bool: def str(self, parent: ParentType = None) -> StringMethods: return StringMethods(self, parent=parent) - @property - def _nbytes(self) -> int: - if self.size == 0: - return 0 - else: - return self.children[1].size - def as_numerical_column( self, dtype: Dtype ) -> "cudf.core.column.NumericalColumn": @@ -5184,7 +5176,7 @@ def to_array(self, fillna: bool = None) -> np.ndarray: return self.to_arrow().to_pandas().values def to_pandas( - self, index: ColumnLike = None, nullable: bool = False, **kwargs + self, index: pd.Index = None, nullable: bool = False, **kwargs ) -> "pd.Series": if nullable: pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow()) @@ -5197,7 +5189,7 @@ def to_pandas( return pd_series def serialize(self) -> Tuple[dict, list]: - header = {"null_count": self.null_count} # type: Dict[Any, Any] + header: Dict[Any, Any] = {"null_count": self.null_count} header["type-serialized"] = pickle.dumps(type(self)) header["size"] = self.size @@ -5359,7 +5351,9 @@ def binary_operator( if op == "add": return cast("column.ColumnBase", lhs.str().cat(others=rhs)) elif op in ("eq", "ne", "gt", "lt", "ge", "le", "NULL_EQUALS"): - return _string_column_binop(self, rhs, op=op, out_dtype="bool") + return libcudf.binaryop.binaryop( + lhs=self, rhs=rhs, op=op, dtype="bool" + ) raise TypeError( f"{op} operator not supported between {type(self)} and {type(rhs)}" @@ -5392,17 +5386,6 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase": return to_view.view(dtype) -@annotate("BINARY_OP", color="orange", domain="cudf_python") -def _string_column_binop( - lhs: "column.ColumnBase", - rhs: "column.ColumnBase", - op: str, - out_dtype: Dtype, -) -> "column.ColumnBase": - out = libcudf.binaryop.binaryop(lhs=lhs, rhs=rhs, op=op, dtype=out_dtype) - return out - - def _get_cols_list(parent_obj, others): parent_index = ( diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index d8ad11f41b3..b202838662c 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -9,7 +9,6 @@ import numpy as np import pandas as pd import pyarrow as pa -from nvtx import annotate import cudf from cudf import _lib as libcudf @@ -247,7 +246,7 @@ def binary_operator( if reflect: lhs, rhs = rhs, lhs # type: ignore - return binop(lhs, rhs, op=op, out_dtype=out_dtype) + return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) def normalize_binop_value(self, other) -> BinaryOperand: if isinstance(other, cudf.Scalar): @@ -575,17 +574,6 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn": ) -@annotate("BINARY_OP", color="orange", domain="cudf_python") -def binop( - lhs: "column.ColumnBase", - rhs: "column.ColumnBase", - op: str, - out_dtype: DtypeObj, -) -> "cudf.core.column.ColumnBase": - out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) - return out - - def determine_out_dtype(lhs_dtype: Dtype, rhs_dtype: Dtype) -> Dtype: if np.can_cast(np.dtype(lhs_dtype), np.dtype(rhs_dtype)): return rhs_dtype diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 2846dc241db..1168ad015f1 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -27,6 +27,7 @@ arange, column, ) +from cudf.core.column.column import _concat_columns from cudf.core.column.string import StringMethods as StringMethods from cudf.core.dtypes import IntervalDtype from cudf.core.frame import SingleColumnFrame @@ -649,7 +650,7 @@ def sum(self): @classmethod def _concat(cls, objs): - data = ColumnBase._concat([o._values for o in objs]) + data = _concat_columns([o._values for o in objs]) names = {obj.name for obj in objs} if len(names) == 1: [name] = names diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index ca029198e52..6b551dc72c3 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1386,11 +1386,7 @@ def is_monotonic_increasing(self): Return if the index is monotonic increasing (only equal or increasing) values. """ - if not hasattr(self, "_is_monotonic_increasing"): - self._is_monotonic_increasing = self._is_sorted( - ascending=None, null_position=None - ) - return self._is_monotonic_increasing + return self._is_sorted(ascending=None, null_position=None) @property def is_monotonic_decreasing(self): @@ -1398,11 +1394,9 @@ def is_monotonic_decreasing(self): Return if the index is monotonic decreasing (only equal or decreasing) values. """ - if not hasattr(self, "_is_monotonic_decreasing"): - self._is_monotonic_decreasing = self._is_sorted( - ascending=[False] * len(self.levels), null_position=None - ) - return self._is_monotonic_decreasing + return self._is_sorted( + ascending=[False] * len(self.levels), null_position=None + ) def argsort(self, ascending=True, **kwargs): indices = self._source_data.argsort(ascending=ascending, **kwargs) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 7b1e6454394..aa99ac6c961 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -21,7 +21,6 @@ from cudf._lib.transform import bools_to_mask from cudf.core.abc import Serializable from cudf.core.column import ( - ColumnBase, DatetimeColumn, TimeDeltaColumn, arange, @@ -33,6 +32,7 @@ from cudf.core.column.categorical import ( CategoricalAccessor as CategoricalAccessor, ) +from cudf.core.column.column import _concat_columns from cudf.core.column.lists import ListMethods from cudf.core.column.string import StringMethods from cudf.core.column.struct import StructMethods @@ -2407,7 +2407,7 @@ def _concat(cls, objs, axis=0, index=True): else: objs = numeric_normalize_types(*objs) - col = ColumnBase._concat([o._column for o in objs]) + col = _concat_columns([o._column for o in objs]) if isinstance(col, cudf.core.column.DecimalColumn): col = objs[0]._column._copy_type_metadata(col) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index c6a64824a86..054bfa15f9b 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1318,7 +1318,6 @@ def test_string_no_children_properties(): assert empty_col.children == () assert empty_col.size == 0 - assert empty_col._nbytes == 0 assert getsizeof(empty_col) >= 0 # Accounts for Python GC overhead