diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 1fe59d3dfd6..b2f3274faab 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -44,6 +44,11 @@ def _values(self) -> ColumnBase: def copy(self, deep: bool = True) -> BaseIndex: raise NotImplementedError + @property + def size(self): + # The size of an index is always its length irrespective of dimension. + return len(self) + @property def values(self): return self._values.values @@ -162,6 +167,38 @@ def _clean_nulls_from_index(self): else: return self + @property + def is_monotonic(self): + """Return boolean if values in the object are monotonic_increasing. + + This property is an alias for :attr:`is_monotonic_increasing`. + + Returns + ------- + bool + """ + return self.is_monotonic_increasing + + @property + def is_monotonic_increasing(self): + """Return boolean if values in the object are monotonically increasing. + + Returns + ------- + bool + """ + raise NotImplementedError + + @property + def is_monotonic_decreasing(self): + """Return boolean if values in the object are monotonically decreasing. + + Returns + ------- + bool + """ + raise NotImplementedError + @property def nlevels(self): """ diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index c14cbd11714..c59081e4b59 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -352,7 +352,9 @@ def cat(self, others=None, sep=None, na_rep=None): if len(data) == 1 and data.null_count == 1: data = [""] - out = self._return_or_inplace(data) + # We only want to keep the index if we are adding something to each + # row, not if we are joining all the rows into a single string. + out = self._return_or_inplace(data, retain_index=others is not None) if len(out) == 1 and others is None: if isinstance(out, cudf.Series): out = out.iloc[0] diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b8fe4fcaff6..a0811f33351 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -594,12 +594,6 @@ def dtypes(self): data=[x.dtype for x in self._data.columns], index=self._data.names, ) - @property - def shape(self): - """Returns a tuple representing the dimensionality of the DataFrame. - """ - return self._num_rows, self._num_columns - @property def ndim(self): """Dimension of the data. DataFrame ndim is always 2. @@ -938,12 +932,6 @@ def memory_usage(self, index=True, deep=False): sizes.append(self.index.memory_usage(deep=deep)) return Series(sizes, index=ind) - def __len__(self): - """ - Returns the number of rows - """ - return len(self.index) - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): import cudf diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 64b96458218..0809e14a8a2 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -166,6 +166,11 @@ def size(self): """ return self._num_columns * self._num_rows + @property + def shape(self): + """Returns a tuple representing the dimensionality of the DataFrame.""" + return self._num_rows, self._num_columns + @property def _is_homogeneous(self): # make sure that the dataframe has columns @@ -4458,6 +4463,12 @@ def to_string(self): def __str__(self): return self.to_string() + def __deepcopy__(self, memo): + return self.copy(deep=True) + + def __copy__(self): + return self.copy(deep=False) + def head(self, n=5): """ Return the first `n` rows. @@ -4726,9 +4737,6 @@ def __iter__(self): """ cudf.utils.utils.raise_iteration_error(obj=self) - def __len__(self): - return len(self._column) - def __bool__(self): raise TypeError( f"The truth value of a {type(self)} is ambiguous. Use " @@ -4916,7 +4924,7 @@ def is_unique(self): @property def is_monotonic(self): - """Return boolean if values in the object are monotonic_increasing. + """Return boolean if values in the object are monotonically increasing. This property is an alias for :attr:`is_monotonic_increasing`. @@ -4928,7 +4936,7 @@ def is_monotonic(self): @property def is_monotonic_increasing(self): - """Return boolean if values in the object are monotonic_increasing. + """Return boolean if values in the object are monotonically increasing. Returns ------- @@ -4938,7 +4946,7 @@ def is_monotonic_increasing(self): @property def is_monotonic_decreasing(self): - """Return boolean if values in the object are monotonic_decreasing. + """Return boolean if values in the object are monotonically decreasing. Returns ------- diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 3ac30143463..6414d4a7e84 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -349,17 +349,6 @@ def dtype(self): """ return cudf.dtype(np.int64) - @property - def is_contiguous(self): - """ - Returns if the index is contiguous. - """ - return self._step == 1 - - @property - def size(self): - return len(self) - def find_label_range(self, first=None, last=None): """Find subrange in the ``RangeIndex``, marked by their positions, that starts greater or equal to ``first`` and ends less or equal to ``last`` @@ -417,18 +406,10 @@ def is_unique(self): @property def is_monotonic_increasing(self): - """ - Return if the index is monotonic increasing - (only equal or increasing) values. - """ return self._step > 0 or len(self) <= 1 @property def is_monotonic_decreasing(self): - """ - Return if the index is monotonic decreasing - (only equal or decreasing) values. - """ return self._step < 0 or len(self) <= 1 def get_slice_bound(self, label, side, kind=None): diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 84566b4627c..3bf5f70be39 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -7,6 +7,7 @@ import pickle import warnings from collections.abc import Sequence +from numbers import Integral from typing import Any, List, MutableMapping, Optional, Tuple, Union import cupy @@ -17,6 +18,7 @@ import cudf from cudf import _lib as libcudf from cudf._typing import DataFrameOrSeries +from cudf.api.types import is_integer, is_list_like from cudf.core import column from cudf.core._compat import PANDAS_GE_120 from cudf.core.frame import Frame @@ -33,8 +35,6 @@ class MultiIndex(Frame, BaseIndex): ---------- levels : sequence of arrays The unique labels for each level. - labels : sequence of arrays - labels is depreciated, please use levels codes: sequence of arrays Integers for each level designating which label at each location. sortorder : optional int @@ -68,7 +68,6 @@ def __init__( levels=None, codes=None, sortorder=None, - labels=None, names=None, dtype=None, copy=False, @@ -78,13 +77,16 @@ def __init__( if sortorder is not None: raise NotImplementedError("sortorder is not yet supported") - if name is not None: raise NotImplementedError( "Use `names`, `name` is not yet supported" ) - - super().__init__() + if len(levels) == 0: + raise ValueError("Must pass non-zero number of levels/codes") + if not isinstance(codes, cudf.DataFrame) and not isinstance( + codes[0], (Sequence, np.ndarray) + ): + raise TypeError("Codes is not a Sequence of sequences") if copy: if isinstance(codes, cudf.DataFrame): @@ -92,58 +94,57 @@ def __init__( if len(levels) > 0 and isinstance(levels[0], cudf.Series): levels = [level.copy(deep=True) for level in levels] - self._name = None - - if labels: - warnings.warn( - "the 'labels' keyword is deprecated, use 'codes' " "instead", - FutureWarning, - ) - if labels and not codes: - codes = labels - - if len(levels) == 0: - raise ValueError("Must pass non-zero number of levels/codes") + if not isinstance(codes, cudf.DataFrame): + if len(levels) == len(codes): + codes = cudf.DataFrame._from_data( + { + i: column.as_column(code).astype(np.int64) + for i, code in enumerate(codes) + } + ) + else: + raise ValueError( + "MultiIndex has unequal number of levels and " + "codes and is inconsistent!" + ) - if not isinstance(codes, cudf.DataFrame) and not isinstance( - codes[0], (Sequence, np.ndarray) - ): - raise TypeError("Codes is not a Sequence of sequences") + levels = [cudf.Series(level) for level in levels] - if isinstance(codes, cudf.DataFrame): - self._codes = codes - elif len(levels) == len(codes): - self._codes = cudf.DataFrame._from_data( - { - i: column.as_column(code).astype(np.int64) - for i, code in enumerate(codes) - } - ) - else: + if len(levels) != len(codes.columns): raise ValueError( "MultiIndex has unequal number of levels and " "codes and is inconsistent!" ) + if len(set(c.size for c in codes._data.columns)) != 1: + raise ValueError( + "MultiIndex length of codes does not match " + "and is inconsistent!" + ) + for level, code in zip(levels, codes._data.columns): + if code.max() > len(level) - 1: + raise ValueError( + "MultiIndex code %d contains value %d larger " + "than maximum level size at this position" + ) - self._levels = [cudf.Series(level) for level in levels] - self._validate_levels_and_codes(self._levels, self._codes) - - source_data = cudf.DataFrame() - for i, n in enumerate(self._codes.columns): - codes = as_index(self._codes[n]._column) - if -1 in self._codes[n].values: + source_data = {} + for i, (column_name, col) in enumerate(codes._data.items()): + if -1 in col.values: level = cudf.DataFrame( - {n: [None] + list(self._levels[i])}, - index=range(-1, len(self._levels[i])), + {column_name: [None] + list(levels[i])}, + index=range(-1, len(levels[i])), ) else: - level = cudf.DataFrame({n: self._levels[i]}) + level = cudf.DataFrame({column_name: levels[i]}) - source_data[n] = libcudf.copying.gather( - level, codes._data.columns[0] - )[0][n] + source_data[column_name] = libcudf.copying.gather(level, col)[0][ + column_name + ] - self._data = source_data._data + super().__init__(source_data) + self._levels = levels + self._codes = codes + self._name = None self.names = names @property @@ -153,7 +154,6 @@ def names(self): @names.setter def names(self, value): value = [None] * self.nlevels if value is None else value - assert len(value) == self.nlevels if len(value) == len(set(value)): # IMPORTANT: if the provided names are unique, @@ -216,25 +216,20 @@ def rename(self, names, inplace=False): return self.set_names(names, level=None, inplace=inplace) def set_names(self, names, level=None, inplace=False): - if ( - level is not None - and not cudf.api.types.is_list_like(level) - and cudf.api.types.is_list_like(names) - ): + names_is_list_like = is_list_like(names) + level_is_list_like = is_list_like(level) + + if level is not None and not level_is_list_like and names_is_list_like: raise TypeError( "Names must be a string when a single level is provided." ) - if ( - not cudf.api.types.is_list_like(names) - and level is None - and self.nlevels > 1 - ): + if not names_is_list_like and level is None and self.nlevels > 1: raise TypeError("Must pass list-like as `names`.") - if not cudf.api.types.is_list_like(names): + if not names_is_list_like: names = [names] - if level is not None and not cudf.api.types.is_list_like(level): + if level is not None and not level_is_list_like: level = [level] if level is not None and len(names) != len(level): @@ -269,10 +264,6 @@ def _from_data( obj.name = name return obj - @property - def shape(self): - return (self._data.nrows, len(self._data.names)) - @property def name(self): return self._name @@ -281,26 +272,6 @@ def name(self): def name(self, value): self._name = value - def _validate_levels_and_codes(self, levels, codes): - if len(levels) != len(codes.columns): - raise ValueError( - "MultiIndex has unequal number of levels and " - "codes and is inconsistent!" - ) - code_length = len(codes[codes.columns[0]]) - for index, code in enumerate(codes): - if code_length != len(codes[code]): - raise ValueError( - "MultiIndex length of codes does not match " - "and is inconsistent!" - ) - for index, code in enumerate(codes): - if codes[code].max() > len(levels[index]) - 1: - raise ValueError( - "MultiIndex code %d contains value %d larger " - "than maximum level size at this position" - ) - def copy( self, names=None, @@ -396,36 +367,9 @@ def copy( return mi - def deepcopy(self): - return self.copy(deep=True) - - def __copy__(self): - return self.copy(deep=True) - def __iter__(self): - """ - Iterating over a GPU object is not effecient and hence not supported. - - Consider using ``.to_arrow()``, ``.to_pandas()`` or ``.values_host`` - if you wish to iterate over the values. - """ cudf.utils.utils.raise_iteration_error(obj=self) - def _popn(self, n): - """ Returns a copy of this index without the left-most n values. - - Removes n names, labels, and codes in order to build a new index - for results. - """ - result = MultiIndex( - levels=self.levels[n:], - codes=self.codes.iloc[:, n:], - names=self.names[n:], - ) - if self.names is not None: - result.names = self.names[n:] - return result - def __repr__(self): max_seq_items = get_option("display.max_seq_items") or len(self) @@ -534,9 +478,7 @@ def codes(self): @property def nlevels(self): - """ - Integer number of levels in this MultiIndex. - """ + """Integer number of levels in this MultiIndex.""" return len(self._data) @property @@ -576,23 +518,13 @@ def levels(self): self._compute_levels_and_codes() return self._levels - @property - def labels(self): - warnings.warn( - "This feature is deprecated in pandas and will be" - "dropped from cudf as well.", - FutureWarning, - ) - return self.codes - @property def ndim(self): - """Dimension of the data. For MultiIndex ndim is always 2. - """ + """Dimension of the data. For MultiIndex ndim is always 2.""" return 2 def _get_level_label(self, level): - """ Get name of the level. + """Get name of the level. Parameters ---------- @@ -658,8 +590,6 @@ def isin(self, values, level=None): >>> midx.isin([(1, 'red'), (3, 'red')]) array([ True, False, False]) """ - from cudf.api.types import is_list_like - if level is None: if isinstance(values, cudf.MultiIndex): values_idx = values @@ -708,11 +638,6 @@ def isin(self, values, level=None): return result - def mask(self, cond, other=None, inplace=False): - raise NotImplementedError( - ".mask is not supported for MultiIndex operations" - ) - def where(self, cond, other=None, inplace=False): raise NotImplementedError( ".where is not supported for MultiIndex operations" @@ -795,9 +720,7 @@ def _index_and_downcast(self, result, index, index_key): ) or isinstance(index_key[0], slice): index_key = index_key[0] - slice_access = False - if isinstance(index_key, slice): - slice_access = True + slice_access = isinstance(index_key, slice) out_index = cudf.DataFrame() # Select the last n-k columns where n is the number of columns and k is # the length of the indexing tuple @@ -805,30 +728,24 @@ def _index_and_downcast(self, result, index, index_key): if not isinstance(index_key, (numbers.Number, slice)): size = len(index_key) for k in range(size, len(index._data)): - if index.names is None: - name = k - else: - name = index.names[k] out_index.insert( - len(out_index.columns), - name, + out_index._num_columns, + k if index.names is None else index.names[k], cudf.Series._from_data({None: index._data.columns[k]}), ) - if len(result) == 1 and size == 0 and slice_access is False: + if len(result) == 1 and size == 0 and not slice_access: # If the final result is one row and it was not mapped into # directly, return a Series with a tuple as name. result = result.T result = result[result._data.names[0]] - elif len(result) == 0 and slice_access is False: + elif len(result) == 0 and not slice_access: # Pandas returns an empty Series with a tuple as name # the one expected result column - series_name = [] - for col in index._data.columns: - series_name.append(col[0]) - result = cudf.Series([]) - result.name = tuple(series_name) - elif len(out_index.columns) == 1: + result = cudf.Series._from_data( + {}, name=tuple((col[0] for col in index._data.columns)) + ) + elif out_index._num_columns == 1: # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according # to that column's name. @@ -836,11 +753,18 @@ def _index_and_downcast(self, result, index, index_key): out_index = as_index(last_column) out_index.name = index.names[-1] index = out_index - elif len(out_index.columns) > 1: + elif out_index._num_columns > 1: # Otherwise pop the leftmost levels, names, and codes from the # source index until it has the correct number of columns (n-k) result.reset_index(drop=True) - index = index._popn(size) + if index.names is not None: + result.names = index.names[size:] + index = MultiIndex( + levels=index.levels[size:], + codes=index.codes.iloc[:, size:], + names=index.names[size:], + ) + if isinstance(index_key, tuple): result = result.set_index(index) return result @@ -896,24 +820,6 @@ def _validate_indexer( for i in indexer: self._validate_indexer(i) - def _split_tuples(self, tuples): - if len(tuples) == 1: - return tuples, slice(None) - elif isinstance(tuples[0], tuple): - row = tuples[0] - if len(tuples) == 1: - column = slice(None) - else: - column = tuples[1] - return row, column - elif isinstance(tuples[0], slice): - return tuples - else: - return tuples, slice(None) - - def __len__(self): - return self._data.nrows - def __eq__(self, other): if isinstance(other, MultiIndex): for self_col, other_col in zip( @@ -924,24 +830,16 @@ def __eq__(self, other): return self.names == other.names return NotImplemented - @property - def is_contiguous(self): - return True - @property def size(self): - return len(self) + # The size of a MultiIndex is only dependent on the number of rows. + return self._num_rows def take(self, indices): - from collections.abc import Sequence - from numbers import Integral - if isinstance(indices, (Integral, Sequence)): indices = np.array(indices) - elif isinstance(indices, cudf.Series): - if indices.has_nulls: - raise ValueError("Column must have no nulls.") - indices = indices + elif isinstance(indices, cudf.Series) and indices.has_nulls: + raise ValueError("Column must have no nulls.") elif isinstance(indices, slice): start, stop, step = indices.indices(len(self)) indices = column.arange(start, stop, step) @@ -983,15 +881,11 @@ def deserialize(cls, header, frames): return cls._from_data(dict(zip(names, columns))) def __getitem__(self, index): - match = self.take(index) - if isinstance(index, slice): - return match if isinstance(index, int): # we are indexing into a single row of the MultiIndex, # return that row as a tuple: - return match.to_pandas()[0] - else: - return match + return self.take(index).to_pandas()[0] + return self.take(index) def to_frame(self, index=True, name=None): # TODO: Currently this function makes a shallow copy, which is @@ -1003,7 +897,7 @@ def to_frame(self, index=True, name=None): if name is not None: if len(name) != len(self.levels): raise ValueError( - "'name' should have th same length as " + "'name' should have the same length as " "number of levels on index." ) df.columns = name @@ -1095,8 +989,7 @@ def from_tuples(cls, tuples, names=None): """ # Use Pandas for handling Python host objects pdi = pd.MultiIndex.from_tuples(tuples, names=names) - result = cls.from_pandas(pdi) - return result + return cls.from_pandas(pdi) @property def values_host(self): @@ -1426,18 +1319,6 @@ def from_pandas(cls, multiindex, nan_as_null=None): def is_unique(self): return len(self) == len(self.unique()) - @property - def is_monotonic(self): - """Return boolean if values in the object are monotonic_increasing. - - This property is an alias for :attr:`is_monotonic_increasing`. - - Returns - ------- - bool - """ - return self.is_monotonic_increasing - @property def is_monotonic_increasing(self): """ @@ -1539,13 +1420,9 @@ def memory_usage(self, deep=False): return n def difference(self, other, sort=None): - temp_self = self - temp_other = other - if hasattr(self, "to_pandas"): - temp_self = self.to_pandas() if hasattr(other, "to_pandas"): - temp_other = self.to_pandas() - return temp_self.difference(temp_other, sort) + other = other.to_pandas() + return self.to_pandas().difference(other, sort) def append(self, other): """ @@ -1609,12 +1486,6 @@ def append(self, other): return MultiIndex._concat(to_concat) - def nan_to_num(*args, **kwargs): - return args[0] - - def array_equal(*args, **kwargs): - return args[0] == args[1] - def __array_function__(self, func, types, args, kwargs): cudf_df_module = MultiIndex @@ -1650,8 +1521,8 @@ def _level_index_from_level(self, level): try: return self.names.index(level) except ValueError: - if not pd.api.types.is_integer(level): - raise KeyError(f"Level {level} not found") from None + if not is_integer(level): + raise KeyError(f"Level {level} not found") if level < 0: level += self.nlevels if level >= self.nlevels: @@ -1661,9 +1532,6 @@ def _level_index_from_level(self, level): ) from None return level - def _level_name_from_level(self, level): - return self.names[self._level_index_from_level(level)] - def get_loc(self, key, method=None, tolerance=None): """ Get location for a label or a tuple of labels. diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 465cf36e1f3..40bbdc4a865 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -1553,3 +1553,20 @@ def test_multiIndex_duplicate_names(): ) assert_eq(gi, pi) + + +def test_difference(): + midx = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ) + midx2 = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3, 3], [0, 2, 1, 1, 0, 2]], + names=["x", "y"], + ) + + expected = midx2.to_pandas().difference(midx.to_pandas()) + actual = midx2.difference(midx) + assert_eq(expected, actual)