From 57de0041cd0ef671090f7cd4860ed1daa20b11a1 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 15 Sep 2021 10:02:05 -0700 Subject: [PATCH 01/19] Remove some unused functions and standardize others in Frame. --- python/cudf/cudf/core/_base_index.py | 37 +++++++++++++++++ python/cudf/cudf/core/dataframe.py | 12 ------ python/cudf/cudf/core/frame.py | 26 ++++++------ python/cudf/cudf/core/index.py | 19 --------- python/cudf/cudf/core/multiindex.py | 60 ---------------------------- 5 files changed, 51 insertions(+), 103 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 1fe59d3dfd6..b2f3274faab 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -44,6 +44,11 @@ def _values(self) -> ColumnBase: def copy(self, deep: bool = True) -> BaseIndex: raise NotImplementedError + @property + def size(self): + # The size of an index is always its length irrespective of dimension. + return len(self) + @property def values(self): return self._values.values @@ -162,6 +167,38 @@ def _clean_nulls_from_index(self): else: return self + @property + def is_monotonic(self): + """Return boolean if values in the object are monotonic_increasing. + + This property is an alias for :attr:`is_monotonic_increasing`. + + Returns + ------- + bool + """ + return self.is_monotonic_increasing + + @property + def is_monotonic_increasing(self): + """Return boolean if values in the object are monotonically increasing. + + Returns + ------- + bool + """ + raise NotImplementedError + + @property + def is_monotonic_decreasing(self): + """Return boolean if values in the object are monotonically decreasing. + + Returns + ------- + bool + """ + raise NotImplementedError + @property def nlevels(self): """ diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b8fe4fcaff6..a0811f33351 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -594,12 +594,6 @@ def dtypes(self): data=[x.dtype for x in self._data.columns], index=self._data.names, ) - @property - def shape(self): - """Returns a tuple representing the dimensionality of the DataFrame. - """ - return self._num_rows, self._num_columns - @property def ndim(self): """Dimension of the data. DataFrame ndim is always 2. @@ -938,12 +932,6 @@ def memory_usage(self, index=True, deep=False): sizes.append(self.index.memory_usage(deep=deep)) return Series(sizes, index=ind) - def __len__(self): - """ - Returns the number of rows - """ - return len(self.index) - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): import cudf diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 64b96458218..a1efe764f24 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -166,6 +166,11 @@ def size(self): """ return self._num_columns * self._num_rows + @property + def shape(self): + """Returns a tuple representing the dimensionality of the DataFrame.""" + return self._num_rows, self._num_columns + @property def _is_homogeneous(self): # make sure that the dataframe has columns @@ -4458,6 +4463,12 @@ def to_string(self): def __str__(self): return self.to_string() + def __deepcopy__(self): + return self.copy(deep=True) + + def __copy__(self): + return self.copy(deep=False) + def head(self, n=5): """ Return the first `n` rows. @@ -4718,17 +4729,8 @@ def shape(self): return (len(self),) def __iter__(self): - """ - Iterating over a GPU object is not effecient and hence not supported. - - Consider using ``.to_arrow()``, ``.to_pandas()`` or ``.values_host`` - if you wish to iterate over the values. - """ cudf.utils.utils.raise_iteration_error(obj=self) - def __len__(self): - return len(self._column) - def __bool__(self): raise TypeError( f"The truth value of a {type(self)} is ambiguous. Use " @@ -4916,7 +4918,7 @@ def is_unique(self): @property def is_monotonic(self): - """Return boolean if values in the object are monotonic_increasing. + """Return boolean if values in the object are monotonically increasing. This property is an alias for :attr:`is_monotonic_increasing`. @@ -4928,7 +4930,7 @@ def is_monotonic(self): @property def is_monotonic_increasing(self): - """Return boolean if values in the object are monotonic_increasing. + """Return boolean if values in the object are monotonically increasing. Returns ------- @@ -4938,7 +4940,7 @@ def is_monotonic_increasing(self): @property def is_monotonic_decreasing(self): - """Return boolean if values in the object are monotonic_decreasing. + """Return boolean if values in the object are monotonically decreasing. Returns ------- diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 3ac30143463..6414d4a7e84 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -349,17 +349,6 @@ def dtype(self): """ return cudf.dtype(np.int64) - @property - def is_contiguous(self): - """ - Returns if the index is contiguous. - """ - return self._step == 1 - - @property - def size(self): - return len(self) - def find_label_range(self, first=None, last=None): """Find subrange in the ``RangeIndex``, marked by their positions, that starts greater or equal to ``first`` and ends less or equal to ``last`` @@ -417,18 +406,10 @@ def is_unique(self): @property def is_monotonic_increasing(self): - """ - Return if the index is monotonic increasing - (only equal or increasing) values. - """ return self._step > 0 or len(self) <= 1 @property def is_monotonic_decreasing(self): - """ - Return if the index is monotonic decreasing - (only equal or decreasing) values. - """ return self._step < 0 or len(self) <= 1 def get_slice_bound(self, label, side, kind=None): diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 84566b4627c..cb661295123 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -269,10 +269,6 @@ def _from_data( obj.name = name return obj - @property - def shape(self): - return (self._data.nrows, len(self._data.names)) - @property def name(self): return self._name @@ -396,19 +392,7 @@ def copy( return mi - def deepcopy(self): - return self.copy(deep=True) - - def __copy__(self): - return self.copy(deep=True) - def __iter__(self): - """ - Iterating over a GPU object is not effecient and hence not supported. - - Consider using ``.to_arrow()``, ``.to_pandas()`` or ``.values_host`` - if you wish to iterate over the values. - """ cudf.utils.utils.raise_iteration_error(obj=self) def _popn(self, n): @@ -896,24 +880,6 @@ def _validate_indexer( for i in indexer: self._validate_indexer(i) - def _split_tuples(self, tuples): - if len(tuples) == 1: - return tuples, slice(None) - elif isinstance(tuples[0], tuple): - row = tuples[0] - if len(tuples) == 1: - column = slice(None) - else: - column = tuples[1] - return row, column - elif isinstance(tuples[0], slice): - return tuples - else: - return tuples, slice(None) - - def __len__(self): - return self._data.nrows - def __eq__(self, other): if isinstance(other, MultiIndex): for self_col, other_col in zip( @@ -924,14 +890,6 @@ def __eq__(self, other): return self.names == other.names return NotImplemented - @property - def is_contiguous(self): - return True - - @property - def size(self): - return len(self) - def take(self, indices): from collections.abc import Sequence from numbers import Integral @@ -1426,18 +1384,6 @@ def from_pandas(cls, multiindex, nan_as_null=None): def is_unique(self): return len(self) == len(self.unique()) - @property - def is_monotonic(self): - """Return boolean if values in the object are monotonic_increasing. - - This property is an alias for :attr:`is_monotonic_increasing`. - - Returns - ------- - bool - """ - return self.is_monotonic_increasing - @property def is_monotonic_increasing(self): """ @@ -1609,12 +1555,6 @@ def append(self, other): return MultiIndex._concat(to_concat) - def nan_to_num(*args, **kwargs): - return args[0] - - def array_equal(*args, **kwargs): - return args[0] == args[1] - def __array_function__(self, func, types, args, kwargs): cudf_df_module = MultiIndex From 4df6abbd7762328d3ed19646d3a9e7d92ef542b3 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 15 Sep 2021 10:43:05 -0700 Subject: [PATCH 02/19] Inline _popn and delete unused _level_name_from_level. --- python/cudf/cudf/core/multiindex.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index cb661295123..bbcfef49d80 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -395,21 +395,6 @@ def copy( def __iter__(self): cudf.utils.utils.raise_iteration_error(obj=self) - def _popn(self, n): - """ Returns a copy of this index without the left-most n values. - - Removes n names, labels, and codes in order to build a new index - for results. - """ - result = MultiIndex( - levels=self.levels[n:], - codes=self.codes.iloc[:, n:], - names=self.names[n:], - ) - if self.names is not None: - result.names = self.names[n:] - return result - def __repr__(self): max_seq_items = get_option("display.max_seq_items") or len(self) @@ -824,7 +809,14 @@ def _index_and_downcast(self, result, index, index_key): # Otherwise pop the leftmost levels, names, and codes from the # source index until it has the correct number of columns (n-k) result.reset_index(drop=True) - index = index._popn(size) + if index.names is not None: + result.names = index.names[size:] + index = MultiIndex( + levels=index.levels[size:], + codes=index.codes.iloc[:, size:], + names=index.names[size:], + ) + if isinstance(index_key, tuple): result = result.set_index(index) return result @@ -1601,9 +1593,6 @@ def _level_index_from_level(self, level): ) from None return level - def _level_name_from_level(self, level): - return self.names[self._level_index_from_level(level)] - def get_loc(self, key, method=None, tolerance=None): """ Get location for a label or a tuple of labels. From cab286f3188d05caf7f55482c3ac596ae199cb2a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 15 Sep 2021 10:47:21 -0700 Subject: [PATCH 03/19] Rewrite difference and fix bug. --- python/cudf/cudf/core/multiindex.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index bbcfef49d80..c155d54a019 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1477,13 +1477,9 @@ def memory_usage(self, deep=False): return n def difference(self, other, sort=None): - temp_self = self - temp_other = other - if hasattr(self, "to_pandas"): - temp_self = self.to_pandas() if hasattr(other, "to_pandas"): - temp_other = self.to_pandas() - return temp_self.difference(temp_other, sort) + other = other.to_pandas() + return self.to_pandas().difference(other, sort) def append(self, other): """ From 59a15727c056a53d709f80dc0fe31d03eec61f3a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 15 Sep 2021 10:55:25 -0700 Subject: [PATCH 04/19] Simplify getitem. --- python/cudf/cudf/core/multiindex.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index c155d54a019..f511799402a 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -933,15 +933,11 @@ def deserialize(cls, header, frames): return cls._from_data(dict(zip(names, columns))) def __getitem__(self, index): - match = self.take(index) - if isinstance(index, slice): - return match if isinstance(index, int): # we are indexing into a single row of the MultiIndex, # return that row as a tuple: - return match.to_pandas()[0] - else: - return match + return self.take(index).to_pandas()[0] + return self.take(index) def to_frame(self, index=True, name=None): # TODO: Currently this function makes a shallow copy, which is @@ -953,7 +949,7 @@ def to_frame(self, index=True, name=None): if name is not None: if len(name) != len(self.levels): raise ValueError( - "'name' should have th same length as " + "'name' should have the same length as " "number of levels on index." ) df.columns = name From beb2fcf9f95e92912f03ae9a70a2916a77c0c36a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 15 Sep 2021 11:14:56 -0700 Subject: [PATCH 05/19] Some minor simplifications to the internals of _index_and_downcast. --- python/cudf/cudf/core/multiindex.py | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index f511799402a..f8ebe8fe08d 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -764,9 +764,7 @@ def _index_and_downcast(self, result, index, index_key): ) or isinstance(index_key[0], slice): index_key = index_key[0] - slice_access = False - if isinstance(index_key, slice): - slice_access = True + slice_access = isinstance(index_key, slice) out_index = cudf.DataFrame() # Select the last n-k columns where n is the number of columns and k is # the length of the indexing tuple @@ -774,30 +772,24 @@ def _index_and_downcast(self, result, index, index_key): if not isinstance(index_key, (numbers.Number, slice)): size = len(index_key) for k in range(size, len(index._data)): - if index.names is None: - name = k - else: - name = index.names[k] out_index.insert( - len(out_index.columns), - name, + out_index._num_columns, + k if index.names is None else index.names[k], cudf.Series._from_data({None: index._data.columns[k]}), ) - if len(result) == 1 and size == 0 and slice_access is False: + if len(result) == 1 and size == 0 and not slice_access: # If the final result is one row and it was not mapped into # directly, return a Series with a tuple as name. result = result.T result = result[result._data.names[0]] - elif len(result) == 0 and slice_access is False: + elif len(result) == 0 and not slice_access: # Pandas returns an empty Series with a tuple as name # the one expected result column - series_name = [] - for col in index._data.columns: - series_name.append(col[0]) - result = cudf.Series([]) - result.name = tuple(series_name) - elif len(out_index.columns) == 1: + result = cudf.Series._from_data( + {}, name=tuple((col[0] for col in index._data.columns)) + ) + elif out_index._num_columns == 1: # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according # to that column's name. @@ -805,7 +797,7 @@ def _index_and_downcast(self, result, index, index_key): out_index = as_index(last_column) out_index.name = index.names[-1] index = out_index - elif len(out_index.columns) > 1: + elif out_index._num_columns > 1: # Otherwise pop the leftmost levels, names, and codes from the # source index until it has the correct number of columns (n-k) result.reset_index(drop=True) From 6c5d60e435b73824c84568b95808e661fbfeaa21 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 16 Sep 2021 15:04:14 -0700 Subject: [PATCH 06/19] Add back accidentally removed MultiIndex.size computation. --- python/cudf/cudf/core/multiindex.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index f8ebe8fe08d..e288bc12755 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -874,6 +874,11 @@ def __eq__(self, other): return self.names == other.names return NotImplemented + @property + def size(self): + # The size of a MultiIndex is only dependent on the number of rows. + return self._num_rows + def take(self, indices): from collections.abc import Sequence from numbers import Integral From 596d589f3e4496309dc5c65c383d102051ea9c69 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 16 Sep 2021 15:18:12 -0700 Subject: [PATCH 07/19] Remove long deprecated labels parameter to constructor. --- python/cudf/cudf/core/multiindex.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index e288bc12755..ecbb78b8b58 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -33,8 +33,6 @@ class MultiIndex(Frame, BaseIndex): ---------- levels : sequence of arrays The unique labels for each level. - labels : sequence of arrays - labels is depreciated, please use levels codes: sequence of arrays Integers for each level designating which label at each location. sortorder : optional int @@ -68,7 +66,6 @@ def __init__( levels=None, codes=None, sortorder=None, - labels=None, names=None, dtype=None, copy=False, @@ -94,14 +91,6 @@ def __init__( self._name = None - if labels: - warnings.warn( - "the 'labels' keyword is deprecated, use 'codes' " "instead", - FutureWarning, - ) - if labels and not codes: - codes = labels - if len(levels) == 0: raise ValueError("Must pass non-zero number of levels/codes") @@ -545,15 +534,6 @@ def levels(self): self._compute_levels_and_codes() return self._levels - @property - def labels(self): - warnings.warn( - "This feature is deprecated in pandas and will be" - "dropped from cudf as well.", - FutureWarning, - ) - return self.codes - @property def ndim(self): """Dimension of the data. For MultiIndex ndim is always 2. From 8a1ac8ffa7f82611e4edddb87a4f9aade710aab4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 16 Sep 2021 15:34:16 -0700 Subject: [PATCH 08/19] Rewrite constructor to use dict, import is_list_like at module level, and remove unnecessary mask method. --- python/cudf/cudf/core/multiindex.py | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index ecbb78b8b58..0a3b1b95950 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -17,6 +17,7 @@ import cudf from cudf import _lib as libcudf from cudf._typing import DataFrameOrSeries +from cudf.api.types import is_list_like from cudf.core import column from cudf.core._compat import PANDAS_GE_120 from cudf.core.frame import Frame @@ -81,8 +82,6 @@ def __init__( "Use `names`, `name` is not yet supported" ) - super().__init__() - if copy: if isinstance(codes, cudf.DataFrame): codes = codes.copy(deep=True) @@ -117,7 +116,7 @@ def __init__( self._levels = [cudf.Series(level) for level in levels] self._validate_levels_and_codes(self._levels, self._codes) - source_data = cudf.DataFrame() + source_data = {} for i, n in enumerate(self._codes.columns): codes = as_index(self._codes[n]._column) if -1 in self._codes[n].values: @@ -132,7 +131,7 @@ def __init__( level, codes._data.columns[0] )[0][n] - self._data = source_data._data + super().__init__(source_data) self.names = names @property @@ -207,23 +206,19 @@ def rename(self, names, inplace=False): def set_names(self, names, level=None, inplace=False): if ( level is not None - and not cudf.api.types.is_list_like(level) - and cudf.api.types.is_list_like(names) + and not is_list_like(level) + and is_list_like(names) ): raise TypeError( "Names must be a string when a single level is provided." ) - if ( - not cudf.api.types.is_list_like(names) - and level is None - and self.nlevels > 1 - ): + if not is_list_like(names) and level is None and self.nlevels > 1: raise TypeError("Must pass list-like as `names`.") - if not cudf.api.types.is_list_like(names): + if not is_list_like(names): names = [names] - if level is not None and not cudf.api.types.is_list_like(level): + if level is not None and not is_list_like(level): level = [level] if level is not None and len(names) != len(level): @@ -607,8 +602,6 @@ def isin(self, values, level=None): >>> midx.isin([(1, 'red'), (3, 'red')]) array([ True, False, False]) """ - from cudf.api.types import is_list_like - if level is None: if isinstance(values, cudf.MultiIndex): values_idx = values @@ -657,11 +650,6 @@ def isin(self, values, level=None): return result - def mask(self, cond, other=None, inplace=False): - raise NotImplementedError( - ".mask is not supported for MultiIndex operations" - ) - def where(self, cond, other=None, inplace=False): raise NotImplementedError( ".where is not supported for MultiIndex operations" From 81ae743c3c3263de3811b84a58b57a2cb86e4447 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 16 Sep 2021 16:10:42 -0700 Subject: [PATCH 09/19] Various minor improvements. --- python/cudf/cudf/core/multiindex.py | 34 ++++++++++++----------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 0a3b1b95950..7d24a1cd342 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -7,6 +7,7 @@ import pickle import warnings from collections.abc import Sequence +from numbers import Integral from typing import Any, List, MutableMapping, Optional, Tuple, Union import cupy @@ -17,7 +18,7 @@ import cudf from cudf import _lib as libcudf from cudf._typing import DataFrameOrSeries -from cudf.api.types import is_list_like +from cudf.api.types import is_integer, is_list_like from cudf.core import column from cudf.core._compat import PANDAS_GE_120 from cudf.core.frame import Frame @@ -204,21 +205,20 @@ def rename(self, names, inplace=False): return self.set_names(names, level=None, inplace=inplace) def set_names(self, names, level=None, inplace=False): - if ( - level is not None - and not is_list_like(level) - and is_list_like(names) - ): + names_is_list_like = is_list_like(names) + level_is_list_like = is_list_like(level) + + if level is not None and not level_is_list_like and names_is_list_like: raise TypeError( "Names must be a string when a single level is provided." ) - if not is_list_like(names) and level is None and self.nlevels > 1: + if not names_is_list_like and level is None and self.nlevels > 1: raise TypeError("Must pass list-like as `names`.") - if not is_list_like(names): + if not names_is_list_like: names = [names] - if level is not None and not is_list_like(level): + if level is not None and not level_is_list_like: level = [level] if level is not None and len(names) != len(level): @@ -848,15 +848,10 @@ def size(self): return self._num_rows def take(self, indices): - from collections.abc import Sequence - from numbers import Integral - if isinstance(indices, (Integral, Sequence)): indices = np.array(indices) - elif isinstance(indices, cudf.Series): - if indices.has_nulls: - raise ValueError("Column must have no nulls.") - indices = indices + elif isinstance(indices, cudf.Series) and indices.has_nulls: + raise ValueError("Column must have no nulls.") elif isinstance(indices, slice): start, stop, step = indices.indices(len(self)) indices = column.arange(start, stop, step) @@ -1006,8 +1001,7 @@ def from_tuples(cls, tuples, names=None): """ # Use Pandas for handling Python host objects pdi = pd.MultiIndex.from_tuples(tuples, names=names) - result = cls.from_pandas(pdi) - return result + return cls.from_pandas(pdi) @property def values_host(self): @@ -1539,8 +1533,8 @@ def _level_index_from_level(self, level): try: return self.names.index(level) except ValueError: - if not pd.api.types.is_integer(level): - raise KeyError(f"Level {level} not found") from None + if not is_integer(level): + raise KeyError(f"Level {level} not found") if level < 0: level += self.nlevels if level >= self.nlevels: From 1cbf58f85c262b563b4d79ef720651d1e8ec90ef Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 16 Sep 2021 16:49:13 -0700 Subject: [PATCH 10/19] Inline _validate_levels_and_codes and simplify constructor. --- python/cudf/cudf/core/multiindex.py | 80 ++++++++++++++--------------- 1 file changed, 38 insertions(+), 42 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 7d24a1cd342..1b282a67901 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -77,32 +77,27 @@ def __init__( if sortorder is not None: raise NotImplementedError("sortorder is not yet supported") - if name is not None: raise NotImplementedError( "Use `names`, `name` is not yet supported" ) - - if copy: - if isinstance(codes, cudf.DataFrame): - codes = codes.copy(deep=True) - if len(levels) > 0 and isinstance(levels[0], cudf.Series): - levels = [level.copy(deep=True) for level in levels] - - self._name = None - if len(levels) == 0: raise ValueError("Must pass non-zero number of levels/codes") - if not isinstance(codes, cudf.DataFrame) and not isinstance( codes[0], (Sequence, np.ndarray) ): raise TypeError("Codes is not a Sequence of sequences") + if copy: + if isinstance(codes, cudf.DataFrame): + codes = codes.copy(deep=True) + if len(levels) > 0 and isinstance(levels[0], cudf.Series): + levels = [level.copy(deep=True) for level in levels] + if isinstance(codes, cudf.DataFrame): - self._codes = codes + codes = codes elif len(levels) == len(codes): - self._codes = cudf.DataFrame._from_data( + codes = cudf.DataFrame._from_data( { i: column.as_column(code).astype(np.int64) for i, code in enumerate(codes) @@ -114,25 +109,46 @@ def __init__( "codes and is inconsistent!" ) - self._levels = [cudf.Series(level) for level in levels] - self._validate_levels_and_codes(self._levels, self._codes) + levels = [cudf.Series(level) for level in levels] + + if len(levels) != len(codes.columns): + raise ValueError( + "MultiIndex has unequal number of levels and " + "codes and is inconsistent!" + ) + code_length = len(codes[codes.columns[0]]) + for index, code in enumerate(codes): + if code_length != len(codes[code]): + raise ValueError( + "MultiIndex length of codes does not match " + "and is inconsistent!" + ) + for index, code in enumerate(codes): + if codes[code].max() > len(levels[index]) - 1: + raise ValueError( + "MultiIndex code %d contains value %d larger " + "than maximum level size at this position" + ) source_data = {} - for i, n in enumerate(self._codes.columns): - codes = as_index(self._codes[n]._column) - if -1 in self._codes[n].values: + for i, n in enumerate(codes.columns): + tmp_codes = as_index(codes[n]._column) + if -1 in codes[n].values: level = cudf.DataFrame( - {n: [None] + list(self._levels[i])}, - index=range(-1, len(self._levels[i])), + {n: [None] + list(levels[i])}, + index=range(-1, len(levels[i])), ) else: - level = cudf.DataFrame({n: self._levels[i]}) + level = cudf.DataFrame({n: levels[i]}) source_data[n] = libcudf.copying.gather( - level, codes._data.columns[0] + level, tmp_codes._data.columns[0] )[0][n] super().__init__(source_data) + self._levels = levels + self._codes = codes + self._name = None self.names = names @property @@ -261,26 +277,6 @@ def name(self): def name(self, value): self._name = value - def _validate_levels_and_codes(self, levels, codes): - if len(levels) != len(codes.columns): - raise ValueError( - "MultiIndex has unequal number of levels and " - "codes and is inconsistent!" - ) - code_length = len(codes[codes.columns[0]]) - for index, code in enumerate(codes): - if code_length != len(codes[code]): - raise ValueError( - "MultiIndex length of codes does not match " - "and is inconsistent!" - ) - for index, code in enumerate(codes): - if codes[code].max() > len(levels[index]) - 1: - raise ValueError( - "MultiIndex code %d contains value %d larger " - "than maximum level size at this position" - ) - def copy( self, names=None, From 3c03aa3aa7fea6675792fcd963529dc1177057a6 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 16 Sep 2021 16:58:24 -0700 Subject: [PATCH 11/19] More constructor simplifications. --- python/cudf/cudf/core/multiindex.py | 40 +++++++++++++---------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 1b282a67901..917ed234cfa 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -94,20 +94,19 @@ def __init__( if len(levels) > 0 and isinstance(levels[0], cudf.Series): levels = [level.copy(deep=True) for level in levels] - if isinstance(codes, cudf.DataFrame): - codes = codes - elif len(levels) == len(codes): - codes = cudf.DataFrame._from_data( - { - i: column.as_column(code).astype(np.int64) - for i, code in enumerate(codes) - } - ) - else: - raise ValueError( - "MultiIndex has unequal number of levels and " - "codes and is inconsistent!" - ) + if not isinstance(codes, cudf.DataFrame): + if len(levels) == len(codes): + codes = cudf.DataFrame._from_data( + { + i: column.as_column(code).astype(np.int64) + for i, code in enumerate(codes) + } + ) + else: + raise ValueError( + "MultiIndex has unequal number of levels and " + "codes and is inconsistent!" + ) levels = [cudf.Series(level) for level in levels] @@ -131,19 +130,16 @@ def __init__( ) source_data = {} - for i, n in enumerate(codes.columns): - tmp_codes = as_index(codes[n]._column) - if -1 in codes[n].values: + for i, (cname, col) in enumerate(codes._data.items()): + if -1 in col.values: level = cudf.DataFrame( - {n: [None] + list(levels[i])}, + {cname: [None] + list(levels[i])}, index=range(-1, len(levels[i])), ) else: - level = cudf.DataFrame({n: levels[i]}) + level = cudf.DataFrame({cname: levels[i]}) - source_data[n] = libcudf.copying.gather( - level, tmp_codes._data.columns[0] - )[0][n] + source_data[cname] = libcudf.copying.gather(level, col)[0][cname] super().__init__(source_data) self._levels = levels From 3ce7a5d3e4931d16ce58de0cd5d58d1da3f88313 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 16 Sep 2021 17:07:29 -0700 Subject: [PATCH 12/19] Simplify some input checking. --- python/cudf/cudf/core/multiindex.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 917ed234cfa..6e6df0e4301 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -115,15 +115,13 @@ def __init__( "MultiIndex has unequal number of levels and " "codes and is inconsistent!" ) - code_length = len(codes[codes.columns[0]]) - for index, code in enumerate(codes): - if code_length != len(codes[code]): - raise ValueError( - "MultiIndex length of codes does not match " - "and is inconsistent!" - ) - for index, code in enumerate(codes): - if codes[code].max() > len(levels[index]) - 1: + if len(set(c.size for c in codes._data.columns)) != 1: + raise ValueError( + "MultiIndex length of codes does not match " + "and is inconsistent!" + ) + for level, code in zip(levels, codes._data.columns): + if code.max() > len(level) - 1: raise ValueError( "MultiIndex code %d contains value %d larger " "than maximum level size at this position" @@ -154,7 +152,6 @@ def names(self): @names.setter def names(self, value): value = [None] * self.nlevels if value is None else value - assert len(value) == self.nlevels if len(value) == len(set(value)): # IMPORTANT: if the provided names are unique, From 0c74e1e69ed5100e96e953c4fac90f39b15e686e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 16 Sep 2021 17:14:05 -0700 Subject: [PATCH 13/19] Minor docstring improvements. --- python/cudf/cudf/core/multiindex.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 6e6df0e4301..c1a31049e50 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -476,9 +476,7 @@ def codes(self): @property def nlevels(self): - """ - Integer number of levels in this MultiIndex. - """ + """Integer number of levels in this MultiIndex.""" return len(self._data) @property @@ -520,12 +518,11 @@ def levels(self): @property def ndim(self): - """Dimension of the data. For MultiIndex ndim is always 2. - """ + """Dimension of the data. For MultiIndex ndim is always 2.""" return 2 def _get_level_label(self, level): - """ Get name of the level. + """Get name of the level. Parameters ---------- From a0657a9f127f901d6d9b4ba0d3a8de8bace385b3 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 17 Sep 2021 10:05:32 -0700 Subject: [PATCH 14/19] Fix signature of __deepcopy__. --- python/cudf/cudf/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index a1efe764f24..97e37ef56b6 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -4463,7 +4463,7 @@ def to_string(self): def __str__(self): return self.to_string() - def __deepcopy__(self): + def __deepcopy__(self, memo): return self.copy(deep=True) def __copy__(self): From bb88f6523ce4578a3c1258cd1b3876f42e59103c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 17 Sep 2021 10:43:12 -0700 Subject: [PATCH 15/19] Fix behavior of string cat index retention. --- python/cudf/cudf/core/column/string.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index c14cbd11714..c59081e4b59 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -352,7 +352,9 @@ def cat(self, others=None, sep=None, na_rep=None): if len(data) == 1 and data.null_count == 1: data = [""] - out = self._return_or_inplace(data) + # We only want to keep the index if we are adding something to each + # row, not if we are joining all the rows into a single string. + out = self._return_or_inplace(data, retain_index=others is not None) if len(out) == 1 and others is None: if isinstance(out, cudf.Series): out = out.iloc[0] From 84dee69d9874e39ebc20722f7f3cdaa4094e1182 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 17 Sep 2021 10:43:45 -0700 Subject: [PATCH 16/19] Add back comment for iter. --- python/cudf/cudf/core/frame.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 97e37ef56b6..0809e14a8a2 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -4729,6 +4729,12 @@ def shape(self): return (len(self),) def __iter__(self): + """ + Iterating over a GPU object is not effecient and hence not supported. + + Consider using ``.to_arrow()``, ``.to_pandas()`` or ``.values_host`` + if you wish to iterate over the values. + """ cudf.utils.utils.raise_iteration_error(obj=self) def __bool__(self): From 1095d7d25cfaeac09604ae4027e5a1de271797c4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 21 Sep 2021 12:21:38 -0700 Subject: [PATCH 17/19] Add tests of difference. --- python/cudf/cudf/tests/test_multiindex.py | 48 +++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 465cf36e1f3..80f44065e9f 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -5,7 +5,9 @@ """ import itertools import operator +import pickle import re +from io import BytesIO import cupy as cp import numpy as np @@ -1553,3 +1555,49 @@ def test_multiIndex_duplicate_names(): ) assert_eq(gi, pi) + + +@pytest.mark.parametrize( + "names", + [ + ["a", "b", "c"], + [None, None, None], + ["aa", "aa", "aa"], + ["bb", "aa", "aa"], + None, + ], +) +def test_pickle_roundtrip_multiIndex(names): + df = cudf.DataFrame( + { + "one": [1, 2, 3], + "two": [True, False, True], + "three": ["ab", "cd", "ef"], + "four": [0.2, 0.1, -10.2], + } + ) + expected_df = df.set_index(["one", "two", "three"]) + expected_df.index.names = names + local_file = BytesIO() + + pickle.dump(expected_df, local_file) + local_file.seek(0) + actual_df = pickle.load(local_file) + assert_eq(expected_df, actual_df) + + +def test_difference(): + midx = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ) + midx2 = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3, 3], [0, 2, 1, 1, 0, 2]], + names=["x", "y"], + ) + + expected = midx2.to_pandas().difference(midx.to_pandas()) + actual = midx2.difference(midx) + assert_eq(expected, actual) From be05fa1bb64b03797f703bd281b30ac5d0a2ce99 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 21 Sep 2021 12:39:22 -0700 Subject: [PATCH 18/19] Address PR comments. --- python/cudf/cudf/core/multiindex.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index c1a31049e50..3bf5f70be39 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -128,16 +128,18 @@ def __init__( ) source_data = {} - for i, (cname, col) in enumerate(codes._data.items()): + for i, (column_name, col) in enumerate(codes._data.items()): if -1 in col.values: level = cudf.DataFrame( - {cname: [None] + list(levels[i])}, + {column_name: [None] + list(levels[i])}, index=range(-1, len(levels[i])), ) else: - level = cudf.DataFrame({cname: levels[i]}) + level = cudf.DataFrame({column_name: levels[i]}) - source_data[cname] = libcudf.copying.gather(level, col)[0][cname] + source_data[column_name] = libcudf.copying.gather(level, col)[0][ + column_name + ] super().__init__(source_data) self._levels = levels From 7cdbe12d42ab35ec8baf93f523ed44169a6b72e2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 21 Sep 2021 13:14:08 -0700 Subject: [PATCH 19/19] Remove accidentally added test from 21.10 merge that will conflict with version from 21.10 to prevent forward merge issues. --- python/cudf/cudf/tests/test_multiindex.py | 31 ----------------------- 1 file changed, 31 deletions(-) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 80f44065e9f..40bbdc4a865 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -5,9 +5,7 @@ """ import itertools import operator -import pickle import re -from io import BytesIO import cupy as cp import numpy as np @@ -1557,35 +1555,6 @@ def test_multiIndex_duplicate_names(): assert_eq(gi, pi) -@pytest.mark.parametrize( - "names", - [ - ["a", "b", "c"], - [None, None, None], - ["aa", "aa", "aa"], - ["bb", "aa", "aa"], - None, - ], -) -def test_pickle_roundtrip_multiIndex(names): - df = cudf.DataFrame( - { - "one": [1, 2, 3], - "two": [True, False, True], - "three": ["ab", "cd", "ef"], - "four": [0.2, 0.1, -10.2], - } - ) - expected_df = df.set_index(["one", "two", "three"]) - expected_df.index.names = names - local_file = BytesIO() - - pickle.dump(expected_df, local_file) - local_file.seek(0) - actual_df = pickle.load(local_file) - assert_eq(expected_df, actual_df) - - def test_difference(): midx = cudf.MultiIndex( levels=[[1, 3, 4, 5], [1, 2, 5]],