From 698b654a0e77990bf4566c5ce9b43418b29f1048 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 28 Feb 2022 17:32:01 -0800 Subject: [PATCH 1/6] Unify and deprecate append. --- python/cudf/cudf/core/dataframe.py | 33 +++++++------------------- python/cudf/cudf/core/indexed_frame.py | 20 ++++++++++++++++ python/cudf/cudf/core/series.py | 14 +---------- 3 files changed, 29 insertions(+), 38 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 287fd3796f4..d943df68045 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6126,19 +6126,10 @@ def append( 3 3 4 4 """ - if verify_integrity not in (None, False): - raise NotImplementedError( - "verify_integrity parameter is not supported yet." - ) - if isinstance(other, dict): if not ignore_index: raise TypeError("Can only append a dict if ignore_index=True") other = DataFrame(other) - result = cudf.concat( - [self, other], ignore_index=ignore_index, sort=sort - ) - return result elif isinstance(other, Series): if other.name is None and not ignore_index: raise TypeError( @@ -6169,23 +6160,15 @@ def append( other = other.reindex(combined_columns, copy=False).to_frame().T if not current_cols.equals(combined_columns): self = self.reindex(columns=combined_columns) - elif isinstance(other, list): - if not other: - pass - elif not isinstance(other[0], DataFrame): - other = DataFrame(other) - cols = self._data.to_pandas_index() - if ( - cols.get_indexer(other._data.to_pandas_index()) >= 0 - ).all(): - other = other.reindex(columns=cols) - - if is_list_like(other): - to_concat = [self, *other] - else: - to_concat = [self, other] + elif isinstance(other, list) and not isinstance(other[0], DataFrame): + other = DataFrame(other) + cols = self._data.to_pandas_index() + if (cols.get_indexer(other._data.to_pandas_index()) >= 0).all(): + other = other.reindex(columns=cols) - return cudf.concat(to_concat, ignore_index=ignore_index, sort=sort) + return super(DataFrame, self)._append( + other, ignore_index, verify_integrity, sort + ) @annotate("DATAFRAME_PIVOT", color="green", domain="cudf_python") @copy_docstring(reshape.pivot) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 3ae0a838873..27a3e5b0746 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1778,6 +1778,26 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): return NotImplemented + def _append( + self, other, ignore_index=False, verify_integrity=False, sort=None + ): + warnings.warn( + "append is deprecated and will be removed in a future version. " + "Use concat instead.", + FutureWarning, + ) + if verify_integrity not in (None, False): + raise NotImplementedError( + "verify_integrity parameter is not supported yet." + ) + + if is_list_like(other): + to_concat = [self, *other] + else: + to_concat = [self, other] + + return cudf.concat(to_concat, ignore_index=ignore_index, sort=sort) + def _check_duplicate_level_names(specified, level_names): """Raise if any of `specified` has duplicates in `level_names`.""" diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index ec87fcdb066..8ae83ddd8c2 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -33,7 +33,6 @@ is_integer_dtype, is_interval_dtype, is_list_dtype, - is_list_like, is_scalar, is_struct_dtype, ) @@ -785,18 +784,7 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): 5 6 dtype: int64 """ - if verify_integrity not in (None, False): - raise NotImplementedError( - "verify_integrity parameter is not supported yet." - ) - - if is_list_like(to_append): - to_concat = [self] - to_concat.extend(to_append) - else: - to_concat = [self, to_append] - - return cudf.concat(to_concat, ignore_index=ignore_index) + return super()._append(to_append, ignore_index, verify_integrity) def reindex(self, index=None, copy=True): """Return a Series that conforms to a new index From 6f14341029f5821147f607c361b788da7fa3e79f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 1 Mar 2022 14:39:01 -0800 Subject: [PATCH 2/6] Unify astype for Series and DataFrame. --- python/cudf/cudf/core/dataframe.py | 95 ---------------- python/cudf/cudf/core/indexed_frame.py | 150 +++++++++++++++++++++++++ python/cudf/cudf/core/series.py | 113 ++----------------- 3 files changed, 157 insertions(+), 201 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d943df68045..facbd848c79 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1638,101 +1638,6 @@ def _concat( return out - def astype(self, dtype, copy=False, errors="raise", **kwargs): - """ - Cast the DataFrame to the given dtype - - Parameters - ---------- - - dtype : data type, or dict of column name -> data type - Use a numpy.dtype or Python type to cast entire DataFrame object to - the same type. Alternatively, use ``{col: dtype, ...}``, where col - is a column label and dtype is a numpy.dtype or Python type - to cast one or more of the DataFrame's columns to - column-specific types. - copy : bool, default False - Return a deep-copy when ``copy=True``. Note by default - ``copy=False`` setting is used and hence changes to - values then may propagate to other cudf objects. - errors : {'raise', 'ignore', 'warn'}, default 'raise' - Control raising of exceptions on invalid data for provided dtype. - - - ``raise`` : allow exceptions to be raised - - ``ignore`` : suppress exceptions. On error return original - object. - - ``warn`` : prints last exceptions as warnings and - return original object. - **kwargs : extra arguments to pass on to the constructor - - Returns - ------- - casted : DataFrame - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [10, 20, 30], 'b': [1, 2, 3]}) - >>> df - a b - 0 10 1 - 1 20 2 - 2 30 3 - >>> df.dtypes - a int64 - b int64 - dtype: object - - Cast all columns to `int32`: - - >>> df.astype('int32').dtypes - a int32 - b int32 - dtype: object - - Cast `a` to `float32` using a dictionary: - - >>> df.astype({'a': 'float32'}).dtypes - a float32 - b int64 - dtype: object - >>> df.astype({'a': 'float32'}) - a b - 0 10.0 1 - 1 20.0 2 - 2 30.0 3 - """ - result = DataFrame(index=self.index) - - if is_dict_like(dtype): - current_cols = self._data.names - if len(set(dtype.keys()) - set(current_cols)) > 0: - raise KeyError( - "Only a column name can be used for the " - "key in a dtype mappings argument." - ) - for col_name in current_cols: - if col_name in dtype: - result._data[col_name] = self._data[col_name].astype( - dtype=dtype[col_name], - errors=errors, - copy=copy, - **kwargs, - ) - else: - result._data[col_name] = ( - self._data[col_name].copy(deep=True) - if copy - else self._data[col_name] - ) - else: - for col in self._data: - result._data[col] = self._data[col].astype( - dtype=dtype, **kwargs - ) - - return result - def _clean_renderable_dataframe(self, output): """ This method takes in partial/preprocessed dataframe diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 27a3e5b0746..7bcc78219d0 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -22,6 +22,8 @@ _is_non_decimal_numeric_dtype, is_bool_dtype, is_categorical_dtype, + is_dict_like, + is_dtype_equal, is_integer_dtype, is_list_like, ) @@ -1798,6 +1800,154 @@ def _append( return cudf.concat(to_concat, ignore_index=ignore_index, sort=sort) + def astype(self, dtype, copy=False, errors="raise", **kwargs): + """Cast the object to the given dtype. + + Parameters + ---------- + dtype : data type, or dict of column name -> data type + Use a numpy.dtype or Python type to cast entire DataFrame object to + the same type. Alternatively, use ``{col: dtype, ...}``, where col + is a column label and dtype is a numpy.dtype or Python type + to cast one or more of the DataFrame's columns to + column-specific types. + copy : bool, default False + Return a deep-copy when ``copy=True``. Note by default + ``copy=False`` setting is used and hence changes to + values then may propagate to other cudf objects. + errors : {'raise', 'ignore', 'warn'}, default 'raise' + Control raising of exceptions on invalid data for provided dtype. + + - ``raise`` : allow exceptions to be raised + - ``ignore`` : suppress exceptions. On error return original + object. + - ``warn`` : prints last exceptions as warnings and + return original object. + - **kwargs : extra arguments to pass on to the constructor + + Returns + ------- + DataFrame/Series + + Examples + -------- + **DataFrame** + + >>> import cudf + >>> df = cudf.DataFrame({'a': [10, 20, 30], 'b': [1, 2, 3]}) + >>> df + a b + 0 10 1 + 1 20 2 + 2 30 3 + >>> df.dtypes + a int64 + b int64 + dtype: object + + Cast all columns to `int32`: + + >>> df.astype('int32').dtypes + a int32 + b int32 + dtype: object + + Cast `a` to `float32` using a dictionary: + + >>> df.astype({'a': 'float32'}).dtypes + a float32 + b int64 + dtype: object + >>> df.astype({'a': 'float32'}) + a b + 0 10.0 1 + 1 20.0 2 + 2 30.0 3 + + **Series** + + >>> import cudf + >>> series = cudf.Series([1, 2], dtype='int32') + >>> series + 0 1 + 1 2 + dtype: int32 + >>> series.astype('int64') + 0 1 + 1 2 + dtype: int64 + + Convert to categorical type: + + >>> series.astype('category') + 0 1 + 1 2 + dtype: category + Categories (2, int64): [1, 2] + + Convert to ordered categorical type with custom ordering: + + >>> cat_dtype = cudf.CategoricalDtype(categories=[2, 1], ordered=True) + >>> series.astype(cat_dtype) + 0 1 + 1 2 + dtype: category + Categories (2, int64): [2 < 1] + + Note that using ``copy=False`` (enabled by default) + and changing data on a new Series will + propagate changes: + + >>> s1 = cudf.Series([1, 2]) + >>> s1 + 0 1 + 1 2 + dtype: int64 + >>> s2 = s1.astype('int64', copy=False) + >>> s2[0] = 10 + >>> s1 + 0 10 + 1 2 + dtype: int64 + """ + if errors not in ("ignore", "warn", "raise"): + raise ValueError("invalid error value specified") + elif errors == "warn": + warnings.warn( + "Specifying errors='warn' is deprecated and will be removed " + "in a future release.", + FutureWarning, + ) + + if not is_dict_like(dtype): + dtype = {cc: dtype for cc in self._data.names} + else: + if len(set(dtype.keys()) - set(self._data.names)) > 0: + raise KeyError( + "Only a column name can be used for the " + "key in a dtype mappings argument." + ) + + try: + result = {} + for col_name, col in self._data.items(): + dt = dtype.get(col_name, col.dtype) + if not is_dtype_equal(dt, col.dtype): + result[col_name] = col.astype(dt, copy=copy, **kwargs) + else: + result[col_name] = col.copy() if copy else col + except Exception as e: + if errors == "raise": + raise e + elif errors == "warn": + import traceback + + tb = traceback.format_exc() + warnings.warn(tb) + return self + + return self._from_data(result, index=self._index) + def _check_duplicate_level_names(specified, level_names): """Raise if any of `specified` has duplicates in `level_names`.""" diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8ae83ddd8c2..72a6ca05f34 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -28,7 +28,6 @@ is_categorical_dtype, is_decimal_dtype, is_dict_like, - is_dtype_equal, is_integer, is_integer_dtype, is_interval_dtype, @@ -1678,111 +1677,13 @@ def nullmask(self): """The gpu buffer for the null-mask""" return cudf.Series(self._column.nullmask) - def astype(self, dtype, copy=False, errors="raise"): - """ - Cast the Series to the given dtype - - Parameters - ---------- - - dtype : data type, or dict of column name -> data type - Use a numpy.dtype or Python type to cast Series object to - the same type. Alternatively, use {col: dtype, ...}, where col is a - series name and dtype is a numpy.dtype or Python type to cast to. - copy : bool, default False - Return a deep-copy when ``copy=True``. Note by default - ``copy=False`` setting is used and hence changes to - values then may propagate to other cudf objects. - errors : {'raise', 'ignore', 'warn'}, default 'raise' - Control raising of exceptions on invalid data for provided dtype. - - - ``raise`` : allow exceptions to be raised - - ``ignore`` : suppress exceptions. On error return original - object. - - ``warn`` : prints last exceptions as warnings and - return original object. - - Returns - ------- - out : Series - Returns ``self.copy(deep=copy)`` if ``dtype`` is the same - as ``self.dtype``. - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([1, 2], dtype='int32') - >>> series - 0 1 - 1 2 - dtype: int32 - >>> series.astype('int64') - 0 1 - 1 2 - dtype: int64 - - Convert to categorical type: - - >>> series.astype('category') - 0 1 - 1 2 - dtype: category - Categories (2, int64): [1, 2] - - Convert to ordered categorical type with custom ordering: - - >>> cat_dtype = cudf.CategoricalDtype(categories=[2, 1], ordered=True) - >>> series.astype(cat_dtype) - 0 1 - 1 2 - dtype: category - Categories (2, int64): [2 < 1] - - Note that using ``copy=False`` (enabled by default) - and changing data on a new Series will - propagate changes: - - >>> s1 = cudf.Series([1, 2]) - >>> s1 - 0 1 - 1 2 - dtype: int64 - >>> s2 = s1.astype('int64', copy=False) - >>> s2[0] = 10 - >>> s1 - 0 10 - 1 2 - dtype: int64 - """ - if errors not in ("ignore", "raise", "warn"): - raise ValueError("invalid error value specified") - - if is_dict_like(dtype): - if len(dtype) > 1 or self.name not in dtype: - raise KeyError( - "Only the Series name can be used for " - "the key in Series dtype mappings." - ) - dtype = dtype[self.name] - - if is_dtype_equal(dtype, self.dtype): - return self.copy(deep=copy) - try: - data = self._column.astype(dtype) - - return self._from_data({self.name: data}, index=self._index) - - except Exception as e: - if errors == "raise": - raise e - elif errors == "warn": - import traceback - - tb = traceback.format_exc() - warnings.warn(tb) - elif errors == "ignore": - pass - return self + def astype(self, dtype, copy=False, errors="raise", **kwargs): + if is_dict_like(dtype) and (len(dtype) > 1 or self.name not in dtype): + raise KeyError( + "Only the Series name can be used for " + "the key in Series dtype mappings." + ) + return super().astype(dtype, copy, errors, **kwargs) def sort_index(self, axis=0, *args, **kwargs): if axis not in (0, "index"): From c6bbd125d8eb2c67e565b1368e86cbfdd2d1ae8e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 1 Mar 2022 15:16:55 -0800 Subject: [PATCH 3/6] Match pandas astype behavior across index types as well and reuse as much logic as possible. --- python/cudf/cudf/core/_base_index.py | 71 ++++++++++++-------------- python/cudf/cudf/core/dataframe.py | 11 ++++ python/cudf/cudf/core/frame.py | 12 +++++ python/cudf/cudf/core/index.py | 15 +++++- python/cudf/cudf/core/indexed_frame.py | 21 +------- python/cudf/cudf/core/multiindex.py | 19 ++++++- python/cudf/cudf/core/series.py | 13 +++-- 7 files changed, 95 insertions(+), 67 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index b7b61e4d332..0910a1bc2e0 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -18,7 +18,6 @@ from cudf._typing import DtypeObj from cudf.api.types import ( is_bool_dtype, - is_dtype_equal, is_integer, is_integer_dtype, is_list_like, @@ -33,6 +32,37 @@ numeric_normalize_types, ) +_index_astype_docstring = """\ +Create an Index with values cast to dtypes. + +The class of a new Index is determined by dtype. When conversion is +impossible, a ValueError exception is raised. + +Parameters +---------- +dtype : numpy dtype + Use a numpy.dtype to cast entire Index object to. +copy : bool, default False + By default, astype always returns a newly allocated object. + If copy is set to False and internal requirements on dtype are + satisfied, the original data is used to create a new Index + or the original Index is returned. + +Returns +------- +Index + Index with values cast to specified dtype. + +Examples +-------- +>>> import cudf +>>> index = cudf.Index([1, 2, 3]) +>>> index +Int64Index([1, 2, 3], dtype='int64') +>>> index.astype('float64') +Float64Index([1.0, 2.0, 3.0], dtype='float64') +""" + class BaseIndex(Serializable): """Base class for all cudf Index types.""" @@ -1199,43 +1229,6 @@ def rename(self, name, inplace=False): out.name = name return out - def astype(self, dtype, copy=False): - """ - Create an Index with values cast to dtypes. The class of a new Index - is determined by dtype. When conversion is impossible, a ValueError - exception is raised. - - Parameters - ---------- - dtype : numpy dtype - Use a numpy.dtype to cast entire Index object to. - copy : bool, default False - By default, astype always returns a newly allocated object. - If copy is set to False and internal requirements on dtype are - satisfied, the original data is used to create a new Index - or the original Index is returned. - - Returns - ------- - Index - Index with values cast to specified dtype. - - Examples - -------- - >>> import cudf - >>> index = cudf.Index([1, 2, 3]) - >>> index - Int64Index([1, 2, 3], dtype='int64') - >>> index.astype('float64') - Float64Index([1.0, 2.0, 3.0], dtype='float64') - """ - if is_dtype_equal(dtype, self.dtype): - return self.copy(deep=copy) - - return cudf.Index( - self.copy(deep=copy)._values.astype(dtype), name=self.name - ) - def to_series(self, index=None, name=None): """ Create a Series with both index and values equal to the index keys. @@ -1277,7 +1270,7 @@ def get_slice_bound(self, label, side, kind=None): int Index of label. """ - raise (NotImplementedError) + raise NotImplementedError def __array_function__(self, func, types, args, kwargs): diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index facbd848c79..93abe6c4419 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1638,6 +1638,17 @@ def _concat( return out + def astype(self, dtype, copy=False, errors="raise", **kwargs): + if is_dict_like(dtype): + if len(set(dtype.keys()) - set(self._data.names)) > 0: + raise KeyError( + "Only a column name can be used for the " + "key in a dtype mappings argument." + ) + else: + dtype = {cc: dtype for cc in self._data.names} + return super().astype(dtype, copy, errors, **kwargs) + def _clean_renderable_dataframe(self, output): """ This method takes in partial/preprocessed dataframe diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0fd7848c7d1..5c45e852e44 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -33,6 +33,7 @@ _is_non_decimal_numeric_dtype, is_decimal_dtype, is_dict_like, + is_dtype_equal, is_scalar, issubdtype, ) @@ -494,6 +495,17 @@ def copy(self: T, deep: bool = True) -> T: return new_frame + def astype(self, dtype, copy=False, **kwargs): + result = {} + for col_name, col in self._data.items(): + dt = dtype.get(col_name, col.dtype) + if not is_dtype_equal(dt, col.dtype): + result[col_name] = col.astype(dt, copy=copy, **kwargs) + else: + result[col_name] = col.copy() if copy else col + + return result + @annotate("FRAME_EQUALS", color="green", domain="cudf_python") def equals(self, other, **kwargs): """ diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 5aab834d452..8055da3054f 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -33,10 +33,11 @@ _is_non_decimal_numeric_dtype, _is_scalar_or_zero_d_array, is_categorical_dtype, + is_dtype_equal, is_interval_dtype, is_string_dtype, ) -from cudf.core._base_index import BaseIndex +from cudf.core._base_index import BaseIndex, _index_astype_docstring from cudf.core.column import ( CategoricalColumn, ColumnBase, @@ -53,7 +54,7 @@ from cudf.core.dtypes import IntervalDtype from cudf.core.frame import Frame from cudf.core.single_column_frame import SingleColumnFrame -from cudf.utils.docutils import copy_docstring +from cudf.utils.docutils import copy_docstring, doc_apply from cudf.utils.dtypes import find_common_type from cudf.utils.utils import search_range @@ -298,6 +299,12 @@ def copy(self, name=None, deep=False, dtype=None, names=None): start=self._start, stop=self._stop, step=self._step, name=name ) + @doc_apply(_index_astype_docstring) + def astype(self, dtype, copy: bool = True): + if is_dtype_equal(dtype, np.int64): + return self + return self._as_int64().astype(dtype, copy=copy) + def drop_duplicates(self, keep="first"): return self @@ -932,6 +939,10 @@ def copy(self, name=None, deep=False, dtype=None, names=None): col = self._values.astype(dtype) return _index_from_data({name: col.copy(True) if deep else col}) + @doc_apply(_index_astype_docstring) + def astype(self, dtype, copy: bool = True): + return _index_from_data(super().astype({self.name: dtype}, copy)) + def get_loc(self, key, method=None, tolerance=None): """Get integer location, slice or boolean mask for requested label. diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 7bcc78219d0..114fb7d0e28 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -22,8 +22,6 @@ _is_non_decimal_numeric_dtype, is_bool_dtype, is_categorical_dtype, - is_dict_like, - is_dtype_equal, is_integer_dtype, is_list_like, ) @@ -1919,23 +1917,8 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs): FutureWarning, ) - if not is_dict_like(dtype): - dtype = {cc: dtype for cc in self._data.names} - else: - if len(set(dtype.keys()) - set(self._data.names)) > 0: - raise KeyError( - "Only a column name can be used for the " - "key in a dtype mappings argument." - ) - try: - result = {} - for col_name, col in self._data.items(): - dt = dtype.get(col_name, col.dtype) - if not is_dtype_equal(dt, col.dtype): - result[col_name] = col.astype(dt, copy=copy, **kwargs) - else: - result[col_name] = col.copy() if copy else col + data = super().astype(dtype, copy, **kwargs) except Exception as e: if errors == "raise": raise e @@ -1946,7 +1929,7 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs): warnings.warn(tb) return self - return self._from_data(result, index=self._index) + return self._from_data(data, index=self._index) def _check_duplicate_level_names(specified, level_names): diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index b09a2d39c14..1ae50206288 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -18,11 +18,17 @@ import cudf from cudf import _lib as libcudf from cudf._typing import DataFrameOrSeries -from cudf.api.types import is_integer, is_list_like +from cudf.api.types import is_integer, is_list_like, is_object_dtype from cudf.core import column from cudf.core._compat import PANDAS_GE_120 from cudf.core.frame import Frame -from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index +from cudf.core.index import ( + BaseIndex, + _index_astype_docstring, + _lexsorted_equal_range, + as_index, +) +from cudf.utils.docutils import doc_apply from cudf.utils.utils import NotIterable, _maybe_indices_to_slice @@ -169,6 +175,15 @@ def names(self, value): ) self._names = pd.core.indexes.frozen.FrozenList(value) + @doc_apply(_index_astype_docstring) + def astype(self, dtype, copy: bool = True): + if not is_object_dtype(dtype): + raise TypeError( + "Setting a MultiIndex dtype to anything other than object is " + "not supported" + ) + return self + def rename(self, names, inplace=False): """ Alter MultiIndex level names diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 72a6ca05f34..bded4f97c5c 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1678,11 +1678,14 @@ def nullmask(self): return cudf.Series(self._column.nullmask) def astype(self, dtype, copy=False, errors="raise", **kwargs): - if is_dict_like(dtype) and (len(dtype) > 1 or self.name not in dtype): - raise KeyError( - "Only the Series name can be used for " - "the key in Series dtype mappings." - ) + if is_dict_like(dtype): + if len(dtype) > 1 or self.name not in dtype: + raise KeyError( + "Only the Series name can be used for the key in Series " + "dtype mappings." + ) + else: + dtype = {self.name: dtype} return super().astype(dtype, copy, errors, **kwargs) def sort_index(self, axis=0, *args, **kwargs): From 8b966c0a9c733902546728a98502ef6e4708520c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 1 Mar 2022 16:28:09 -0800 Subject: [PATCH 4/6] Unify drop implementations. --- python/cudf/cudf/core/dataframe.py | 206 +-------------------- python/cudf/cudf/core/frame.py | 7 + python/cudf/cudf/core/indexed_frame.py | 242 ++++++++++++++++++++++++- python/cudf/cudf/core/series.py | 123 +------------ python/cudf/cudf/tests/test_series.py | 2 +- 5 files changed, 255 insertions(+), 325 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 93abe6c4419..d9eb938f0a2 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -62,7 +62,7 @@ concat_columns, ) from cudf.core.column_accessor import ColumnAccessor -from cudf.core.frame import Frame, _drop_rows_by_labels +from cudf.core.frame import Frame from cudf.core.groupby.groupby import DataFrameGroupBy from cudf.core.index import BaseIndex, Index, RangeIndex, as_index from cudf.core.indexed_frame import ( @@ -1221,9 +1221,6 @@ def __setitem__(self, arg, value): ) def __delitem__(self, name): - """ - Drop the given column by *name*. - """ self._drop_column(name) @annotate("DATAFRAME_SLICE", color="blue", domain="cudf_python") @@ -2690,187 +2687,6 @@ def diff(self, periods=1, axis=0): return self - self.shift(periods=periods) - @annotate("DATAFRAME_DROP", color="green", domain="cudf_python") - def drop( - self, - labels=None, - axis=0, - index=None, - columns=None, - level=None, - inplace=False, - errors="raise", - ): - """ - Drop specified labels from rows or columns. - - Remove rows or columns by specifying label names and corresponding - axis, or by specifying directly index or column names. When using a - multi-index, labels on different levels can be removed by specifying - the level. - - Parameters - ---------- - labels : single label or list-like - Index or column labels to drop. - axis : {0 or 'index', 1 or 'columns'}, default 0 - Whether to drop labels from the index (0 or 'index') or - columns (1 or 'columns'). - index : single label or list-like - Alternative to specifying axis (``labels, axis=0`` - is equivalent to ``index=labels``). - columns : single label or list-like - Alternative to specifying axis (``labels, axis=1`` - is equivalent to ``columns=labels``). - level : int or level name, optional - For MultiIndex, level from which the labels will be removed. - inplace : bool, default False - If False, return a copy. Otherwise, do operation - inplace and return None. - errors : {'ignore', 'raise'}, default 'raise' - If 'ignore', suppress error and only existing labels are - dropped. - - Returns - ------- - DataFrame - DataFrame without the removed index or column labels. - - Raises - ------ - KeyError - If any of the labels is not found in the selected axis. - - See Also - -------- - DataFrame.loc : Label-location based indexer for selection by label. - DataFrame.dropna : Return DataFrame with labels on given axis omitted - where (all or any) data are missing. - DataFrame.drop_duplicates : Return DataFrame with duplicate rows - removed, optionally only considering certain columns. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({"A": [1, 2, 3, 4], - ... "B": [5, 6, 7, 8], - ... "C": [10, 11, 12, 13], - ... "D": [20, 30, 40, 50]}) - >>> df - A B C D - 0 1 5 10 20 - 1 2 6 11 30 - 2 3 7 12 40 - 3 4 8 13 50 - - Drop columns - - >>> df.drop(['B', 'C'], axis=1) - A D - 0 1 20 - 1 2 30 - 2 3 40 - 3 4 50 - >>> df.drop(columns=['B', 'C']) - A D - 0 1 20 - 1 2 30 - 2 3 40 - 3 4 50 - - Drop a row by index - - >>> df.drop([0, 1]) - A B C D - 2 3 7 12 40 - 3 4 8 13 50 - - Drop columns and/or rows of MultiIndex DataFrame - - >>> midx = cudf.MultiIndex(levels=[['lama', 'cow', 'falcon'], - ... ['speed', 'weight', 'length']], - ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], - ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) - >>> df = cudf.DataFrame(index=midx, columns=['big', 'small'], - ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], - ... [250, 150], [1.5, 0.8], [320, 250], - ... [1, 0.8], [0.3, 0.2]]) - >>> df - big small - lama speed 45.0 30.0 - weight 200.0 100.0 - length 1.5 1.0 - cow speed 30.0 20.0 - weight 250.0 150.0 - length 1.5 0.8 - falcon speed 320.0 250.0 - weight 1.0 0.8 - length 0.3 0.2 - >>> df.drop(index='cow', columns='small') - big - lama speed 45.0 - weight 200.0 - length 1.5 - falcon speed 320.0 - weight 1.0 - length 0.3 - >>> df.drop(index='length', level=1) - big small - lama speed 45.0 30.0 - weight 200.0 100.0 - cow speed 30.0 20.0 - weight 250.0 150.0 - falcon speed 320.0 250.0 - weight 1.0 0.8 - """ - - if labels is not None: - if index is not None or columns is not None: - raise ValueError( - "Cannot specify both 'labels' and 'index'/'columns'" - ) - target = labels - elif index is not None: - target = index - axis = 0 - elif columns is not None: - target = columns - axis = 1 - else: - raise ValueError( - "Need to specify at least one of 'labels', " - "'index' or 'columns'" - ) - - if inplace: - out = self - else: - out = self.copy() - - if axis in (1, "columns"): - target = _get_host_unique(target) - - _drop_columns(out, target, errors) - elif axis in (0, "index"): - dropped = _drop_rows_by_labels(out, target, level, errors) - - if columns is not None: - columns = _get_host_unique(columns) - _drop_columns(dropped, columns, errors) - - out._data = dropped._data - out._index = dropped._index - - if not inplace: - return out - - @annotate("DATAFRAME_DROP_COLUMN", color="green", domain="cudf_python") - def _drop_column(self, name): - """Drop a column by *name*""" - if name not in self._data: - raise KeyError(f"column '{name}' does not exist") - del self._data[name] - @annotate("DATAFRAME_DROP_DUPLICATES", color="green", domain="cudf_python") def drop_duplicates( self, subset=None, keep="first", inplace=False, ignore_index=False @@ -6611,26 +6427,6 @@ def _get_union_of_series_names(series_list): return names_list -def _get_host_unique(array): - if isinstance(array, (cudf.Series, cudf.Index, ColumnBase)): - return array.unique.to_pandas() - elif isinstance(array, (str, numbers.Number)): - return [array] - else: - return set(array) - - -def _drop_columns(df: DataFrame, columns: Iterable, errors: str): - for c in columns: - try: - df._drop_column(c) - except KeyError as e: - if errors == "ignore": - pass - else: - raise e - - # Create a dictionary of the common, non-null columns def _get_non_null_cols_and_dtypes(col_idxs, list_of_columns): # A mapping of {idx: np.dtype} diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5c45e852e44..7a9bc4625be 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1350,6 +1350,13 @@ def fillna( inplace=inplace, ) + @annotate("FRAME_DROP_COLUMN", color="green", domain="cudf_python") + def _drop_column(self, name): + """Drop a column by *name*""" + if name not in self._data: + raise KeyError(f"column '{name}' does not exist") + del self._data[name] + @annotate("FRAME_DROPNA_COLUMNS", color="green", domain="cudf_python") def _drop_na_columns(self, how="any", subset=None, thresh=None): """ diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 114fb7d0e28..c617f4b53f5 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3,6 +3,7 @@ from __future__ import annotations +import numbers import operator import warnings from collections import Counter, abc @@ -25,8 +26,9 @@ is_integer_dtype, is_list_like, ) +from cudf.core.column import ColumnBase from cudf.core.column_accessor import ColumnAccessor -from cudf.core.frame import Frame +from cudf.core.frame import Frame, _drop_rows_by_labels from cudf.core.index import Index, RangeIndex, _index_from_columns from cudf.core.multiindex import MultiIndex from cudf.core.udf.utils import _compile_or_get, _supported_cols_from_frame @@ -57,6 +59,26 @@ """ +def _get_host_unique(array): + if isinstance(array, (cudf.Series, cudf.Index, ColumnBase)): + return array.unique.to_pandas() + elif isinstance(array, (str, numbers.Number)): + return [array] + else: + return set(array) + + +def _drop_columns(f: Frame, columns: abc.Iterable, errors: str): + for c in columns: + try: + f._drop_column(c) + except KeyError as e: + if errors == "ignore": + pass + else: + raise e + + def _indices_from_labels(obj, labels): if not isinstance(labels, cudf.MultiIndex): @@ -1726,10 +1748,7 @@ def _make_operands_and_index_for_binop( **kwargs, ) -> Tuple[ Union[ - Dict[ - Optional[str], - Tuple[cudf.core.column.ColumnBase, Any, bool, Any], - ], + Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], Type[NotImplemented], ], Optional[cudf.BaseIndex], @@ -1931,6 +1950,219 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs): return self._from_data(data, index=self._index) + @annotate("INDEXED_FRAME_DROP", color="green", domain="cudf_python") + def drop( + self, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors="raise", + ): + """Drop specified labels from rows or columns. + + Remove rows or columns by specifying label names and corresponding + axis, or by specifying directly index or column names. When using a + multi-index, labels on different levels can be removed by specifying + the level. + + Parameters + ---------- + labels : single label or list-like + Index or column labels to drop. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Whether to drop labels from the index (0 or 'index') or + columns (1 or 'columns'). + index : single label or list-like + Alternative to specifying axis (``labels, axis=0`` + is equivalent to ``index=labels``). + columns : single label or list-like + Alternative to specifying axis (``labels, axis=1`` + is equivalent to ``columns=labels``). + level : int or level name, optional + For MultiIndex, level from which the labels will be removed. + inplace : bool, default False + If False, return a copy. Otherwise, do operation + inplace and return None. + errors : {'ignore', 'raise'}, default 'raise' + If 'ignore', suppress error and only existing labels are + dropped. + + Returns + ------- + DataFrame or Series + DataFrame or Series without the removed index or column labels. + + Raises + ------ + KeyError + If any of the labels is not found in the selected axis. + + See Also + -------- + DataFrame.loc : Label-location based indexer for selection by label. + DataFrame.dropna : Return DataFrame with labels on given axis omitted + where (all or any) data are missing. + DataFrame.drop_duplicates : Return DataFrame with duplicate rows + removed, optionally only considering certain columns. + Series.reindex + Return only specified index labels of Series + Series.dropna + Return series without null values + Series.drop_duplicates + Return series with duplicate values removed + + Examples + -------- + **Series** + + >>> s = cudf.Series([1,2,3], index=['x', 'y', 'z']) + >>> s + x 1 + y 2 + z 3 + dtype: int64 + + Drop labels x and z + + >>> s.drop(labels=['x', 'z']) + y 2 + dtype: int64 + + Drop a label from the second level in MultiIndex Series. + + >>> midx = cudf.MultiIndex.from_product([[0, 1, 2], ['x', 'y']]) + >>> s = cudf.Series(range(6), index=midx) + >>> s + 0 x 0 + y 1 + 1 x 2 + y 3 + 2 x 4 + y 5 + dtype: int64 + >>> s.drop(labels='y', level=1) + 0 x 0 + 1 x 2 + 2 x 4 + Name: 2, dtype: int64 + + **DataFrame** + + >>> import cudf + >>> df = cudf.DataFrame({"A": [1, 2, 3, 4], + ... "B": [5, 6, 7, 8], + ... "C": [10, 11, 12, 13], + ... "D": [20, 30, 40, 50]}) + >>> df + A B C D + 0 1 5 10 20 + 1 2 6 11 30 + 2 3 7 12 40 + 3 4 8 13 50 + + Drop columns + + >>> df.drop(['B', 'C'], axis=1) + A D + 0 1 20 + 1 2 30 + 2 3 40 + 3 4 50 + >>> df.drop(columns=['B', 'C']) + A D + 0 1 20 + 1 2 30 + 2 3 40 + 3 4 50 + + Drop a row by index + + >>> df.drop([0, 1]) + A B C D + 2 3 7 12 40 + 3 4 8 13 50 + + Drop columns and/or rows of MultiIndex DataFrame + + >>> midx = cudf.MultiIndex(levels=[['lama', 'cow', 'falcon'], + ... ['speed', 'weight', 'length']], + ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], + ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + >>> df = cudf.DataFrame(index=midx, columns=['big', 'small'], + ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], + ... [250, 150], [1.5, 0.8], [320, 250], + ... [1, 0.8], [0.3, 0.2]]) + >>> df + big small + lama speed 45.0 30.0 + weight 200.0 100.0 + length 1.5 1.0 + cow speed 30.0 20.0 + weight 250.0 150.0 + length 1.5 0.8 + falcon speed 320.0 250.0 + weight 1.0 0.8 + length 0.3 0.2 + >>> df.drop(index='cow', columns='small') + big + lama speed 45.0 + weight 200.0 + length 1.5 + falcon speed 320.0 + weight 1.0 + length 0.3 + >>> df.drop(index='length', level=1) + big small + lama speed 45.0 30.0 + weight 200.0 100.0 + cow speed 30.0 20.0 + weight 250.0 150.0 + falcon speed 320.0 250.0 + weight 1.0 0.8 + """ + if labels is not None: + if index is not None or columns is not None: + raise ValueError( + "Cannot specify both 'labels' and 'index'/'columns'" + ) + target = labels + elif index is not None: + target = index + axis = 0 + elif columns is not None: + target = columns + axis = 1 + else: + raise ValueError( + "Need to specify at least one of 'labels', " + "'index' or 'columns'" + ) + + if inplace: + out = self + else: + out = self.copy() + + if axis in (1, "columns"): + target = _get_host_unique(target) + + _drop_columns(out, target, errors) + elif axis in (0, "index"): + dropped = _drop_rows_by_labels(out, target, level, errors) + + if columns is not None: + columns = _get_host_unique(columns) + _drop_columns(dropped, columns, errors) + + out._data = dropped._data + out._index = dropped._index + + if not inplace: + return out + def _check_duplicate_level_names(specified, level_names): """Raise if any of `specified` has duplicates in `level_names`.""" diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index bded4f97c5c..bb3d8a4e221 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -54,7 +54,7 @@ from cudf.core.column.string import StringMethods from cudf.core.column.struct import StructMethods from cudf.core.column_accessor import ColumnAccessor -from cudf.core.frame import Frame, _drop_rows_by_labels +from cudf.core.frame import Frame from cudf.core.groupby.groupby import SeriesGroupBy from cudf.core.index import BaseIndex, RangeIndex, as_index from cudf.core.indexed_frame import ( @@ -596,119 +596,14 @@ def drop( inplace=False, errors="raise", ): - """ - Return Series with specified index labels removed. - - Remove elements of a Series based on specifying the index labels. - When using a multi-index, labels on different levels can be removed by - specifying the level. - - Parameters - ---------- - labels : single label or list-like - Index labels to drop. - axis : 0, default 0 - Redundant for application on Series. - index : single label or list-like - Redundant for application on Series. But ``index`` can be used - instead of ``labels`` - columns : single label or list-like - This parameter is ignored. Use ``index`` or ``labels`` to specify. - level : int or level name, optional - For MultiIndex, level from which the labels will be removed. - inplace : bool, default False - If False, return a copy. Otherwise, do operation - inplace and return None. - errors : {'ignore', 'raise'}, default 'raise' - If 'ignore', suppress error and only existing labels are - dropped. - - Returns - ------- - Series or None - Series with specified index labels removed or None if - ``inplace=True`` - - Raises - ------ - KeyError - If any of the labels is not found in the selected axis and - ``error='raise'`` - - See Also - -------- - Series.reindex - Return only specified index labels of Series - Series.dropna - Return series without null values - Series.drop_duplicates - Return series with duplicate values removed - cudf.DataFrame.drop - Drop specified labels from rows or columns in dataframe - - Examples - -------- - >>> s = cudf.Series([1,2,3], index=['x', 'y', 'z']) - >>> s - x 1 - y 2 - z 3 - dtype: int64 - - Drop labels x and z - - >>> s.drop(labels=['x', 'z']) - y 2 - dtype: int64 - - Drop a label from the second level in MultiIndex Series. - - >>> midx = cudf.MultiIndex.from_product([[0, 1, 2], ['x', 'y']]) - >>> s = cudf.Series(range(6), index=midx) - >>> s - 0 x 0 - y 1 - 1 x 2 - y 3 - 2 x 4 - y 5 - dtype: int64 - >>> s.drop(labels='y', level=1) - 0 x 0 - 1 x 2 - 2 x 4 - Name: 2, dtype: int64 - """ - if labels is not None: - if index is not None or columns is not None: - raise ValueError( - "Cannot specify both 'labels' and 'index'/'columns'" - ) - if axis == 1: - raise ValueError("No axis named 1 for object type Series") - target = labels - elif index is not None: - target = index - elif columns is not None: - target = [] # Ignore parameter columns - else: - raise ValueError( - "Need to specify at least one of 'labels', " - "'index' or 'columns'" - ) - - if inplace: - out = self - else: - out = self.copy() - - dropped = _drop_rows_by_labels(out, target, level, errors) - - out._data = dropped._data - out._index = dropped._index - - if not inplace: - return out + if axis == 1: + raise ValueError("No axis named 1 for object type Series") + # Ignore columns for Series + if columns is not None: + columns = [] + return super().drop( + labels, axis, index, columns, level, inplace, errors + ) def append(self, to_append, ignore_index=False, verify_integrity=False): """Append values from another ``Series`` or array-like object. diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 3e3c5d1b053..011b39c86a2 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1109,7 +1109,7 @@ def test_series_drop_edge_inputs(): rfunc=gs.drop, lfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), rfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), - expected_error_message="Cannot specify both", + compare_error_message=False, ) assert_exceptions_equal( From 119ca616fab4e0c4ca14d27b55a05b45ed000ae5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 1 Mar 2022 16:56:13 -0800 Subject: [PATCH 5/6] Unify explode implementations. --- python/cudf/cudf/core/dataframe.py | 7 +----- python/cudf/cudf/core/frame.py | 24 ------------------- python/cudf/cudf/core/indexed_frame.py | 32 ++++++++++++++++++++++++++ python/cudf/cudf/core/series.py | 9 ++------ 4 files changed, 35 insertions(+), 37 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d9eb938f0a2..c726fbfcace 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5925,7 +5925,7 @@ def explode(self, column, ignore_index=False): Parameters ---------- - column : str or tuple + column : str Column to explode. ignore_index : bool, default False If True, the resulting index will be labeled 0, 1, …, n - 1. @@ -5960,11 +5960,6 @@ def explode(self, column, ignore_index=False): if column not in self._column_names: raise KeyError(column) - if not is_list_dtype(self._data[column].dtype): - data = self._data.copy(deep=True) - idx = None if ignore_index else self._index.copy(deep=True) - return self.__class__._from_data(data, index=idx) - return super()._explode(column, ignore_index) def pct_change( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7a9bc4625be..d54503318c6 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -589,30 +589,6 @@ def equals(self, other, **kwargs): else: return self._index.equals(other._index) - @annotate("FRAME_EXPLODE", color="green", domain="cudf_python") - def _explode(self, explode_column: Any, ignore_index: bool): - """Helper function for `explode` in `Series` and `Dataframe`, explodes - a specified nested column. Other columns' corresponding rows are - duplicated. If ignore_index is set, the original index is not exploded - and will be replaced with a `RangeIndex`. - """ - explode_column_num = self._column_names.index(explode_column) - if not ignore_index and self._index is not None: - explode_column_num += self._index.nlevels - - res = self.__class__._from_data( # type: ignore - *libcudf.lists.explode_outer( - self, explode_column_num, ignore_index - ) - ) - - res._data.multiindex = self._data.multiindex - res._data._level_names = self._data._level_names - - if not ignore_index and self._index is not None: - res.index.names = self._index.names - return res - @annotate( "FRAME_GET_COLUMNS_BY_LABEL", color="green", domain="cudf_python" ) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index c617f4b53f5..cdb4e2c824d 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -24,6 +24,7 @@ is_bool_dtype, is_categorical_dtype, is_integer_dtype, + is_list_dtype, is_list_like, ) from cudf.core.column import ColumnBase @@ -2163,6 +2164,37 @@ def drop( if not inplace: return out + @annotate("INDEXED_FRAME_EXPLODE", color="green", domain="cudf_python") + def _explode(self, explode_column: Any, ignore_index: bool): + # Helper function for `explode` in `Series` and `Dataframe`, explodes a + # specified nested column. Other columns' corresponding rows are + # duplicated. If ignore_index is set, the original index is not + # exploded and will be replaced with a `RangeIndex`. + if not is_list_dtype(self._data[explode_column].dtype): + data = self._data.copy(deep=True) + idx = None if ignore_index else self._index.copy(deep=True) + return self.__class__._from_data(data, index=idx) + + explode_column_num = self._column_names.index(explode_column) + if not ignore_index and self._index is not None: + explode_column_num += self._index.nlevels + + data, index = libcudf.lists.explode_outer( + self, explode_column_num, ignore_index + ) + res = self.__class__._from_data( + ColumnAccessor( + data, + multiindex=self._data.multiindex, + level_names=self._data._level_names, + ), + index=index, + ) + + if not ignore_index and self._index is not None: + res.index.names = self._index.names + return res + def _check_duplicate_level_names(specified, level_names): """Raise if any of `specified` has duplicates in `level_names`.""" diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index bb3d8a4e221..2a809aeb30b 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3150,7 +3150,7 @@ def explode(self, ignore_index=False): Returns ------- - DataFrame + Series Examples -------- @@ -3172,12 +3172,7 @@ def explode(self, ignore_index=False): 3 5 dtype: int64 """ - if not is_list_dtype(self._column.dtype): - data = self._data.copy(deep=True) - idx = None if ignore_index else self._index.copy(deep=True) - return self.__class__._from_data(data, index=idx) - - return super()._explode(self._column_names[0], ignore_index) + return super()._explode(self.name, ignore_index) def pct_change( self, periods=1, fill_method="ffill", limit=None, freq=None From e35b82b326e703eb92448207b5ef2fc100b028d2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 2 Mar 2022 16:52:45 -0800 Subject: [PATCH 6/6] Fix empty list case. --- python/cudf/cudf/core/dataframe.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c726fbfcace..59e8959e295 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5892,7 +5892,11 @@ def append( other = other.reindex(combined_columns, copy=False).to_frame().T if not current_cols.equals(combined_columns): self = self.reindex(columns=combined_columns) - elif isinstance(other, list) and not isinstance(other[0], DataFrame): + elif ( + isinstance(other, list) + and other + and not isinstance(other[0], DataFrame) + ): other = DataFrame(other) cols = self._data.to_pandas_index() if (cols.get_indexer(other._data.to_pandas_index()) >= 0).all():