From 2bd7320c0097aa08033a68bbca41632315a5e58c Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 20 Jan 2022 05:21:57 -0800 Subject: [PATCH] Add `_from_column_like_self` factory (#10022) Follow up to #9558 On a return trip from libcudf, it is a common pattern for cudf frame to apply its own metadata to the columns. This PR generalizes this procedure as a new factory function `_from_colums_like_self` Authors: - Michael Wang (https://github.com/isVoid) Approvers: - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) - Paul Taylor (https://github.com/trxcllnt) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/10022 --- python/cudf/cudf/core/_base_index.py | 18 +++++------------- python/cudf/cudf/core/frame.py | 16 ++++++++++++++++ python/cudf/cudf/core/indexed_frame.py | 17 ++++------------- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index be5a1e7cc93..b1335c7c076 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1444,7 +1444,8 @@ def drop_duplicates( """ # This utilizes the fact that all `Index` is also a `Frame`. - result = self.__class__._from_columns( + # Except RangeIndex. + return self._from_columns_like_self( drop_duplicates( list(self._columns), keys=range(len(self._data)), @@ -1453,8 +1454,6 @@ def drop_duplicates( ), self._column_names, ) - result._copy_type_metadata(self, include_index=False) - return result def dropna(self, how="any"): """ @@ -1476,12 +1475,10 @@ def dropna(self, how="any"): for col in self._columns ] - result = self.__class__._from_columns( + return self._from_columns_like_self( drop_nulls(data_columns, how=how, keys=range(len(data_columns)),), self._column_names, ) - result._copy_type_metadata(self, include_index=False) - return result def _gather(self, gather_map, nullify=False, check_bounds=True): """Gather rows of index specified by indices in `gather_map`. @@ -1501,14 +1498,11 @@ def _gather(self, gather_map, nullify=False, check_bounds=True): ): raise IndexError("Gather map index is out of bounds.") - result = self.__class__._from_columns( + return self._from_columns_like_self( gather(list(self._columns), gather_map, nullify=nullify), self._column_names, ) - result._copy_type_metadata(self, include_index=False) - return result - def take(self, indices, axis=0, allow_fill=True, fill_value=None): """Return a new index containing the rows specified by *indices* @@ -1561,12 +1555,10 @@ def _apply_boolean_mask(self, boolean_mask): if not is_bool_dtype(boolean_mask.dtype): raise ValueError("boolean_mask is not boolean type.") - result = self.__class__._from_columns( + return self._from_columns_like_self( apply_boolean_mask(list(self._columns), boolean_mask), column_names=self._column_names, ) - result._copy_type_metadata(self) - return result def _split_columns_by_levels(self, levels): if isinstance(levels, int) and levels > 0: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 1d59d9f3b1a..69dc5389e7a 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -163,6 +163,22 @@ def _from_columns( return cls._from_data(data, index) + def _from_columns_like_self( + self, + columns: List[ColumnBase], + column_names: List[str], + index_names: Optional[List[str]] = None, + ): + """Construct a `Frame` from a list of columns with metadata from self. + + If `index_names` is set, the first `len(index_names)` columns are + used to construct the index of the frame. + """ + frame = self.__class__._from_columns( + columns, column_names, index_names + ) + return frame._copy_type_metadata(self, include_index=bool(index_names)) + def _mimic_inplace( self: T, result: Frame, inplace: bool = False ) -> Optional[Frame]: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 9458057894a..e9f2de1cb1c 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -556,7 +556,7 @@ def _gather( ): raise IndexError("Gather map index is out of bounds.") - result = self.__class__._from_columns( + return self._from_columns_like_self( libcudf.copying.gather( list(self._index._columns + self._columns) if keep_index @@ -568,9 +568,6 @@ def _gather( self._index.names if keep_index else None, ) - result._copy_type_metadata(self, include_index=keep_index) - return result - def _positions_from_column_names( self, column_names, offset_by_index_columns=False ): @@ -628,7 +625,7 @@ def drop_duplicates( keys = self._positions_from_column_names( subset, offset_by_index_columns=not ignore_index ) - result = self.__class__._from_columns( + return self._from_columns_like_self( libcudf.stream_compaction.drop_duplicates( list(self._columns) if ignore_index @@ -640,8 +637,6 @@ def drop_duplicates( self._column_names, self._index.names if not ignore_index else None, ) - result._copy_type_metadata(self) - return result def add_prefix(self, prefix): """ @@ -1354,7 +1349,7 @@ def _drop_na_rows( for col in self._columns ] - result = self.__class__._from_columns( + return self._from_columns_like_self( libcudf.stream_compaction.drop_nulls( list(self._index._data.columns) + data_columns, how=how, @@ -1366,8 +1361,6 @@ def _drop_na_rows( self._column_names, self._index.names, ) - result._copy_type_metadata(self) - return result def _apply_boolean_mask(self, boolean_mask): """Apply boolean mask to each row of `self`. @@ -1378,15 +1371,13 @@ def _apply_boolean_mask(self, boolean_mask): if not is_bool_dtype(boolean_mask.dtype): raise ValueError("boolean_mask is not boolean type.") - result = self.__class__._from_columns( + return self._from_columns_like_self( libcudf.stream_compaction.apply_boolean_mask( list(self._index._columns + self._columns), boolean_mask ), column_names=self._column_names, index_names=self._index.names, ) - result._copy_type_metadata(self) - return result def take(self, indices, axis=0): """Return a new frame containing the rows specified by *indices*.