From fa26ce7cbf45f417c23e517ce4b97e377a56a354 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 11 Jan 2022 15:38:43 -0800 Subject: [PATCH 1/5] Adds from_column_like_self --- python/cudf/cudf/core/frame.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0345966d6bd..746fa80cd01 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -165,6 +165,16 @@ def _from_columns( return cls._from_data(data, index) + def _from_columns_like_self( + self, + columns: List[ColumnBase], + column_names: List[str], + index_names: Optional[List[str]] = None, + ): + col = self.__class__._from_columns(columns, column_names, index_names) + col._copy_type_metadata(self, include_index=bool(index_names)) + return col + def _mimic_inplace( self: T, result: Frame, inplace: bool = False ) -> Optional[Frame]: @@ -554,14 +564,12 @@ def _gather( ): raise IndexError("Gather map index is out of bounds.") - result = self.__class__._from_columns( + result = self._from_columns_like_self( libcudf.copying.gather( list(self._columns), gather_map, nullify=nullify, ), self._column_names, ) - - result._copy_type_metadata(self) return result def _as_column(self): @@ -1419,7 +1427,7 @@ def _drop_na_rows( else: frame._data[name] = col - result = self.__class__._from_columns( + return self._from_columns_like_self( libcudf.stream_compaction.drop_nulls( list(self._index._data.columns + frame._columns), how=how, @@ -1431,8 +1439,6 @@ def _drop_na_rows( self._column_names, self._index.names, ) - result._copy_type_metadata(frame) - return result def _drop_na_columns(self, how="any", subset=None, thresh=None): """ @@ -2154,8 +2160,7 @@ def drop_duplicates( nulls_are_equal: bool, default True Null elements are considered equal to other null elements. """ - - result = self.__class__._from_columns( + return self._from_columns_like_self( libcudf.stream_compaction.drop_duplicates( list(self._columns), keys=range(len(self._columns)), @@ -2164,11 +2169,6 @@ def drop_duplicates( ), self._column_names, ) - # TODO: _copy_type_metadata is a common pattern to apply after the - # roundtrip from libcudf. We should build this into a factory function - # to increase reusability. - result._copy_type_metadata(self) - return result def _positions_from_column_names(self, column_names): """Map each column name into their positions in the frame. From 9e792f766ba2e9414c6ed4213608025c8ca3bdac Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 11 Jan 2022 15:44:25 -0800 Subject: [PATCH 2/5] use in _apply_boolean_mask too --- python/cudf/cudf/core/frame.py | 3 +-- python/cudf/cudf/core/indexed_frame.py | 4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index cec17d5164a..a6b3c498001 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -564,13 +564,12 @@ def _gather( ): raise IndexError("Gather map index is out of bounds.") - result = self._from_columns_like_self( + return self._from_columns_like_self( libcudf.copying.gather( list(self._columns), gather_map, nullify=nullify, ), self._column_names, ) - return result def _as_column(self): """ diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 7c5783bf637..59cd21f788a 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1207,15 +1207,13 @@ def _apply_boolean_mask(self, boolean_mask): if not is_bool_dtype(boolean_mask.dtype): raise ValueError("boolean_mask is not boolean type.") - result = self.__class__._from_columns( + return self._from_columns_like_self( libcudf.stream_compaction.apply_boolean_mask( list(self._index._columns + self._columns), boolean_mask ), column_names=self._column_names, index_names=self._index.names, ) - result._copy_type_metadata(self) - return result def _reset_index(self, level, drop, col_level=0, col_fill=""): """Shared path for DataFrame.reset_index and Series.reset_index.""" From b875aee1f1965a8a886effd2a5131f645dd85e79 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 19 Jan 2022 11:14:19 -0800 Subject: [PATCH 3/5] Use _from_columns_like_self in recently moved methods --- python/cudf/cudf/core/_base_index.py | 18 +++++------------- python/cudf/cudf/core/indexed_frame.py | 13 +++---------- 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index be5a1e7cc93..b1335c7c076 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1444,7 +1444,8 @@ def drop_duplicates( """ # This utilizes the fact that all `Index` is also a `Frame`. - result = self.__class__._from_columns( + # Except RangeIndex. + return self._from_columns_like_self( drop_duplicates( list(self._columns), keys=range(len(self._data)), @@ -1453,8 +1454,6 @@ def drop_duplicates( ), self._column_names, ) - result._copy_type_metadata(self, include_index=False) - return result def dropna(self, how="any"): """ @@ -1476,12 +1475,10 @@ def dropna(self, how="any"): for col in self._columns ] - result = self.__class__._from_columns( + return self._from_columns_like_self( drop_nulls(data_columns, how=how, keys=range(len(data_columns)),), self._column_names, ) - result._copy_type_metadata(self, include_index=False) - return result def _gather(self, gather_map, nullify=False, check_bounds=True): """Gather rows of index specified by indices in `gather_map`. @@ -1501,14 +1498,11 @@ def _gather(self, gather_map, nullify=False, check_bounds=True): ): raise IndexError("Gather map index is out of bounds.") - result = self.__class__._from_columns( + return self._from_columns_like_self( gather(list(self._columns), gather_map, nullify=nullify), self._column_names, ) - result._copy_type_metadata(self, include_index=False) - return result - def take(self, indices, axis=0, allow_fill=True, fill_value=None): """Return a new index containing the rows specified by *indices* @@ -1561,12 +1555,10 @@ def _apply_boolean_mask(self, boolean_mask): if not is_bool_dtype(boolean_mask.dtype): raise ValueError("boolean_mask is not boolean type.") - result = self.__class__._from_columns( + return self._from_columns_like_self( apply_boolean_mask(list(self._columns), boolean_mask), column_names=self._column_names, ) - result._copy_type_metadata(self) - return result def _split_columns_by_levels(self, levels): if isinstance(levels, int) and levels > 0: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index cbcb586c81a..e9f2de1cb1c 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -556,7 +556,7 @@ def _gather( ): raise IndexError("Gather map index is out of bounds.") - result = self.__class__._from_columns( + return self._from_columns_like_self( libcudf.copying.gather( list(self._index._columns + self._columns) if keep_index @@ -568,9 +568,6 @@ def _gather( self._index.names if keep_index else None, ) - result._copy_type_metadata(self, include_index=keep_index) - return result - def _positions_from_column_names( self, column_names, offset_by_index_columns=False ): @@ -628,7 +625,7 @@ def drop_duplicates( keys = self._positions_from_column_names( subset, offset_by_index_columns=not ignore_index ) - result = self.__class__._from_columns( + return self._from_columns_like_self( libcudf.stream_compaction.drop_duplicates( list(self._columns) if ignore_index @@ -640,8 +637,6 @@ def drop_duplicates( self._column_names, self._index.names if not ignore_index else None, ) - result._copy_type_metadata(self) - return result def add_prefix(self, prefix): """ @@ -1354,7 +1349,7 @@ def _drop_na_rows( for col in self._columns ] - result = self.__class__._from_columns( + return self._from_columns_like_self( libcudf.stream_compaction.drop_nulls( list(self._index._data.columns) + data_columns, how=how, @@ -1366,8 +1361,6 @@ def _drop_na_rows( self._column_names, self._index.names, ) - result._copy_type_metadata(self) - return result def _apply_boolean_mask(self, boolean_mask): """Apply boolean mask to each row of `self`. From 8921ad8ac245d44f21ed2e0daa49ab9cb527a6a3 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 19 Jan 2022 12:35:36 -0800 Subject: [PATCH 4/5] Update python/cudf/cudf/core/frame.py Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/core/frame.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ea5b56c9945..0e24e241fd7 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -169,9 +169,8 @@ def _from_columns_like_self( column_names: List[str], index_names: Optional[List[str]] = None, ): - col = self.__class__._from_columns(columns, column_names, index_names) - col._copy_type_metadata(self, include_index=bool(index_names)) - return col + frame = self.__class__._from_columns(columns, column_names, index_names) + return frame._copy_type_metadata(self, include_index=bool(index_names)) def _mimic_inplace( self: T, result: Frame, inplace: bool = False From 4b60dc2c87ad85ecfb65b39a4f4585c0b8313a11 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 19 Jan 2022 12:41:29 -0800 Subject: [PATCH 5/5] docstring --- python/cudf/cudf/core/frame.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ea5b56c9945..853cdc1b725 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -169,6 +169,11 @@ def _from_columns_like_self( column_names: List[str], index_names: Optional[List[str]] = None, ): + """Construct a `Frame` from a list of columns with metadata from self. + + If `index_names` is set, the first `len(index_names)` columns are + used to construct the index of the frame. + """ col = self.__class__._from_columns(columns, column_names, index_names) col._copy_type_metadata(self, include_index=bool(index_names)) return col