Refactor filling.repeat API (#10371)

Part of #10153 This PR refactors `filling.repeat` cython API to accept a list of columns instead of Frame object. In this PR I'm also trying out a possibly better pattern for index and indexed_frame to share logics, which might become a solution for #10068. Authors: - Michael Wang (https://github.com/isVoid) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: #10371
rapidsai · Mar 18, 2022 · 47d16cb · 47d16cb
1 parent 621d26f
commit 47d16cb
Show file tree

Hide file tree

Showing 6 changed files with 144 additions and 107 deletions.
diff --git a/python/cudf/cudf/_lib/filling.pyx b/python/cudf/cudf/_lib/filling.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import numpy as np
 
@@ -15,7 +15,11 @@ from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport (
+    columns_from_unique_ptr,
+    data_from_unique_ptr,
+    table_view_from_columns,
+)
 
 
 def fill_in_place(Column destination, int begin, int end, DeviceScalar value):
@@ -50,15 +54,15 @@ def fill(Column destination, int begin, int end, DeviceScalar value):
     return Column.from_unique_ptr(move(c_result))
 
 
-def repeat(inp, object count, bool check_count=False):
+def repeat(list inp, object count, bool check_count=False):
     if isinstance(count, Column):
         return _repeat_via_column(inp, count, check_count)
     else:
         return _repeat_via_size_type(inp, count)
 
 
-def _repeat_via_column(inp, Column count, bool check_count):
-    cdef table_view c_inp = table_view_from_table(inp)
+def _repeat_via_column(list inp, Column count, bool check_count):
+    cdef table_view c_inp = table_view_from_columns(inp)
     cdef column_view c_count = count.view()
     cdef bool c_check_count = check_count
     cdef unique_ptr[table] c_result
@@ -70,15 +74,11 @@ def _repeat_via_column(inp, Column count, bool check_count):
             c_check_count
         ))
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=inp._column_names,
-        index_names=inp._index_names
-    )
+    return columns_from_unique_ptr(move(c_result))
 
 
-def _repeat_via_size_type(inp, size_type count):
-    cdef table_view c_inp = table_view_from_table(inp)
+def _repeat_via_size_type(list inp, size_type count):
+    cdef table_view c_inp = table_view_from_columns(inp)
     cdef unique_ptr[table] c_result
 
     with nogil:
@@ -87,11 +87,7 @@ def _repeat_via_size_type(inp, size_type count):
             count
         ))
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=inp._column_names,
-        index_names=inp._index_names
-    )
+    return columns_from_unique_ptr(move(c_result))
 
 
 def sequence(int size, DeviceScalar init, DeviceScalar step):

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
@@ -1512,6 +1512,37 @@ def _apply_boolean_mask(self, boolean_mask):
             column_names=self._column_names,
         )
 
+    def repeat(self, repeats, axis=None):
+        """Repeat elements of a Index.
+
+        Returns a new Index where each element of the current Index is repeated
+        consecutively a given number of times.
+
+        Parameters
+        ----------
+        repeats : int, or array of ints
+            The number of repetitions for each element. This should
+            be a non-negative integer. Repeating 0 times will return
+            an empty object.
+
+        Returns
+        -------
+        Index
+            A newly created object of same type as caller with repeated
+            elements.
+
+        Examples
+        --------
+        >>> index = cudf.Index([10, 22, 33, 55])
+        >>> index
+        Int64Index([10, 22, 33, 55], dtype='int64')
+        >>> index.repeat(5)
+        Int64Index([10, 10, 10, 10, 10, 22, 22, 22, 22, 22, 33,
+                    33, 33, 33, 33, 55, 55, 55, 55, 55],
+                dtype='int64')
+        """
+        raise NotImplementedError
+
     def _split_columns_by_levels(self, levels):
         if isinstance(levels, int) and levels > 0:
             raise ValueError(f"Out of bound level: {levels}")

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
@@ -1547,96 +1547,6 @@ def rank(
 
         return self._from_data(data, index).astype(np.float64)
 
-    @_cudf_nvtx_annotate
-    def repeat(self, repeats, axis=None):
-        """Repeats elements consecutively.
-
-        Returns a new object of caller type(DataFrame/Series/Index) where each
-        element of the current object is repeated consecutively a given
-        number of times.
-
-        Parameters
-        ----------
-        repeats : int, or array of ints
-            The number of repetitions for each element. This should
-            be a non-negative integer. Repeating 0 times will return
-            an empty object.
-
-        Returns
-        -------
-        Series/DataFrame/Index
-            A newly created object of same type as caller
-            with repeated elements.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [1, 2, 3], 'b': [10, 20, 30]})
-        >>> df
-           a   b
-        0  1  10
-        1  2  20
-        2  3  30
-        >>> df.repeat(3)
-           a   b
-        0  1  10
-        0  1  10
-        0  1  10
-        1  2  20
-        1  2  20
-        1  2  20
-        2  3  30
-        2  3  30
-        2  3  30
-
-        Repeat on Series
-
-        >>> s = cudf.Series([0, 2])
-        >>> s
-        0    0
-        1    2
-        dtype: int64
-        >>> s.repeat([3, 4])
-        0    0
-        0    0
-        0    0
-        1    2
-        1    2
-        1    2
-        1    2
-        dtype: int64
-        >>> s.repeat(2)
-        0    0
-        0    0
-        1    2
-        1    2
-        dtype: int64
-
-        Repeat on Index
-
-        >>> index = cudf.Index([10, 22, 33, 55])
-        >>> index
-        Int64Index([10, 22, 33, 55], dtype='int64')
-        >>> index.repeat(5)
-        Int64Index([10, 10, 10, 10, 10, 22, 22, 22, 22, 22, 33,
-                    33, 33, 33, 33, 55, 55, 55, 55, 55],
-                dtype='int64')
-        """
-        if axis is not None:
-            raise NotImplementedError(
-                "Only axis=`None` supported at this time."
-            )
-
-        if not is_scalar(repeats):
-            repeats = as_column(repeats)
-
-        result = self.__class__._from_data(
-            *libcudf.filling.repeat(self, repeats)
-        )
-
-        result._copy_type_metadata(self)
-        return result
-
     @_cudf_nvtx_annotate
     def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         """Shift values by `periods` positions."""
@@ -6260,6 +6170,20 @@ def nunique(self, dropna: bool = True):
             for name, col in self._data.items()
         }
 
+    @staticmethod
+    def _repeat(
+        columns: List[ColumnBase], repeats, axis=None
+    ) -> List[ColumnBase]:
+        if axis is not None:
+            raise NotImplementedError(
+                "Only axis=`None` supported at this time."
+            )
+
+        if not is_scalar(repeats):
+            repeats = as_column(repeats)
+
+        return libcudf.filling.repeat(columns, repeats)
+
 
 @_cudf_nvtx_annotate
 def _get_replacement_values_for_columns(

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
@@ -743,6 +743,9 @@ def _apply_boolean_mask(self, boolean_mask):
             [self._values.apply_boolean_mask(boolean_mask)], [self.name]
         )
 
+    def repeat(self, repeats, axis=None):
+        return self._as_int64().repeat(repeats, axis)
+
     def _split(self, splits):
         return Int64Index._from_columns(
             [self._values.columns_split(splits)], [self.name]
@@ -1264,6 +1267,11 @@ def argsort(
             na_position=na_position,
         )
 
+    def repeat(self, repeats, axis=None):
+        return self._from_columns_like_self(
+            Frame._repeat([*self._columns], repeats, axis), self._column_names
+        )
+
 
 class NumericIndex(GenericIndex):
     """Immutable, ordered and sliceable sequence of labels.

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
@@ -2027,6 +2027,79 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
 
         return NotImplemented
 
+    @_cudf_nvtx_annotate
+    def repeat(self, repeats, axis=None):
+        """Repeats elements consecutively.
+
+        Returns a new object of caller type(DataFrame/Series) where each
+        element of the current object is repeated consecutively a given
+        number of times.
+
+        Parameters
+        ----------
+        repeats : int, or array of ints
+            The number of repetitions for each element. This should
+            be a non-negative integer. Repeating 0 times will return
+            an empty object.
+
+        Returns
+        -------
+        Series/DataFrame
+            A newly created object of same type as caller
+            with repeated elements.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [1, 2, 3], 'b': [10, 20, 30]})
+        >>> df
+           a   b
+        0  1  10
+        1  2  20
+        2  3  30
+        >>> df.repeat(3)
+           a   b
+        0  1  10
+        0  1  10
+        0  1  10
+        1  2  20
+        1  2  20
+        1  2  20
+        2  3  30
+        2  3  30
+        2  3  30
+
+        Repeat on Series
+
+        >>> s = cudf.Series([0, 2])
+        >>> s
+        0    0
+        1    2
+        dtype: int64
+        >>> s.repeat([3, 4])
+        0    0
+        0    0
+        0    0
+        1    2
+        1    2
+        1    2
+        1    2
+        dtype: int64
+        >>> s.repeat(2)
+        0    0
+        0    0
+        1    2
+        1    2
+        dtype: int64
+        """
+        return self._from_columns_like_self(
+            Frame._repeat(
+                [*self._index._data.columns, *self._columns], repeats, axis
+            ),
+            self._column_names,
+            self._index_names,
+        )
+
     def _append(
         self, other, ignore_index=False, verify_integrity=False, sort=None
     ):

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
@@ -1830,3 +1830,8 @@ def _split_columns_by_levels(self, levels):
                 index_columns.append(col)
                 index_names.append(name)
         return data_columns, index_columns, data_names, index_names
+
+    def repeat(self, repeats, axis=None):
+        return self._from_columns_like_self(
+            Frame._repeat([*self._columns], repeats, axis), self._column_names
+        )