Skip to content

Commit

Permalink
Refactor filling.repeat API (rapidsai#10371)
Browse files Browse the repository at this point in the history
Part of rapidsai#10153 

This PR refactors `filling.repeat` cython API to accept a list of columns instead of Frame object. In this PR I'm also trying out a possibly better pattern for index and indexed_frame to share logics, which might become a solution for rapidsai#10068.

Authors:
  - Michael Wang (https://github.com/isVoid)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: rapidsai#10371
  • Loading branch information
isVoid authored Mar 18, 2022
1 parent 621d26f commit 47d16cb
Show file tree
Hide file tree
Showing 6 changed files with 144 additions and 107 deletions.
30 changes: 13 additions & 17 deletions python/cudf/cudf/_lib/filling.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

import numpy as np

Expand All @@ -15,7 +15,11 @@ from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport size_type
from cudf._lib.scalar cimport DeviceScalar
from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
from cudf._lib.utils cimport (
columns_from_unique_ptr,
data_from_unique_ptr,
table_view_from_columns,
)


def fill_in_place(Column destination, int begin, int end, DeviceScalar value):
Expand Down Expand Up @@ -50,15 +54,15 @@ def fill(Column destination, int begin, int end, DeviceScalar value):
return Column.from_unique_ptr(move(c_result))


def repeat(inp, object count, bool check_count=False):
def repeat(list inp, object count, bool check_count=False):
if isinstance(count, Column):
return _repeat_via_column(inp, count, check_count)
else:
return _repeat_via_size_type(inp, count)


def _repeat_via_column(inp, Column count, bool check_count):
cdef table_view c_inp = table_view_from_table(inp)
def _repeat_via_column(list inp, Column count, bool check_count):
cdef table_view c_inp = table_view_from_columns(inp)
cdef column_view c_count = count.view()
cdef bool c_check_count = check_count
cdef unique_ptr[table] c_result
Expand All @@ -70,15 +74,11 @@ def _repeat_via_column(inp, Column count, bool check_count):
c_check_count
))

return data_from_unique_ptr(
move(c_result),
column_names=inp._column_names,
index_names=inp._index_names
)
return columns_from_unique_ptr(move(c_result))


def _repeat_via_size_type(inp, size_type count):
cdef table_view c_inp = table_view_from_table(inp)
def _repeat_via_size_type(list inp, size_type count):
cdef table_view c_inp = table_view_from_columns(inp)
cdef unique_ptr[table] c_result

with nogil:
Expand All @@ -87,11 +87,7 @@ def _repeat_via_size_type(inp, size_type count):
count
))

return data_from_unique_ptr(
move(c_result),
column_names=inp._column_names,
index_names=inp._index_names
)
return columns_from_unique_ptr(move(c_result))


def sequence(int size, DeviceScalar init, DeviceScalar step):
Expand Down
31 changes: 31 additions & 0 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1512,6 +1512,37 @@ def _apply_boolean_mask(self, boolean_mask):
column_names=self._column_names,
)

def repeat(self, repeats, axis=None):
"""Repeat elements of a Index.
Returns a new Index where each element of the current Index is repeated
consecutively a given number of times.
Parameters
----------
repeats : int, or array of ints
The number of repetitions for each element. This should
be a non-negative integer. Repeating 0 times will return
an empty object.
Returns
-------
Index
A newly created object of same type as caller with repeated
elements.
Examples
--------
>>> index = cudf.Index([10, 22, 33, 55])
>>> index
Int64Index([10, 22, 33, 55], dtype='int64')
>>> index.repeat(5)
Int64Index([10, 10, 10, 10, 10, 22, 22, 22, 22, 22, 33,
33, 33, 33, 33, 55, 55, 55, 55, 55],
dtype='int64')
"""
raise NotImplementedError

def _split_columns_by_levels(self, levels):
if isinstance(levels, int) and levels > 0:
raise ValueError(f"Out of bound level: {levels}")
Expand Down
104 changes: 14 additions & 90 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1547,96 +1547,6 @@ def rank(

return self._from_data(data, index).astype(np.float64)

@_cudf_nvtx_annotate
def repeat(self, repeats, axis=None):
"""Repeats elements consecutively.
Returns a new object of caller type(DataFrame/Series/Index) where each
element of the current object is repeated consecutively a given
number of times.
Parameters
----------
repeats : int, or array of ints
The number of repetitions for each element. This should
be a non-negative integer. Repeating 0 times will return
an empty object.
Returns
-------
Series/DataFrame/Index
A newly created object of same type as caller
with repeated elements.
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'a': [1, 2, 3], 'b': [10, 20, 30]})
>>> df
a b
0 1 10
1 2 20
2 3 30
>>> df.repeat(3)
a b
0 1 10
0 1 10
0 1 10
1 2 20
1 2 20
1 2 20
2 3 30
2 3 30
2 3 30
Repeat on Series
>>> s = cudf.Series([0, 2])
>>> s
0 0
1 2
dtype: int64
>>> s.repeat([3, 4])
0 0
0 0
0 0
1 2
1 2
1 2
1 2
dtype: int64
>>> s.repeat(2)
0 0
0 0
1 2
1 2
dtype: int64
Repeat on Index
>>> index = cudf.Index([10, 22, 33, 55])
>>> index
Int64Index([10, 22, 33, 55], dtype='int64')
>>> index.repeat(5)
Int64Index([10, 10, 10, 10, 10, 22, 22, 22, 22, 22, 33,
33, 33, 33, 33, 55, 55, 55, 55, 55],
dtype='int64')
"""
if axis is not None:
raise NotImplementedError(
"Only axis=`None` supported at this time."
)

if not is_scalar(repeats):
repeats = as_column(repeats)

result = self.__class__._from_data(
*libcudf.filling.repeat(self, repeats)
)

result._copy_type_metadata(self)
return result

@_cudf_nvtx_annotate
def shift(self, periods=1, freq=None, axis=0, fill_value=None):
"""Shift values by `periods` positions."""
Expand Down Expand Up @@ -6260,6 +6170,20 @@ def nunique(self, dropna: bool = True):
for name, col in self._data.items()
}

@staticmethod
def _repeat(
columns: List[ColumnBase], repeats, axis=None
) -> List[ColumnBase]:
if axis is not None:
raise NotImplementedError(
"Only axis=`None` supported at this time."
)

if not is_scalar(repeats):
repeats = as_column(repeats)

return libcudf.filling.repeat(columns, repeats)


@_cudf_nvtx_annotate
def _get_replacement_values_for_columns(
Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,6 +743,9 @@ def _apply_boolean_mask(self, boolean_mask):
[self._values.apply_boolean_mask(boolean_mask)], [self.name]
)

def repeat(self, repeats, axis=None):
return self._as_int64().repeat(repeats, axis)

def _split(self, splits):
return Int64Index._from_columns(
[self._values.columns_split(splits)], [self.name]
Expand Down Expand Up @@ -1264,6 +1267,11 @@ def argsort(
na_position=na_position,
)

def repeat(self, repeats, axis=None):
return self._from_columns_like_self(
Frame._repeat([*self._columns], repeats, axis), self._column_names
)


class NumericIndex(GenericIndex):
"""Immutable, ordered and sliceable sequence of labels.
Expand Down
73 changes: 73 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2027,6 +2027,79 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):

return NotImplemented

@_cudf_nvtx_annotate
def repeat(self, repeats, axis=None):
"""Repeats elements consecutively.
Returns a new object of caller type(DataFrame/Series) where each
element of the current object is repeated consecutively a given
number of times.
Parameters
----------
repeats : int, or array of ints
The number of repetitions for each element. This should
be a non-negative integer. Repeating 0 times will return
an empty object.
Returns
-------
Series/DataFrame
A newly created object of same type as caller
with repeated elements.
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'a': [1, 2, 3], 'b': [10, 20, 30]})
>>> df
a b
0 1 10
1 2 20
2 3 30
>>> df.repeat(3)
a b
0 1 10
0 1 10
0 1 10
1 2 20
1 2 20
1 2 20
2 3 30
2 3 30
2 3 30
Repeat on Series
>>> s = cudf.Series([0, 2])
>>> s
0 0
1 2
dtype: int64
>>> s.repeat([3, 4])
0 0
0 0
0 0
1 2
1 2
1 2
1 2
dtype: int64
>>> s.repeat(2)
0 0
0 0
1 2
1 2
dtype: int64
"""
return self._from_columns_like_self(
Frame._repeat(
[*self._index._data.columns, *self._columns], repeats, axis
),
self._column_names,
self._index_names,
)

def _append(
self, other, ignore_index=False, verify_integrity=False, sort=None
):
Expand Down
5 changes: 5 additions & 0 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1830,3 +1830,8 @@ def _split_columns_by_levels(self, levels):
index_columns.append(col)
index_names.append(name)
return data_columns, index_columns, data_names, index_names

def repeat(self, repeats, axis=None):
return self._from_columns_like_self(
Frame._repeat([*self._columns], repeats, axis), self._column_names
)

0 comments on commit 47d16cb

Please sign in to comment.