Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrites column.__setitem__, Use boolean_mask_scatter #10202

Merged
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
685009f
Refactor of column.__setitem__
isVoid Feb 3, 2022
570b0c4
Remove comments
isVoid Feb 3, 2022
07f3bd4
Merge branch 'branch-22.04' of github.com:rapidsai/cudf into improvem…
isVoid Feb 10, 2022
3fc4846
Update copyright years
isVoid Feb 10, 2022
5838607
Use a ternary operator and check key type with isinstance
isVoid Feb 14, 2022
f85cddb
Update err message
isVoid Feb 14, 2022
ed6a4fa
Remove wrong param style
isVoid Feb 14, 2022
9ef7f14
Merge branch 'improvement/column_setitem_refactor' of github.com:isVo…
isVoid Feb 14, 2022
f50c57e
Improving index handling
isVoid Feb 14, 2022
b13eb69
Update python/cudf/cudf/core/column/column.py
isVoid Feb 14, 2022
6576af8
Use `as_device_scalar` factory function; Better handling of nelem in
isVoid Feb 15, 2022
62b8552
Change `arange` type annotation
isVoid Feb 15, 2022
6667be2
Revert to scalar constructor.
isVoid Feb 15, 2022
cdbabc4
use `scalar.isvalid`
isVoid Feb 15, 2022
6cb2992
Add comment
isVoid Feb 15, 2022
6f092d6
Apply suggestions from code review
isVoid Feb 16, 2022
b5c2db1
Apply suggestions from code review
isVoid Feb 16, 2022
e6b2833
Merge branch 'branch-22.04' of github.com:rapidsai/cudf into improvem…
isVoid Feb 16, 2022
77e6693
Merge branch 'improvement/column_setitem_refactor' of github.com:isVo…
isVoid Feb 16, 2022
5d5714e
update boolean mask scatter docstring
isVoid Feb 16, 2022
a111f7d
style
isVoid Feb 16, 2022
2d3486f
Addressing multiple reviews
isVoid Feb 17, 2022
d5d209c
Merge branch 'branch-22.04' of github.com:rapidsai/cudf into improvem…
isVoid Feb 17, 2022
1754580
simple things to get ci unblocked
isVoid Feb 18, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 30 additions & 26 deletions python/cudf/cudf/_lib/copying.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

import pickle

Expand Down Expand Up @@ -537,11 +537,11 @@ def copy_if_else(object lhs, object rhs, Column boolean_mask):
as_device_scalar(lhs), as_device_scalar(rhs), boolean_mask)


def _boolean_mask_scatter_table(input_table, target_table,
Column boolean_mask):
def _boolean_mask_scatter_columns(list input_columns, list target_columns,
Column boolean_mask):

cdef table_view input_table_view = table_view_from_columns(input_table)
cdef table_view target_table_view = table_view_from_columns(target_table)
cdef table_view input_table_view = table_view_from_columns(input_columns)
cdef table_view target_table_view = table_view_from_columns(target_columns)
cdef column_view boolean_mask_view = boolean_mask.view()

cdef unique_ptr[table] c_result
Expand All @@ -555,14 +555,10 @@ def _boolean_mask_scatter_table(input_table, target_table,
)
)

return data_from_unique_ptr(
move(c_result),
column_names=target_table._column_names,
index_names=target_table._index._column_names
)
return columns_from_unique_ptr(move(c_result))


def _boolean_mask_scatter_scalar(list input_scalars, target_table,
def _boolean_mask_scatter_scalar(list input_scalars, list target_columns,
Column boolean_mask):

cdef vector[reference_wrapper[constscalar]] input_scalar_vector
Expand All @@ -571,7 +567,7 @@ def _boolean_mask_scatter_scalar(list input_scalars, target_table,
for scl in input_scalars:
input_scalar_vector.push_back(reference_wrapper[constscalar](
scl.get_raw_ptr()[0]))
cdef table_view target_table_view = table_view_from_columns(target_table)
cdef table_view target_table_view = table_view_from_columns(target_columns)
cdef column_view boolean_mask_view = boolean_mask.view()

cdef unique_ptr[table] c_result
Expand All @@ -585,29 +581,37 @@ def _boolean_mask_scatter_scalar(list input_scalars, target_table,
)
)

return data_from_unique_ptr(
move(c_result),
column_names=target_table._column_names,
index_names=target_table._index._column_names
)
return columns_from_unique_ptr(move(c_result))


# TODO: This function is currently unused but should be used in
# ColumnBase.__setitem__, see https://github.com/rapidsai/cudf/issues/8667.
def boolean_mask_scatter(object input, target_table,
def boolean_mask_scatter(list input_, list target_columns,
Column boolean_mask):
"""Copy the target columns, replacing masked rows with input data.

The ``input_`` data can be a list of columns or as a list of scalars.
A list of input columns will be used to replace corresponding rows in the
target columns for which the boolean mask is ``True``. For the nth ``True``
in the boolean mask, the nth row in ``input_`` is used to replace. A list
of input scalars will replace all rows in the target columns for which the
boolean mask is ``True``.
"""
if len(input_) != len(target_columns):
raise ValueError("Mismatched number of input and target columns.")

if len(input_) == 0:
return []

if isinstance(input, cudf.core.frame.Frame):
return _boolean_mask_scatter_table(
input,
target_table,
if isinstance(input_[0], Column):
return _boolean_mask_scatter_columns(
input_,
target_columns,
boolean_mask
)
else:
scalar_list = [as_device_scalar(i) for i in input]
scalar_list = [as_device_scalar(i) for i in input_]
return _boolean_mask_scatter_scalar(
scalar_list,
target_table,
target_columns,
boolean_mask
)

Expand Down
178 changes: 111 additions & 67 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@
from cudf.utils.utils import NotIterable, mask_dtype

T = TypeVar("T", bound="ColumnBase")
# TODO: This workaround allows type hints for `slice`, since `slice` is a
# method in ColumnBase.
Slice = TypeVar("Slice", bound=slice)


class ColumnBase(Column, Serializable, NotIterable):
Expand Down Expand Up @@ -315,22 +318,24 @@ def _fill(
if end <= begin or begin >= self.size:
return self if inplace else self.copy()

fill_scalar = as_device_scalar(fill_value, self.dtype)
# Constructing a cuDF scalar can cut unnecessary DtoH copy if
# the scalar is None when calling `is_valid`.
slr = cudf.Scalar(fill_value, dtype=self.dtype)
vyasr marked this conversation as resolved.
Show resolved Hide resolved

if not inplace:
return libcudf.filling.fill(self, begin, end, fill_scalar)
return libcudf.filling.fill(self, begin, end, slr.device_value)

if is_string_dtype(self.dtype):
return self._mimic_inplace(
libcudf.filling.fill(self, begin, end, fill_scalar),
libcudf.filling.fill(self, begin, end, slr.device_value),
inplace=True,
)

if fill_value is None and not self.nullable:
if not slr.is_valid() and not self.nullable:
mask = create_null_mask(self.size, state=MaskState.ALL_VALID)
self.set_base_mask(mask)

libcudf.filling.fill_in_place(self, begin, end, fill_scalar)
libcudf.filling.fill_in_place(self, begin, end, slr.device_value)

return self

Expand Down Expand Up @@ -481,82 +486,121 @@ def __getitem__(self, arg) -> Union[ScalarLike, ColumnBase]:

def __setitem__(self, key: Any, value: Any):
"""
Set the value of self[key] to value.
Set the value of ``self[key]`` to ``value``.

If value and self are of different types,
value is coerced to self.dtype
If ``value`` and ``self`` are of different types, ``value`` is coerced
to ``self.dtype``. Assumes ``self`` and ``value`` are index-aligned.
"""

# Normalize value to scalar/column
value_normalized = (
cudf.Scalar(value, dtype=self.dtype)
if is_scalar(value)
else as_column(value, dtype=self.dtype)
)

out: Optional[ColumnBase] # If None, no need to perform mimic inplace.
if isinstance(key, slice):
key_start, key_stop, key_stride = key.indices(len(self))
if key_start < 0:
key_start = key_start + len(self)
if key_stop < 0:
key_stop = key_stop + len(self)
if key_start >= key_stop:
return self.copy()
if (key_stride is None or key_stride == 1) and is_scalar(value):
return self._fill(value, key_start, key_stop, inplace=True)
if key_stride != 1 or key_stride is not None or is_scalar(value):
key = arange(
start=key_start,
stop=key_stop,
step=key_stride,
dtype=cudf.dtype(np.int32),
)
nelem = len(key)
else:
nelem = abs(key_stop - key_start)
out = self._scatter_by_slice(key, value_normalized)
else:
key = as_column(key)
if not isinstance(key, cudf.core.column.NumericalColumn):
raise ValueError(f"Invalid scatter map type {key.dtype}.")
out = self._scatter_by_column(key, value_normalized)

if out:
self._mimic_inplace(out, inplace=True)
vyasr marked this conversation as resolved.
Show resolved Hide resolved

def _scatter_by_slice(
self, key: Slice, value: Union[cudf.core.scalar.Scalar, ColumnBase]
) -> Optional[ColumnBase]:
"""If this function returns None, it's either a no-op (slice is empty),
or the inplace replacement is already performed (fill-in-place).
vyasr marked this conversation as resolved.
Show resolved Hide resolved
"""
nelem: int # the number of elements to scatter
isVoid marked this conversation as resolved.
Show resolved Hide resolved

start, stop, step = key.indices(len(self))
if start >= stop:
return None
nelem = (stop - start) // step

self._check_scatter_key_length(nelem, value)

if step == 1:
if isinstance(value, cudf.core.scalar.Scalar):
return self._fill(value, start, stop, inplace=True)
else:
return libcudf.copying.copy_range(
value, self, 0, nelem, start, stop, False
)

# step != 1, create a scatter map with arange
scatter_map = arange(
start=start, stop=stop, step=step, dtype=cudf.dtype(np.int32),
)

return self._scatter_by_column(scatter_map, value)

def _scatter_by_column(
self,
key: "cudf.core.column.NumericalColumn",
value: Union[cudf.core.scalar.Scalar, ColumnBase],
) -> ColumnBase:
nelem = len(key) # the number of elements to scatter
vyasr marked this conversation as resolved.
Show resolved Hide resolved

if is_bool_dtype(key.dtype):
if len(key) != len(self):
raise ValueError(
"Boolean mask must be of same length as column"
)
skip_reducing_key = False
if isinstance(value, ColumnBase):
if len(self) == len(value):
isVoid marked this conversation as resolved.
Show resolved Hide resolved
# Both value and key are aligned to self. Thus, the values
# corresponding to the false values in key should be
# ignored.
value = value.apply_boolean_mask(key)
# After applying boolean mask, the length of value equals
# the number of elements to scatter, we can skip computing
# the sum of ``key`` below.
nelem = len(value)
skip_reducing_key = True
# Compute the number of element to scatter by summing all `True`s
# in the boolean mask.
nelem = key.sum() if not skip_reducing_key else nelem

self._check_scatter_key_length(nelem, value)

try:
if is_bool_dtype(key.dtype):
if not len(key) == len(self):
raise ValueError(
"Boolean mask must be of same length as column"
)
key = arange(len(self))[key]
if hasattr(value, "__len__") and len(value) == len(self):
value = as_column(value)[key]
nelem = len(key)
return libcudf.copying.boolean_mask_scatter(
[value], [self], key
)[0]._with_type_metadata(self.dtype)
else:
return libcudf.copying.scatter(
value, key, self
)._with_type_metadata(self.dtype)
except RuntimeError as e:
if "out of bounds" in str(e):
raise IndexError(
f"index out of bounds for column of size {len(self)}"
) from e
raise

if is_scalar(value):
value = cudf.Scalar(value, dtype=self.dtype)
else:
def _check_scatter_key_length(
self, nelem: int, value: Union[cudf.core.scalar.Scalar, ColumnBase]
):
"""`nelem` is the number of keys to scatter. Should equal to the
number of rows in ``value`` if ``value`` is a column.
"""
if isinstance(value, ColumnBase):
if len(value) != nelem:
msg = (
f"Size mismatch: cannot set value "
f"of size {len(value)} to indexing result of size "
f"{nelem}"
)
raise ValueError(msg)
value = as_column(value).astype(self.dtype)

if (
isinstance(key, slice)
and (key_stride == 1 or key_stride is None)
and not is_scalar(value)
):

out = libcudf.copying.copy_range(
value, self, 0, nelem, key_start, key_stop, False
)
else:
try:
if not isinstance(key, Column):
key = as_column(key)
if not is_scalar(value) and not isinstance(value, Column):
value = as_column(value)
out = libcudf.copying.scatter(
value, key, self
)._with_type_metadata(self.dtype)
except RuntimeError as e:
if "out of bounds" in str(e):
raise IndexError(
f"index out of bounds for column of size {len(self)}"
) from e
raise

self._mimic_inplace(out, inplace=True)

def fillna(
self: T,
Expand Down Expand Up @@ -2237,7 +2281,7 @@ def arange(
stop: Union[int, float] = None,
step: Union[int, float] = 1,
dtype=None,
) -> ColumnBase:
) -> "cudf.core.column.NumericalColumn":
isVoid marked this conversation as resolved.
Show resolved Hide resolved
"""
Returns a column with evenly spaced values within a given interval.

Expand Down
29 changes: 28 additions & 1 deletion python/cudf/cudf/tests/test_setitem.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
# Copyright (c) 2018-2022, NVIDIA CORPORATION.

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -187,6 +187,33 @@ def test_column_set_equal_length_object_by_mask():
assert_eq(cudf.Series(data), cudf.Series([100, 0, 300, 1, 500]))


def test_column_set_unequal_length_object_by_mask():
data = [1, 2, 3, 4, 5]
replace_data_1 = [8, 9]
replace_data_2 = [8, 9, 10, 11]
mask = [True, True, False, True, False]

psr = pd.Series(data)
gsr = cudf.Series(data)
assert_exceptions_equal(
psr.__setitem__,
gsr.__setitem__,
([mask, replace_data_1], {}),
([mask, replace_data_1], {}),
compare_error_message=False,
)

psr = pd.Series(data)
gsr = cudf.Series(data)
assert_exceptions_equal(
psr.__setitem__,
gsr.__setitem__,
([mask, replace_data_2], {}),
([mask, replace_data_2], {}),
compare_error_message=False,
)


def test_categorical_setitem_invalid():
ps = pd.Series([1, 2, 3], dtype="category")
gs = cudf.Series([1, 2, 3], dtype="category")
Expand Down