Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrites column.__setitem__, Use boolean_mask_scatter #10202

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
685009f
Refactor of column.__setitem__
isVoid Feb 3, 2022
570b0c4
Remove comments
isVoid Feb 3, 2022
07f3bd4
Merge branch 'branch-22.04' of github.com:rapidsai/cudf into improvem…
isVoid Feb 10, 2022
3fc4846
Update copyright years
isVoid Feb 10, 2022
5838607
Use a ternary operator and check key type with isinstance
isVoid Feb 14, 2022
f85cddb
Update err message
isVoid Feb 14, 2022
ed6a4fa
Remove wrong param style
isVoid Feb 14, 2022
9ef7f14
Merge branch 'improvement/column_setitem_refactor' of github.com:isVo…
isVoid Feb 14, 2022
f50c57e
Improving index handling
isVoid Feb 14, 2022
b13eb69
Update python/cudf/cudf/core/column/column.py
isVoid Feb 14, 2022
6576af8
Use `as_device_scalar` factory function; Better handling of nelem in
isVoid Feb 15, 2022
62b8552
Change `arange` type annotation
isVoid Feb 15, 2022
6667be2
Revert to scalar constructor.
isVoid Feb 15, 2022
cdbabc4
use `scalar.isvalid`
isVoid Feb 15, 2022
6cb2992
Add comment
isVoid Feb 15, 2022
6f092d6
Apply suggestions from code review
isVoid Feb 16, 2022
b5c2db1
Apply suggestions from code review
isVoid Feb 16, 2022
e6b2833
Merge branch 'branch-22.04' of github.com:rapidsai/cudf into improvem…
isVoid Feb 16, 2022
77e6693
Merge branch 'improvement/column_setitem_refactor' of github.com:isVo…
isVoid Feb 16, 2022
5d5714e
update boolean mask scatter docstring
isVoid Feb 16, 2022
a111f7d
style
isVoid Feb 16, 2022
2d3486f
Addressing multiple reviews
isVoid Feb 17, 2022
d5d209c
Merge branch 'branch-22.04' of github.com:rapidsai/cudf into improvem…
isVoid Feb 17, 2022
1754580
simple things to get ci unblocked
isVoid Feb 18, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 30 additions & 26 deletions python/cudf/cudf/_lib/copying.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

import pickle

Expand Down Expand Up @@ -537,11 +537,11 @@ def copy_if_else(object lhs, object rhs, Column boolean_mask):
as_device_scalar(lhs), as_device_scalar(rhs), boolean_mask)


def _boolean_mask_scatter_table(input_table, target_table,
Column boolean_mask):
def _boolean_mask_scatter_columns(list input_columns, list target_columns,
Column boolean_mask):

cdef table_view input_table_view = table_view_from_columns(input_table)
cdef table_view target_table_view = table_view_from_columns(target_table)
cdef table_view input_table_view = table_view_from_columns(input_columns)
cdef table_view target_table_view = table_view_from_columns(target_columns)
cdef column_view boolean_mask_view = boolean_mask.view()

cdef unique_ptr[table] c_result
Expand All @@ -555,14 +555,10 @@ def _boolean_mask_scatter_table(input_table, target_table,
)
)

return data_from_unique_ptr(
move(c_result),
column_names=target_table._column_names,
index_names=target_table._index._column_names
)
return columns_from_unique_ptr(move(c_result))


def _boolean_mask_scatter_scalar(list input_scalars, target_table,
def _boolean_mask_scatter_scalar(list input_scalars, list target_columns,
Column boolean_mask):

cdef vector[reference_wrapper[constscalar]] input_scalar_vector
Expand All @@ -571,7 +567,7 @@ def _boolean_mask_scatter_scalar(list input_scalars, target_table,
for scl in input_scalars:
input_scalar_vector.push_back(reference_wrapper[constscalar](
scl.get_raw_ptr()[0]))
cdef table_view target_table_view = table_view_from_columns(target_table)
cdef table_view target_table_view = table_view_from_columns(target_columns)
cdef column_view boolean_mask_view = boolean_mask.view()

cdef unique_ptr[table] c_result
Expand All @@ -585,29 +581,37 @@ def _boolean_mask_scatter_scalar(list input_scalars, target_table,
)
)

return data_from_unique_ptr(
move(c_result),
column_names=target_table._column_names,
index_names=target_table._index._column_names
)
return columns_from_unique_ptr(move(c_result))


# TODO: This function is currently unused but should be used in
# ColumnBase.__setitem__, see https://github.com/rapidsai/cudf/issues/8667.
def boolean_mask_scatter(object input, target_table,
def boolean_mask_scatter(list input_, list target_columns,
Column boolean_mask):
"""Copy the target columns, replacing masked rows with input data.

The ``input_`` data can be a list of columns or as a list of scalars.
A list of input columns will be used to replace corresponding rows in the
target columns for which the boolean mask is ``True``. For the nth ``True``
in the boolean mask, the nth row in ``input_`` is used to replace. A list
of input scalars will replace all rows in the target columns for which the
boolean mask is ``True``.
"""
if len(input_) != len(target_columns):
raise ValueError("Mismatched number of input and target columns.")

if len(input_) == 0:
return []

if isinstance(input, cudf.core.frame.Frame):
return _boolean_mask_scatter_table(
input,
target_table,
if isinstance(input_[0], Column):
return _boolean_mask_scatter_columns(
input_,
target_columns,
boolean_mask
)
else:
scalar_list = [as_device_scalar(i) for i in input]
scalar_list = [as_device_scalar(i) for i in input_]
return _boolean_mask_scatter_scalar(
scalar_list,
target_table,
target_columns,
boolean_mask
)

Expand Down
178 changes: 110 additions & 68 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@
from cudf.utils.utils import NotIterable, mask_dtype

T = TypeVar("T", bound="ColumnBase")
# TODO: This workaround allows type hints for `slice`, since `slice` is a
# method in ColumnBase.
Slice = TypeVar("Slice", bound=slice)


class ColumnBase(Column, Serializable, NotIterable):
Expand Down Expand Up @@ -315,22 +318,24 @@ def _fill(
if end <= begin or begin >= self.size:
return self if inplace else self.copy()

fill_scalar = as_device_scalar(fill_value, self.dtype)
# Constructing a cuDF scalar can cut unnecessary DtoH copy if
# the scalar is None when calling `is_valid`.
slr = cudf.Scalar(fill_value, dtype=self.dtype)
vyasr marked this conversation as resolved.
Show resolved Hide resolved

if not inplace:
return libcudf.filling.fill(self, begin, end, fill_scalar)
return libcudf.filling.fill(self, begin, end, slr.device_value)

if is_string_dtype(self.dtype):
return self._mimic_inplace(
libcudf.filling.fill(self, begin, end, fill_scalar),
libcudf.filling.fill(self, begin, end, slr.device_value),
inplace=True,
)

if fill_value is None and not self.nullable:
if not slr.is_valid() and not self.nullable:
mask = create_null_mask(self.size, state=MaskState.ALL_VALID)
self.set_base_mask(mask)

libcudf.filling.fill_in_place(self, begin, end, fill_scalar)
libcudf.filling.fill_in_place(self, begin, end, slr.device_value)

return self

Expand Down Expand Up @@ -489,82 +494,119 @@ def __getitem__(self, arg) -> Union[ScalarLike, ColumnBase]:

def __setitem__(self, key: Any, value: Any):
"""
Set the value of self[key] to value.
Set the value of ``self[key]`` to ``value``.

If value and self are of different types,
value is coerced to self.dtype
If ``value`` and ``self`` are of different types, ``value`` is coerced
to ``self.dtype``. Assumes ``self`` and ``value`` are index-aligned.
"""

# Normalize value to scalar/column
value_normalized = (
cudf.Scalar(value, dtype=self.dtype)
if is_scalar(value)
else as_column(value, dtype=self.dtype)
)

out: Optional[ColumnBase] # If None, no need to perform mimic inplace.
if isinstance(key, slice):
key_start, key_stop, key_stride = key.indices(len(self))
if key_start < 0:
key_start = key_start + len(self)
if key_stop < 0:
key_stop = key_stop + len(self)
if key_start >= key_stop:
return self.copy()
if (key_stride is None or key_stride == 1) and is_scalar(value):
return self._fill(value, key_start, key_stop, inplace=True)
if key_stride != 1 or key_stride is not None or is_scalar(value):
key = arange(
start=key_start,
stop=key_stop,
step=key_stride,
dtype=cudf.dtype(np.int32),
)
nelem = len(key)
else:
nelem = abs(key_stop - key_start)
out = self._scatter_by_slice(key, value_normalized)
else:
key = as_column(key)
if is_bool_dtype(key.dtype):
if not len(key) == len(self):
raise ValueError(
"Boolean mask must be of same length as column"
)
key = arange(len(self))[key]
if hasattr(value, "__len__") and len(value) == len(self):
value = as_column(value)[key]
nelem = len(key)
if not isinstance(key, cudf.core.column.NumericalColumn):
raise ValueError(f"Invalid scatter map type {key.dtype}.")
out = self._scatter_by_column(key, value_normalized)

if is_scalar(value):
value = cudf.Scalar(value, dtype=self.dtype)
else:
if len(value) != nelem:
msg = (
f"Size mismatch: cannot set value "
f"of size {len(value)} to indexing result of size "
f"{nelem}"
if out:
self._mimic_inplace(out, inplace=True)
vyasr marked this conversation as resolved.
Show resolved Hide resolved

def _scatter_by_slice(
self, key: Slice, value: Union[cudf.core.scalar.Scalar, ColumnBase]
) -> Optional[ColumnBase]:
"""If this function returns None, it's either a no-op (slice is empty),
or the inplace replacement is already performed (fill-in-place).
vyasr marked this conversation as resolved.
Show resolved Hide resolved
"""
start, stop, step = key.indices(len(self))
if start >= stop:
return None
num_keys = (stop - start) // step

self._check_scatter_key_length(num_keys, value)

if step == 1:
if isinstance(value, cudf.core.scalar.Scalar):
return self._fill(value, start, stop, inplace=True)
else:
return libcudf.copying.copy_range(
value, self, 0, num_keys, start, stop, False
)
raise ValueError(msg)
value = as_column(value).astype(self.dtype)

if (
isinstance(key, slice)
and (key_stride == 1 or key_stride is None)
and not is_scalar(value)
):
# step != 1, create a scatter map with arange
scatter_map = arange(
start=start, stop=stop, step=step, dtype=cudf.dtype(np.int32),
)

out = libcudf.copying.copy_range(
value, self, 0, nelem, key_start, key_stop, False
)
return self._scatter_by_column(scatter_map, value)

def _scatter_by_column(
self,
key: cudf.core.column.NumericalColumn,
value: Union[cudf.core.scalar.Scalar, ColumnBase],
) -> ColumnBase:
if is_bool_dtype(key.dtype):
# `key` is boolean mask
if len(key) != len(self):
raise ValueError(
"Boolean mask must be of same length as column"
)
if isinstance(value, ColumnBase) and len(self) == len(value):
# Both value and key are aligned to self. Thus, the values
# corresponding to the false values in key should be
# ignored.
value = value.apply_boolean_mask(key)
# After applying boolean mask, the length of value equals
# the number of elements to scatter, we can skip computing
# the sum of ``key`` below.
num_keys = len(value)
else:
# Compute the number of element to scatter by summing all
# `True`s in the boolean mask.
num_keys = key.sum()
else:
try:
if not isinstance(key, Column):
key = as_column(key)
if not is_scalar(value) and not isinstance(value, Column):
value = as_column(value)
out = libcudf.copying.scatter(
# `key` is integer scatter map
num_keys = len(key)

self._check_scatter_key_length(num_keys, value)

try:
if is_bool_dtype(key.dtype):
return libcudf.copying.boolean_mask_scatter(
[value], [self], key
)[0]._with_type_metadata(self.dtype)
else:
return libcudf.copying.scatter(
value, key, self
)._with_type_metadata(self.dtype)
except RuntimeError as e:
if "out of bounds" in str(e):
raise IndexError(
f"index out of bounds for column of size {len(self)}"
) from e
raise
except RuntimeError as e:
if "out of bounds" in str(e):
raise IndexError(
f"index out of bounds for column of size {len(self)}"
) from e
raise

self._mimic_inplace(out, inplace=True)
def _check_scatter_key_length(
self, num_keys: int, value: Union[cudf.core.scalar.Scalar, ColumnBase]
):
"""`num_keys` is the number of keys to scatter. Should equal to the
number of rows in ``value`` if ``value`` is a column.
"""
if isinstance(value, ColumnBase):
if len(value) != num_keys:
msg = (
f"Size mismatch: cannot set value "
f"of size {len(value)} to indexing result of size "
f"{num_keys}"
)
raise ValueError(msg)

def fillna(
self: T,
Expand Down Expand Up @@ -2245,7 +2287,7 @@ def arange(
stop: Union[int, float] = None,
step: Union[int, float] = 1,
dtype=None,
) -> ColumnBase:
) -> cudf.core.column.NumericalColumn:
"""
Returns a column with evenly spaced values within a given interval.

Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/core/df_protocol.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION.

import collections
import enum
from typing import (
Expand Down
16 changes: 12 additions & 4 deletions python/cudf/cudf/tests/test_df_protocol.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION.

from typing import Any, Tuple

import cupy as cp
Expand Down Expand Up @@ -27,13 +29,19 @@ def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
)
# check that non null values are the equals as nulls are represented
# by sentinel values in the buffer.
non_null_idxs = cudf.Series(cudfcol) != cudf.NA
# FIXME: In gh-10202 some minimal fixes were added to unblock CI. But
# currently only non-null values are compared, null positions are
# unchecked.
non_null_idxs = ~cudf.Series(cudfcol).isna()
assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs])

if dtype[0] != _DtypeKind.BOOL:
array_from_dlpack = cp.fromDlpack(buf.__dlpack__())
col_array = cp.asarray(cudfcol.data_array_view)
assert_eq(array_from_dlpack.flatten(), col_array.flatten())
array_from_dlpack = cp.fromDlpack(buf.__dlpack__()).get()
col_array = cp.asarray(cudfcol.data_array_view).get()
assert_eq(
array_from_dlpack[non_null_idxs.to_numpy()].flatten(),
col_array[non_null_idxs.to_numpy()].flatten(),
)
else:
pytest.raises(TypeError, buf.__dlpack__)

Expand Down
Loading