From 40cad3868ae6902cb0fe3dcf2fea16cc5a52fab2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 26 Aug 2021 04:19:31 -0700 Subject: [PATCH] Refactor implementation of column setitem (#9110) This small PR reworks the behavior of `ColumnBase.__setitem__` when it is provided something other than a slice as input, for instance an array. This code path requires scattering the new values into the column, which previously involved converting the Column to a Frame in order to call Frame._scatter. Since that method was only used for this one purpose, the underlying libcudf scatter implementation has been rewritten to accept and return Columns, allowing us to inline the call and also get rid of a round trip from Column to Frame and back. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Ashwin Srinath (https://github.com/shwina) - Marlene (https://github.com/marlenezw) URL: https://github.com/rapidsai/cudf/pull/9110 --- python/cudf/cudf/_lib/copying.pyx | 105 +++++++++---------------- python/cudf/cudf/core/column/column.py | 18 ++--- python/cudf/cudf/core/frame.py | 8 -- 3 files changed, 43 insertions(+), 88 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index ed31574b4a5..88f54632000 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -19,7 +19,7 @@ from cudf._lib.column cimport Column from cudf._lib.scalar import as_device_scalar from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.table cimport Table +from cudf._lib.table cimport Table, make_table_view from cudf._lib.reduce import minmax from cudf.core.abc import Serializable @@ -192,92 +192,59 @@ def gather( ) -def _scatter_table(Table source_table, Column scatter_map, - Table target_table, bool bounds_check=True): +def scatter(object source, Column scatter_map, Column target_column, + bool bounds_check=True): + """ + Scattering input into target as per the scatter map, + input can be a list of scalars or can be a table + """ - cdef table_view source_table_view = source_table.data_view() cdef column_view scatter_map_view = scatter_map.view() - cdef table_view target_table_view = target_table.data_view() + cdef table_view target_table_view = make_table_view((target_column,)) cdef bool c_bounds_check = bounds_check - cdef unique_ptr[table] c_result - with nogil: - c_result = move( - cpp_copying.scatter( - source_table_view, - scatter_map_view, - target_table_view, - c_bounds_check - ) - ) - - data, _ = data_from_unique_ptr( - move(c_result), - column_names=target_table._column_names, - index_names=None - ) - - return data, ( - None if target_table._index is None else target_table._index.copy( - deep=False) - ) - - -def _scatter_scalar(scalars, Column scatter_map, - Table target_table, bool bounds_check=True): + # Needed for the table branch + cdef table_view source_table_view + # Needed for the scalar branch cdef vector[reference_wrapper[constscalar]] source_scalars - source_scalars.reserve(len(scalars)) - cdef bool c_bounds_check = bounds_check cdef DeviceScalar slr - for val, col in zip(scalars, target_table._columns): - slr = as_device_scalar(val, col.dtype) + + if isinstance(source, Column): + source_table_view = make_table_view(( source,)) + + with nogil: + c_result = move( + cpp_copying.scatter( + source_table_view, + scatter_map_view, + target_table_view, + c_bounds_check + ) + ) + else: + slr = as_device_scalar(source, target_column.dtype) source_scalars.push_back(reference_wrapper[constscalar]( slr.get_raw_ptr()[0])) - cdef column_view scatter_map_view = scatter_map.view() - cdef table_view target_table_view = target_table.data_view() - - cdef unique_ptr[table] c_result - with nogil: - c_result = move( - cpp_copying.scatter( - source_scalars, - scatter_map_view, - target_table_view, - c_bounds_check + with nogil: + c_result = move( + cpp_copying.scatter( + source_scalars, + scatter_map_view, + target_table_view, + c_bounds_check + ) ) - ) data, _ = data_from_unique_ptr( move(c_result), - column_names=target_table._column_names, + column_names=(None,), index_names=None ) - return data, ( - None if target_table._index is None else target_table._index.copy( - deep=False) - ) - - -def scatter(object input, object scatter_map, Table target, - bool bounds_check=True): - """ - Scattering input into target as per the scatter map, - input can be a list of scalars or can be a table - """ - - from cudf.core.column.column import as_column - - if not isinstance(scatter_map, Column): - scatter_map = as_column(scatter_map) - - if isinstance(input, Table): - return _scatter_table(input, scatter_map, target, bounds_check) - else: - return _scatter_scalar(input, scatter_map, target, bounds_check) + return next(iter(data.values())) def _reverse_column(Column source_column): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d52f63a79f5..c834efec9fb 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -599,17 +599,13 @@ def __setitem__(self, key: Any, value: Any): ) else: try: - if is_scalar(value): - input = self - out = input.as_frame()._scatter(key, [value])._as_column() - else: - if not isinstance(value, Column): - value = as_column(value) - out = ( - self.as_frame() - ._scatter(key, value.as_frame()) - ._as_column() - ) + if not isinstance(key, Column): + key = as_column(key) + if not is_scalar(value) and not isinstance(value, Column): + value = as_column(value) + out = libcudf.copying.scatter( + value, key, self + )._with_type_metadata(self.dtype) except RuntimeError as e: if "out of bounds" in str(e): raise IndexError( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 9f743cd8c85..4f46794aa3f 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -692,14 +692,6 @@ def _as_column(self): return self._data[None].copy(deep=False) - def _scatter(self, key, value): - result = self.__class__._from_data( - *libcudf.copying.scatter(value, key, self) - ) - - result._copy_type_metadata(self) - return result - def _empty_like(self, keep_index=True): result = self.__class__._from_data( *libcudf.copying.table_empty_like(self, keep_index)