From 0e5088b58aefa15035c1dcbac24ae53a6323044e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 24 Aug 2021 14:30:38 -0700 Subject: [PATCH 1/8] Inline _scatter_scalars into _scatter_table. --- python/cudf/cudf/_lib/copying.pyx | 77 ++++++++++---------------- python/cudf/cudf/core/column/column.py | 2 +- 2 files changed, 31 insertions(+), 48 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index ed31574b4a5..8a652344243 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -192,63 +192,49 @@ def gather( ) -def _scatter_table(Table source_table, Column scatter_map, +def _scatter_table(object source, Column scatter_map, Table target_table, bool bounds_check=True): - cdef table_view source_table_view = source_table.data_view() cdef column_view scatter_map_view = scatter_map.view() + # cdef table_view target_table_view = target_column.table_view() cdef table_view target_table_view = target_table.data_view() cdef bool c_bounds_check = bounds_check - cdef unique_ptr[table] c_result - with nogil: - c_result = move( - cpp_copying.scatter( - source_table_view, - scatter_map_view, - target_table_view, - c_bounds_check - ) - ) - - data, _ = data_from_unique_ptr( - move(c_result), - column_names=target_table._column_names, - index_names=None - ) - - return data, ( - None if target_table._index is None else target_table._index.copy( - deep=False) - ) - - -def _scatter_scalar(scalars, Column scatter_map, - Table target_table, bool bounds_check=True): + # Needed for the table branch + cdef table_view source_table_view + # Needed for the scalar branch cdef vector[reference_wrapper[constscalar]] source_scalars - source_scalars.reserve(len(scalars)) - cdef bool c_bounds_check = bounds_check cdef DeviceScalar slr - for val, col in zip(scalars, target_table._columns): - slr = as_device_scalar(val, col.dtype) + + if isinstance(source, Table): + source_table_view = ( source).data_view() + + with nogil: + c_result = move( + cpp_copying.scatter( + source_table_view, + scatter_map_view, + target_table_view, + c_bounds_check + ) + ) + else: + target_column = next(iter(target_table._columns)) + slr = as_device_scalar(source, target_column.dtype) source_scalars.push_back(reference_wrapper[constscalar]( slr.get_raw_ptr()[0])) - cdef column_view scatter_map_view = scatter_map.view() - cdef table_view target_table_view = target_table.data_view() - cdef unique_ptr[table] c_result - - with nogil: - c_result = move( - cpp_copying.scatter( - source_scalars, - scatter_map_view, - target_table_view, - c_bounds_check + with nogil: + c_result = move( + cpp_copying.scatter( + source_scalars, + scatter_map_view, + target_table_view, + c_bounds_check + ) ) - ) data, _ = data_from_unique_ptr( move(c_result), @@ -274,10 +260,7 @@ def scatter(object input, object scatter_map, Table target, if not isinstance(scatter_map, Column): scatter_map = as_column(scatter_map) - if isinstance(input, Table): - return _scatter_table(input, scatter_map, target, bounds_check) - else: - return _scatter_scalar(input, scatter_map, target, bounds_check) + return _scatter_table(input, scatter_map, target, bounds_check) def _reverse_column(Column source_column): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d52f63a79f5..cea49912440 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -601,7 +601,7 @@ def __setitem__(self, key: Any, value: Any): try: if is_scalar(value): input = self - out = input.as_frame()._scatter(key, [value])._as_column() + out = input.as_frame()._scatter(key, value)._as_column() else: if not isinstance(value, Column): value = as_column(value) From 93c933b9db77c8b533f61646c86f249d0423d1de Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 24 Aug 2021 14:40:28 -0700 Subject: [PATCH 2/8] Inline _scatter_table into scatter. --- python/cudf/cudf/_lib/copying.pyx | 25 ++++++++----------------- python/cudf/cudf/core/column/column.py | 2 ++ 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 8a652344243..e9065c59d38 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -192,8 +192,14 @@ def gather( ) -def _scatter_table(object source, Column scatter_map, - Table target_table, bool bounds_check=True): +def scatter(object source, Column scatter_map, Table target_table, + bool bounds_check=True): + """ + Scattering input into target as per the scatter map, + input can be a list of scalars or can be a table + """ + + from cudf.core.column.column import as_column cdef column_view scatter_map_view = scatter_map.view() # cdef table_view target_table_view = target_column.table_view() @@ -248,21 +254,6 @@ def _scatter_table(object source, Column scatter_map, ) -def scatter(object input, object scatter_map, Table target, - bool bounds_check=True): - """ - Scattering input into target as per the scatter map, - input can be a list of scalars or can be a table - """ - - from cudf.core.column.column import as_column - - if not isinstance(scatter_map, Column): - scatter_map = as_column(scatter_map) - - return _scatter_table(input, scatter_map, target, bounds_check) - - def _reverse_column(Column source_column): cdef column_view reverse_column_view = source_column.view() diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index cea49912440..1d6017f1b0e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -599,6 +599,8 @@ def __setitem__(self, key: Any, value: Any): ) else: try: + if not isinstance(key, Column): + key = as_column(key) if is_scalar(value): input = self out = input.as_frame()._scatter(key, value)._as_column() From 6d1966f82ad35a035d2b07447b1da832b4e68c22 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 24 Aug 2021 15:06:30 -0700 Subject: [PATCH 3/8] Inline call to _scatter so that it can be removed. --- python/cudf/cudf/core/column/column.py | 16 ++++++++++------ python/cudf/cudf/core/frame.py | 8 -------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 1d6017f1b0e..468a431038c 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -599,19 +599,23 @@ def __setitem__(self, key: Any, value: Any): ) else: try: + tmp = self.as_frame() if not isinstance(key, Column): key = as_column(key) if is_scalar(value): - input = self - out = input.as_frame()._scatter(key, value)._as_column() + result = tmp.__class__._from_data( + *libcudf.copying.scatter(value, key, tmp) + ) + result._copy_type_metadata(tmp) + out = result._as_column() else: if not isinstance(value, Column): value = as_column(value) - out = ( - self.as_frame() - ._scatter(key, value.as_frame()) - ._as_column() + result = tmp.__class__._from_data( + *libcudf.copying.scatter(value.as_frame(), key, tmp) ) + result._copy_type_metadata(tmp) + out = result._as_column() except RuntimeError as e: if "out of bounds" in str(e): raise IndexError( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 9f743cd8c85..4f46794aa3f 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -692,14 +692,6 @@ def _as_column(self): return self._data[None].copy(deep=False) - def _scatter(self, key, value): - result = self.__class__._from_data( - *libcudf.copying.scatter(value, key, self) - ) - - result._copy_type_metadata(self) - return result - def _empty_like(self, keep_index=True): result = self.__class__._from_data( *libcudf.copying.table_empty_like(self, keep_index) From b7f1547ac43827eb5efea95e30e6e268f9450c27 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 24 Aug 2021 15:14:04 -0700 Subject: [PATCH 4/8] Unify scalar and table paths as much as possible. --- python/cudf/cudf/core/column/column.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 468a431038c..0034e33154a 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -600,22 +600,19 @@ def __setitem__(self, key: Any, value: Any): else: try: tmp = self.as_frame() + if not isinstance(key, Column): key = as_column(key) - if is_scalar(value): - result = tmp.__class__._from_data( - *libcudf.copying.scatter(value, key, tmp) - ) - result._copy_type_metadata(tmp) - out = result._as_column() - else: + + if not is_scalar(value): if not isinstance(value, Column): value = as_column(value) - result = tmp.__class__._from_data( - *libcudf.copying.scatter(value.as_frame(), key, tmp) - ) - result._copy_type_metadata(tmp) - out = result._as_column() + value = value.as_frame() + result = tmp.__class__._from_data( + *libcudf.copying.scatter(value, key, tmp) + ) + result._copy_type_metadata(tmp) + out = result._as_column() except RuntimeError as e: if "out of bounds" in str(e): raise IndexError( From b2bb3a60cbed243f345a5e4f8b5e3be137a22616 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 24 Aug 2021 15:57:08 -0700 Subject: [PATCH 5/8] Pass source as a column rather than a table. --- python/cudf/cudf/_lib/copying.pyx | 6 +++--- python/cudf/cudf/core/column/column.py | 6 ++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index e9065c59d38..c718d4ee9f9 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -19,7 +19,7 @@ from cudf._lib.column cimport Column from cudf._lib.scalar import as_device_scalar from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.table cimport Table +from cudf._lib.table cimport Table, make_table_view from cudf._lib.reduce import minmax from cudf.core.abc import Serializable @@ -214,8 +214,8 @@ def scatter(object source, Column scatter_map, Table target_table, cdef vector[reference_wrapper[constscalar]] source_scalars cdef DeviceScalar slr - if isinstance(source, Table): - source_table_view = (
source).data_view() + if isinstance(source, Column): + source_table_view = make_table_view(( source,)) with nogil: c_result = move( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 0034e33154a..a4665a1a95c 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -604,10 +604,8 @@ def __setitem__(self, key: Any, value: Any): if not isinstance(key, Column): key = as_column(key) - if not is_scalar(value): - if not isinstance(value, Column): - value = as_column(value) - value = value.as_frame() + if not is_scalar(value) and not isinstance(value, Column): + value = as_column(value) result = tmp.__class__._from_data( *libcudf.copying.scatter(value, key, tmp) ) From aab6897d72f0984a2aa3a28567d21e360d32e654 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 24 Aug 2021 16:27:36 -0700 Subject: [PATCH 6/8] Change scatter to return the column. --- python/cudf/cudf/_lib/copying.pyx | 7 +------ python/cudf/cudf/core/column/column.py | 7 ++----- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index c718d4ee9f9..ef63f7c8e95 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -199,8 +199,6 @@ def scatter(object source, Column scatter_map, Table target_table, input can be a list of scalars or can be a table """ - from cudf.core.column.column import as_column - cdef column_view scatter_map_view = scatter_map.view() # cdef table_view target_table_view = target_column.table_view() cdef table_view target_table_view = target_table.data_view() @@ -248,10 +246,7 @@ def scatter(object source, Column scatter_map, Table target_table, index_names=None ) - return data, ( - None if target_table._index is None else target_table._index.copy( - deep=False) - ) + return next(iter(data.values())) def _reverse_column(Column source_column): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index a4665a1a95c..a93678ad789 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -606,11 +606,8 @@ def __setitem__(self, key: Any, value: Any): if not is_scalar(value) and not isinstance(value, Column): value = as_column(value) - result = tmp.__class__._from_data( - *libcudf.copying.scatter(value, key, tmp) - ) - result._copy_type_metadata(tmp) - out = result._as_column() + out = libcudf.copying.scatter(value, key, tmp) + out._with_type_metadata(self) except RuntimeError as e: if "out of bounds" in str(e): raise IndexError( From 1dd090d1c2d31d0df3b295b5fee18c1d62dcbbdb Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 24 Aug 2021 16:30:30 -0700 Subject: [PATCH 7/8] Pass column rather than table as argument. --- python/cudf/cudf/_lib/copying.pyx | 8 +++----- python/cudf/cudf/core/column/column.py | 5 +---- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index ef63f7c8e95..88f54632000 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -192,7 +192,7 @@ def gather( ) -def scatter(object source, Column scatter_map, Table target_table, +def scatter(object source, Column scatter_map, Column target_column, bool bounds_check=True): """ Scattering input into target as per the scatter map, @@ -200,8 +200,7 @@ def scatter(object source, Column scatter_map, Table target_table, """ cdef column_view scatter_map_view = scatter_map.view() - # cdef table_view target_table_view = target_column.table_view() - cdef table_view target_table_view = target_table.data_view() + cdef table_view target_table_view = make_table_view((target_column,)) cdef bool c_bounds_check = bounds_check cdef unique_ptr[table] c_result @@ -225,7 +224,6 @@ def scatter(object source, Column scatter_map, Table target_table, ) ) else: - target_column = next(iter(target_table._columns)) slr = as_device_scalar(source, target_column.dtype) source_scalars.push_back(reference_wrapper[constscalar]( slr.get_raw_ptr()[0])) @@ -242,7 +240,7 @@ def scatter(object source, Column scatter_map, Table target_table, data, _ = data_from_unique_ptr( move(c_result), - column_names=target_table._column_names, + column_names=(None,), index_names=None ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index a93678ad789..a13160f02b1 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -599,14 +599,11 @@ def __setitem__(self, key: Any, value: Any): ) else: try: - tmp = self.as_frame() - if not isinstance(key, Column): key = as_column(key) - if not is_scalar(value) and not isinstance(value, Column): value = as_column(value) - out = libcudf.copying.scatter(value, key, tmp) + out = libcudf.copying.scatter(value, key, self) out._with_type_metadata(self) except RuntimeError as e: if "out of bounds" in str(e): From 4334941d58a70cd87abae1a92fbe33c5f6bce7b1 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 25 Aug 2021 10:27:21 -0700 Subject: [PATCH 8/8] Fix metadata reassignment. --- python/cudf/cudf/core/column/column.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index a13160f02b1..c834efec9fb 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -603,8 +603,9 @@ def __setitem__(self, key: Any, value: Any): key = as_column(key) if not is_scalar(value) and not isinstance(value, Column): value = as_column(value) - out = libcudf.copying.scatter(value, key, self) - out._with_type_metadata(self) + out = libcudf.copying.scatter( + value, key, self + )._with_type_metadata(self.dtype) except RuntimeError as e: if "out of bounds" in str(e): raise IndexError(