From a0b0f8c9c348403e5ea7fdebccf118a7403df2cb Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 31 Jan 2022 15:20:02 -0800 Subject: [PATCH 01/14] Refactoring table_slice --- python/cudf/cudf/_lib/copying.pyx | 37 ++++++--------- python/cudf/cudf/_lib/utils.pxd | 1 + python/cudf/cudf/_lib/utils.pyx | 18 ++++++++ python/cudf/cudf/core/_base_index.py | 9 +++- python/cudf/cudf/core/dataframe.py | 69 ++++++++++++++-------------- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/core/index.py | 6 +++ python/cudf/cudf/core/multiindex.py | 3 ++ 8 files changed, 86 insertions(+), 59 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 30157bc10ad..df0e78a5634 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -38,6 +38,7 @@ from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type from cudf._lib.utils cimport ( + columns_from_table_view, columns_from_unique_ptr, data_from_table_view, data_from_unique_ptr, @@ -330,21 +331,18 @@ def column_slice(Column input_column, object indices): return result -def table_slice(input_table, object indices, bool keep_index=True): - - cdef table_view input_table_view = table_view_from_table( - input_table, not keep_index - ) - - cdef vector[size_type] c_indices - c_indices.reserve(len(indices)) +def columns_slice(input_columns: list, indices: list): + """ + Given a list of input columns, return columns sliced by ``indices``. + Returns a list of list of columns. The length of return is + `len(indices) / 2`. The `i`th item in return is a list of columns sliced + from ``input_columns`` with `slice(indices[i*2], indices[i*2 + 1])`. + """ + cdef table_view input_table_view = table_view_from_columns(input_columns) + cdef vector[size_type] c_indices = indices cdef vector[table_view] c_result - cdef int index - for index in indices: - c_indices.push_back(index) - with nogil: c_result = move( cpp_copying.slice( @@ -352,18 +350,11 @@ def table_slice(input_table, object indices, bool keep_index=True): c_indices) ) - num_of_result_cols = c_result.size() return [ - data_from_table_view( - c_result[i], - input_table, - column_names=input_table._column_names, - index_names=( - input_table._index._column_names if ( - keep_index is True) - else None - ) - ) for i in range(num_of_result_cols)] + columns_from_table_view( + c_result[i], input_columns + ) for i in range(c_result.size()) + ] def column_split(Column input_column, object splits): diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd index 50893ef9838..ef8b5c156d5 100644 --- a/python/cudf/cudf/_lib/utils.pxd +++ b/python/cudf/cudf/_lib/utils.pxd @@ -17,3 +17,4 @@ cdef data_from_table_view( cdef table_view table_view_from_columns(columns) except * cdef table_view table_view_from_table(tbl, ignore_index=*) except* cdef columns_from_unique_ptr(unique_ptr[table] c_tbl) +cdef columns_from_table_view(table_view tv, object owners) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 40edd4bf9a2..520c9a28dde 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -310,6 +310,24 @@ cdef data_from_unique_ptr( } return data, index +cdef columns_from_table_view( + table_view tv, + object owners, +): + """ + Given a ``cudf::table_view``, construsts a list of columns from it, + along with referencing an ``owner`` Python object that owns the memory + lifetime. ``owner`` must be either None or a list of column. If ``owner`` + is a list of columns, the owner of the `i`th ``cudf::column_view`` in the + table view is ``owners[i]``. For more about memory ownership, + see ``Column.from_column_view`` + """ + + return [ + Column.from_column_view( + tv.column(i), owners[i] if isinstance(owners, list) else None + ) for i in range(tv.num_columns()) + ] cdef data_from_table_view( table_view tv, diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index b1335c7c076..ad02c7d423d 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -4,7 +4,7 @@ import pickle import warnings -from typing import Any, Set +from typing import Any, Set, Tuple import pandas as pd @@ -75,6 +75,13 @@ def get_loc(self, key, method=None, tolerance=None): def __getitem__(self, key): raise NotImplementedError() + def _data_columns(self) -> Tuple[ColumnBase, ...]: + """Return a tuple of columns that holds actual data. ``RangeIndex`` + returns an empty tuple. Unlike ``_values``, this method does not + materialize columns. + """ + raise NotImplementedError() + def __contains__(self, item): return item in self._values diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c686cd0fd39..7bc0d9ef6a1 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1192,52 +1192,53 @@ def _slice(self: T, arg: slice) -> T: return self start, stop, stride = arg.indices(num_rows) - # This is just to handle RangeIndex type, stop - # it from materializing unnecessarily - keep_index = True - if self.index is not None and isinstance(self.index, RangeIndex): + # If index type is RangeIndex, slice without materializing. + is_range_index = isinstance(self.index, RangeIndex) + if is_range_index: if self._num_columns == 0: - result = self._empty_like(keep_index) + result = self._empty_like(keep_index=False) result._index = self.index[start:stop] return result - keep_index = False - # For decreasing slices, terminal at before-the-zero - # position is preserved. if start < 0: start = start + num_rows + + # Decreasing slices that terminates at -1, such as slice(4, -1, -1), + # has end index of 0, The check below makes sure -1 is not wrapped + # to `-1 + num_rows`. if stop < 0 and not (stride < 0 and stop == -1): stop = stop + num_rows + stride = 1 if stride is None else stride - if start > stop and (stride is None or stride == 1): - return self._empty_like(keep_index) - else: - start = len(self) if start > num_rows else start - stop = len(self) if stop > num_rows else stop + if start > stop and stride == 1: + return self._empty_like(keep_index=True) - if stride is not None and stride != 1: - return self._gather( - cudf.core.column.arange( - start, stop=stop, step=stride, dtype=np.int32 - ) - ) - else: - result = self._from_data( - *libcudf.copying.table_slice( - self, [start, stop], keep_index - )[0] + start = len(self) if start > num_rows else start + stop = len(self) if stop > num_rows else stop + + if stride != 1: + return self._gather( + cudf.core.column.arange( + start, stop=stop, step=stride, dtype=np.int32 ) + ) + else: + columns_to_slice = [ + *self._index._data_columns(), + *self._columns, + ] + result = self._from_columns_like_self( + libcudf.copying.columns_slice(columns_to_slice, [start, stop])[ + 0 + ], + self._column_names, + None if is_range_index else self._index.names, + ) - result._copy_type_metadata(self, include_index=keep_index) - if self.index is not None: - if keep_index: - result._index.names = self.index.names - else: - # Adding index of type RangeIndex back to - # result - result.index = self.index[start:stop] - result.columns = self.columns - return result + if is_range_index: + result.index = self.index[start:stop] + result.columns = self.columns + return result def memory_usage(self, index=True, deep=False): """ diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 69dc5389e7a..9a8d113e3e7 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -104,7 +104,7 @@ def _index_names(self) -> List[Any]: # TODO: List[str]? ) @property - def _columns(self) -> List[Any]: # TODO: List[Column]? + def _columns(self) -> Tuple[ColumnBase, ...]: return self._data.columns def serialize(self): diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 91c7a740699..0c587be64ec 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -231,6 +231,9 @@ def _values(self): else: return column.column_empty(0, masked=False, dtype=self.dtype) + def _data_columns(self) -> Tuple[ColumnBase, ...]: + return () + def is_numeric(self): return True @@ -825,6 +828,9 @@ def _copy_type_metadata( def _values(self): return self._column + def _data_columns(self) -> Tuple[ColumnBase, ...]: + return self._columns + @classmethod def _concat(cls, objs): if all(isinstance(obj, RangeIndex) for obj in objs): diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index e8ff7838a9e..f5e6d194fca 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1090,6 +1090,9 @@ def values(self): """ return self.to_frame(index=False).values + def _data_columns(self) -> Tuple[column.ColumnBase, ...]: + return self._columns + @classmethod def from_frame(cls, df, names=None): """ From 9e794927a6e03d84c7808f030ddc0b042692b0d2 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 31 Jan 2022 16:49:59 -0800 Subject: [PATCH 02/14] Refactor `table_empty_like`. --- python/cudf/cudf/_lib/copying.pyx | 16 +++------------- python/cudf/cudf/core/frame.py | 10 +--------- python/cudf/cudf/core/indexed_frame.py | 14 ++++++++++++++ 3 files changed, 18 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index df0e78a5634..0014f64cbf5 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -282,24 +282,14 @@ def column_allocate_like(Column input_column, size=None): return Column.from_unique_ptr(move(c_result)) -def table_empty_like(input_table, bool keep_index=True): - - cdef table_view input_table_view = table_view_from_table( - input_table, not keep_index - ) - +def columns_empty_like(input_columns): + cdef table_view input_table_view = table_view_from_columns(input_columns) cdef unique_ptr[table] c_result with nogil: c_result = move(cpp_copying.empty_like(input_table_view)) - return data_from_unique_ptr( - move(c_result), - column_names=input_table._column_names, - index_names=( - input_table._index._column_names if keep_index is True else None - ) - ) + return columns_from_unique_ptr(move(c_result)) def column_slice(Column input_column, object indices): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 9a8d113e3e7..9cabeef6a9f 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -104,7 +104,7 @@ def _index_names(self) -> List[Any]: # TODO: List[str]? ) @property - def _columns(self) -> Tuple[ColumnBase, ...]: + def _columns(self) -> Tuple[ColumnBase, ...]: # TODO: List[Column]? return self._data.columns def serialize(self): @@ -560,14 +560,6 @@ def _as_column(self): return self._data[None].copy(deep=False) - def _empty_like(self, keep_index=True): - result = self.__class__._from_data( - *libcudf.copying.table_empty_like(self, keep_index) - ) - - result._copy_type_metadata(self, include_index=keep_index) - return result - @property def values(self): """ diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index e9f2de1cb1c..56f7287fd6c 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -638,6 +638,20 @@ def drop_duplicates( self._index.names if not ignore_index else None, ) + def _empty_like(self, keep_index=True): + # TODO: RangeIndex._data.columns materializes data, + # which is unecessary here. + return self._from_columns_like_self( + libcudf.copying.columns_empty_like( + [ + *(self._index._data.columns if keep_index else ()), + *self._columns, + ] + ), + self._column_names, + self._index.names if keep_index else None, + ) + def add_prefix(self, prefix): """ Prefix labels with string `prefix`. From cd14022a531f97ef7d7f89df9ef6814b3e20b79c Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 1 Feb 2022 12:26:01 -0800 Subject: [PATCH 03/14] Refactor split --- python/cudf/cudf/_lib/copying.pyx | 30 +++++++------------------- python/cudf/cudf/core/_base_index.py | 3 +++ python/cudf/cudf/core/frame.py | 18 +++++++++++----- python/cudf/cudf/core/index.py | 5 +++++ python/cudf/cudf/core/indexed_frame.py | 16 ++++++++++++++ 5 files changed, 45 insertions(+), 27 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 0014f64cbf5..d5adb47e0f5 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -282,7 +282,7 @@ def column_allocate_like(Column input_column, size=None): return Column.from_unique_ptr(move(c_result)) -def columns_empty_like(input_columns): +def columns_empty_like(input_columns: list): cdef table_view input_table_view = table_view_from_columns(input_columns) cdef unique_ptr[table] c_result @@ -378,21 +378,12 @@ def column_split(Column input_column, object splits): return result -def table_split(input_table, object splits, bool keep_index=True): - - cdef table_view input_table_view = table_view_from_table( - input_table, not keep_index - ) - - cdef vector[size_type] c_splits - c_splits.reserve(len(splits)) +def columns_split(input_columns: list, object splits): + cdef table_view input_table_view = table_view_from_columns(input_columns) + cdef vector[size_type] c_splits = splits cdef vector[table_view] c_result - cdef int split - for split in splits: - c_splits.push_back(split) - with nogil: c_result = move( cpp_copying.split( @@ -400,16 +391,11 @@ def table_split(input_table, object splits, bool keep_index=True): c_splits) ) - num_of_result_cols = c_result.size() return [ - data_from_table_view( - c_result[i], - input_table, - column_names=input_table._column_names, - index_names=input_table._index_names if ( - keep_index is True) - else None - ) for i in range(num_of_result_cols)] + columns_from_table_view( + c_result[i], input_columns + ) for i in range(c_result.size()) + ] def _copy_if_else_column_column(Column lhs, Column rhs, Column boolean_mask): diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index ad02c7d423d..410dcad9ea3 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1577,6 +1577,9 @@ def _split_columns_by_levels(self, levels): [], ) + def _split(self, splits): + raise NotImplementedError() + def _get_result_name(left_name, right_name): if left_name == right_name: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 9cabeef6a9f..dc3c56ef77e 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3378,11 +3378,19 @@ def _is_sorted(self, ascending=None, null_position=None): self, ascending=ascending, null_position=null_position ) - def _split(self, splits, keep_index=True): - results = libcudf.copying.table_split( - self, splits, keep_index=keep_index - ) - return [self.__class__._from_data(*result) for result in results] + def _split(self, splits): + """Split a frame with split points in ``splits``. Returns a list of + Frames of length `len(splits) + 1`. + """ + return [ + self._from_columns_like_self( + libcudf.copying.columns_split([*self._data.columns], splits)[ + split_idx + ], + self._column_names, + ) + for split_idx in range(len(splits) + 1) + ] def _encode(self): data, index, indices = libcudf.transform.table_encode(self) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 0c587be64ec..520e8c90ba2 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -694,6 +694,11 @@ def _apply_boolean_mask(self, boolean_mask): [self._values.apply_boolean_mask(boolean_mask)], [self.name] ) + def _split(self, splits): + return Int64Index._from_columns( + [self._values.columns_split(splits)], [self.name] + ) + # Patch in all binops and unary ops, which bypass __getattr__ on the instance # and prevent the above overload from working. diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 56f7287fd6c..3a5e82dc106 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -652,6 +652,22 @@ def _empty_like(self, keep_index=True): self._index.names if keep_index else None, ) + def _split(self, splits, keep_index=True): + return [ + self._from_columns_like_self( + libcudf.copying.columns_split( + [ + *(self._index._data.columns if keep_index else []), + *self._columns, + ], + splits, + )[split_idx], + self._column_names, + self._index.names if keep_index else None, + ) + for split_idx in range(len(splits) + 1) + ] + def add_prefix(self, prefix): """ Prefix labels with string `prefix`. From 85ff332bb28bf04abb17b29915981602413c3db8 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 2 Feb 2022 16:33:46 -0800 Subject: [PATCH 04/14] Unify type declaration --- python/cudf/cudf/_lib/copying.pyx | 8 ++++---- python/cudf/cudf/_lib/stream_compaction.pyx | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index d5adb47e0f5..5e8ee95857f 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -167,7 +167,7 @@ def copy_range(Column input_column, def gather( - columns: list, + list columns, Column gather_map, bool nullify=False ): @@ -282,7 +282,7 @@ def column_allocate_like(Column input_column, size=None): return Column.from_unique_ptr(move(c_result)) -def columns_empty_like(input_columns: list): +def columns_empty_like(list input_columns): cdef table_view input_table_view = table_view_from_columns(input_columns) cdef unique_ptr[table] c_result @@ -321,7 +321,7 @@ def column_slice(Column input_column, object indices): return result -def columns_slice(input_columns: list, indices: list): +def columns_slice(list input_columns, list indices): """ Given a list of input columns, return columns sliced by ``indices``. @@ -378,7 +378,7 @@ def column_split(Column input_column, object splits): return result -def columns_split(input_columns: list, object splits): +def columns_split(list input_columns, object splits): cdef table_view input_table_view = table_view_from_columns(input_columns) cdef vector[size_type] c_splits = splits diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 4330c565982..c11a221547d 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -32,7 +32,7 @@ from cudf._lib.utils cimport ( ) -def drop_nulls(columns: list, how="any", keys=None, thresh=None): +def drop_nulls(list columns, how="any", keys=None, thresh=None): """ Drops null rows from cols depending on key columns. @@ -75,7 +75,7 @@ def drop_nulls(columns: list, how="any", keys=None, thresh=None): return columns_from_unique_ptr(move(c_result)) -def apply_boolean_mask(columns: list, Column boolean_mask): +def apply_boolean_mask(list columns, Column boolean_mask): """ Drops the rows which correspond to False in boolean_mask. @@ -104,7 +104,7 @@ def apply_boolean_mask(columns: list, Column boolean_mask): return columns_from_unique_ptr(move(c_result)) -def drop_duplicates(columns: list, +def drop_duplicates(list columns, object keys=None, object keep='first', bool nulls_are_equal=True): From 4fd6dd93107a040da7d7f2ce9c53843663cb29d6 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 25 Feb 2022 13:22:57 -0800 Subject: [PATCH 05/14] Refactor scatter --- python/cudf/cudf/_lib/copying.pyx | 110 +++++++++++++++---------- python/cudf/cudf/_lib/utils.pxd | 2 +- python/cudf/cudf/core/column/column.py | 6 +- 3 files changed, 69 insertions(+), 49 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 4cf696f8248..e462770ef85 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -191,60 +191,80 @@ def gather( return columns_from_unique_ptr(move(c_result)) -def scatter(object source, Column scatter_map, Column target_column, - bool bounds_check=True): - """ - Scattering input into target as per the scatter map, - input can be a list of scalars or can be a table - """ - - cdef column_view scatter_map_view = scatter_map.view() - cdef table_view target_table_view = table_view_from_columns( - (target_column,)) - cdef bool c_bounds_check = bounds_check +cdef scatter_scalar(list source_device_slrs, + column_view scatter_map, + table_view target_table, + bool bounds_check): + cdef vector[reference_wrapper[constscalar]] c_source + cdef DeviceScalar d_slr cdef unique_ptr[table] c_result - # Needed for the table branch - cdef table_view source_table_view + c_source.reserve(len(source_device_slrs)) + for d_slr in source_device_slrs: + c_source.push_back( + reference_wrapper[constscalar](d_slr.get_raw_ptr()[0]) + ) + + with nogil: + c_result = move( + cpp_copying.scatter( + c_source, + scatter_map, + target_table, + bounds_check + ) + ) - # Needed for the scalar branch - cdef vector[reference_wrapper[constscalar]] source_scalars - cdef DeviceScalar slr + return columns_from_unique_ptr(move(c_result)) - if isinstance(source, Column): - source_table_view = table_view_from_columns(( source,)) - with nogil: - c_result = move( - cpp_copying.scatter( - source_table_view, - scatter_map_view, - target_table_view, - c_bounds_check - ) - ) - else: - slr = as_device_scalar(source, target_column.dtype) - source_scalars.push_back(reference_wrapper[constscalar]( - slr.get_raw_ptr()[0])) +cdef scatter_column(list source_columns, + column_view scatter_map, + table_view target_table, + bool bounds_check): + cdef table_view c_source = table_view_from_columns(source_columns) + cdef unique_ptr[table] c_result - with nogil: - c_result = move( - cpp_copying.scatter( - source_scalars, - scatter_map_view, - target_table_view, - c_bounds_check - ) + with nogil: + c_result = move( + cpp_copying.scatter( + c_source, + scatter_map, + target_table, + bounds_check ) + ) + return columns_from_unique_ptr(move(c_result)) - data, _ = data_from_unique_ptr( - move(c_result), - column_names=(None,), - index_names=None - ) - return next(iter(data.values())) +def scatter(list sources, Column scatter_map, list target_columns, + bool bounds_check=True): + """ + Scattering source into target as per the scatter map. + `source` can be a list of scalars, or a list of columns. The number of + items in `sources` must equal the number of `target_columns` to scatter. + """ + # TODO: Only single column scatter is used, we should explore multi-column + # scatter for frames for performance increase. + + if len(sources) != len(target_columns): + raise ValueError("Mismatched number of source and target columns.") + + if len(sources) == 0: + return [] + + cdef column_view scatter_map_view = scatter_map.view() + cdef table_view target_table_view = table_view_from_columns(target_columns) + + if isinstance(sources[0], Column): + return scatter_column( + sources, scatter_map_view, target_table_view, bounds_check + ) + else: + source_scalars = [as_device_scalar(slr) for slr in sources] + return scatter_scalar( + source_scalars, scatter_map_view, target_table_view, bounds_check + ) def column_empty_like(Column input_column): diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd index ef8b5c156d5..8a53b71124a 100644 --- a/python/cudf/cudf/_lib/utils.pxd +++ b/python/cudf/cudf/_lib/utils.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.string cimport string diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2788ac6a600..95bb103e364 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -585,9 +585,9 @@ def _scatter_by_column( [value], [self], key )[0]._with_type_metadata(self.dtype) else: - return libcudf.copying.scatter( - value, key, self - )._with_type_metadata(self.dtype) + return libcudf.copying.scatter([value], key, [self])[ + 0 + ]._with_type_metadata(self.dtype) except RuntimeError as e: if "out of bounds" in str(e): raise IndexError( From f0d8b6b855d780d70342d71014bf9450dee3d6e4 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 25 Feb 2022 13:24:26 -0800 Subject: [PATCH 06/14] Revert _data_column introduction --- python/cudf/cudf/core/multiindex.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 3c6d86d2b1c..b09a2d39c14 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1067,9 +1067,6 @@ def values(self): """ return self.to_frame(index=False).values - def _data_columns(self) -> Tuple[column.ColumnBase, ...]: - return self._columns - @classmethod def from_frame(cls, df, names=None): """ From 683417574edf3910dfeea3578c79a04f0005fac6 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 25 Feb 2022 13:29:32 -0800 Subject: [PATCH 07/14] doc fix --- python/cudf/cudf/_lib/utils.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 39025500095..8557f430e25 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -321,7 +321,7 @@ cdef columns_from_table_view( lifetime. ``owner`` must be either None or a list of column. If ``owner`` is a list of columns, the owner of the `i`th ``cudf::column_view`` in the table view is ``owners[i]``. For more about memory ownership, - see ``Column.from_column_view`` + see ``Column.from_column_view``. """ return [ From da613a191ec86b3ad99f510bdad17b7842832759 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 25 Feb 2022 13:35:23 -0800 Subject: [PATCH 08/14] Revert more _data_columns introduction --- python/cudf/cudf/core/_base_index.py | 9 +-------- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/index.py | 6 ------ 3 files changed, 2 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 8a2b27bc6eb..a91d2747980 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -4,7 +4,7 @@ import pickle from functools import cached_property -from typing import Any, Set, Tuple +from typing import Any, Set import pandas as pd @@ -66,13 +66,6 @@ def get_loc(self, key, method=None, tolerance=None): def __getitem__(self, key): raise NotImplementedError() - def _data_columns(self) -> Tuple[ColumnBase, ...]: - """Return a tuple of columns that holds actual data. ``RangeIndex`` - returns an empty tuple. Unlike ``_values``, this method does not - materialize columns. - """ - raise NotImplementedError() - def __contains__(self, item): return item in self._values diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1c0d5620492..c88dc7b0c94 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1314,7 +1314,7 @@ def _slice(self: T, arg: slice) -> T: ) else: columns_to_slice = [ - *self._index._data_columns(), + *(self._index._data.columns if not is_range_index else []), *self._columns, ] result = self._from_columns_like_self( diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 3b81cfb1b5c..89613edef8c 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -232,9 +232,6 @@ def _values(self): else: return column.column_empty(0, masked=False, dtype=self.dtype) - def _data_columns(self) -> Tuple[ColumnBase, ...]: - return () - def is_numeric(self): return True @@ -863,9 +860,6 @@ def _copy_type_metadata( def _values(self): return self._column - def _data_columns(self) -> Tuple[ColumnBase, ...]: - return self._columns - @classmethod def _concat(cls, objs): if all(isinstance(obj, RangeIndex) for obj in objs): From 52df3cbbcd41492275a43c135836818c586f4844 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 25 Feb 2022 13:36:40 -0800 Subject: [PATCH 09/14] Remove TODO that's tracked elsewhere. --- python/cudf/cudf/core/indexed_frame.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 9b4d688904b..2245c40f7d5 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -701,8 +701,6 @@ def drop_duplicates( @annotate("FRAME_EMPTY_LIKE", color="green", domain="cudf_python") def _empty_like(self, keep_index=True): - # TODO: RangeIndex._data.columns materializes data, - # which is unecessary here. return self._from_columns_like_self( libcudf.copying.columns_empty_like( [ From 9d878f142f1188e8a7f3a98312a1d813e7eeae1d Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 4 Mar 2022 10:43:57 -0800 Subject: [PATCH 10/14] review comments --- python/cudf/cudf/core/_base_index.py | 2 +- python/cudf/cudf/core/dataframe.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index a91d2747980..84c916472e5 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1529,7 +1529,7 @@ def _split_columns_by_levels(self, levels): ) def _split(self, splits): - raise NotImplementedError() + raise NotImplementedError def _get_result_name(left_name, right_name): diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c88dc7b0c94..3faf04029a7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1300,7 +1300,7 @@ def _slice(self: T, arg: slice) -> T: stop = stop + num_rows stride = 1 if stride is None else stride - if start > stop and stride == 1: + if (stop - start) * stride <= 0: return self._empty_like(keep_index=True) start = len(self) if start > num_rows else start From 0afc3e7775e8acd2d4a5cc5fb0e925f0414cba81 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 4 Mar 2022 12:03:04 -0800 Subject: [PATCH 11/14] Avoid dup work in split --- python/cudf/cudf/core/indexed_frame.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 2245c40f7d5..35381dbe198 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -713,19 +713,21 @@ def _empty_like(self, keep_index=True): ) def _split(self, splits, keep_index=True): + columns_splitted = libcudf.copying.columns_split( + [ + *(self._index._data.columns if keep_index else []), + *self._columns, + ], + splits, + ) + return [ self._from_columns_like_self( - libcudf.copying.columns_split( - [ - *(self._index._data.columns if keep_index else []), - *self._columns, - ], - splits, - )[split_idx], + columns_splitted[i], self._column_names, self._index.names if keep_index else None, ) - for split_idx in range(len(splits) + 1) + for i in range(len(splits) + 1) ] def add_prefix(self, prefix): From 580732942f167fae139e0281e839f510c98f8eaf Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 8 Mar 2022 11:53:36 -0800 Subject: [PATCH 12/14] Update python/cudf/cudf/core/indexed_frame.py Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/core/indexed_frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index c3b6fbc0d60..d5af02161e3 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -713,7 +713,7 @@ def _empty_like(self, keep_index=True): ) def _split(self, splits, keep_index=True): - columns_splitted = libcudf.copying.columns_split( + columns_split = libcudf.copying.columns_split( [ *(self._index._data.columns if keep_index else []), *self._columns, From c0d8d8e46286d0b19ac5b1e045aa3a52662fd6fa Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 8 Mar 2022 11:53:49 -0800 Subject: [PATCH 13/14] Update python/cudf/cudf/core/indexed_frame.py Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/core/indexed_frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index d5af02161e3..46b5e51df73 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -723,7 +723,7 @@ def _split(self, splits, keep_index=True): return [ self._from_columns_like_self( - columns_splitted[i], + columns_split[i], self._column_names, self._index.names if keep_index else None, ) From f4349bdaf828037c31420f61bcffee7ce1da2485 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 8 Mar 2022 11:56:02 -0800 Subject: [PATCH 14/14] Deindent --- python/cudf/cudf/core/dataframe.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d74aeddbda7..e687c274d2f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1312,23 +1312,21 @@ def _slice(self: T, arg: slice) -> T: start, stop=stop, step=stride, dtype=np.int32 ) ) - else: - columns_to_slice = [ - *(self._index._data.columns if not is_range_index else []), - *self._columns, - ] - result = self._from_columns_like_self( - libcudf.copying.columns_slice(columns_to_slice, [start, stop])[ - 0 - ], - self._column_names, - None if is_range_index else self._index.names, - ) - if is_range_index: - result.index = self.index[start:stop] - result._set_column_names_like(self) - return result + columns_to_slice = [ + *(self._index._data.columns if not is_range_index else []), + *self._columns, + ] + result = self._from_columns_like_self( + libcudf.copying.columns_slice(columns_to_slice, [start, stop])[0], + self._column_names, + None if is_range_index else self._index.names, + ) + + if is_range_index: + result.index = self.index[start:stop] + result._set_column_names_like(self) + return result @annotate("DATAFRAME_MEMORY_USAGE", color="blue", domain="cudf_python") def memory_usage(self, index=True, deep=False):