diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 2f18c904c05..abf20869a15 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -38,6 +38,7 @@ from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type from cudf._lib.utils cimport ( + columns_from_table_view, columns_from_unique_ptr, data_from_table_view, data_from_unique_ptr, @@ -166,7 +167,7 @@ def copy_range(Column input_column, def gather( - columns: list, + list columns, Column gather_map, bool nullify=False ): @@ -190,60 +191,80 @@ def gather( return columns_from_unique_ptr(move(c_result)) -def scatter(object source, Column scatter_map, Column target_column, - bool bounds_check=True): - """ - Scattering input into target as per the scatter map, - input can be a list of scalars or can be a table - """ - - cdef column_view scatter_map_view = scatter_map.view() - cdef table_view target_table_view = table_view_from_columns( - (target_column,)) - cdef bool c_bounds_check = bounds_check +cdef scatter_scalar(list source_device_slrs, + column_view scatter_map, + table_view target_table, + bool bounds_check): + cdef vector[reference_wrapper[constscalar]] c_source + cdef DeviceScalar d_slr cdef unique_ptr[table] c_result - # Needed for the table branch - cdef table_view source_table_view + c_source.reserve(len(source_device_slrs)) + for d_slr in source_device_slrs: + c_source.push_back( + reference_wrapper[constscalar](d_slr.get_raw_ptr()[0]) + ) + + with nogil: + c_result = move( + cpp_copying.scatter( + c_source, + scatter_map, + target_table, + bounds_check + ) + ) - # Needed for the scalar branch - cdef vector[reference_wrapper[constscalar]] source_scalars - cdef DeviceScalar slr + return columns_from_unique_ptr(move(c_result)) - if isinstance(source, Column): - source_table_view = table_view_from_columns(( source,)) - with nogil: - c_result = move( - cpp_copying.scatter( - source_table_view, - scatter_map_view, - target_table_view, - c_bounds_check - ) - ) - else: - slr = as_device_scalar(source, target_column.dtype) - source_scalars.push_back(reference_wrapper[constscalar]( - slr.get_raw_ptr()[0])) +cdef scatter_column(list source_columns, + column_view scatter_map, + table_view target_table, + bool bounds_check): + cdef table_view c_source = table_view_from_columns(source_columns) + cdef unique_ptr[table] c_result - with nogil: - c_result = move( - cpp_copying.scatter( - source_scalars, - scatter_map_view, - target_table_view, - c_bounds_check - ) + with nogil: + c_result = move( + cpp_copying.scatter( + c_source, + scatter_map, + target_table, + bounds_check ) + ) + return columns_from_unique_ptr(move(c_result)) - data, _ = data_from_unique_ptr( - move(c_result), - column_names=(None,), - index_names=None - ) - return next(iter(data.values())) +def scatter(list sources, Column scatter_map, list target_columns, + bool bounds_check=True): + """ + Scattering source into target as per the scatter map. + `source` can be a list of scalars, or a list of columns. The number of + items in `sources` must equal the number of `target_columns` to scatter. + """ + # TODO: Only single column scatter is used, we should explore multi-column + # scatter for frames for performance increase. + + if len(sources) != len(target_columns): + raise ValueError("Mismatched number of source and target columns.") + + if len(sources) == 0: + return [] + + cdef column_view scatter_map_view = scatter_map.view() + cdef table_view target_table_view = table_view_from_columns(target_columns) + + if isinstance(sources[0], Column): + return scatter_column( + sources, scatter_map_view, target_table_view, bounds_check + ) + else: + source_scalars = [as_device_scalar(slr) for slr in sources] + return scatter_scalar( + source_scalars, scatter_map_view, target_table_view, bounds_check + ) def column_empty_like(Column input_column): @@ -281,24 +302,14 @@ def column_allocate_like(Column input_column, size=None): return Column.from_unique_ptr(move(c_result)) -def table_empty_like(input_table, bool keep_index=True): - - cdef table_view input_table_view = table_view_from_table( - input_table, not keep_index - ) - +def columns_empty_like(list input_columns): + cdef table_view input_table_view = table_view_from_columns(input_columns) cdef unique_ptr[table] c_result with nogil: c_result = move(cpp_copying.empty_like(input_table_view)) - return data_from_unique_ptr( - move(c_result), - column_names=input_table._column_names, - index_names=( - input_table._index._column_names if keep_index is True else None - ) - ) + return columns_from_unique_ptr(move(c_result)) def column_slice(Column input_column, object indices): @@ -330,21 +341,18 @@ def column_slice(Column input_column, object indices): return result -def table_slice(input_table, object indices, bool keep_index=True): - - cdef table_view input_table_view = table_view_from_table( - input_table, not keep_index - ) - - cdef vector[size_type] c_indices - c_indices.reserve(len(indices)) +def columns_slice(list input_columns, list indices): + """ + Given a list of input columns, return columns sliced by ``indices``. + Returns a list of list of columns. The length of return is + `len(indices) / 2`. The `i`th item in return is a list of columns sliced + from ``input_columns`` with `slice(indices[i*2], indices[i*2 + 1])`. + """ + cdef table_view input_table_view = table_view_from_columns(input_columns) + cdef vector[size_type] c_indices = indices cdef vector[table_view] c_result - cdef int index - for index in indices: - c_indices.push_back(index) - with nogil: c_result = move( cpp_copying.slice( @@ -352,18 +360,11 @@ def table_slice(input_table, object indices, bool keep_index=True): c_indices) ) - num_of_result_cols = c_result.size() return [ - data_from_table_view( - c_result[i], - input_table, - column_names=input_table._column_names, - index_names=( - input_table._index._column_names if ( - keep_index is True) - else None - ) - ) for i in range(num_of_result_cols)] + columns_from_table_view( + c_result[i], input_columns + ) for i in range(c_result.size()) + ] def column_split(Column input_column, object splits): @@ -397,21 +398,12 @@ def column_split(Column input_column, object splits): return result -def table_split(input_table, object splits, bool keep_index=True): - - cdef table_view input_table_view = table_view_from_table( - input_table, not keep_index - ) - - cdef vector[size_type] c_splits - c_splits.reserve(len(splits)) +def columns_split(list input_columns, object splits): + cdef table_view input_table_view = table_view_from_columns(input_columns) + cdef vector[size_type] c_splits = splits cdef vector[table_view] c_result - cdef int split - for split in splits: - c_splits.push_back(split) - with nogil: c_result = move( cpp_copying.split( @@ -419,16 +411,11 @@ def table_split(input_table, object splits, bool keep_index=True): c_splits) ) - num_of_result_cols = c_result.size() return [ - data_from_table_view( - c_result[i], - input_table, - column_names=input_table._column_names, - index_names=input_table._index_names if ( - keep_index is True) - else None - ) for i in range(num_of_result_cols)] + columns_from_table_view( + c_result[i], input_columns + ) for i in range(c_result.size()) + ] def _copy_if_else_column_column(Column lhs, Column rhs, Column boolean_mask): diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index c4f885382f3..876c7145399 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -32,7 +32,7 @@ from cudf._lib.utils cimport ( ) -def drop_nulls(columns: list, how="any", keys=None, thresh=None): +def drop_nulls(list columns, how="any", keys=None, thresh=None): """ Drops null rows from cols depending on key columns. @@ -75,7 +75,7 @@ def drop_nulls(columns: list, how="any", keys=None, thresh=None): return columns_from_unique_ptr(move(c_result)) -def apply_boolean_mask(columns: list, Column boolean_mask): +def apply_boolean_mask(list columns, Column boolean_mask): """ Drops the rows which correspond to False in boolean_mask. @@ -104,7 +104,7 @@ def apply_boolean_mask(columns: list, Column boolean_mask): return columns_from_unique_ptr(move(c_result)) -def drop_duplicates(columns: list, +def drop_duplicates(list columns, object keys=None, object keep='first', bool nulls_are_equal=True): diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd index 50893ef9838..8a53b71124a 100644 --- a/python/cudf/cudf/_lib/utils.pxd +++ b/python/cudf/cudf/_lib/utils.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -17,3 +17,4 @@ cdef data_from_table_view( cdef table_view table_view_from_columns(columns) except * cdef table_view table_view_from_table(tbl, ignore_index=*) except* cdef columns_from_unique_ptr(unique_ptr[table] c_tbl) +cdef columns_from_table_view(table_view tv, object owners) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 91dfea735a1..8557f430e25 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -311,6 +311,24 @@ cdef data_from_unique_ptr( } return data, index +cdef columns_from_table_view( + table_view tv, + object owners, +): + """ + Given a ``cudf::table_view``, construsts a list of columns from it, + along with referencing an ``owner`` Python object that owns the memory + lifetime. ``owner`` must be either None or a list of column. If ``owner`` + is a list of columns, the owner of the `i`th ``cudf::column_view`` in the + table view is ``owners[i]``. For more about memory ownership, + see ``Column.from_column_view``. + """ + + return [ + Column.from_column_view( + tv.column(i), owners[i] if isinstance(owners, list) else None + ) for i in range(tv.num_columns()) + ] cdef data_from_table_view( table_view tv, diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 74f22780a6c..60f739cff8b 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1522,6 +1522,9 @@ def _split_columns_by_levels(self, levels): [], ) + def _split(self, splits): + raise NotImplementedError + def sample( self, n=None, diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 775bc365cee..03a4bdd7b4d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -593,9 +593,9 @@ def _scatter_by_column( [value], [self], key )[0]._with_type_metadata(self.dtype) else: - return libcudf.copying.scatter( - value, key, self - )._with_type_metadata(self.dtype) + return libcudf.copying.scatter([value], key, [self])[ + 0 + ]._with_type_metadata(self.dtype) except RuntimeError as e: if "out of bounds" in str(e): raise IndexError( diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index bf880e3b25a..e687c274d2f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1282,52 +1282,51 @@ def _slice(self: T, arg: slice) -> T: ), ) - # This is just to handle RangeIndex type, stop - # it from materializing unnecessarily - keep_index = True - if self.index is not None and isinstance(self.index, RangeIndex): + # If index type is RangeIndex, slice without materializing. + is_range_index = isinstance(self.index, RangeIndex) + if is_range_index: if self._num_columns == 0: - result = self._empty_like(keep_index) + result = self._empty_like(keep_index=False) result._index = self.index[start:stop:stride] return result - keep_index = False - # For decreasing slices, terminal at before-the-zero - # position is preserved. if start < 0: start = start + num_rows + + # Decreasing slices that terminates at -1, such as slice(4, -1, -1), + # has end index of 0, The check below makes sure -1 is not wrapped + # to `-1 + num_rows`. if stop < 0 and not (stride < 0 and stop == -1): stop = stop + num_rows + stride = 1 if stride is None else stride - if start > stop and (stride is None or stride == 1): - return self._empty_like(keep_index) - else: - start = len(self) if start > num_rows else start - stop = len(self) if stop > num_rows else stop + if (stop - start) * stride <= 0: + return self._empty_like(keep_index=True) - if stride is not None and stride != 1: - return self._gather( - cudf.core.column.arange( - start, stop=stop, step=stride, dtype=np.int32 - ) - ) - else: - result = self._from_data( - *libcudf.copying.table_slice( - self, [start, stop], keep_index - )[0] + start = len(self) if start > num_rows else start + stop = len(self) if stop > num_rows else stop + + if stride != 1: + return self._gather( + cudf.core.column.arange( + start, stop=stop, step=stride, dtype=np.int32 ) + ) - result._copy_type_metadata(self, include_index=keep_index) - if self.index is not None: - if keep_index: - result._index.names = self.index.names - else: - # Adding index of type RangeIndex back to - # result - result.index = self.index[start:stop] - result._set_column_names_like(self) - return result + columns_to_slice = [ + *(self._index._data.columns if not is_range_index else []), + *self._columns, + ] + result = self._from_columns_like_self( + libcudf.copying.columns_slice(columns_to_slice, [start, stop])[0], + self._column_names, + None if is_range_index else self._index.names, + ) + + if is_range_index: + result.index = self.index[start:stop] + result._set_column_names_like(self) + return result @annotate("DATAFRAME_MEMORY_USAGE", color="blue", domain="cudf_python") def memory_usage(self, index=True, deep=False): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 58a45645d59..aba2b6d1a11 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -643,15 +643,6 @@ def _as_column(self): return self._data[None].copy(deep=False) - @annotate("FRAME_EMPTY_LIKE", color="green", domain="cudf_python") - def _empty_like(self, keep_index=True): - result = self.__class__._from_data( - *libcudf.copying.table_empty_like(self, keep_index) - ) - - result._copy_type_metadata(self, include_index=keep_index) - return result - @property def values(self): """ @@ -3353,11 +3344,19 @@ def _is_sorted(self, ascending=None, null_position=None): ) @annotate("FRAME_SPLIT", color="green", domain="cudf_python") - def _split(self, splits, keep_index=True): - results = libcudf.copying.table_split( - self, splits, keep_index=keep_index - ) - return [self.__class__._from_data(*result) for result in results] + def _split(self, splits): + """Split a frame with split points in ``splits``. Returns a list of + Frames of length `len(splits) + 1`. + """ + return [ + self._from_columns_like_self( + libcudf.copying.columns_split([*self._data.columns], splits)[ + split_idx + ], + self._column_names, + ) + for split_idx in range(len(splits) + 1) + ] @annotate("FRAME_ENCODE", color="green", domain="cudf_python") def _encode(self): diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index e16165fcf4b..d418ffc0394 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -763,6 +763,11 @@ def _apply_boolean_mask(self, boolean_mask): [self._values.apply_boolean_mask(boolean_mask)], [self.name] ) + def _split(self, splits): + return Int64Index._from_columns( + [self._values.columns_split(splits)], [self.name] + ) + def _binaryop(self, other, op: str): return self._as_int64()._binaryop(other, op=op) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index bebff037bee..82b7645b138 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -722,6 +722,37 @@ def drop_duplicates( self._index.names if not ignore_index else None, ) + @annotate("FRAME_EMPTY_LIKE", color="green", domain="cudf_python") + def _empty_like(self, keep_index=True): + return self._from_columns_like_self( + libcudf.copying.columns_empty_like( + [ + *(self._index._data.columns if keep_index else ()), + *self._columns, + ] + ), + self._column_names, + self._index.names if keep_index else None, + ) + + def _split(self, splits, keep_index=True): + columns_split = libcudf.copying.columns_split( + [ + *(self._index._data.columns if keep_index else []), + *self._columns, + ], + splits, + ) + + return [ + self._from_columns_like_self( + columns_split[i], + self._column_names, + self._index.names if keep_index else None, + ) + for i in range(len(splits) + 1) + ] + def add_prefix(self, prefix): """ Prefix labels with string `prefix`.