Skip to content

Commit

Permalink
Remove deprecated Index methods from Frame (NVIDIA#11073)
Browse files Browse the repository at this point in the history
This PR moves a number of methods from Frame to IndexedFrame that are no longer supported for Index objects. The one holdout is `Frame.replace`, which is also deprecated for Index but is used internally by Index methods in one place. In order to make this PR as simple to review as possible, I have omitted that function here. All changes in this PR are trivial moves of code aside from the deletions of the warnings.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: rapidsai/cudf#11073
  • Loading branch information
vyasr authored Jun 18, 2022
1 parent 379faf9 commit d6c56fe
Show file tree
Hide file tree
Showing 4 changed files with 427 additions and 519 deletions.
85 changes: 85 additions & 0 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1999,6 +1999,91 @@ def _make_operands_and_index_for_binop(
operands[k] = (left_default, v, reflect, None)
return operands, index

@_cudf_nvtx_annotate
def scatter_by_map(
self, map_index, map_size=None, keep_index=True, **kwargs
):
"""Scatter to a list of dataframes.
Uses map_index to determine the destination
of each row of the original DataFrame.
Parameters
----------
map_index : Series, str or list-like
Scatter assignment for each row
map_size : int
Length of output list. Must be >= uniques in map_index
keep_index : bool
Conserve original index values for each row
Returns
-------
A list of cudf.DataFrame objects.
"""
# map_index might be a column name or array,
# make it a Column
if isinstance(map_index, str):
map_index = self._data[map_index]
elif isinstance(map_index, cudf.Series):
map_index = map_index._column
else:
map_index = as_column(map_index)

# Convert float to integer
if map_index.dtype.kind == "f":
map_index = map_index.astype(np.int32)

# Convert string or categorical to integer
if isinstance(map_index, cudf.core.column.StringColumn):
map_index = map_index.as_categorical_column(
"category"
).as_numerical
warnings.warn(
"Using StringColumn for map_index in scatter_by_map. "
"Use an integer array/column for better performance."
)
elif isinstance(map_index, cudf.core.column.CategoricalColumn):
map_index = map_index.as_numerical
warnings.warn(
"Using CategoricalColumn for map_index in scatter_by_map. "
"Use an integer array/column for better performance."
)

if kwargs.get("debug", False) == 1 and map_size is not None:
count = map_index.distinct_count()
if map_size < count:
raise ValueError(
f"ERROR: map_size must be >= {count} (got {map_size})."
)

partitioned_columns, output_offsets = libcudf.partitioning.partition(
[*(self._index._columns if keep_index else ()), *self._columns],
map_index,
map_size,
)
partitioned = self._from_columns_like_self(
partitioned_columns,
column_names=self._column_names,
index_names=self._index_names if keep_index else None,
)

# due to the split limitation mentioned
# here: https://github.com/rapidsai/cudf/issues/4607
# we need to remove first & last elements in offsets.
# TODO: Remove this after the above issue is fixed.
output_offsets = output_offsets[1:-1]

result = partitioned._split(output_offsets, keep_index=keep_index)

if map_size:
result += [
self._empty_like(keep_index)
for _ in range(map_size - len(result))
]

return result

@_cudf_nvtx_annotate
def update(
self,
Expand Down
Loading

0 comments on commit d6c56fe

Please sign in to comment.