Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove or split up Frame methods that use the index #10439

Merged
merged 18 commits into from
Mar 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 21 additions & 7 deletions python/cudf/cudf/core/column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,26 @@ def select_by_label(self, key: Any) -> ColumnAccessor:
return self._select_by_label_with_wildcard(key)
return self._select_by_label_grouped(key)

def get_labels_by_index(self, index: Any) -> tuple:
"""Get the labels corresponding to the provided column indices.

Parameters
----------
index : integer, integer slice, or list-like of integers
The column indexes.

Returns
-------
tuple
"""
if isinstance(index, slice):
start, stop, step = index.indices(len(self._data))
return self.names[start:stop:step]
elif pd.api.types.is_integer(index):
return (self.names[index],)
else:
return tuple(self.names[i] for i in index)

def select_by_index(self, index: Any) -> ColumnAccessor:
"""
Return a ColumnAccessor composed of the columns
Expand All @@ -355,13 +375,7 @@ def select_by_index(self, index: Any) -> ColumnAccessor:
-------
ColumnAccessor
"""
if isinstance(index, slice):
start, stop, step = index.indices(len(self._data))
keys = self.names[start:stop:step]
elif pd.api.types.is_integer(index):
keys = (self.names[index],)
else:
keys = tuple(self.names[i] for i in index)
keys = self.get_labels_by_index(index)
data = {k: self._data[k] for k in keys}
return self.__class__(
data, multiindex=self.multiindex, level_names=self.level_names,
Expand Down
38 changes: 25 additions & 13 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,9 +372,9 @@ class _DataFrameIlocIndexer(_DataFrameIndexer):
def _getitem_tuple_arg(self, arg):
# Iloc Step 1:
# Gather the columns specified by the second tuple arg
columns_df = self._frame._get_columns_by_index(arg[1])

columns_df._index = self._frame._index
columns_df = self._frame._from_data(
self._frame._data.select_by_index(arg[1]), self._frame._index
)

# Iloc Step 2:
# Gather the rows specified by the first tuple arg
Expand Down Expand Up @@ -422,9 +422,9 @@ def _getitem_tuple_arg(self, arg):

@_cudf_nvtx_annotate
def _setitem_tuple_arg(self, key, value):
columns = self._frame._get_columns_by_index(key[1])

for col in columns:
# TODO: Determine if this usage is prevalent enough to expose this
# selection logic at a higher level than ColumnAccessor.
for col in self._frame._data.get_labels_by_index(key[1]):
self._frame[col].iloc[key[0]] = value

def _getitem_scalar(self, arg):
Expand Down Expand Up @@ -612,7 +612,8 @@ def __init__(
new_df = self._from_arrays(data, index=index, columns=columns)

self._data = new_df._data
self.index = new_df._index
self._index = new_df._index
self._check_data_index_length_match()
elif hasattr(data, "__array_interface__"):
arr_interface = data.__array_interface__
if len(arr_interface["descr"]) == 1:
Expand All @@ -621,7 +622,8 @@ def __init__(
else:
new_df = self.from_records(data, index=index, columns=columns)
self._data = new_df._data
self.index = new_df._index
self._index = new_df._index
self._check_data_index_length_match()
else:
if is_list_like(data):
if len(data) > 0 and is_scalar(data[0]):
Expand All @@ -632,7 +634,8 @@ def __init__(
new_df = DataFrame(data=data, index=index)

self._data = new_df._data
self.index = new_df._index
self._index = new_df._index
self._check_data_index_length_match()
elif len(data) > 0 and isinstance(data[0], Series):
self._init_from_series_list(
data=data, columns=columns, index=index
Expand All @@ -653,6 +656,15 @@ def __init__(
if dtype:
self._data = self.astype(dtype)._data

def _check_data_index_length_match(df: DataFrame) -> None:
# Validate that the number of rows in the data matches the index if the
# data is not empty. This is a helper for the constructor.
if df._data.nrows > 0 and df._data.nrows != len(df._index):
raise ValueError(
f"Shape of passed values is {df.shape}, indices imply "
f"({len(df._index)}, {df._num_columns})"
)

@_cudf_nvtx_annotate
def _init_from_series_list(self, data, columns, index):
if index is None:
Expand Down Expand Up @@ -856,9 +868,7 @@ def _from_data(
index: Optional[BaseIndex] = None,
columns: Any = None,
) -> DataFrame:
out = super()._from_data(data, index)
if index is None:
out.index = RangeIndex(out._data.nrows)
out = super()._from_data(data=data, index=index)
if columns is not None:
out.columns = columns
return out
Expand Down Expand Up @@ -5601,7 +5611,9 @@ def stack(self, level=-1, dropna=True):
"""
assert level in (None, -1)
repeated_index = self.index.repeat(self.shape[1])
name_index = Frame({0: self._column_names}).tile(self.shape[0])
name_index = cudf.DataFrame._from_data({0: self._column_names}).tile(
self.shape[0]
)
new_index = list(repeated_index._columns) + [name_index._columns[0]]
if isinstance(self._index, MultiIndex):
index_names = self._index.names + [None]
Expand Down
Loading