Skip to content

Commit

Permalink
Fix remaining internal uses of columns API throughout codebase.
Browse files Browse the repository at this point in the history
  • Loading branch information
vyasr committed Feb 22, 2022
1 parent ccb08a1 commit 102ccef
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 32 deletions.
8 changes: 4 additions & 4 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1618,7 +1618,7 @@ def _concat(

# Reassign index and column names
if objs[0]._data.multiindex:
out.columns = objs[0].columns
out._set_column_names_like(objs[0])
else:
out.columns = names
if not ignore_index:
Expand Down Expand Up @@ -6606,10 +6606,10 @@ def _align_indices(lhs, rhs):
df = df.sort_index()
lhs_out = DataFrame(index=df.index)
rhs_out = DataFrame(index=df.index)
common = set(lhs.columns) & set(rhs.columns)
common = set(lhs._column_names) & set(rhs._column_names)
common_x = {f"{x}_x" for x in common}
common_y = {f"{x}_y" for x in common}
for col in df.columns:
for col in df._column_names:
if col in common_x:
lhs_out[col[:-2]] = df[col]
elif col in common_y:
Expand Down Expand Up @@ -6639,7 +6639,7 @@ def _setitem_with_dataframe(
"""

if input_cols is None:
input_cols = input_df.columns
input_cols = input_df._column_names

if len(input_cols) != len(replace_df._column_names):
raise ValueError(
Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/core/df_protocol.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION.

import collections
import enum
from typing import (
Expand Down Expand Up @@ -535,7 +537,7 @@ def metadata(self):
return {"cudf.index": self._df.index}

def num_columns(self) -> int:
return len(self._df.columns)
return len(self._df._column_names)

def num_rows(self) -> int:
return len(self._df)
Expand All @@ -544,7 +546,7 @@ def num_chunks(self) -> int:
return 1

def column_names(self) -> Iterable[str]:
return self._df.columns.tolist()
return self._df._column_names

def get_column(self, i: int) -> _CuDFColumn:
return _CuDFColumn(
Expand Down
15 changes: 9 additions & 6 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ def melt(
if not isinstance(id_vars, collections.abc.Sequence):
id_vars = [id_vars]
id_vars = list(id_vars)
missing = set(id_vars) - set(frame.columns)
missing = set(id_vars) - set(frame._column_names)
if not len(missing) == 0:
raise KeyError(
f"The following 'id_vars' are not present"
Expand All @@ -508,16 +508,15 @@ def melt(
if not isinstance(value_vars, collections.abc.Sequence):
value_vars = [value_vars]
value_vars = list(value_vars)
missing = set(value_vars) - set(frame.columns)
missing = set(value_vars) - set(frame._column_names)
if not len(missing) == 0:
raise KeyError(
f"The following 'value_vars' are not present"
f" in the DataFrame: {list(missing)}"
)
else:
# then all remaining columns in frame
value_vars = frame.columns.drop(id_vars)
value_vars = list(value_vars)
value_vars = list(set(frame._column_names) - set(id_vars))

# Error for unimplemented support for datatype
dtypes = [frame[col].dtype for col in id_vars + value_vars]
Expand Down Expand Up @@ -691,7 +690,9 @@ def get_dummies(
encode_fallback_dtypes = ["object", "category"]

if columns is None or len(columns) == 0:
columns = df.select_dtypes(include=encode_fallback_dtypes).columns
columns = df.select_dtypes(
include=encode_fallback_dtypes
)._column_names

_length_check_params(prefix, columns, "prefix")
_length_check_params(prefix_sep, columns, "prefix_sep")
Expand Down Expand Up @@ -1062,7 +1063,9 @@ def unstack(df, level, fill_value=None):
)
res = df.T.stack(dropna=False)
# Result's index is a multiindex
res.index.names = tuple(df.columns.names) + df.index.names
res.index.names = (
tuple(df._data.to_pandas_index().names) + df.index.names
)
return res
else:
columns = df.index._poplevels(level)
Expand Down
30 changes: 18 additions & 12 deletions python/cudf/cudf/core/window/rolling.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION
# Copyright (c) 2020-2022, NVIDIA CORPORATION

import itertools

Expand Down Expand Up @@ -198,8 +198,7 @@ def __getitem__(self, arg):
center=self.center,
)

def _apply_agg_series(self, sr, agg_name):
source_column = sr._column
def _apply_agg_column(self, source_column, agg_name):
min_periods = self.min_periods or 1
if isinstance(self.window, int):
preceding_window = None
Expand Down Expand Up @@ -230,7 +229,7 @@ def _apply_agg_series(self, sr, agg_name):
)
window = None

result_col = libcudf.rolling.rolling(
return libcudf.rolling.rolling(
source_column=source_column,
pre_column_window=preceding_window,
fwd_column_window=following_window,
Expand All @@ -240,19 +239,26 @@ def _apply_agg_series(self, sr, agg_name):
op=agg_name,
agg_params=self.agg_params,
)
return sr._from_data({sr.name: result_col}, sr._index)

def _apply_agg_dataframe(self, df, agg_name):
result_df = cudf.DataFrame({})
for i, col_name in enumerate(df.columns):
result_col = self._apply_agg_series(df[col_name], agg_name)
result_df.insert(i, col_name, result_col)
result_df.index = df.index
return result_df
return cudf.DataFrame._from_data(
{
col_name: self._apply_agg_column(col, agg_name)
for col_name, col in df._data.items()
},
index=df.index,
)

def _apply_agg(self, agg_name):
if isinstance(self.obj, cudf.Series):
return self._apply_agg_series(self.obj, agg_name)
return cudf.Series._from_data(
{
self.obj.name: self._apply_agg_column(
self.obj._column, agg_name
)
},
index=self.obj.index,
)
else:
return self._apply_agg_dataframe(self.obj, agg_name)

Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,7 @@ def to_parquet(

if engine == "cudf":
# Ensure that no columns dtype is 'category'
for col in df.columns:
for col in df._column_names:
if partition_cols is None or col not in partition_cols:
if df[col].dtype.name == "category":
raise ValueError(
Expand Down
12 changes: 6 additions & 6 deletions python/cudf/cudf/testing/testing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -696,8 +696,8 @@ def assert_frame_equal(

if PANDAS_GE_110:
pd.testing.assert_index_equal(
left.columns,
right.columns,
left._data.to_pandas_index(),
right._data.to_pandas_index(),
exact=check_column_type,
check_names=check_names,
check_exact=check_exact,
Expand All @@ -708,16 +708,16 @@ def assert_frame_equal(
)
else:
pd.testing.assert_index_equal(
left.columns,
right.columns,
left._data.to_pandas_index(),
right._data.to_pandas_index(),
exact=check_column_type,
check_names=check_names,
check_exact=check_exact,
check_categorical=check_categorical,
obj=f"{obj}.columns",
)

for col in left.columns:
for col in left._column_names:
assert_column_equal(
left._data[col],
right._data[col],
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/tests/test_df_protocol.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION.

from typing import Any, Tuple

import cupy as cp
Expand Down Expand Up @@ -74,7 +76,7 @@ def assert_dataframe_equal(dfo: DataFrameObject, df: cudf.DataFrame):
assert dfo.num_columns() == len(df.columns)
assert dfo.num_rows() == len(df)
assert dfo.num_chunks() == 1
assert dfo.column_names() == list(df.columns)
assert dfo.column_names() == tuple(df.columns)
for col in df.columns:
assert_column_equal(dfo.get_column_by_name(col), df[col]._column)

Expand Down

0 comments on commit 102ccef

Please sign in to comment.