Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Some frame.py typing, move seldom used methods in frame.py #14766

Merged
merged 13 commits into from
Jan 19, 2024
Merged
97 changes: 37 additions & 60 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,6 @@ def _dtypes(self):
zip(self._data.names, (col.dtype for col in self._data.columns))
)

@property
def _has_nulls(self):
return any(col.has_nulls() for col in self._data.values())

@_cudf_nvtx_annotate
def serialize(self):
# TODO: See if self._data can be serialized outright
Expand Down Expand Up @@ -135,13 +131,13 @@ def deserialize(cls, header, frames):

@classmethod
@_cudf_nvtx_annotate
def _from_data(cls, data: MutableMapping):
def _from_data(cls, data: MutableMapping) -> Self:
obj = cls.__new__(cls)
Frame.__init__(obj, data)
return obj

@_cudf_nvtx_annotate
def _from_data_like_self(self, data: MutableMapping):
def _from_data_like_self(self, data: MutableMapping) -> Self:
return self._from_data(data)

@_cudf_nvtx_annotate
Expand Down Expand Up @@ -179,7 +175,7 @@ def _mimic_inplace(

@property
@_cudf_nvtx_annotate
def size(self):
def size(self) -> int:
"""
Return the number of elements in the underlying data.

Expand Down Expand Up @@ -272,7 +268,7 @@ def memory_usage(self, deep=False):
raise NotImplementedError

@_cudf_nvtx_annotate
def __len__(self):
def __len__(self) -> int:
return self._num_rows

@_cudf_nvtx_annotate
Expand All @@ -291,7 +287,7 @@ def astype(self, dtype, copy: bool = False):
)

@_cudf_nvtx_annotate
def equals(self, other):
def equals(self, other) -> bool:
"""
Test whether two objects contain the same elements.

Expand Down Expand Up @@ -375,7 +371,7 @@ def _get_columns_by_label(self, labels, *, downcast=False) -> Self:

@property
@_cudf_nvtx_annotate
def values(self):
def values(self) -> cupy.ndarray:
"""
Return a CuPy representation of the DataFrame.

Expand All @@ -391,7 +387,7 @@ def values(self):

@property
@_cudf_nvtx_annotate
def values_host(self):
def values_host(self) -> np.ndarray:
"""
Return a NumPy representation of the data.

Expand Down Expand Up @@ -547,7 +543,7 @@ def to_numpy(
)

@_cudf_nvtx_annotate
def where(self, cond, other=None, inplace=False):
def where(self, cond, other=None, inplace: bool = False) -> Optional[Self]:
"""
Replace values where the condition is False.

Expand Down Expand Up @@ -618,7 +614,7 @@ def where(self, cond, other=None, inplace=False):
raise NotImplementedError

@_cudf_nvtx_annotate
def mask(self, cond, other=None, inplace=False):
def mask(self, cond, other=None, inplace: bool = False) -> Optional[Self]:
"""
Replace values where the condition is True.

Expand Down Expand Up @@ -728,8 +724,13 @@ def pipe(self, func, *args, **kwargs):

@_cudf_nvtx_annotate
def fillna(
self, value=None, method=None, axis=None, inplace=False, limit=None
):
self,
value=None,
method: Optional[Literal["ffill", "bfill", "pad", "backfill"]] = None,
axis=None,
inplace: bool = False,
limit=None,
) -> Optional[Self]:
"""Fill null values with ``value`` or specified ``method``.

Parameters
Expand Down Expand Up @@ -848,15 +849,15 @@ def fillna(
if isinstance(value, cudf.Series):
value = value.reindex(self._data.names)
elif isinstance(value, cudf.DataFrame):
if not self.index.equals(value.index):
value = value.reindex(self.index)
if not self.index.equals(value.index): # type: ignore[attr-defined]
value = value.reindex(self.index) # type: ignore[attr-defined]
else:
value = value
elif not isinstance(value, abc.Mapping):
value = {name: copy.deepcopy(value) for name in self._data.names}
else:
value = {
key: value.reindex(self.index)
key: value.reindex(self.index) # type: ignore[attr-defined]
if isinstance(value, cudf.Series)
else value
for key, value in value.items()
Expand Down Expand Up @@ -898,44 +899,14 @@ def _drop_column(self, name):
raise KeyError(f"column '{name}' does not exist")
del self._data[name]

@_cudf_nvtx_annotate
def _drop_na_columns(self, how="any", subset=None, thresh=None):
"""
Drop columns containing nulls
"""
out_cols = []

if subset is None:
df = self
else:
df = self.take(subset)

if thresh is None:
if how == "all":
thresh = 1
else:
thresh = len(df)

for name, col in df._data.items():
try:
check_col = col.nans_to_nulls()
except AttributeError:
check_col = col
no_threshold_valid_count = (
len(col) - check_col.null_count
) < thresh
if no_threshold_valid_count:
continue
out_cols.append(name)

return self[out_cols]

@_cudf_nvtx_annotate
def _quantile_table(
self,
q,
interpolation="LINEAR",
is_sorted=False,
q: float,
interpolation: Literal[
"LINEAR", "LOWER", "HIGHER", "MIDPOINT", "NEAREST"
] = "LINEAR",
is_sorted: bool = False,
column_order=(),
null_precedence=(),
):
Expand Down Expand Up @@ -963,7 +934,7 @@ def _quantile_table(

@classmethod
@_cudf_nvtx_annotate
def from_arrow(cls, data):
def from_arrow(cls, data: pa.Table) -> Self:
"""Convert from PyArrow Table to Frame

Parameters
Expand Down Expand Up @@ -1140,7 +1111,7 @@ def to_arrow(self):
)

@_cudf_nvtx_annotate
def _positions_from_column_names(self, column_names):
def _positions_from_column_names(self, column_names) -> list[int]:
"""Map each column name into their positions in the frame.

The order of indices returned corresponds to the column order in this
Expand Down Expand Up @@ -1542,7 +1513,12 @@ def argsort(
).values

@_cudf_nvtx_annotate
def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
def _get_sorted_inds(
self,
by=None,
ascending=True,
na_position: Literal["first", "last"] = "last",
) -> ColumnBase:
"""
Get the indices required to sort self according to the columns
specified in by.
Expand All @@ -1556,13 +1532,14 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
)._columns
]

# If given a scalar need to construct a sequence of length # of columns
if np.isscalar(ascending):
ascending = [ascending] * len(to_sort)
if is_scalar(ascending):
ascending_lst = [ascending] * len(to_sort)
else:
ascending_lst = list(ascending)

return libcudf.sort.order_by(
to_sort,
ascending,
ascending_lst,
na_position,
stable=True,
)
Expand Down
8 changes: 1 addition & 7 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1251,10 +1251,6 @@ def pipe(self, func, *args, **kwargs):
def _jit_groupby_apply(
self, function, group_names, offsets, group_keys, grouped_values, *args
):
# Nulls are not yet supported
if self.grouping._obj._has_nulls:
raise ValueError("Nulls not yet supported with groupby JIT engine")

chunk_results = jit_groupby_apply(
offsets, grouped_values, function, *args
)
Expand Down Expand Up @@ -1445,9 +1441,7 @@ def mult(df):
group_names, offsets, group_keys, grouped_values = self._grouped()

if engine == "auto":
if (not grouped_values._has_nulls) and _can_be_jitted(
grouped_values, function, args
):
if _can_be_jitted(grouped_values, function, args):
engine = "jit"
else:
engine = "cudf"
Expand Down
36 changes: 35 additions & 1 deletion python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2439,7 +2439,9 @@ def sort_values(
out.columns = self._data.to_pandas_index()
return out

def _n_largest_or_smallest(self, largest, n, columns, keep):
def _n_largest_or_smallest(
self, largest: bool, n: int, columns, keep: Literal["first", "last"]
):
# Get column to operate on
if isinstance(columns, str):
columns = [columns]
Expand Down Expand Up @@ -3068,6 +3070,38 @@ def dropna(

return self._mimic_inplace(result, inplace=inplace)

@_cudf_nvtx_annotate
def _drop_na_columns(self, how="any", subset=None, thresh=None):
"""
Drop columns containing nulls
"""
out_cols = []

if subset is None:
df = self
else:
df = self.take(subset)

if thresh is None:
if how == "all":
thresh = 1
else:
thresh = len(df)

for name, col in df._data.items():
try:
check_col = col.nans_to_nulls()
except AttributeError:
check_col = col
no_threshold_valid_count = (
len(col) - check_col.null_count
) < thresh
if no_threshold_valid_count:
continue
out_cols.append(name)

return self[out_cols]

def _drop_na_rows(self, how="any", subset=None, thresh=None):
"""
Drop null rows from `self`.
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/core/udf/groupby_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.


import cupy as cp
Expand Down Expand Up @@ -209,6 +209,9 @@ def _can_be_jitted(frame, func, args):
# Numba requires bytecode to be present to proceed.
# See https://github.com/numba/numba/issues/4587
return False

if any(col.has_nulls() for col in frame._data.values()):
return False
np_field_types = np.dtype(
list(
_supported_dtypes_from_frame(
Expand Down