Skip to content

Commit

Permalink
Remove or simplify various utility functions (#10705)
Browse files Browse the repository at this point in the history
This PR excises some of the functions in `cudf/utils/utils.py` and moves others as close to their point of use as possible. Most of the moved functions were essentially single-use functions, while the removed functions were either unnecessary or had obvious preexisting replacements.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: #10705
  • Loading branch information
vyasr authored Apr 21, 2022
1 parent 070aef1 commit 5053a1a
Show file tree
Hide file tree
Showing 20 changed files with 132 additions and 211 deletions.
8 changes: 4 additions & 4 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -911,8 +911,8 @@ def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn:
)
return other

ary = cudf.utils.utils.scalar_broadcast_to(
self._encode(other), size=len(self), dtype=self.codes.dtype
ary = column.full(
len(self), self._encode(other), dtype=self.codes.dtype
)
return column.build_categorical_column(
categories=self.dtype.categories._values,
Expand Down Expand Up @@ -1629,9 +1629,9 @@ def _create_empty_categorical_column(
return column.build_categorical_column(
categories=column.as_column(dtype.categories),
codes=column.as_column(
cudf.utils.utils.scalar_broadcast_to(
_DEFAULT_CATEGORICAL_VALUE,
column.full(
categorical_column.size,
_DEFAULT_CATEGORICAL_VALUE,
categorical_column.codes.dtype,
)
),
Expand Down
5 changes: 1 addition & 4 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@
StructDtype,
)
from cudf.core.mixins import BinaryOperand, Reducible
from cudf.utils import utils
from cudf.utils.dtypes import (
cudf_dtype_from_pa_type,
get_time_unit,
Expand Down Expand Up @@ -1774,9 +1773,7 @@ def as_column(
if dtype is None:
dtype = cudf.dtype("float64")

data = as_column(
utils.scalar_broadcast_to(arbitrary, length, dtype=dtype)
)
data = as_column(full(length, arbitrary, dtype=dtype))
if not nan_as_null and not is_decimal_dtype(data.dtype):
if np.issubdtype(data.dtype, np.floating):
data = data.fillna(np.nan)
Expand Down
11 changes: 4 additions & 7 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,12 @@
as_column,
build_column,
column,
full,
string,
)
from cudf.core.dtypes import CategoricalDtype
from cudf.core.mixins import BinaryOperand
from cudf.utils import cudautils, utils
from cudf.utils import cudautils
from cudf.utils.dtypes import (
NUMERIC_TYPES,
min_column_type,
Expand Down Expand Up @@ -254,9 +255,7 @@ def normalize_binop_value(
if np.isscalar(other):
return cudf.dtype(other_dtype).type(other)
else:
ary = utils.scalar_broadcast_to(
other, size=len(self), dtype=other_dtype
)
ary = full(len(self), other, dtype=other_dtype)
return column.build_column(
data=Buffer(ary),
dtype=ary.dtype,
Expand Down Expand Up @@ -438,9 +437,7 @@ def find_and_replace(
)
if len(replacement_col) == 1 and len(to_replace_col) > 1:
replacement_col = column.as_column(
utils.scalar_broadcast_to(
replacement[0], (len(to_replace_col),), self.dtype
)
full(len(to_replace_col), replacement[0], self.dtype)
)
elif len(replacement_col) == 1 and len(to_replace_col) == 0:
return self.copy()
Expand Down
11 changes: 8 additions & 3 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
from cudf.core.buffer import Buffer
from cudf.core.column import column, datetime
from cudf.core.column.methods import ColumnMethods, ParentType
from cudf.utils import utils
from cudf.utils.docutils import copy_docstring
from cudf.utils.dtypes import can_convert_to_column

Expand Down Expand Up @@ -5521,9 +5520,15 @@ def _binaryop(
if isinstance(other, (StringColumn, str, cudf.Scalar)):
if op == "__add__":
if isinstance(other, cudf.Scalar):
other = utils.scalar_broadcast_to(
other, size=len(self), dtype="object"
other = cast(
StringColumn,
column.full(len(self), other, dtype="object"),
)

# Explicit types are necessary because mypy infers ColumnBase
# rather than StringColumn and sometimes forgets Scalar.
lhs: Union[cudf.Scalar, StringColumn]
rhs: Union[cudf.Scalar, StringColumn]
lhs, rhs = (other, self) if reflect else (self, other)

return cast(
Expand Down
26 changes: 20 additions & 6 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
from cudf.core.resample import DataFrameResampler
from cudf.core.series import Series
from cudf.core.udf.row_function import _get_row_kernel
from cudf.utils import applyutils, docutils, ioutils, queryutils, utils
from cudf.utils import applyutils, docutils, ioutils, queryutils
from cudf.utils.docutils import copy_docstring
from cudf.utils.dtypes import (
can_convert_to_column,
Expand Down Expand Up @@ -1104,7 +1104,7 @@ def __getitem__(self, arg):
elif can_convert_to_column(arg):
mask = arg
if is_list_like(mask):
mask = cudf.utils.utils._create_pandas_series(data=mask)
mask = pd.Series(mask)
if mask.dtype == "bool":
return self._apply_boolean_mask(mask)
else:
Expand Down Expand Up @@ -1173,9 +1173,7 @@ def __setitem__(self, arg, value):
allow_non_unique=True,
)
if is_scalar(value):
self._data[arg] = utils.scalar_broadcast_to(
value, len(self)
)
self._data[arg] = column.full(len(self), value)
else:
value = as_column(value)
self._data[arg] = value
Expand Down Expand Up @@ -2572,8 +2570,24 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
f"{num_cols * (num_cols > 0)}"
)

# TODO: This check is currently necessary because
# _is_scalar_or_zero_d_array below will treat a length 1 pd.Categorical
# as a scalar and attempt to use column.full, which can't handle it.
# Maybe _is_scalar_or_zero_d_array should be changed, or maybe we just
# shouldn't support pd.Categorical at all, but those changes will at
# least require a deprecation cycle because we currently support
# inserting a pd.Categorical.
if isinstance(value, pd.Categorical):
value = cudf.core.column.categorical.pandas_categorical_as_column(
value
)

if _is_scalar_or_zero_d_array(value):
value = utils.scalar_broadcast_to(value, len(self))
value = column.full(
len(self),
value,
"str" if libcudf.scalar._is_null_host_scalar(value) else None,
)

if len(self) == 0:
if isinstance(value, (pd.Series, Series)):
Expand Down
5 changes: 3 additions & 2 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
as_column,
build_categorical_column,
deserialize_columns,
full,
serialize_columns,
)
from cudf.core.column_accessor import ColumnAccessor
Expand Down Expand Up @@ -3655,9 +3656,9 @@ def _get_replacement_values_for_columns(
values_columns = {
col: [value]
if _is_non_decimal_numeric_dtype(columns_dtype_map[col])
else cudf.utils.utils.scalar_broadcast_to(
else full(
len(to_replace),
value,
(len(to_replace),),
cudf.dtype(type(value)),
)
for col in columns_dtype_map
Expand Down
29 changes: 22 additions & 7 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from numbers import Integral
from typing import Any, List, MutableMapping, Tuple, Union

import cupy
import cupy as cp
import numpy as np
import pandas as pd
from pandas._config import get_option
Expand All @@ -29,11 +29,26 @@
as_index,
)
from cudf.utils.docutils import doc_apply
from cudf.utils.utils import (
NotIterable,
_cudf_nvtx_annotate,
_maybe_indices_to_slice,
)
from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate


def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
"""Makes best effort to convert an array of indices into a python slice.
If the conversion is not possible, return input. `indices` are expected
to be valid.
"""
# TODO: improve efficiency by avoiding sync.
if len(indices) == 1:
x = indices[0].item()
return slice(x, x + 1)
if len(indices) == 2:
x1, x2 = indices[0].item(), indices[1].item()
return slice(x1, x2 + 1, x2 - x1)
start, step = indices[0].item(), (indices[1] - indices[0]).item()
stop = start + step * len(indices)
if (indices == cp.arange(start, stop, step)).all():
return slice(start, stop, step)
return indices


class MultiIndex(Frame, BaseIndex, NotIterable):
Expand Down Expand Up @@ -1709,7 +1724,7 @@ def get_loc(self, key, method=None, tolerance=None):
return true_inds

# Not sorted and not unique. Return a boolean mask
mask = cupy.full(self._data.nrows, False)
mask = cp.full(self._data.nrows, False)
mask[true_inds] = True
return mask

Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,9 @@
find_common_type,
is_mixed_with_object_dtype,
min_scalar_type,
to_cudf_compatible_scalar,
)
from cudf.utils.utils import _cudf_nvtx_annotate, to_cudf_compatible_scalar
from cudf.utils.utils import _cudf_nvtx_annotate


def _append_new_row_inplace(col: ColumnLike, value: ScalarLike):
Expand Down
7 changes: 7 additions & 0 deletions python/cudf/cudf/testing/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,13 @@ def assert_column_memory_ne(
raise AssertionError("lhs and rhs holds the same memory.")


def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs):
# Wrapper around pd.Series using a float64 default dtype for empty data.
if dtype is None and (data is None or len(data) == 0):
dtype = "float64"
return pd.Series(data=data, index=index, dtype=dtype, *args, **kwargs)


parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize(
"left_dtype,right_dtype",
list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)),
Expand Down
31 changes: 25 additions & 6 deletions python/cudf/cudf/tests/test_array_function.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,36 @@
# Copyright (c) 2018, NVIDIA CORPORATION.
# Copyright (c) 2018-2022, NVIDIA CORPORATION.
import numpy as np
import pandas as pd
import pytest

import cudf
from cudf.testing._utils import assert_eq
from cudf.utils.utils import IS_NEP18_ACTIVE

missing_arrfunc_cond = not IS_NEP18_ACTIVE
missing_arrfunc_reason = "NEP-18 support is not available in NumPy"

# Test implementation based on dask array test
# https://github.com/dask/dask/blob/master/dask/array/tests/test_array_function.py
# To determine if NEP18 is available in the current version of NumPy we simply
# attempt to concatenate an object with `__array_function__` defined and see if
# NumPy invokes the protocol or not. Taken from dask array
# https://github.com/dask/dask/blob/master/dask/array/utils.py#L352-L363
# TODO: Unclear if this is still necessary. NEP 18 was introduced as the
# default in 1.17 (https://github.com/numpy/numpy/releases/tag/v1.17.0) almost
# 3 years ago, and it was originally introduced one version before in 1.16
# (although not enabled by default then). Can we safely assume that testers
# will have a sufficiently new version of numpy to run these tests?
class _Test:
def __array_function__(self, *args, **kwargs):
return True


try:
np.concatenate([_Test()])
except ValueError:
missing_arrfunc_cond = True
else:
missing_arrfunc_cond = False

del _Test

missing_arrfunc_reason = "NEP-18 support is not available in NumPy"


@pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
Expand Down
10 changes: 4 additions & 6 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3662,9 +3662,7 @@ def test_all(data):
# Pandas treats `None` in object type columns as True for some reason, so
# replacing with `False`
if np.array(data).ndim <= 1:
pdata = cudf.utils.utils._create_pandas_series(data=data).replace(
[None], False
)
pdata = pd.Series(data=data).replace([None], False)
gdata = cudf.Series.from_pandas(pdata)
else:
pdata = pd.DataFrame(data, columns=["a", "b"]).replace([None], False)
Expand Down Expand Up @@ -3715,7 +3713,7 @@ def test_all(data):
@pytest.mark.parametrize("axis", [0, 1])
def test_any(data, axis):
if np.array(data).ndim <= 1:
pdata = cudf.utils.utils._create_pandas_series(data=data)
pdata = pd.Series(data=data)
gdata = cudf.Series.from_pandas(pdata)

if axis == 1:
Expand Down Expand Up @@ -4185,7 +4183,7 @@ def test_create_dataframe_column():
],
)
def test_series_values_host_property(data):
pds = cudf.utils.utils._create_pandas_series(data=data)
pds = pd.Series(data=data)
gds = cudf.Series(data)

np.testing.assert_array_equal(pds.values, gds.values_host)
Expand All @@ -4208,7 +4206,7 @@ def test_series_values_host_property(data):
],
)
def test_series_values_property(data):
pds = cudf.utils.utils._create_pandas_series(data=data)
pds = pd.Series(data=data)
gds = cudf.Series(data)
gds_vals = gds.values
assert isinstance(gds_vals, cupy.ndarray)
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/tests/test_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pytest

import cudf
from cudf.testing._utils import assert_eq
from cudf.testing._utils import _create_pandas_series, assert_eq


@pytest.mark.parametrize(
Expand All @@ -22,7 +22,7 @@
@pytest.mark.parametrize("inplace", [True, False])
def test_dropna_series(data, nulls, inplace):

psr = cudf.utils.utils._create_pandas_series(data=data)
psr = _create_pandas_series(data)

if len(data) > 0:
if nulls == "one":
Expand Down
8 changes: 6 additions & 2 deletions python/cudf/cudf/tests/test_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@

import cudf
from cudf import concat
from cudf.testing._utils import assert_eq, assert_exceptions_equal
from cudf.testing._utils import (
_create_pandas_series,
assert_eq,
assert_exceptions_equal,
)

# TODO: PANDAS 1.0 support
# Revisit drop_duplicates() tests to update parameters like ignore_index.
Expand Down Expand Up @@ -59,7 +63,7 @@ def test_duplicated_with_misspelled_column_name(subset):
],
)
def test_drop_duplicates_series(data, keep):
pds = cudf.utils.utils._create_pandas_series(data)
pds = _create_pandas_series(data)
gds = cudf.from_pandas(pds)

assert_df(pds.drop_duplicates(keep=keep), gds.drop_duplicates(keep=keep))
Expand Down
Loading

0 comments on commit 5053a1a

Please sign in to comment.