Remove or simplify various utility functions (#10705)

This PR excises some of the functions in `cudf/utils/utils.py` and moves others as close to their point of use as possible. Most of the moved functions were essentially single-use functions, while the removed functions were either unnecessary or had obvious preexisting replacements. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: #10705
rapidsai · Apr 21, 2022 · 5053a1a · 5053a1a
1 parent 070aef1
commit 5053a1a
Show file tree

Hide file tree

Showing 20 changed files with 132 additions and 211 deletions.
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
@@ -911,8 +911,8 @@ def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn:
                 )
             return other
 
-        ary = cudf.utils.utils.scalar_broadcast_to(
-            self._encode(other), size=len(self), dtype=self.codes.dtype
+        ary = column.full(
+            len(self), self._encode(other), dtype=self.codes.dtype
         )
         return column.build_categorical_column(
             categories=self.dtype.categories._values,
@@ -1629,9 +1629,9 @@ def _create_empty_categorical_column(
     return column.build_categorical_column(
         categories=column.as_column(dtype.categories),
         codes=column.as_column(
-            cudf.utils.utils.scalar_broadcast_to(
-                _DEFAULT_CATEGORICAL_VALUE,
+            column.full(
                 categorical_column.size,
+                _DEFAULT_CATEGORICAL_VALUE,
                 categorical_column.codes.dtype,
             )
         ),

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -68,7 +68,6 @@
     StructDtype,
 )
 from cudf.core.mixins import BinaryOperand, Reducible
-from cudf.utils import utils
 from cudf.utils.dtypes import (
     cudf_dtype_from_pa_type,
     get_time_unit,
@@ -1774,9 +1773,7 @@ def as_column(
             if dtype is None:
                 dtype = cudf.dtype("float64")
 
-        data = as_column(
-            utils.scalar_broadcast_to(arbitrary, length, dtype=dtype)
-        )
+        data = as_column(full(length, arbitrary, dtype=dtype))
         if not nan_as_null and not is_decimal_dtype(data.dtype):
             if np.issubdtype(data.dtype, np.floating):
                 data = data.fillna(np.nan)

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -40,11 +40,12 @@
     as_column,
     build_column,
     column,
+    full,
     string,
 )
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.mixins import BinaryOperand
-from cudf.utils import cudautils, utils
+from cudf.utils import cudautils
 from cudf.utils.dtypes import (
     NUMERIC_TYPES,
     min_column_type,
@@ -254,9 +255,7 @@ def normalize_binop_value(
             if np.isscalar(other):
                 return cudf.dtype(other_dtype).type(other)
             else:
-                ary = utils.scalar_broadcast_to(
-                    other, size=len(self), dtype=other_dtype
-                )
+                ary = full(len(self), other, dtype=other_dtype)
                 return column.build_column(
                     data=Buffer(ary),
                     dtype=ary.dtype,
@@ -438,9 +437,7 @@ def find_and_replace(
             )
         if len(replacement_col) == 1 and len(to_replace_col) > 1:
             replacement_col = column.as_column(
-                utils.scalar_broadcast_to(
-                    replacement[0], (len(to_replace_col),), self.dtype
-                )
+                full(len(to_replace_col), replacement[0], self.dtype)
             )
         elif len(replacement_col) == 1 and len(to_replace_col) == 0:
             return self.copy()

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -37,7 +37,6 @@
 from cudf.core.buffer import Buffer
 from cudf.core.column import column, datetime
 from cudf.core.column.methods import ColumnMethods, ParentType
-from cudf.utils import utils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import can_convert_to_column
 
@@ -5521,9 +5520,15 @@ def _binaryop(
         if isinstance(other, (StringColumn, str, cudf.Scalar)):
             if op == "__add__":
                 if isinstance(other, cudf.Scalar):
-                    other = utils.scalar_broadcast_to(
-                        other, size=len(self), dtype="object"
+                    other = cast(
+                        StringColumn,
+                        column.full(len(self), other, dtype="object"),
                     )
+
+                # Explicit types are necessary because mypy infers ColumnBase
+                # rather than StringColumn and sometimes forgets Scalar.
+                lhs: Union[cudf.Scalar, StringColumn]
+                rhs: Union[cudf.Scalar, StringColumn]
                 lhs, rhs = (other, self) if reflect else (self, other)
 
                 return cast(

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -79,7 +79,7 @@
 from cudf.core.resample import DataFrameResampler
 from cudf.core.series import Series
 from cudf.core.udf.row_function import _get_row_kernel
-from cudf.utils import applyutils, docutils, ioutils, queryutils, utils
+from cudf.utils import applyutils, docutils, ioutils, queryutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
     can_convert_to_column,
@@ -1104,7 +1104,7 @@ def __getitem__(self, arg):
         elif can_convert_to_column(arg):
             mask = arg
             if is_list_like(mask):
-                mask = cudf.utils.utils._create_pandas_series(data=mask)
+                mask = pd.Series(mask)
             if mask.dtype == "bool":
                 return self._apply_boolean_mask(mask)
             else:
@@ -1173,9 +1173,7 @@ def __setitem__(self, arg, value):
                             allow_non_unique=True,
                         )
                     if is_scalar(value):
-                        self._data[arg] = utils.scalar_broadcast_to(
-                            value, len(self)
-                        )
+                        self._data[arg] = column.full(len(self), value)
                     else:
                         value = as_column(value)
                         self._data[arg] = value
@@ -2572,8 +2570,24 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
                 f"{num_cols * (num_cols > 0)}"
             )
 
+        # TODO: This check is currently necessary because
+        # _is_scalar_or_zero_d_array below will treat a length 1 pd.Categorical
+        # as a scalar and attempt to use column.full, which can't handle it.
+        # Maybe _is_scalar_or_zero_d_array should be changed, or maybe we just
+        # shouldn't support pd.Categorical at all, but those changes will at
+        # least require a deprecation cycle because we currently support
+        # inserting a pd.Categorical.
+        if isinstance(value, pd.Categorical):
+            value = cudf.core.column.categorical.pandas_categorical_as_column(
+                value
+            )
+
         if _is_scalar_or_zero_d_array(value):
-            value = utils.scalar_broadcast_to(value, len(self))
+            value = column.full(
+                len(self),
+                value,
+                "str" if libcudf.scalar._is_null_host_scalar(value) else None,
+            )
 
         if len(self) == 0:
             if isinstance(value, (pd.Series, Series)):

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
@@ -40,6 +40,7 @@
     as_column,
     build_categorical_column,
     deserialize_columns,
+    full,
     serialize_columns,
 )
 from cudf.core.column_accessor import ColumnAccessor
@@ -3655,9 +3656,9 @@ def _get_replacement_values_for_columns(
             values_columns = {
                 col: [value]
                 if _is_non_decimal_numeric_dtype(columns_dtype_map[col])
-                else cudf.utils.utils.scalar_broadcast_to(
+                else full(
+                    len(to_replace),
                     value,
-                    (len(to_replace),),
                     cudf.dtype(type(value)),
                 )
                 for col in columns_dtype_map

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
@@ -10,7 +10,7 @@
 from numbers import Integral
 from typing import Any, List, MutableMapping, Tuple, Union
 
-import cupy
+import cupy as cp
 import numpy as np
 import pandas as pd
 from pandas._config import get_option
@@ -29,11 +29,26 @@
     as_index,
 )
 from cudf.utils.docutils import doc_apply
-from cudf.utils.utils import (
-    NotIterable,
-    _cudf_nvtx_annotate,
-    _maybe_indices_to_slice,
-)
+from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate
+
+
+def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
+    """Makes best effort to convert an array of indices into a python slice.
+    If the conversion is not possible, return input. `indices` are expected
+    to be valid.
+    """
+    # TODO: improve efficiency by avoiding sync.
+    if len(indices) == 1:
+        x = indices[0].item()
+        return slice(x, x + 1)
+    if len(indices) == 2:
+        x1, x2 = indices[0].item(), indices[1].item()
+        return slice(x1, x2 + 1, x2 - x1)
+    start, step = indices[0].item(), (indices[1] - indices[0]).item()
+    stop = start + step * len(indices)
+    if (indices == cp.arange(start, stop, step)).all():
+        return slice(start, stop, step)
+    return indices
 
 
 class MultiIndex(Frame, BaseIndex, NotIterable):
@@ -1709,7 +1724,7 @@ def get_loc(self, key, method=None, tolerance=None):
             return true_inds
 
         # Not sorted and not unique. Return a boolean mask
-        mask = cupy.full(self._data.nrows, False)
+        mask = cp.full(self._data.nrows, False)
         mask[true_inds] = True
         return mask
 

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -70,8 +70,9 @@
     find_common_type,
     is_mixed_with_object_dtype,
     min_scalar_type,
+    to_cudf_compatible_scalar,
 )
-from cudf.utils.utils import _cudf_nvtx_annotate, to_cudf_compatible_scalar
+from cudf.utils.utils import _cudf_nvtx_annotate
 
 
 def _append_new_row_inplace(col: ColumnLike, value: ScalarLike):

diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
@@ -363,6 +363,13 @@ def assert_column_memory_ne(
     raise AssertionError("lhs and rhs holds the same memory.")
 
 
+def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs):
+    # Wrapper around pd.Series using a float64 default dtype for empty data.
+    if dtype is None and (data is None or len(data) == 0):
+        dtype = "float64"
+    return pd.Series(data=data, index=index, dtype=dtype, *args, **kwargs)
+
+
 parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize(
     "left_dtype,right_dtype",
     list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)),

diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py
@@ -1,17 +1,36 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 import numpy as np
 import pandas as pd
 import pytest
 
 import cudf
 from cudf.testing._utils import assert_eq
-from cudf.utils.utils import IS_NEP18_ACTIVE
 
-missing_arrfunc_cond = not IS_NEP18_ACTIVE
-missing_arrfunc_reason = "NEP-18 support is not available in NumPy"
 
-# Test implementation based on dask array test
-# https://github.com/dask/dask/blob/master/dask/array/tests/test_array_function.py
+# To determine if NEP18 is available in the current version of NumPy we simply
+# attempt to concatenate an object with `__array_function__` defined and see if
+# NumPy invokes the protocol or not. Taken from dask array
+# https://github.com/dask/dask/blob/master/dask/array/utils.py#L352-L363
+# TODO: Unclear if this is still necessary. NEP 18 was introduced as the
+# default in 1.17 (https://github.com/numpy/numpy/releases/tag/v1.17.0) almost
+# 3 years ago, and it was originally introduced one version before in 1.16
+# (although not enabled by default then). Can we safely assume that testers
+# will have a sufficiently new version of numpy to run these tests?
+class _Test:
+    def __array_function__(self, *args, **kwargs):
+        return True
+
+
+try:
+    np.concatenate([_Test()])
+except ValueError:
+    missing_arrfunc_cond = True
+else:
+    missing_arrfunc_cond = False
+
+del _Test
+
+missing_arrfunc_reason = "NEP-18 support is not available in NumPy"
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
@@ -3662,9 +3662,7 @@ def test_all(data):
     # Pandas treats `None` in object type columns as True for some reason, so
     # replacing with `False`
     if np.array(data).ndim <= 1:
-        pdata = cudf.utils.utils._create_pandas_series(data=data).replace(
-            [None], False
-        )
+        pdata = pd.Series(data=data).replace([None], False)
         gdata = cudf.Series.from_pandas(pdata)
     else:
         pdata = pd.DataFrame(data, columns=["a", "b"]).replace([None], False)
@@ -3715,7 +3713,7 @@ def test_all(data):
 @pytest.mark.parametrize("axis", [0, 1])
 def test_any(data, axis):
     if np.array(data).ndim <= 1:
-        pdata = cudf.utils.utils._create_pandas_series(data=data)
+        pdata = pd.Series(data=data)
         gdata = cudf.Series.from_pandas(pdata)
 
         if axis == 1:
@@ -4185,7 +4183,7 @@ def test_create_dataframe_column():
     ],
 )
 def test_series_values_host_property(data):
-    pds = cudf.utils.utils._create_pandas_series(data=data)
+    pds = pd.Series(data=data)
     gds = cudf.Series(data)
 
     np.testing.assert_array_equal(pds.values, gds.values_host)
@@ -4208,7 +4206,7 @@ def test_series_values_host_property(data):
     ],
 )
 def test_series_values_property(data):
-    pds = cudf.utils.utils._create_pandas_series(data=data)
+    pds = pd.Series(data=data)
     gds = cudf.Series(data)
     gds_vals = gds.values
     assert isinstance(gds_vals, cupy.ndarray)

diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing._utils import _create_pandas_series, assert_eq
 
 
 @pytest.mark.parametrize(
@@ -22,7 +22,7 @@
 @pytest.mark.parametrize("inplace", [True, False])
 def test_dropna_series(data, nulls, inplace):
 
-    psr = cudf.utils.utils._create_pandas_series(data=data)
+    psr = _create_pandas_series(data)
 
     if len(data) > 0:
         if nulls == "one":

diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py
@@ -9,7 +9,11 @@
 
 import cudf
 from cudf import concat
-from cudf.testing._utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import (
+    _create_pandas_series,
+    assert_eq,
+    assert_exceptions_equal,
+)
 
 # TODO: PANDAS 1.0 support
 # Revisit drop_duplicates() tests to update parameters like ignore_index.
@@ -59,7 +63,7 @@ def test_duplicated_with_misspelled_column_name(subset):
     ],
 )
 def test_drop_duplicates_series(data, keep):
-    pds = cudf.utils.utils._create_pandas_series(data)
+    pds = _create_pandas_series(data)
     gds = cudf.from_pandas(pds)
 
     assert_df(pds.drop_duplicates(keep=keep), gds.drop_duplicates(keep=keep))