From c21b5e8d5c05b774b9d04e34dcb1ad30d4be38ef Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 12 Jul 2024 17:29:21 -0700
Subject: [PATCH 1/3] Type & reduce cupy usage

---
 python/cudf/cudf/core/_base_index.py      |  4 ++-
 python/cudf/cudf/core/column/column.py    |  8 +++---
 python/cudf/cudf/core/column/numerical.py | 10 ++-----
 python/cudf/cudf/core/cut.py              |  6 ++--
 python/cudf/cudf/core/dataframe.py        | 18 +++++++-----
 python/cudf/cudf/core/frame.py            |  6 ++--
 python/cudf/cudf/core/groupby/groupby.py  | 23 ++++++++-------
 python/cudf/cudf/core/index.py            | 34 ++++++++++++-----------
 python/cudf/cudf/core/multiindex.py       | 13 +++++----
 python/cudf/cudf/core/tools/datetimes.py  |  9 +++---
 10 files changed, 69 insertions(+), 62 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index e160fa697ee..9ba2d161619 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -38,6 +38,8 @@
 if TYPE_CHECKING:
     from collections.abc import Generator
 
+    import cupy
+
     from cudf.core.column_accessor import ColumnAccessor
 
 
@@ -2001,7 +2003,7 @@ def drop_duplicates(
             self._column_names,
         )
 
-    def duplicated(self, keep="first"):
+    def duplicated(self, keep="first") -> cupy.ndarray:
         """
         Indicate duplicate index values.
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index f633d527681..fd3664ecac4 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -721,7 +721,7 @@ def notnull(self) -> ColumnBase:
         return result
 
     def indices_of(
-        self, value: ScalarLike | Self
+        self, value: ScalarLike
     ) -> cudf.core.column.NumericalColumn:
         """
         Find locations of value in the column
@@ -735,10 +735,10 @@ def indices_of(
         -------
         Column of indices that match value
         """
-        if not isinstance(value, ColumnBase):
-            value = as_column([value], dtype=self.dtype)
+        if not is_scalar(value):
+            raise ValueError("value must be a scalar")
         else:
-            assert len(value) == 1
+            value = as_column(value, dtype=self.dtype, length=1)
         mask = libcudf.search.contains(value, self)
         return apply_boolean_mask(
             [as_column(range(0, len(self)), dtype=size_type_dtype)], mask
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index a0550bff72b..f2282ae76d8 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -5,7 +5,6 @@
 import functools
 from typing import TYPE_CHECKING, Any, Callable, Sequence, cast
 
-import cupy as cp
 import numpy as np
 import pandas as pd
 from typing_extensions import Self
@@ -13,7 +12,6 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib import pylibcudf
-from cudf._lib.types import size_type_dtype
 from cudf.api.types import (
     is_bool_dtype,
     is_float_dtype,
@@ -131,12 +129,8 @@ def indices_of(self, value: ScalarLike) -> NumericalColumn:
             and self.dtype.kind in {"c", "f"}
             and np.isnan(value)
         ):
-            return column.as_column(
-                cp.argwhere(
-                    cp.isnan(self.data_array_view(mode="read"))
-                ).flatten(),
-                dtype=size_type_dtype,
-            )
+            nan_col = libcudf.unary.is_nan(self)
+            return nan_col.indices_of(True)
         else:
             return super().indices_of(value)
 
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index d9f62f51f92..197f46ee9fe 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -188,9 +188,6 @@ def cut(
         # adjust bin edges decimal precision
         int_label_bins = np.around(bins, precision)
 
-    # the inputs is a column of the values in the array x
-    input_arr = as_column(x)
-
     # checking for the correct inclusivity values
     if right:
         closed = "right"
@@ -242,6 +239,9 @@ def cut(
                 labels if len(set(labels)) == len(labels) else None
             )
 
+    # the inputs is a column of the values in the array x
+    input_arr = as_column(x)
+
     if isinstance(bins, pd.IntervalIndex):
         # get the left and right edges of the bins as columns
         # we cannot typecast an IntervalIndex, so we need to
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f110b788789..0ee4bfd0ec5 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -430,7 +430,7 @@ def _setitem_tuple_arg(self, key, value):
 
             else:
                 value = cupy.asarray(value)
-                if cupy.ndim(value) == 2:
+                if value.ndim == 2:
                     # If the inner dimension is 1, it's broadcastable to
                     # all columns of the dataframe.
                     indexed_shape = columns_df.loc[key[0]].shape
@@ -567,7 +567,7 @@ def _setitem_tuple_arg(self, key, value):
             # TODO: consolidate code path with identical counterpart
             # in `_DataFrameLocIndexer._setitem_tuple_arg`
             value = cupy.asarray(value)
-            if cupy.ndim(value) == 2:
+            if value.ndim == 2:
                 indexed_shape = columns_df.iloc[key[0]].shape
                 if value.shape[1] == 1:
                     if value.shape[0] != indexed_shape[0]:
@@ -2200,8 +2200,8 @@ def from_dict(
 
         orient = orient.lower()
         if orient == "index":
-            if len(data) > 0 and isinstance(
-                next(iter(data.values())), (cudf.Series, cupy.ndarray)
+            if isinstance(
+                next(iter(data.values()), None), (cudf.Series, cupy.ndarray)
             ):
                 result = cls(data).T
                 result.columns = (
@@ -5699,7 +5699,13 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
 
     @classmethod
     @_performance_tracking
-    def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
+    def _from_arrays(
+        cls,
+        data: np.ndarray | cupy.ndarray,
+        index=None,
+        columns=None,
+        nan_as_null=False,
+    ):
         """Convert a numpy/cupy array to DataFrame.
 
         Parameters
@@ -5717,8 +5723,6 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
         -------
         DataFrame
         """
-
-        data = cupy.asarray(data)
         if data.ndim != 1 and data.ndim != 2:
             raise ValueError(
                 f"records dimension expected 1 or 2 but found: {data.ndim}"
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 253d200f7d4..802751e47ad 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1189,7 +1189,7 @@ def searchsorted(
         side: Literal["left", "right"] = "left",
         ascending: bool = True,
         na_position: Literal["first", "last"] = "last",
-    ):
+    ) -> ScalarLike | cupy.ndarray:
         """Find indices where elements should be inserted to maintain order
 
         Parameters
@@ -1527,7 +1527,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
     @acquire_spill_lock()
     def _apply_cupy_ufunc_to_operands(
         self, ufunc, cupy_func, operands, **kwargs
-    ):
+    ) -> list[dict[Any, ColumnBase]]:
         # Note: There are some operations that may be supported by libcudf but
         # are not supported by pandas APIs. In particular, libcudf binary
         # operations support logical and/or operations as well as
@@ -1538,7 +1538,7 @@ def _apply_cupy_ufunc_to_operands(
         # without cupy.
 
         mask = None
-        data = [{} for _ in range(ufunc.nout)]
+        data: list[dict[Any, ColumnBase]] = [{} for _ in range(ufunc.nout)]
         for name, (left, right, _, _) in operands.items():
             cupy_inputs = []
             for inp in (left, right) if ufunc.nin == 2 else (left,):
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index eccb3acabf6..8659d7c2392 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -35,7 +35,12 @@
 from cudf.utils.utils import GetAttrGetItemMixin
 
 if TYPE_CHECKING:
-    from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
+    from cudf._typing import (
+        AggType,
+        DataFrameOrSeries,
+        MultiColumnAggType,
+        ScalarLike,
+    )
 
 
 def _deprecate_collect():
@@ -357,7 +362,7 @@ def groups(self):
         )
 
     @cached_property
-    def indices(self):
+    def indices(self) -> dict[ScalarLike, cp.ndarray]:
         """
         Dict {group name -> group indices}.
 
@@ -1015,18 +1020,16 @@ def ngroup(self, ascending=True):
 
         if ascending:
             # Count ascending from 0 to num_groups - 1
-            group_ids = cudf.Series._from_data({None: cp.arange(num_groups)})
+            groups = range(num_groups)
         elif has_null_group:
             # Count descending from num_groups - 1 to 0, but subtract one more
             # for the null group making it num_groups - 2 to -1.
-            group_ids = cudf.Series._from_data(
-                {None: cp.arange(num_groups - 2, -2, -1)}
-            )
+            groups = range(num_groups - 2, -2, -1)
         else:
             # Count descending from num_groups - 1 to 0
-            group_ids = cudf.Series._from_data(
-                {None: cp.arange(num_groups - 1, -1, -1)}
-            )
+            groups = range(num_groups - 1, -1, -1)
+
+        group_ids = cudf.Series._from_data({None: as_column(groups)})
 
         if has_null_group:
             group_ids.iloc[-1] = cudf.NA
@@ -1713,7 +1716,7 @@ def rolling_avg(val, avg):
         return grouped_values.apply_chunks(function, **kwargs)
 
     @_performance_tracking
-    def _broadcast(self, values):
+    def _broadcast(self, values: cudf.Series) -> cudf.Series:
         """
         Broadcast the results of an aggregation to the group
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index b398ee2343e..4164f981fca 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -103,7 +103,7 @@ def __subclasscheck__(self, subclass):
 
 def _lexsorted_equal_range(
     idx: Index | cudf.MultiIndex,
-    key_as_table: Frame,
+    keys: list[ColumnBase],
     is_sorted: bool,
 ) -> tuple[int, int, ColumnBase | None]:
     """Get equal range for key in lexicographically sorted index. If index
@@ -118,13 +118,13 @@ def _lexsorted_equal_range(
         sort_vals = idx
     lower_bound = search_sorted(
         [*sort_vals._data.columns],
-        [*key_as_table._columns],
+        keys,
         side="left",
         ascending=sort_vals.is_monotonic_increasing,
     ).element_indexing(0)
     upper_bound = search_sorted(
         [*sort_vals._data.columns],
-        [*key_as_table._columns],
+        keys,
         side="right",
         ascending=sort_vals.is_monotonic_increasing,
     ).element_indexing(0)
@@ -260,7 +260,9 @@ def searchsorted(
         ), "Invalid ascending flag"
         return search_range(value, self._range, side=side)
 
-    def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
+    def factorize(
+        self, sort: bool = False, use_na_sentinel: bool = True
+    ) -> tuple[cupy.ndarray, Self]:
         if sort and self.step < 0:
             codes = cupy.arange(len(self) - 1, -1, -1)
             uniques = self[::-1]
@@ -753,15 +755,16 @@ def difference(self, other, sort=None):
             super().difference(other, sort=sort)
         )
 
-    def _try_reconstruct_range_index(self, index):
-        if isinstance(index, RangeIndex) or index.dtype.kind == "f":
+    def _try_reconstruct_range_index(
+        self, index: BaseIndex
+    ) -> Self | BaseIndex:
+        if isinstance(index, RangeIndex) or index.dtype.kind not in "iu":
             return index
         # Evenly spaced values can return a
         # RangeIndex instead of a materialized Index.
-        if not index._column.has_nulls():
+        if not index._column.has_nulls():  # type: ignore[attr-defined]
             uniques = cupy.unique(cupy.diff(index.values))
-            if len(uniques) == 1 and uniques[0].get() != 0:
-                diff = uniques[0].get()
+            if len(uniques) == 1 and (diff := uniques[0].get()) != 0:
                 new_range = range(index[0], index[-1] + diff, diff)
                 return type(self)(new_range, name=index.name)
         return index
@@ -1309,7 +1312,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
         return _return_get_indexer_result(result_series.to_cupy())
 
     @_performance_tracking
-    def get_loc(self, key):
+    def get_loc(self, key) -> int | slice | cupy.ndarray:
         if not is_scalar(key):
             raise TypeError("Should be a scalar-like")
 
@@ -1317,9 +1320,8 @@ def get_loc(self, key):
             self.is_monotonic_increasing or self.is_monotonic_decreasing
         )
 
-        target_as_table = cudf.core.frame.Frame({"None": as_column([key])})
         lower_bound, upper_bound, sort_inds = _lexsorted_equal_range(
-            self, target_as_table, is_sorted
+            self, [as_column([key])], is_sorted
         )
 
         if lower_bound == upper_bound:
@@ -1330,7 +1332,7 @@ def get_loc(self, key):
             return (
                 lower_bound
                 if is_sorted
-                else sort_inds.element_indexing(lower_bound)
+                else sort_inds.element_indexing(lower_bound)  # type: ignore[union-attr]
             )
 
         if is_sorted:
@@ -1339,8 +1341,8 @@ def get_loc(self, key):
             return slice(lower_bound, upper_bound)
 
         # Not sorted and not unique. Return a boolean mask
-        mask = cupy.full(self._data.nrows, False)
-        true_inds = sort_inds.slice(lower_bound, upper_bound).values
+        mask = cupy.full(len(self), False)
+        true_inds = sort_inds.slice(lower_bound, upper_bound).values  # type: ignore[union-attr]
         mask[true_inds] = True
         return mask
 
@@ -2076,7 +2078,7 @@ def day_of_year(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def is_leap_year(self):
+    def is_leap_year(self) -> cupy.ndarray:
         """
         Boolean indicator if the date belongs to a leap year.
 
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 6503dae6ff5..3ed72ff812a 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1926,17 +1926,18 @@ def get_loc(self, key):
 
         # Handle partial key search. If length of `key` is less than `nlevels`,
         # Only search levels up to `len(key)` level.
-        key_as_table = cudf.core.frame.Frame(
-            {i: column.as_column(k, length=1) for i, k in enumerate(key)}
-        )
         partial_index = self.__class__._from_data(
-            data=self._data.select_by_index(slice(key_as_table._num_columns))
+            data=self._data.select_by_index(slice(len(key)))
         )
         (
             lower_bound,
             upper_bound,
             sort_inds,
-        ) = _lexsorted_equal_range(partial_index, key_as_table, is_sorted)
+        ) = _lexsorted_equal_range(
+            partial_index,
+            [column.as_column(k, length=1) for k in key],
+            is_sorted,
+        )
 
         if lower_bound == upper_bound:
             raise KeyError(key)
@@ -1961,7 +1962,7 @@ def get_loc(self, key):
             return true_inds
 
         # Not sorted and not unique. Return a boolean mask
-        mask = cp.full(self._data.nrows, False)
+        mask = cp.full(len(self), False)
         mask[true_inds] = True
         return mask
 
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 064e8fc667d..c6e2b5d10e1 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -6,7 +6,6 @@
 import warnings
 from typing import Literal, Sequence
 
-import cupy as cp
 import numpy as np
 import pandas as pd
 import pandas.tseries.offsets as pd_offset
@@ -894,7 +893,7 @@ def date_range(
         # integers and divide the number range evenly with `periods` elements.
         start = cudf.Scalar(start, dtype=dtype).value.astype("int64")
         end = cudf.Scalar(end, dtype=dtype).value.astype("int64")
-        arr = cp.linspace(start=start, stop=end, num=periods)
+        arr = np.linspace(start=start, stop=end, num=periods)
         result = cudf.core.column.as_column(arr).astype("datetime64[ns]")
         return cudf.DatetimeIndex._from_data({name: result}).tz_localize(tz)
 
@@ -991,8 +990,10 @@ def date_range(
         stop = end_estim.astype("int64")
         start = start.value.astype("int64")
         step = _offset_to_nanoseconds_lower_bound(offset)
-        arr = cp.arange(start=start, stop=stop, step=step, dtype="int64")
-        res = cudf.core.column.as_column(arr).astype("datetime64[ns]")
+        arr = range(int(start), int(stop), step)
+        res = cudf.core.column.as_column(arr, dtype="int64").astype(
+            "datetime64[ns]"
+        )
 
     return cudf.DatetimeIndex._from_data({name: res}, freq=freq).tz_localize(
         tz

From 3a1ddc3623a57182b72ebb4d0c0c7641657b7e0d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 15 Jul 2024 13:20:41 -0700
Subject: [PATCH 2/3] Fix indices_of for datetime

---
 python/cudf/cudf/core/column/datetime.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 214e84028d2..03b404dded5 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -632,9 +632,9 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
     def indices_of(
         self, value: ScalarLike
     ) -> cudf.core.column.NumericalColumn:
-        value = column.as_column(
-            pd.to_datetime(value), dtype=self.dtype
-        ).astype("int64")
+        value = (
+            pd.to_datetime(value).to_numpy().astype(self.dtype).astype("int64")
+        )
         return self.astype("int64").indices_of(value)
 
     @property

From a23b3b58fac6bb4758454501204f0602fde7fd7d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 15 Jul 2024 15:59:34 -0700
Subject: [PATCH 3/3] Remove xfail

---
 python/cudf/cudf/tests/test_datetime.py | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 092e9790c63..7ab9ff2ef23 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1534,18 +1534,7 @@ def test_date_range_start_end_periods(start, end, periods):
     )
 
 
-def test_date_range_start_end_freq(request, start, end, freq):
-    request.applymarker(
-        pytest.mark.xfail(
-            condition=(
-                start == "1831-05-08 15:23:21"
-                and end == "1996-11-21 04:05:30"
-                and freq == "110546789ms"
-            ),
-            reason="https://github.com/rapidsai/cudf/issues/12133",
-        )
-    )
-
+def test_date_range_start_end_freq(start, end, freq):
     if isinstance(freq, str):
         _gfreq = _pfreq = freq
     else:
@@ -1561,7 +1550,7 @@ def test_date_range_start_end_freq(request, start, end, freq):
     )
 
 
-def test_date_range_start_freq_periods(request, start, freq, periods):
+def test_date_range_start_freq_periods(start, freq, periods):
     if isinstance(freq, str):
         _gfreq = _pfreq = freq
     else: