Merge remote-tracking branch 'upstream/branch-24.08' into api/series/…

…alignment
rapidsai · Jul 19, 2024 · e593196 · e593196
2 parents 029d776 + 4c46628
commit e593196
Show file tree

Hide file tree

Showing 19 changed files with 994 additions and 73 deletions.
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
@@ -290,6 +290,17 @@ cudf::data_type binary_operation_fixed_point_output_type(binary_operator op,
 
 namespace binops {
 
+/**
+ * @brief Returns true if the binary operator is supported for the given input types.
+ *
+ * @param out The output data type
+ * @param lhs The left-hand cudf::data_type
+ * @param rhs The right-hand cudf::data_type
+ * @param op The binary operator
+ * @return true if the binary operator is supported for the given input types
+ */
+bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_operator op);
+
 /**
  * @brief Computes output valid mask for op between a column and a scalar
  *

diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
@@ -50,6 +50,11 @@
 namespace cudf {
 namespace binops {
 
+bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_operator op)
+{
+  return cudf::binops::compiled::is_supported_operation(out, lhs, rhs, op);
+}
+
 /**
  * @brief Computes output valid mask for op between a column and a scalar
  */
@@ -194,7 +199,7 @@ std::unique_ptr<column> binary_operation(LhsType const& lhs,
                                          rmm::device_async_resource_ref mr)
 {
   if constexpr (std::is_same_v<LhsType, column_view> and std::is_same_v<RhsType, column_view>)
-    CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
+    CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match", std::invalid_argument);
 
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING and
       output_type.id() == type_id::STRING and

diff --git a/cpp/tests/binaryop/binop-verify-input-test.cpp b/cpp/tests/binaryop/binop-verify-input-test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <[email protected]>
@@ -42,5 +42,5 @@ TEST_F(BinopVerifyInputTest, Vector_Vector_ErrorSecondOperandVectorZeroSize)
 
   EXPECT_THROW(cudf::binary_operation(
                  lhs, rhs, cudf::binary_operator::ADD, cudf::data_type(cudf::type_id::INT64)),
-               cudf::logic_error);
+               std::invalid_argument);
 }
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,7 +26,69 @@ quiet-level = 3
 line-length = 79
 
 [tool.ruff.lint]
-select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418", "TCH", "FA", "UP006", "UP007"]
+typing-modules = ["cudf._typing"]
+select = [
+    # pycodestyle Error
+    "E",
+    # Pyflakes
+    "F",
+    # pycodestyle Warning
+    "W",
+    # no-blank-line-before-function
+    "D201",
+    # one-blank-line-after-class
+    "D204",
+    # indent-with-spaces
+    "D206",
+    # under-indentation
+    "D207",
+    # over-indentation
+    "D208",
+    # new-line-after-last-paragraph
+    "D209",
+    # surrounding-whitespace
+    "D210",
+    # blank-line-before-class
+    "D211",
+    # section-not-over-indented
+    "D214",
+    # section-underline-not-over-indented
+    "D215",
+    # triple-single-quotes
+    "D300",
+    # escape-sequence-in-docstring
+    "D301",
+    # first-line-capitalized
+    "D403",
+    # capitalize-section-name
+    "D405",
+    # new-line-after-section-name
+    "D406",
+    # dashed-underline-after-section
+    "D407",
+    # section-underline-after-name
+    "D408",
+    # section-underline-matches-section-length
+    "D409",
+    # no-blank-line-after-section
+    "D410",
+    # no-blank-line-before-section
+    "D411",
+    # blank-lines-between-header-and-content
+    "D412",
+    # empty-docstring-section
+    "D414",
+    # overload-with-docstring
+    "D418",
+    # flake8-type-checking
+    "TCH",
+    # flake8-future-annotations
+    "FA",
+    # non-pep585-annotation
+    "UP006",
+    # non-pep604-annotation
+    "UP007"
+]
 ignore = [
     # whitespace before :
     "E203",

diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
@@ -1,5 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from libcpp cimport bool
+
 from cudf._lib.pylibcudf.libcudf.binaryop cimport binary_operator
 
 from .column cimport Column
@@ -22,3 +24,10 @@ cpdef Column binary_operation(
     binary_operator op,
     DataType output_type
 )
+
+cpdef bool is_supported_operation(
+    DataType out,
+    DataType lhs,
+    DataType rhs,
+    binary_operator op
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx b/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
@@ -2,6 +2,7 @@
 
 from cython.operator import dereference
 
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
@@ -84,3 +85,37 @@ cpdef Column binary_operation(
         raise ValueError(f"Invalid arguments {lhs} and {rhs}")
 
     return Column.from_libcudf(move(result))
+
+
+cpdef bool is_supported_operation(
+    DataType out,
+    DataType lhs,
+    DataType rhs,
+    binary_operator op
+):
+    """Check if an operation is supported for the given data types.
+
+    For details, see :cpp:func::is_supported_operation`.
+
+    Parameters
+    ----------
+    out : DataType
+        The output data type.
+    lhs : DataType
+        The left hand side data type.
+    rhs : DataType
+        The right hand side data type.
+    op : BinaryOperator
+        The operation to check.
+    Returns
+    -------
+    bool
+        True if the operation is supported, False otherwise
+    """
+
+    return cpp_binaryop.is_supported_operation(
+        out.c_obj,
+        lhs.c_obj,
+        rhs.c_obj,
+        op
+    )
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
@@ -1,9 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
+from cudf._lib.exception_handler cimport cudf_exception_handler
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
@@ -19,48 +21,67 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         TRUE_DIV
         FLOOR_DIV
         MOD
+        PMOD
         PYMOD
         POW
         INT_POW
+        LOG_BASE
+        ATAN2
+        SHIFT_LEFT
+        SHIFT_RIGHT
+        SHIFT_RIGHT_UNSIGNED
+        BITWISE_AND
+        BITWISE_OR
+        BITWISE_XOR
+        LOGICAL_AND
+        LOGICAL_OR
         EQUAL
         NOT_EQUAL
         LESS
         GREATER
         LESS_EQUAL
         GREATER_EQUAL
         NULL_EQUALS
+        NULL_MAX
+        NULL_MIN
         NULL_NOT_EQUALS
-        BITWISE_AND
-        BITWISE_OR
-        BITWISE_XOR
-        LOGICAL_AND
-        LOGICAL_OR
         GENERIC_BINARY
+        NULL_LOGICAL_AND
+        NULL_LOGICAL_OR
+        INVALID_BINARY
 
     cdef unique_ptr[column] binary_operation (
         const scalar& lhs,
         const column_view& rhs,
         binary_operator op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const scalar& rhs,
         binary_operator op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const column_view& rhs,
         binary_operator op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const column_view& rhs,
         const string& op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
+
+cdef extern from "cudf/binaryop.hpp" namespace "cudf::binops" nogil:
+    cdef bool is_supported_operation(
+        data_type output_type,
+        data_type lhs_type,
+        data_type rhs_type,
+        binary_operator op
+    ) except +cudf_exception_handler
diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
@@ -47,7 +47,7 @@ def _check_and_cast_columns_with_other(
 
     other_is_scalar = is_scalar(other)
     if other_is_scalar:
-        if isinstance(other, float) and not np.isnan(other):
+        if isinstance(other, (float, np.floating)) and not np.isnan(other):
             try:
                 is_safe = source_dtype.type(other) == other
             except OverflowError:

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -1458,9 +1458,10 @@ def column_empty_like(
     return column_empty(row_count, dtype, masked)
 
 
-def _has_any_nan(arbitrary):
+def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:
+    """Check if an object dtype Series or array contains NaN."""
     return any(
-        ((isinstance(x, float) or isinstance(x, np.floating)) and np.isnan(x))
+        isinstance(x, (float, np.floating)) and np.isnan(x)
         for x in np.asarray(arbitrary)
     )
 
@@ -2312,9 +2313,8 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
     # Notice, we can always cast pure null columns
     not_null_col_dtypes = [o.dtype for o in objs if o.null_count != len(o)]
     if len(not_null_col_dtypes) and all(
-        _is_non_decimal_numeric_dtype(dtyp)
-        and np.issubdtype(dtyp, np.datetime64)
-        for dtyp in not_null_col_dtypes
+        _is_non_decimal_numeric_dtype(dtype) and dtype.kind == "M"
+        for dtype in not_null_col_dtypes
     ):
         common_dtype = find_common_type(not_null_col_dtypes)
         # Cast all columns to the common dtype

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
@@ -639,7 +639,7 @@ def isin(self, values: Sequence) -> ColumnBase:
         return cudf.core.tools.datetimes._isin_datetimelike(self, values)
 
     def can_cast_safely(self, to_dtype: Dtype) -> bool:
-        if np.issubdtype(to_dtype, np.datetime64):
+        if to_dtype.kind == "M":  # type: ignore[union-attr]
             to_res, _ = np.datetime_data(to_dtype)
             self_res, _ = np.datetime_data(self.dtype)
 

diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
@@ -564,10 +564,11 @@ def take(self, lists_indices: ColumnLike) -> ParentType:
             raise ValueError(
                 "lists_indices and list column is of different " "size."
             )
-        if not _is_non_decimal_numeric_dtype(
-            lists_indices_col.children[1].dtype
-        ) or not np.issubdtype(
-            lists_indices_col.children[1].dtype, np.integer
+        if (
+            not _is_non_decimal_numeric_dtype(
+                lists_indices_col.children[1].dtype
+            )
+            or lists_indices_col.children[1].dtype.kind not in "iu"
         ):
             raise TypeError(
                 "lists_indices should be column of values of index types."

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -225,25 +225,17 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                 tmp = self if reflect else other
                 # Guard against division by zero for integers.
                 if (
-                    (tmp.dtype.type in int_float_dtype_mapping)
-                    and (tmp.dtype.type != np.bool_)
-                    and (
-                        (
-                            (
-                                np.isscalar(tmp)
-                                or (
-                                    isinstance(tmp, cudf.Scalar)
-                                    # host to device copy
-                                    and tmp.is_valid()
-                                )
-                            )
-                            and (0 == tmp)
-                        )
-                        or ((isinstance(tmp, NumericalColumn)) and (0 in tmp))
-                    )
+                    tmp.dtype.type in int_float_dtype_mapping
+                    and tmp.dtype.kind != "b"
                 ):
-                    out_dtype = cudf.dtype("float64")
-
+                    if isinstance(tmp, NumericalColumn) and 0 in tmp:
+                        out_dtype = cudf.dtype("float64")
+                    elif isinstance(tmp, cudf.Scalar):
+                        if tmp.is_valid() and tmp == 0:
+                            # tmp == 0 can return NA
+                            out_dtype = cudf.dtype("float64")
+                    elif is_scalar(tmp) and tmp == 0:
+                        out_dtype = cudf.dtype("float64")
         if op in {
             "__lt__",
             "__gt__",