Add support for Series.between (#11051)

Resolves: #10443 This PR adds `between` API for `Series`. Here is the reference pandas API: https://pandas.pydata.org/docs/reference/api/pandas.Series.between.html Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) URL: #11051
rapidsai · Jun 6, 2022 · a187f44 · a187f44
1 parent becf5c3
commit a187f44
Show file tree

Hide file tree

Showing 3 changed files with 141 additions and 0 deletions.
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
@@ -117,6 +117,7 @@ Computations / descriptive stats
    Series.all
    Series.any
    Series.autocorr
+   Series.between
    Series.clip
    Series.corr
    Series.count

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -1510,6 +1510,95 @@ def fillna(
             value=value, method=method, axis=axis, inplace=inplace, limit=limit
         )
 
+    def between(self, left, right, inclusive="both") -> Series:
+        """
+        Return boolean Series equivalent to left <= series <= right.
+
+        This function returns a boolean vector containing `True` wherever the
+        corresponding Series element is between the boundary values `left` and
+        `right`. NA values are treated as `False`.
+
+        Parameters
+        ----------
+        left : scalar or list-like
+            Left boundary.
+        right : scalar or list-like
+            Right boundary.
+        inclusive : {"both", "neither", "left", "right"}
+            Include boundaries. Whether to set each bound as closed or open.
+
+        Returns
+        -------
+        Series
+            Series representing whether each element is between left and
+            right (inclusive).
+
+        See Also
+        --------
+        Series.gt : Greater than of series and other.
+        Series.lt : Less than of series and other.
+
+        Notes
+        -----
+        This function is equivalent to ``(left <= ser) & (ser <= right)``
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series([2, 0, 4, 8, None])
+
+        Boundary values are included by default:
+
+        >>> s.between(1, 4)
+        0     True
+        1    False
+        2     True
+        3    False
+        4     <NA>
+        dtype: bool
+
+        With `inclusive` set to ``"neither"`` boundary values are excluded:
+
+        >>> s.between(1, 4, inclusive="neither")
+        0     True
+        1    False
+        2    False
+        3    False
+        4     <NA>
+        dtype: bool
+
+        `left` and `right` can be any scalar value:
+
+        >>> s = cudf.Series(['Alice', 'Bob', 'Carol', 'Eve'])
+        >>> s.between('Anna', 'Daniel')
+        0    False
+        1     True
+        2     True
+        3    False
+        dtype: bool
+        """
+        left_operand = left if is_scalar(left) else as_column(left)
+        right_operand = right if is_scalar(right) else as_column(right)
+
+        if inclusive == "both":
+            lmask = self._column >= left_operand
+            rmask = self._column <= right_operand
+        elif inclusive == "left":
+            lmask = self._column >= left_operand
+            rmask = self._column < right_operand
+        elif inclusive == "right":
+            lmask = self._column > left_operand
+            rmask = self._column <= right_operand
+        elif inclusive == "neither":
+            lmask = self._column > left_operand
+            rmask = self._column < right_operand
+        else:
+            raise ValueError(
+                "Inclusive has to be either string of 'both', "
+                "'left', 'right', or 'neither'."
+            )
+        return self._from_data({self.name: lmask & rmask}, self._index)
+
     @_cudf_nvtx_annotate
     def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
         if bool_only not in (None, True):

diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
@@ -1813,3 +1813,54 @@ def test_series_digitize_invalid_bins():
         ValueError, match="`bins` cannot contain null entries."
     ):
         _ = s.digitize(bins)
+
+
+@pytest.mark.parametrize(
+    "data,left,right",
+    [
+        ([0, 1, 2, 3, 4, 5, 10], 0, 5),
+        ([0, 1, 2, 3, 4, 5, 10], 10, 1),
+        ([0, 1, 2, 3, 4, 5], [0, 10, 11] * 2, [1, 2, 5] * 2),
+        (["a", "few", "set", "of", "strings", "xyz", "abc"], "banana", "few"),
+        (["a", "few", "set", "of", "strings", "xyz", "abc"], "phone", "hello"),
+        (
+            ["a", "few", "set", "of", "strings", "xyz", "abc"],
+            ["a", "hello", "rapids", "ai", "world", "chars", "strs"],
+            ["yes", "no", "hi", "bye", "test", "pass", "fail"],
+        ),
+        ([0, 1, 2, np.nan, 4, np.nan, 10], 10, 1),
+    ],
+)
+@pytest.mark.parametrize("inclusive", ["both", "neither", "left", "right"])
+def test_series_between(data, left, right, inclusive):
+    ps = pd.Series(data)
+    gs = cudf.from_pandas(ps, nan_as_null=False)
+
+    expected = ps.between(left, right, inclusive=inclusive)
+    actual = gs.between(left, right, inclusive=inclusive)
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "data,left,right",
+    [
+        ([0, 1, 2, None, 4, 5, 10], 0, 5),
+        ([0, 1, 2, 3, None, 5, 10], 10, 1),
+        ([None, 1, 2, 3, 4, None], [0, 10, 11] * 2, [1, 2, 5] * 2),
+        (
+            ["a", "few", "set", None, "strings", "xyz", "abc"],
+            ["a", "hello", "rapids", "ai", "world", "chars", "strs"],
+            ["yes", "no", "hi", "bye", "test", "pass", "fail"],
+        ),
+    ],
+)
+@pytest.mark.parametrize("inclusive", ["both", "neither", "left", "right"])
+def test_series_between_with_null(data, left, right, inclusive):
+    gs = cudf.Series(data)
+    ps = gs.to_pandas(nullable=True)
+
+    expected = ps.between(left, right, inclusive=inclusive)
+    actual = gs.between(left, right, inclusive=inclusive)
+
+    assert_eq(expected, actual.to_pandas(nullable=True))