Skip to content

Commit

Permalink
Add support for Series.between (#11051)
Browse files Browse the repository at this point in the history
Resolves: #10443 

This PR adds `between` API for `Series`. Here is the reference pandas API: https://pandas.pydata.org/docs/reference/api/pandas.Series.between.html

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #11051
  • Loading branch information
galipremsagar authored Jun 6, 2022
1 parent becf5c3 commit a187f44
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/cudf/source/api_docs/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ Computations / descriptive stats
Series.all
Series.any
Series.autocorr
Series.between
Series.clip
Series.corr
Series.count
Expand Down
89 changes: 89 additions & 0 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1510,6 +1510,95 @@ def fillna(
value=value, method=method, axis=axis, inplace=inplace, limit=limit
)

def between(self, left, right, inclusive="both") -> Series:
"""
Return boolean Series equivalent to left <= series <= right.
This function returns a boolean vector containing `True` wherever the
corresponding Series element is between the boundary values `left` and
`right`. NA values are treated as `False`.
Parameters
----------
left : scalar or list-like
Left boundary.
right : scalar or list-like
Right boundary.
inclusive : {"both", "neither", "left", "right"}
Include boundaries. Whether to set each bound as closed or open.
Returns
-------
Series
Series representing whether each element is between left and
right (inclusive).
See Also
--------
Series.gt : Greater than of series and other.
Series.lt : Less than of series and other.
Notes
-----
This function is equivalent to ``(left <= ser) & (ser <= right)``
Examples
--------
>>> import cudf
>>> s = cudf.Series([2, 0, 4, 8, None])
Boundary values are included by default:
>>> s.between(1, 4)
0 True
1 False
2 True
3 False
4 <NA>
dtype: bool
With `inclusive` set to ``"neither"`` boundary values are excluded:
>>> s.between(1, 4, inclusive="neither")
0 True
1 False
2 False
3 False
4 <NA>
dtype: bool
`left` and `right` can be any scalar value:
>>> s = cudf.Series(['Alice', 'Bob', 'Carol', 'Eve'])
>>> s.between('Anna', 'Daniel')
0 False
1 True
2 True
3 False
dtype: bool
"""
left_operand = left if is_scalar(left) else as_column(left)
right_operand = right if is_scalar(right) else as_column(right)

if inclusive == "both":
lmask = self._column >= left_operand
rmask = self._column <= right_operand
elif inclusive == "left":
lmask = self._column >= left_operand
rmask = self._column < right_operand
elif inclusive == "right":
lmask = self._column > left_operand
rmask = self._column <= right_operand
elif inclusive == "neither":
lmask = self._column > left_operand
rmask = self._column < right_operand
else:
raise ValueError(
"Inclusive has to be either string of 'both', "
"'left', 'right', or 'neither'."
)
return self._from_data({self.name: lmask & rmask}, self._index)

@_cudf_nvtx_annotate
def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
if bool_only not in (None, True):
Expand Down
51 changes: 51 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1813,3 +1813,54 @@ def test_series_digitize_invalid_bins():
ValueError, match="`bins` cannot contain null entries."
):
_ = s.digitize(bins)


@pytest.mark.parametrize(
"data,left,right",
[
([0, 1, 2, 3, 4, 5, 10], 0, 5),
([0, 1, 2, 3, 4, 5, 10], 10, 1),
([0, 1, 2, 3, 4, 5], [0, 10, 11] * 2, [1, 2, 5] * 2),
(["a", "few", "set", "of", "strings", "xyz", "abc"], "banana", "few"),
(["a", "few", "set", "of", "strings", "xyz", "abc"], "phone", "hello"),
(
["a", "few", "set", "of", "strings", "xyz", "abc"],
["a", "hello", "rapids", "ai", "world", "chars", "strs"],
["yes", "no", "hi", "bye", "test", "pass", "fail"],
),
([0, 1, 2, np.nan, 4, np.nan, 10], 10, 1),
],
)
@pytest.mark.parametrize("inclusive", ["both", "neither", "left", "right"])
def test_series_between(data, left, right, inclusive):
ps = pd.Series(data)
gs = cudf.from_pandas(ps, nan_as_null=False)

expected = ps.between(left, right, inclusive=inclusive)
actual = gs.between(left, right, inclusive=inclusive)

assert_eq(expected, actual)


@pytest.mark.parametrize(
"data,left,right",
[
([0, 1, 2, None, 4, 5, 10], 0, 5),
([0, 1, 2, 3, None, 5, 10], 10, 1),
([None, 1, 2, 3, 4, None], [0, 10, 11] * 2, [1, 2, 5] * 2),
(
["a", "few", "set", None, "strings", "xyz", "abc"],
["a", "hello", "rapids", "ai", "world", "chars", "strs"],
["yes", "no", "hi", "bye", "test", "pass", "fail"],
),
],
)
@pytest.mark.parametrize("inclusive", ["both", "neither", "left", "right"])
def test_series_between_with_null(data, left, right, inclusive):
gs = cudf.Series(data)
ps = gs.to_pandas(nullable=True)

expected = ps.between(left, right, inclusive=inclusive)
actual = gs.between(left, right, inclusive=inclusive)

assert_eq(expected, actual.to_pandas(nullable=True))

0 comments on commit a187f44

Please sign in to comment.