Skip to content

Commit

Permalink
BUG: Fix Series/DataFrame.rank(pct=True) with more than 2**24 rows (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
jschendel authored and jreback committed Nov 14, 2018
1 parent 118ba81 commit 4476962
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 2 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1205,6 +1205,7 @@ Numeric
- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype arithmetic operations with ``ndarray`` with integer dtype incorrectly treating the narray as ``timedelta64[ns]`` dtype (:issue:`23114`)
- Bug in :meth:`Series.rpow` with object dtype ``NaN`` for ``1 ** NA`` instead of ``1`` (:issue:`22922`).
- :meth:`Series.agg` can now handle numpy NaN-aware methods like :func:`numpy.nansum` (:issue:`19629`)
- Bug in :meth:`Series.rank` and :meth:`DataFrame.rank` when ``pct=True`` and more than 2:sup:`24` rows are present resulted in percentages greater than 1.0 (:issue:`18271`)

Strings
^^^^^^^
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/algos_rank_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average',
int tiebreak = 0
bint keep_na = 0
bint isnan
float count = 0.0
float64_t count = 0.0
tiebreak = tiebreakers[ties_method]

{{if dtype == 'float64'}}
Expand Down Expand Up @@ -228,7 +228,7 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average',
float64_t sum_ranks = 0
int tiebreak = 0
bint keep_na = 0
float count = 0.0
float64_t count = 0.0

tiebreak = tiebreakers[ties_method]

Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/frame/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,3 +309,10 @@ def test_rank_pct_true(self, method, exp):

expected = DataFrame(exp)
tm.assert_frame_equal(result, expected)

def test_pct_max_many_rows(self):
# GH 18271
df = DataFrame({'A': np.arange(2**24 + 1),
'B': np.arange(2**24 + 1, 0, -1)})
result = df.rank(pct=True).max()
assert (result == 1).all()
7 changes: 7 additions & 0 deletions pandas/tests/series/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,3 +495,10 @@ def test_rank_first_pct(dtype, ser, exp):
result = s.rank(method='first', pct=True)
expected = Series(exp).astype(result.dtype)
assert_series_equal(result, expected)


def test_pct_max_many_rows():
# GH 18271
s = Series(np.arange(2**24 + 1))
result = s.rank(pct=True).max()
assert result == 1
9 changes: 9 additions & 0 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -1462,6 +1462,15 @@ def test_too_many_ndims(self):
with pytest.raises(TypeError, match=msg):
algos.rank(arr)

@pytest.mark.parametrize('values', [
np.arange(2**24 + 1),
np.arange(2**25 + 2).reshape(2**24 + 1, 2)],
ids=['1d', '2d'])
def test_pct_max_many_rows(self, values):
# GH 18271
result = algos.rank(values, pct=True).max()
assert result == 1


def test_pad_backfill_object_segfault():

Expand Down

0 comments on commit 4476962

Please sign in to comment.