From 149138ef008a26bca04f4b698e15ff8fd3ce4663 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 31 Jan 2019 04:46:45 -0800 Subject: [PATCH] BUG: Better handle larger numbers in to_numeric (#24956) * BUG: Better handle larger numbers in to_numeric * Warn about lossiness when passing really large numbers that exceed (u)int64 ranges. * Coerce negative numbers to float when requested instead of crashing and returning object. * Consistently parse numbers as integers / floats, even if we know that the resulting container has to be float. This is to ensure consistent error behavior when inputs numbers are too large. Closes gh-24910. * MAINT: Address comments --- doc/source/whatsnew/v0.25.0.rst | 2 + pandas/_libs/lib.pyx | 25 +++-- pandas/core/tools/numeric.py | 8 ++ pandas/tests/tools/test_numeric.py | 148 ++++++++++++++++++++++++++--- 4 files changed, 158 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 24e3b42859416..939fb8b9415bd 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -110,6 +110,8 @@ Timezones Numeric ^^^^^^^ +- Bug in :meth:`to_numeric` in which large negative numbers were being improperly handled (:issue:`24910`) +- Bug in :meth:`to_numeric` in which numbers were being coerced to float, even though ``errors`` was not ``coerce`` (:issue:`24910`) - - - diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4745916eb0ce2..4a3440e14ba14 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1828,7 +1828,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, except (ValueError, OverflowError, TypeError): pass - # otherwise, iterate and do full infererence + # Otherwise, iterate and do full inference. cdef: int status, maybe_int Py_ssize_t i, n = values.size @@ -1865,10 +1865,10 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, else: seen.float_ = True - if val <= oINT64_MAX: + if oINT64_MIN <= val <= oINT64_MAX: ints[i] = val - if seen.sint_ and seen.uint_: + if val < oINT64_MIN or (seen.sint_ and seen.uint_): seen.float_ = True elif util.is_bool_object(val): @@ -1910,23 +1910,28 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, else: seen.saw_int(as_int) - if not (seen.float_ or as_int in na_values): + if as_int not in na_values: if as_int < oINT64_MIN or as_int > oUINT64_MAX: - raise ValueError('Integer out of range.') + if seen.coerce_numeric: + seen.float_ = True + else: + raise ValueError("Integer out of range.") + else: + if as_int >= 0: + uints[i] = as_int - if as_int >= 0: - uints[i] = as_int - if as_int <= oINT64_MAX: - ints[i] = as_int + if as_int <= oINT64_MAX: + ints[i] = as_int seen.float_ = seen.float_ or (seen.uint_ and seen.sint_) else: seen.float_ = True except (TypeError, ValueError) as e: if not seen.coerce_numeric: - raise type(e)(str(e) + ' at position {pos}'.format(pos=i)) + raise type(e)(str(e) + " at position {pos}".format(pos=i)) elif "uint64" in str(e): # Exception from check functions. raise + seen.saw_null() floats[i] = NaN diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 79d8ee38637f9..24f3e6753e500 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -19,6 +19,14 @@ def to_numeric(arg, errors='raise', downcast=None): depending on the data supplied. Use the `downcast` parameter to obtain other dtypes. + Please note that precision loss may occur if really large numbers + are passed in. Due to the internal limitations of `ndarray`, if + numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min) + or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are + passed in, it is very likely they will be converted to float so that + they can stored in an `ndarray`. These warnings apply similarly to + `Series` since it internally leverages `ndarray`. + Parameters ---------- arg : scalar, list, tuple, 1-d array, or Series diff --git a/pandas/tests/tools/test_numeric.py b/pandas/tests/tools/test_numeric.py index 3822170d884aa..97e1dc2f6aefc 100644 --- a/pandas/tests/tools/test_numeric.py +++ b/pandas/tests/tools/test_numeric.py @@ -4,11 +4,50 @@ from numpy import iinfo import pytest +import pandas.compat as compat + import pandas as pd from pandas import DataFrame, Index, Series, to_numeric from pandas.util import testing as tm +@pytest.fixture(params=[None, "ignore", "raise", "coerce"]) +def errors(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def signed(request): + return request.param + + +@pytest.fixture(params=[lambda x: x, str], ids=["identity", "str"]) +def transform(request): + return request.param + + +@pytest.fixture(params=[ + 47393996303418497800, + 100000000000000000000 +]) +def large_val(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def multiple_elts(request): + return request.param + + +@pytest.fixture(params=[ + (lambda x: Index(x, name="idx"), tm.assert_index_equal), + (lambda x: Series(x, name="ser"), tm.assert_series_equal), + (lambda x: np.array(Index(x).values), tm.assert_numpy_array_equal) +]) +def transform_assert_equal(request): + return request.param + + @pytest.mark.parametrize("input_kwargs,result_kwargs", [ (dict(), dict(dtype=np.int64)), (dict(errors="coerce", downcast="integer"), dict(dtype=np.int8)) @@ -172,7 +211,6 @@ def test_all_nan(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("errors", [None, "ignore", "raise", "coerce"]) def test_type_check(errors): # see gh-11776 df = DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]}) @@ -183,11 +221,100 @@ def test_type_check(errors): to_numeric(df, **kwargs) -@pytest.mark.parametrize("val", [ - 1, 1.1, "1", "1.1", -1.5, "-1.5" -]) -def test_scalar(val): - assert to_numeric(val) == float(val) +@pytest.mark.parametrize("val", [1, 1.1, 20001]) +def test_scalar(val, signed, transform): + val = -val if signed else val + assert to_numeric(transform(val)) == float(val) + + +def test_really_large_scalar(large_val, signed, transform, errors): + # see gh-24910 + kwargs = dict(errors=errors) if errors is not None else dict() + val = -large_val if signed else large_val + + val = transform(val) + val_is_string = isinstance(val, str) + + if val_is_string and errors in (None, "raise"): + msg = "Integer out of range. at position 0" + with pytest.raises(ValueError, match=msg): + to_numeric(val, **kwargs) + else: + expected = float(val) if (errors == "coerce" and + val_is_string) else val + assert tm.assert_almost_equal(to_numeric(val, **kwargs), expected) + + +def test_really_large_in_arr(large_val, signed, transform, + multiple_elts, errors): + # see gh-24910 + kwargs = dict(errors=errors) if errors is not None else dict() + val = -large_val if signed else large_val + val = transform(val) + + extra_elt = "string" + arr = [val] + multiple_elts * [extra_elt] + + val_is_string = isinstance(val, str) + coercing = errors == "coerce" + + if errors in (None, "raise") and (val_is_string or multiple_elts): + if val_is_string: + msg = "Integer out of range. at position 0" + else: + msg = 'Unable to parse string "string" at position 1' + + with pytest.raises(ValueError, match=msg): + to_numeric(arr, **kwargs) + else: + result = to_numeric(arr, **kwargs) + + exp_val = float(val) if (coercing and val_is_string) else val + expected = [exp_val] + + if multiple_elts: + if coercing: + expected.append(np.nan) + exp_dtype = float + else: + expected.append(extra_elt) + exp_dtype = object + else: + exp_dtype = float if isinstance(exp_val, ( + int, compat.long, float)) else object + + tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype)) + + +def test_really_large_in_arr_consistent(large_val, signed, + multiple_elts, errors): + # see gh-24910 + # + # Even if we discover that we have to hold float, does not mean + # we should be lenient on subsequent elements that fail to be integer. + kwargs = dict(errors=errors) if errors is not None else dict() + arr = [str(-large_val if signed else large_val)] + + if multiple_elts: + arr.insert(0, large_val) + + if errors in (None, "raise"): + index = int(multiple_elts) + msg = "Integer out of range. at position {index}".format(index=index) + + with pytest.raises(ValueError, match=msg): + to_numeric(arr, **kwargs) + else: + result = to_numeric(arr, **kwargs) + + if errors == "coerce": + expected = [float(i) for i in arr] + exp_dtype = float + else: + expected = arr + exp_dtype = object + + tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype)) @pytest.mark.parametrize("errors,checker", [ @@ -205,15 +332,6 @@ def test_scalar_fail(errors, checker): assert checker(to_numeric(scalar, errors=errors)) -@pytest.fixture(params=[ - (lambda x: Index(x, name="idx"), tm.assert_index_equal), - (lambda x: Series(x, name="ser"), tm.assert_series_equal), - (lambda x: np.array(Index(x).values), tm.assert_numpy_array_equal) -]) -def transform_assert_equal(request): - return request.param - - @pytest.mark.parametrize("data", [ [1, 2, 3], [1., np.nan, 3, np.nan]