From 38c55e311a64561ca1ba5a503c7583b10ea2ed86 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 31 Dec 2019 08:51:59 -0800 Subject: [PATCH 1/2] REF: refactor array casting out of Index.__new__ --- pandas/core/indexes/base.py | 119 +++++++++++++++++++++++++----------- 1 file changed, 84 insertions(+), 35 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a3808f6f4a37e..64de39751d911 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -349,41 +349,8 @@ def __new__( # they are actually ints, e.g. '0' and 0.0 # should not be coerced # GH 11836 - if is_integer_dtype(dtype): - inferred = lib.infer_dtype(data, skipna=False) - if inferred == "integer": - data = maybe_cast_to_integer_array(data, dtype, copy=copy) - elif inferred in ["floating", "mixed-integer-float"]: - if isna(data).any(): - raise ValueError("cannot convert float NaN to integer") - - if inferred == "mixed-integer-float": - data = maybe_cast_to_integer_array(data, dtype) - - # If we are actually all equal to integers, - # then coerce to integer. - try: - return cls._try_convert_to_int_index( - data, copy, name, dtype - ) - except ValueError: - pass - - # Return an actual float index. - return Float64Index(data, copy=copy, name=name) - - elif inferred == "string": - pass - else: - data = data.astype(dtype) - elif is_float_dtype(dtype): - inferred = lib.infer_dtype(data, skipna=False) - if inferred == "string": - pass - else: - data = data.astype(dtype) - else: - data = np.array(data, dtype=dtype, copy=copy) + data = _maybe_cast_with_dtype(data, dtype, copy) + dtype = data.dtype # TODO: maybe not for object? # maybe coerce to a sub-class if is_signed_integer_dtype(data.dtype): @@ -5486,3 +5453,85 @@ def maybe_extract_name(name, obj, cls) -> Optional[Hashable]: raise TypeError(f"{cls.__name__}.name must be a hashable type") return name + + +def _maybe_cast_with_dtype(data, dtype: np.dtype, copy: bool): + # we need to avoid having numpy coerce + # things that look like ints/floats to ints unless + # they are actually ints, e.g. '0' and 0.0 + # should not be coerced + # GH 11836 + if is_integer_dtype(dtype): + inferred = lib.infer_dtype(data, skipna=False) + if inferred == "integer": + data = maybe_cast_to_integer_array(data, dtype, copy=copy) + elif inferred in ["floating", "mixed-integer-float"]: + if isna(data).any(): + raise ValueError("cannot convert float NaN to integer") + + if inferred == "mixed-integer-float": + data = maybe_cast_to_integer_array(data, dtype) + + # If we are actually all equal to integers, + # then coerce to integer. + try: + data = _try_convert_to_int_array(data, copy, dtype) + except ValueError: + data = np.array(data, dtype=np.float64, copy=copy) + + elif inferred == "string": + pass + else: + data = data.astype(dtype) + elif is_float_dtype(dtype): + inferred = lib.infer_dtype(data, skipna=False) + if inferred == "string": + pass + else: + data = data.astype(dtype) + else: + data = np.array(data, dtype=dtype, copy=copy) + + return data + + +def _try_convert_to_int_array(data, copy, dtype): + """ + Attempt to convert an array of data into an integer array. + + Parameters + ---------- + data : The data to convert. + copy : bool + Whether to copy the data or not. + dtype : np.dtype + + Returns + ------- + int_array : data converted to either an ndarray[int64] or ndarray[uint64] + + Raises + ------ + ValueError if the conversion was not successful. + """ + + if not is_unsigned_integer_dtype(dtype): + # skip int64 conversion attempt if uint-like dtype is passed, as + # this could return Int64Index when UInt64Index is what's desired + try: + res = data.astype("i8", copy=False) + if (res == data).all(): + return res # TODO: might still need to copy + except (OverflowError, TypeError, ValueError): + pass + + # Conversion to int64 failed (possibly due to overflow) or was skipped, + # so let's try now with uint64. + try: + res = data.astype("u8", copy=False) + if (res == data).all(): + return res # TODO: might still need to copy + except (OverflowError, TypeError, ValueError): + pass + + raise ValueError From 970d047f65e4e9e6f1282c1aa48620032a9daf0f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 31 Dec 2019 09:27:07 -0800 Subject: [PATCH 2/2] docstrings --- pandas/core/indexes/base.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 64de39751d911..aa41e2d591029 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5455,7 +5455,21 @@ def maybe_extract_name(name, obj, cls) -> Optional[Hashable]: return name -def _maybe_cast_with_dtype(data, dtype: np.dtype, copy: bool): +def _maybe_cast_with_dtype(data: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray: + """ + If a dtype is passed, cast to the closest matching dtype that is supported + by Index. + + Parameters + ---------- + data : np.ndarray + dtype : np.dtype + copy : bool + + Returns + ------- + np.ndarray + """ # we need to avoid having numpy coerce # things that look like ints/floats to ints unless # they are actually ints, e.g. '0' and 0.0 @@ -5495,7 +5509,9 @@ def _maybe_cast_with_dtype(data, dtype: np.dtype, copy: bool): return data -def _try_convert_to_int_array(data, copy, dtype): +def _try_convert_to_int_array( + data: np.ndarray, copy: bool, dtype: np.dtype +) -> np.ndarray: """ Attempt to convert an array of data into an integer array.