From 8d2b2f7c83b5303e946fc8a2e3ab0a322e5976f3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 16 Nov 2022 16:53:59 -0800 Subject: [PATCH] API: allow mixed-datetimes-and-ints in to_datetime, DatetimeIndex (#49348) * API: allow mixed-datetimes-and-ints in to_datetime, DatetimeIndex * typo fixup * typo fixup, update import * mypy fixup --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/_libs/tslib.pyi | 1 - pandas/_libs/tslib.pyx | 9 ----- pandas/core/arrays/datetimes.py | 13 +------ pandas/core/dtypes/cast.py | 37 ++++++------------- .../tests/frame/methods/test_combine_first.py | 6 +-- pandas/tests/frame/test_constructors.py | 11 ++---- pandas/tests/tools/test_to_datetime.py | 11 ++++-- 8 files changed, 28 insertions(+), 61 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 561e39df60b64e..6046b6bf987cdf 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -335,6 +335,7 @@ Other API changes - The ``other`` argument in :meth:`DataFrame.mask` and :meth:`Series.mask` now defaults to ``no_default`` instead of ``np.nan`` consistent with :meth:`DataFrame.where` and :meth:`Series.where`. Entries will be filled with the corresponding NULL value (``np.nan`` for numpy dtypes, ``pd.NA`` for extension dtypes). (:issue:`49111`) - When creating a :class:`Series` with a object-dtype :class:`Index` of datetime objects, pandas no longer silently converts the index to a :class:`DatetimeIndex` (:issue:`39307`, :issue:`23598`) - :meth:`Series.unique` with dtype "timedelta64[ns]" or "datetime64[ns]" now returns :class:`TimedeltaArray` or :class:`DatetimeArray` instead of ``numpy.ndarray`` (:issue:`49176`) +- :func:`to_datetime` and :class:`DatetimeIndex` now allow sequences containing both ``datetime`` objects and numeric entries, matching :class:`Series` behavior (:issue:`49037`) - :func:`pandas.api.dtypes.is_string_dtype` now only returns ``True`` for array-likes with ``dtype=object`` when the elements are inferred to be strings (:issue:`15585`) - Passing a sequence containing ``datetime`` objects and ``date`` objects to :class:`Series` constructor will return with ``object`` dtype instead of ``datetime64[ns]`` dtype, consistent with :class:`Index` behavior (:issue:`49341`) - Passing strings that cannot be parsed as datetimes to :class:`Series` or :class:`DataFrame` with ``dtype="datetime64[ns]"`` will raise instead of silently ignoring the keyword and returning ``object`` dtype (:issue:`24435`) diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index ac8d5bac7c6e76..f3a24a707c530a 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -24,7 +24,6 @@ def array_to_datetime( yearfirst: bool = ..., utc: bool = ..., require_iso8601: bool = ..., - allow_mixed: bool = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... # returned ndarray may be object dtype or datetime64[ns] diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index d7c4c022a25569..3104ecbc8bdb87 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -446,7 +446,6 @@ cpdef array_to_datetime( bint yearfirst=False, bint utc=False, bint require_iso8601=False, - bint allow_mixed=False, ): """ Converts a 1D array of date-like values to a numpy array of either: @@ -475,8 +474,6 @@ cpdef array_to_datetime( indicator whether the dates should be UTC require_iso8601 : bool, default False indicator whether the datetime string should be iso8601 - allow_mixed : bool, default False - Whether to allow mixed datetimes and integers. Returns ------- @@ -710,12 +707,6 @@ cpdef array_to_datetime( val = values[i] if is_integer_object(val) or is_float_object(val): result[i] = NPY_NAT - elif allow_mixed: - pass - elif is_raise: - raise ValueError("mixed datetimes and integers in passed array") - else: - return _array_to_datetime_object(values, errors, dayfirst, yearfirst) if seen_datetime_offset and not utc_convert: # GH#17697 diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index bd134dc11201e9..64deba8a9d3ce9 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1937,10 +1937,7 @@ def sequence_to_datetimes(data) -> DatetimeArray: """ Parse/convert the passed data to either DatetimeArray or np.ndarray[object]. """ - result, tz, freq = _sequence_to_dt64ns( - data, - allow_mixed=True, - ) + result, tz, freq = _sequence_to_dt64ns(data) unit = np.datetime_data(result.dtype)[0] dtype = tz_to_dtype(tz, unit) @@ -1956,7 +1953,6 @@ def _sequence_to_dt64ns( dayfirst: bool = False, yearfirst: bool = False, ambiguous: TimeAmbiguous = "raise", - allow_mixed: bool = False, ): """ Parameters @@ -1968,8 +1964,6 @@ def _sequence_to_dt64ns( yearfirst : bool, default False ambiguous : str, bool, or arraylike, default 'raise' See pandas._libs.tslibs.tzconversion.tz_localize_to_utc. - allow_mixed : bool, default False - Interpret integers as timestamps when datetime objects are also present. Returns ------- @@ -2020,7 +2014,6 @@ def _sequence_to_dt64ns( dayfirst=dayfirst, yearfirst=yearfirst, allow_object=False, - allow_mixed=allow_mixed, ) if tz and inferred_tz: # two timezones: convert to intended from base UTC repr @@ -2109,7 +2102,6 @@ def objects_to_datetime64ns( errors: DateTimeErrorChoices = "raise", require_iso8601: bool = False, allow_object: bool = False, - allow_mixed: bool = False, ): """ Convert data to array of timestamps. @@ -2126,8 +2118,6 @@ def objects_to_datetime64ns( allow_object : bool Whether to return an object-dtype ndarray instead of raising if the data contains more than one timezone. - allow_mixed : bool, default False - Interpret integers as timestamps when datetime objects are also present. Returns ------- @@ -2156,7 +2146,6 @@ def objects_to_datetime64ns( dayfirst=dayfirst, yearfirst=yearfirst, require_iso8601=require_iso8601, - allow_mixed=allow_mixed, ) result = result.reshape(data.shape, order=order) except OverflowError as err: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2059ed05d95e3f..215b6c1021fd89 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -58,7 +58,6 @@ is_complex, is_complex_dtype, is_datetime64_dtype, - is_dtype_equal, is_extension_array_dtype, is_float, is_float_dtype, @@ -1222,7 +1221,7 @@ def maybe_cast_to_datetime( Caller is responsible for handling ExtensionDtype cases and non dt64/td64 cases. """ - from pandas.core.arrays.datetimes import sequence_to_datetimes + from pandas.core.arrays.datetimes import DatetimeArray from pandas.core.arrays.timedeltas import TimedeltaArray assert dtype.kind in ["m", "M"] @@ -1238,36 +1237,24 @@ def maybe_cast_to_datetime( res = TimedeltaArray._from_sequence(value, dtype=dtype) return res - if is_datetime64_dtype(dtype): - # Incompatible types in assignment (expression has type - # "Union[dtype[Any], ExtensionDtype]", variable has type - # "Optional[dtype[Any]]") + else: + # error: Incompatible types in assignment (expression has type + # "Union[dtype[Any], ExtensionDtype]", variable has type "Optional[dtype[Any]]") dtype = _ensure_nanosecond_dtype(dtype) # type: ignore[assignment] - value = np.array(value, copy=False) - - # we have an array of datetime or timedeltas & nulls - if value.size or not is_dtype_equal(value.dtype, dtype): - _disallow_mismatched_datetimelike(value, dtype) - - dta = sequence_to_datetimes(value) - # GH 25843: Remove tz information since the dtype - # didn't specify one - - if dta.tz is not None: + try: + dta = DatetimeArray._from_sequence(value, dtype=dtype) + except ValueError as err: + # We can give a Series-specific exception message. + if "cannot supply both a tz and a timezone-naive dtype" in str(err): raise ValueError( "Cannot convert timezone-aware data to " "timezone-naive dtype. Use " "pd.Series(values).dt.tz_localize(None) instead." - ) - - # TODO(2.0): Do this astype in sequence_to_datetimes to - # avoid potential extra copy? - dta = dta.astype(dtype, copy=False) - return dta + ) from err + raise - # at this point we have converted or raised in all cases where we had a list - return cast(ArrayLike, value) + return dta def sanitize_to_nanoseconds(values: np.ndarray, copy: bool = False) -> np.ndarray: diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 30aef0bc0ec980..e838c8fabf456a 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -6,10 +6,8 @@ from pandas.compat import pa_version_under7p0 from pandas.errors import PerformanceWarning -from pandas.core.dtypes.cast import ( - find_common_type, - is_dtype_equal, -) +from pandas.core.dtypes.cast import find_common_type +from pandas.core.dtypes.common import is_dtype_equal import pandas as pd from pandas import ( diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 033884b9ac57eb..c70268bd0aef20 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -3021,14 +3021,11 @@ def test_from_scalar_datetimelike_mismatched(self, constructor, cls): scalar = cls("NaT", "ns") dtype = {np.datetime64: "m8[ns]", np.timedelta64: "M8[ns]"}[cls] - msg = "Cannot cast" if cls is np.datetime64: - msg = "|".join( - [ - r"dtype datetime64\[ns\] cannot be converted to timedelta64\[ns\]", - "Cannot cast", - ] - ) + msg1 = r"dtype datetime64\[ns\] cannot be converted to timedelta64\[ns\]" + else: + msg1 = r"dtype timedelta64\[ns\] cannot be converted to datetime64\[ns\]" + msg = "|".join(["Cannot cast", msg1]) with pytest.raises(TypeError, match=msg): constructor(scalar, dtype=dtype) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index c3b4159c2cbfcd..27fe4e2d5e0b6d 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1434,9 +1434,14 @@ def test_unit_mixed(self, cache, exp, arr): result = to_datetime(arr, errors="coerce", cache=cache) tm.assert_index_equal(result, expected) - msg = "mixed datetimes and integers in passed array" - with pytest.raises(ValueError, match=msg): - to_datetime(arr, errors="raise", cache=cache) + # GH#49037 pre-2.0 this raised, but it always worked with Series, + # was never clear why it was disallowed + result = to_datetime(arr, errors="raise", cache=cache) + expected = Index([Timestamp(x) for x in arr], dtype="M8[ns]") + tm.assert_index_equal(result, expected) + + result = DatetimeIndex(arr) + tm.assert_index_equal(result, expected) def test_unit_rounding(self, cache): # GH 14156 & GH 20445: argument will incur floating point errors