diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst index 2e3c15eca972f..b6fdbec68d776 100644 --- a/doc/source/whatsnew/v1.3.1.rst +++ b/doc/source/whatsnew/v1.3.1.rst @@ -25,6 +25,8 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.isin` and :meth:`Series.isin` raising ``TypeError`` with nullable data containing at least one missing value (:issue:`42405`) - Regression in :func:`concat` between objects with bool dtype and integer dtype casting to object instead of to integer (:issue:`42092`) - Bug in :class:`Series` constructor not accepting a ``dask.Array`` (:issue:`38645`) +- Fixed regression in :func:`to_datetime` returning pd.NaT for inputs that produce duplicated values, when ``cache=True`` (:issue:`42259`) + .. --------------------------------------------------------------------------- diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 014a702618bda..7414fdf190936 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -194,9 +194,9 @@ def _maybe_cache( if len(unique_dates) < len(arg): cache_dates = convert_listlike(unique_dates, format) cache_array = Series(cache_dates, index=unique_dates) - if not cache_array.is_unique: - # GH#39882 in case of None and NaT we get duplicates - cache_array = cache_array.drop_duplicates() + # GH#39882 and GH#35888 in case of None and NaT we get duplicates + if not cache_array.index.is_unique: + cache_array = cache_array[~cache_array.index.duplicated()] return cache_array diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 121ca99785831..9da7951c199ca 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -957,18 +957,40 @@ def test_to_datetime_cache_scalar(self): expected = Timestamp("20130101 00:00:00") assert result == expected - def test_convert_object_to_datetime_with_cache(self): + @pytest.mark.parametrize( + "datetimelikes,expected_values", + ( + ( + (None, np.nan) + (NaT,) * start_caching_at, + (NaT,) * (start_caching_at + 2), + ), + ( + (None, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, + (NaT, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, + ), + ( + (None,) + + (NaT,) * start_caching_at + + ("2012 July 26", Timestamp("2012-07-26")), + (NaT,) * (start_caching_at + 1) + + (Timestamp("2012-07-26"), Timestamp("2012-07-26")), + ), + ), + ) + def test_convert_object_to_datetime_with_cache( + self, datetimelikes, expected_values + ): # GH#39882 ser = Series( - [None] + [NaT] * start_caching_at + [Timestamp("2012-07-26")], + datetimelikes, dtype="object", ) - result = to_datetime(ser, errors="coerce") - expected = Series( - [NaT] * (start_caching_at + 1) + [Timestamp("2012-07-26")], + result_series = to_datetime(ser, errors="coerce") + expected_series = Series( + expected_values, dtype="datetime64[ns]", ) - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result_series, expected_series) @pytest.mark.parametrize( "date, format",