Skip to content

Commit

Permalink
API/BUG: infer_dtype_from_scalar with non-nano (#52212)
Browse files Browse the repository at this point in the history
* API/BUG: infer_dtype_from_scalar with non-nano

* update test

* xfail on 32bit

* fix xfail condition

* whatsnew

* xfail on windows
  • Loading branch information
jbrockmendel authored May 18, 2023
1 parent b2bb68a commit a2bb939
Show file tree
Hide file tree
Showing 23 changed files with 166 additions and 91 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,7 @@ Datetimelike
- Bug in :func:`date_range` when ``freq`` was a :class:`DateOffset` with ``nanoseconds`` (:issue:`46877`)
- Bug in :meth:`Timestamp.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsDatetime`` (:issue:`51494`)
- Bug in :meth:`arrays.DatetimeArray.map` and :meth:`DatetimeIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`)
- Bug in constructing a :class:`Series` or :class:`DataFrame` from a datetime or timedelta scalar always inferring nanosecond resolution instead of inferring from the input (:issue:`52212`)
- Bug in parsing datetime strings with weekday but no day e.g. "2023 Sept Thu" incorrectly raising ``AttributeError`` instead of ``ValueError`` (:issue:`52659`)
-

Expand Down
2 changes: 1 addition & 1 deletion pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -931,7 +931,7 @@ def rand_series_with_duplicate_datetimeindex() -> Series:
(Period("2012-02-01", freq="D"), "period[D]"),
(
Timestamp("2011-01-01", tz="US/Eastern"),
DatetimeTZDtype(tz="US/Eastern"),
DatetimeTZDtype(unit="s", tz="US/Eastern"),
),
(Timedelta(seconds=500), "timedelta64[ns]"),
]
Expand Down
24 changes: 18 additions & 6 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,7 +645,18 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan):
if inferred == dtype:
return dtype, fv

return np.dtype("object"), fill_value
elif inferred.kind == "m":
# different unit, e.g. passed np.timedelta64(24, "h") with dtype=m8[ns]
# see if we can losslessly cast it to our dtype
unit = np.datetime_data(dtype)[0]
try:
td = Timedelta(fill_value).as_unit(unit, round_ok=False)
except OutOfBoundsTimedelta:
return _dtype_obj, fill_value
else:
return dtype, td.asm8

return _dtype_obj, fill_value

elif is_float(fill_value):
if issubclass(dtype.type, np.bool_):
Expand Down Expand Up @@ -775,8 +786,6 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
elif isinstance(val, (np.datetime64, dt.datetime)):
try:
val = Timestamp(val)
if val is not NaT:
val = val.as_unit("ns")
except OutOfBoundsDatetime:
return _dtype_obj, val

Expand All @@ -785,16 +794,19 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
dtype = val.dtype
# TODO: test with datetime(2920, 10, 1) based on test_replace_dtypes
else:
dtype = DatetimeTZDtype(unit="ns", tz=val.tz)
dtype = DatetimeTZDtype(unit=val.unit, tz=val.tz)

elif isinstance(val, (np.timedelta64, dt.timedelta)):
try:
val = Timedelta(val)
except (OutOfBoundsTimedelta, OverflowError):
dtype = _dtype_obj
else:
dtype = np.dtype("m8[ns]")
val = np.timedelta64(val.value, "ns")
if val is NaT:
val = np.timedelta64("NaT", "ns")
else:
val = val.asm8
dtype = val.dtype

elif is_bool(val):
dtype = np.dtype(np.bool_)
Expand Down
36 changes: 25 additions & 11 deletions pandas/tests/dtypes/cast/test_infer_dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,17 +61,31 @@ def test_infer_dtype_from_complex(complex_dtype):
assert dtype == np.complex_


@pytest.mark.parametrize(
"data", [np.datetime64(1, "ns"), Timestamp(1), datetime(2000, 1, 1, 0, 0)]
)
def test_infer_dtype_from_datetime(data):
dtype, val = infer_dtype_from_scalar(data)
def test_infer_dtype_from_datetime():
dt64 = np.datetime64(1, "ns")
dtype, val = infer_dtype_from_scalar(dt64)
assert dtype == "M8[ns]"

ts = Timestamp(1)
dtype, val = infer_dtype_from_scalar(ts)
assert dtype == "M8[ns]"

@pytest.mark.parametrize("data", [np.timedelta64(1, "ns"), Timedelta(1), timedelta(1)])
def test_infer_dtype_from_timedelta(data):
dtype, val = infer_dtype_from_scalar(data)
dt = datetime(2000, 1, 1, 0, 0)
dtype, val = infer_dtype_from_scalar(dt)
assert dtype == "M8[us]"


def test_infer_dtype_from_timedelta():
td64 = np.timedelta64(1, "ns")
dtype, val = infer_dtype_from_scalar(td64)
assert dtype == "m8[ns]"

pytd = timedelta(1)
dtype, val = infer_dtype_from_scalar(pytd)
assert dtype == "m8[us]"

td = Timedelta(1)
dtype, val = infer_dtype_from_scalar(td)
assert dtype == "m8[ns]"


Expand Down Expand Up @@ -140,9 +154,9 @@ def test_infer_dtype_from_scalar_errors():
(b"foo", np.object_),
(1, np.int64),
(1.5, np.float_),
(np.datetime64("2016-01-01"), np.dtype("M8[ns]")),
(Timestamp("20160101"), np.dtype("M8[ns]")),
(Timestamp("20160101", tz="UTC"), "datetime64[ns, UTC]"),
(np.datetime64("2016-01-01"), np.dtype("M8[s]")),
(Timestamp("20160101"), np.dtype("M8[s]")),
(Timestamp("20160101", tz="UTC"), "datetime64[s, UTC]"),
],
)
def test_infer_dtype_from_scalar(value, expected):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,7 +819,7 @@ def test_setitem_single_column_mixed_datetime(self):
# check our dtypes
result = df.dtypes
expected = Series(
[np.dtype("float64")] * 3 + [np.dtype("datetime64[ns]")],
[np.dtype("float64")] * 3 + [np.dtype("datetime64[s]")],
index=["foo", "bar", "baz", "timestamp"],
)
tm.assert_series_equal(result, expected)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def test_setitem_dt64_index_empty_columns(self):
def test_setitem_timestamp_empty_columns(self):
# GH#19843
df = DataFrame(index=range(3))
df["now"] = Timestamp("20130101", tz="UTC")
df["now"] = Timestamp("20130101", tz="UTC").as_unit("ns")

expected = DataFrame(
[[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"]
Expand Down Expand Up @@ -234,7 +234,7 @@ def test_setitem_dict_preserves_dtypes(self):
(Interval(left=0, right=5), IntervalDtype("int64", "right")),
(
Timestamp("2011-01-01", tz="US/Eastern"),
DatetimeTZDtype(tz="US/Eastern"),
DatetimeTZDtype(unit="s", tz="US/Eastern"),
),
],
)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_get_numeric_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_get_numeric_data_preserve_dtype(self):
tm.assert_frame_equal(result, expected)

def test_get_numeric_data(self):
datetime64name = np.dtype("M8[ns]").name
datetime64name = np.dtype("M8[s]").name
objectname = np.dtype(np.object_).name

df = DataFrame(
Expand Down
15 changes: 13 additions & 2 deletions pandas/tests/frame/methods/test_reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
import pytest

from pandas._libs.tslibs.timezones import dateutil_gettz as gettz
from pandas.compat import (
IS64,
is_platform_windows,
)
import pandas.util._test_decorators as td

import pandas as pd
Expand Down Expand Up @@ -118,15 +122,21 @@ class TestDataFrameSelectReindex:
# These are specific reindex-based tests; other indexing tests should go in
# test_indexing

@pytest.mark.xfail(
not IS64 or is_platform_windows(),
reason="Passes int32 values to DatetimeArray in make_na_array on "
"windows, 32bit linux builds",
)
@td.skip_array_manager_not_yet_implemented
def test_reindex_tzaware_fill_value(self):
# GH#52586
df = DataFrame([[1]])

ts = pd.Timestamp("2023-04-10 17:32", tz="US/Pacific")
res = df.reindex([0, 1], axis=1, fill_value=ts)
assert res.dtypes[1] == pd.DatetimeTZDtype(tz="US/Pacific")
assert res.dtypes[1] == pd.DatetimeTZDtype(unit="s", tz="US/Pacific")
expected = DataFrame({0: [1], 1: [ts]})
expected[1] = expected[1].astype(res.dtypes[1])
tm.assert_frame_equal(res, expected)

per = ts.tz_localize(None).to_period("s")
Expand All @@ -137,8 +147,9 @@ def test_reindex_tzaware_fill_value(self):

interval = pd.Interval(ts, ts + pd.Timedelta(seconds=1))
res = df.reindex([0, 1], axis=1, fill_value=interval)
assert res.dtypes[1] == pd.IntervalDtype("datetime64[ns, US/Pacific]", "right")
assert res.dtypes[1] == pd.IntervalDtype("datetime64[s, US/Pacific]", "right")
expected = DataFrame({0: [1], 1: [interval]})
expected[1] = expected[1].astype(res.dtypes[1])
tm.assert_frame_equal(res, expected)

def test_reindex_copies(self):
Expand Down
9 changes: 7 additions & 2 deletions pandas/tests/frame/methods/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -656,14 +656,17 @@ def create_cols(name):
"foo", index=df_float.index, columns=create_cols("object")
)
df_dt = DataFrame(
Timestamp("20010101"), index=df_float.index, columns=create_cols("date")
Timestamp("20010101").as_unit("ns"),
index=df_float.index,
columns=create_cols("date"),
)

# add in some nans
df_float.iloc[30:50, 1:3] = np.nan

# ## this is a bug in read_csv right now ####
# df_dt.loc[30:50,1:3] = np.nan
# FIXME: don't leave commented-out

df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)

Expand Down Expand Up @@ -702,7 +705,9 @@ def test_to_csv_dups_cols(self):
df_int = DataFrame(np.random.randn(1000, 3)).astype("int64")
df_bool = DataFrame(True, index=df_float.index, columns=range(3))
df_object = DataFrame("foo", index=df_float.index, columns=range(3))
df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3))
df_dt = DataFrame(
Timestamp("20010101").as_unit("ns"), index=df_float.index, columns=range(3)
)
df = pd.concat(
[df_float, df_int, df_bool, df_object, df_dt], axis=1, ignore_index=True
)
Expand Down
12 changes: 6 additions & 6 deletions pandas/tests/frame/test_block_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,20 +191,20 @@ def test_construction_with_mixed(self, float_string_frame):

# check dtypes
result = df.dtypes
expected = Series({"datetime64[ns]": 3})
expected = Series({"datetime64[us]": 3})

# mixed-type frames
float_string_frame["datetime"] = datetime.now()
float_string_frame["timedelta"] = timedelta(days=1, seconds=1)
assert float_string_frame["datetime"].dtype == "M8[ns]"
assert float_string_frame["timedelta"].dtype == "m8[ns]"
assert float_string_frame["datetime"].dtype == "M8[us]"
assert float_string_frame["timedelta"].dtype == "m8[us]"
result = float_string_frame.dtypes
expected = Series(
[np.dtype("float64")] * 4
+ [
np.dtype("object"),
np.dtype("datetime64[ns]"),
np.dtype("timedelta64[ns]"),
np.dtype("datetime64[us]"),
np.dtype("timedelta64[us]"),
],
index=list("ABCD") + ["foo", "datetime", "timedelta"],
)
Expand All @@ -230,7 +230,7 @@ def test_construction_with_conversions(self):
},
index=range(3),
)
assert expected.dtypes["dt1"] == "M8[ns]"
assert expected.dtypes["dt1"] == "M8[s]"
assert expected.dtypes["dt2"] == "M8[s]"

df = DataFrame(index=range(3))
Expand Down
43 changes: 28 additions & 15 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def test_constructor_from_2d_datetimearray(self, using_array_manager):
def test_constructor_dict_with_tzaware_scalar(self):
# GH#42505
dt = Timestamp("2019-11-03 01:00:00-0700").tz_convert("America/Los_Angeles")
dt = dt.as_unit("ns")

df = DataFrame({"dt": dt}, index=[0])
expected = DataFrame({"dt": [dt]})
Expand Down Expand Up @@ -926,7 +927,7 @@ def test_constructor_dict_extension_scalar(self, ea_scalar_and_dtype):
(Interval(left=0, right=5), IntervalDtype("int64", "right")),
(
Timestamp("2011-01-01", tz="US/Eastern"),
DatetimeTZDtype(tz="US/Eastern"),
DatetimeTZDtype(unit="s", tz="US/Eastern"),
),
],
)
Expand Down Expand Up @@ -1323,7 +1324,7 @@ def test_constructor_unequal_length_nested_list_column(self):
[[Timestamp("2021-01-01")]],
[{"x": Timestamp("2021-01-01")}],
{"x": [Timestamp("2021-01-01")]},
{"x": Timestamp("2021-01-01")},
{"x": Timestamp("2021-01-01").as_unit("ns")},
],
)
def test_constructor_one_element_data_list(self, data):
Expand Down Expand Up @@ -1814,7 +1815,6 @@ def test_constructor_single_value(self):
def test_constructor_with_datetimes(self):
intname = np.dtype(np.int_).name
floatname = np.dtype(np.float_).name
datetime64name = np.dtype("M8[ns]").name
objectname = np.dtype(np.object_).name

# single item
Expand All @@ -1832,7 +1832,7 @@ def test_constructor_with_datetimes(self):
expected = Series(
[np.dtype("int64")]
+ [np.dtype(objectname)] * 2
+ [np.dtype(datetime64name)] * 2,
+ [np.dtype("M8[s]"), np.dtype("M8[us]")],
index=list("ABCDE"),
)
tm.assert_series_equal(result, expected)
Expand Down Expand Up @@ -1912,7 +1912,7 @@ def test_constructor_with_datetimes3(self):
df = DataFrame({"End Date": dt}, index=[0])
assert df.iat[0, 0] == dt
tm.assert_series_equal(
df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"})
df.dtypes, Series({"End Date": "datetime64[us, US/Eastern]"})
)

df = DataFrame([{"End Date": dt}])
Expand Down Expand Up @@ -3047,15 +3047,22 @@ def test_from_scalar_datetimelike_mismatched(self, constructor, cls):
with pytest.raises(TypeError, match=msg):
constructor(scalar, dtype=dtype)

@pytest.mark.xfail(
reason="Timestamp constructor has been updated to cast dt64 to non-nano, "
"but DatetimeArray._from_sequence has not"
)
@pytest.mark.parametrize("cls", [datetime, np.datetime64])
def test_from_out_of_bounds_ns_datetime(self, constructor, cls):
def test_from_out_of_bounds_ns_datetime(
self, constructor, cls, request, box, frame_or_series
):
# scalar that won't fit in nanosecond dt64, but will fit in microsecond
if box is list or (frame_or_series is Series and box is dict):
mark = pytest.mark.xfail(
reason="Timestamp constructor has been updated to cast dt64 to "
"non-nano, but DatetimeArray._from_sequence has not",
strict=True,
)
request.node.add_marker(mark)

scalar = datetime(9999, 1, 1)
exp_dtype = "M8[us]" # pydatetime objects default to this reso

if cls is np.datetime64:
scalar = np.datetime64(scalar, "D")
exp_dtype = "M8[s]" # closest reso to input
Expand All @@ -3076,13 +3083,19 @@ def test_out_of_s_bounds_datetime64(self, constructor):
dtype = tm.get_dtype(result)
assert dtype == object

@pytest.mark.xfail(
reason="TimedeltaArray constructor has been updated to cast td64 to non-nano, "
"but TimedeltaArray._from_sequence has not"
)
@pytest.mark.parametrize("cls", [timedelta, np.timedelta64])
def test_from_out_of_bounds_ns_timedelta(self, constructor, cls):
def test_from_out_of_bounds_ns_timedelta(
self, constructor, cls, request, box, frame_or_series
):
# scalar that won't fit in nanosecond td64, but will fit in microsecond
if box is list or (frame_or_series is Series and box is dict):
mark = pytest.mark.xfail(
reason="TimedeltaArray constructor has been updated to cast td64 "
"to non-nano, but TimedeltaArray._from_sequence has not",
strict=True,
)
request.node.add_marker(mark)

scalar = datetime(9999, 1, 1) - datetime(1970, 1, 1)
exp_dtype = "m8[us]" # smallest reso that fits
if cls is np.timedelta64:
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,9 @@ def func_with_date(batch):
dfg_no_conversion_expected.index.name = "a"

dfg_conversion = df.groupby(by=["a"]).apply(func_with_date)
dfg_conversion_expected = DataFrame({"b": datetime(2015, 1, 1), "c": 2}, index=[1])
dfg_conversion_expected = DataFrame(
{"b": pd.Timestamp(2015, 1, 1).as_unit("ns"), "c": 2}, index=[1]
)
dfg_conversion_expected.index.name = "a"

tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_groupby_shift_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def test_group_shift_with_fill_value():

def test_group_shift_lose_timezone():
# GH 30134
now_dt = Timestamp.utcnow()
now_dt = Timestamp.utcnow().as_unit("ns")
df = DataFrame({"a": [1, 1], "date": now_dt})
result = df.groupby("a").shift(0).iloc[0]
expected = Series({"date": now_dt}, name=result.name)
Expand Down
Loading

0 comments on commit a2bb939

Please sign in to comment.