Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API/BUG: infer_dtype_from_scalar with non-nano #52212

Merged
merged 14 commits into from
May 18, 2023
2 changes: 1 addition & 1 deletion pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -931,7 +931,7 @@ def rand_series_with_duplicate_datetimeindex() -> Series:
(Period("2012-02-01", freq="D"), "period[D]"),
(
Timestamp("2011-01-01", tz="US/Eastern"),
DatetimeTZDtype(tz="US/Eastern"),
DatetimeTZDtype(unit="s", tz="US/Eastern"),
),
(Timedelta(seconds=500), "timedelta64[ns]"),
]
Expand Down
24 changes: 18 additions & 6 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,7 +645,18 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan):
if inferred == dtype:
return dtype, fv

return np.dtype("object"), fill_value
elif inferred.kind == "m":
# different unit, e.g. passed np.timedelta64(24, "h") with dtype=m8[ns]
# see if we can losslessly cast it to our dtype
unit = np.datetime_data(dtype)[0]
try:
td = Timedelta(fill_value).as_unit(unit, round_ok=False)
except OutOfBoundsTimedelta:
return _dtype_obj, fill_value
else:
return dtype, td.asm8

return _dtype_obj, fill_value

elif is_float(fill_value):
if issubclass(dtype.type, np.bool_):
Expand Down Expand Up @@ -775,8 +786,6 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
elif isinstance(val, (np.datetime64, dt.datetime)):
try:
val = Timestamp(val)
if val is not NaT:
val = val.as_unit("ns")
except OutOfBoundsDatetime:
return _dtype_obj, val

Expand All @@ -785,16 +794,19 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
dtype = val.dtype
# TODO: test with datetime(2920, 10, 1) based on test_replace_dtypes
else:
dtype = DatetimeTZDtype(unit="ns", tz=val.tz)
dtype = DatetimeTZDtype(unit=val.unit, tz=val.tz)

elif isinstance(val, (np.timedelta64, dt.timedelta)):
try:
val = Timedelta(val)
except (OutOfBoundsTimedelta, OverflowError):
dtype = _dtype_obj
else:
dtype = np.dtype("m8[ns]")
val = np.timedelta64(val.value, "ns")
if val is NaT:
val = np.timedelta64("NaT", "ns")
else:
val = val.asm8
dtype = val.dtype

elif is_bool(val):
dtype = np.dtype(np.bool_)
Expand Down
36 changes: 25 additions & 11 deletions pandas/tests/dtypes/cast/test_infer_dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,17 +61,31 @@ def test_infer_dtype_from_complex(complex_dtype):
assert dtype == np.complex_


@pytest.mark.parametrize(
"data", [np.datetime64(1, "ns"), Timestamp(1), datetime(2000, 1, 1, 0, 0)]
)
def test_infer_dtype_from_datetime(data):
dtype, val = infer_dtype_from_scalar(data)
def test_infer_dtype_from_datetime():
dt64 = np.datetime64(1, "ns")
dtype, val = infer_dtype_from_scalar(dt64)
assert dtype == "M8[ns]"

ts = Timestamp(1)
dtype, val = infer_dtype_from_scalar(ts)
assert dtype == "M8[ns]"

@pytest.mark.parametrize("data", [np.timedelta64(1, "ns"), Timedelta(1), timedelta(1)])
def test_infer_dtype_from_timedelta(data):
dtype, val = infer_dtype_from_scalar(data)
dt = datetime(2000, 1, 1, 0, 0)
dtype, val = infer_dtype_from_scalar(dt)
assert dtype == "M8[us]"


def test_infer_dtype_from_timedelta():
td64 = np.timedelta64(1, "ns")
dtype, val = infer_dtype_from_scalar(td64)
assert dtype == "m8[ns]"

pytd = timedelta(1)
dtype, val = infer_dtype_from_scalar(pytd)
assert dtype == "m8[us]"

td = Timedelta(1)
dtype, val = infer_dtype_from_scalar(td)
assert dtype == "m8[ns]"


Expand Down Expand Up @@ -140,9 +154,9 @@ def test_infer_dtype_from_scalar_errors():
(b"foo", np.object_),
(1, np.int64),
(1.5, np.float_),
(np.datetime64("2016-01-01"), np.dtype("M8[ns]")),
(Timestamp("20160101"), np.dtype("M8[ns]")),
(Timestamp("20160101", tz="UTC"), "datetime64[ns, UTC]"),
(np.datetime64("2016-01-01"), np.dtype("M8[s]")),
(Timestamp("20160101"), np.dtype("M8[s]")),
(Timestamp("20160101", tz="UTC"), "datetime64[s, UTC]"),
],
)
def test_infer_dtype_from_scalar(value, expected):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,7 +819,7 @@ def test_setitem_single_column_mixed_datetime(self):
# check our dtypes
result = df.dtypes
expected = Series(
[np.dtype("float64")] * 3 + [np.dtype("datetime64[ns]")],
[np.dtype("float64")] * 3 + [np.dtype("datetime64[s]")],
index=["foo", "bar", "baz", "timestamp"],
)
tm.assert_series_equal(result, expected)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def test_setitem_dt64_index_empty_columns(self):
def test_setitem_timestamp_empty_columns(self):
# GH#19843
df = DataFrame(index=range(3))
df["now"] = Timestamp("20130101", tz="UTC")
df["now"] = Timestamp("20130101", tz="UTC").as_unit("ns")

expected = DataFrame(
[[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"]
Expand Down Expand Up @@ -234,7 +234,7 @@ def test_setitem_dict_preserves_dtypes(self):
(Interval(left=0, right=5), IntervalDtype("int64", "right")),
(
Timestamp("2011-01-01", tz="US/Eastern"),
DatetimeTZDtype(tz="US/Eastern"),
DatetimeTZDtype(unit="s", tz="US/Eastern"),
),
],
)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_get_numeric_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_get_numeric_data_preserve_dtype(self):
tm.assert_frame_equal(result, expected)

def test_get_numeric_data(self):
datetime64name = np.dtype("M8[ns]").name
datetime64name = np.dtype("M8[s]").name
objectname = np.dtype(np.object_).name

df = DataFrame(
Expand Down
12 changes: 10 additions & 2 deletions pandas/tests/frame/methods/test_reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pytest

from pandas._libs.tslibs.timezones import dateutil_gettz as gettz
from pandas.compat import IS64
import pandas.util._test_decorators as td

import pandas as pd
Expand Down Expand Up @@ -118,15 +119,21 @@ class TestDataFrameSelectReindex:
# These are specific reindex-based tests; other indexing tests should go in
# test_indexing

@pytest.mark.xfail(
not IS64,
reason="Passes int32 values to DatetimeArray in make_na_array on "
"windows, 32bit linux builds",
)
@td.skip_array_manager_not_yet_implemented
def test_reindex_tzaware_fill_value(self):
# GH#52586
df = DataFrame([[1]])

ts = pd.Timestamp("2023-04-10 17:32", tz="US/Pacific")
res = df.reindex([0, 1], axis=1, fill_value=ts)
assert res.dtypes[1] == pd.DatetimeTZDtype(tz="US/Pacific")
assert res.dtypes[1] == pd.DatetimeTZDtype(unit="s", tz="US/Pacific")
expected = DataFrame({0: [1], 1: [ts]})
expected[1] = expected[1].astype(res.dtypes[1])
tm.assert_frame_equal(res, expected)

per = ts.tz_localize(None).to_period("s")
Expand All @@ -137,8 +144,9 @@ def test_reindex_tzaware_fill_value(self):

interval = pd.Interval(ts, ts + pd.Timedelta(seconds=1))
res = df.reindex([0, 1], axis=1, fill_value=interval)
assert res.dtypes[1] == pd.IntervalDtype("datetime64[ns, US/Pacific]", "right")
assert res.dtypes[1] == pd.IntervalDtype("datetime64[s, US/Pacific]", "right")
expected = DataFrame({0: [1], 1: [interval]})
expected[1] = expected[1].astype(res.dtypes[1])
tm.assert_frame_equal(res, expected)

def test_reindex_copies(self):
Expand Down
9 changes: 7 additions & 2 deletions pandas/tests/frame/methods/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -656,14 +656,17 @@ def create_cols(name):
"foo", index=df_float.index, columns=create_cols("object")
)
df_dt = DataFrame(
Timestamp("20010101"), index=df_float.index, columns=create_cols("date")
Timestamp("20010101").as_unit("ns"),
index=df_float.index,
columns=create_cols("date"),
)

# add in some nans
df_float.iloc[30:50, 1:3] = np.nan

# ## this is a bug in read_csv right now ####
# df_dt.loc[30:50,1:3] = np.nan
# FIXME: don't leave commented-out

df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)

Expand Down Expand Up @@ -702,7 +705,9 @@ def test_to_csv_dups_cols(self):
df_int = DataFrame(np.random.randn(1000, 3)).astype("int64")
df_bool = DataFrame(True, index=df_float.index, columns=range(3))
df_object = DataFrame("foo", index=df_float.index, columns=range(3))
df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3))
df_dt = DataFrame(
Timestamp("20010101").as_unit("ns"), index=df_float.index, columns=range(3)
)
df = pd.concat(
[df_float, df_int, df_bool, df_object, df_dt], axis=1, ignore_index=True
)
Expand Down
12 changes: 6 additions & 6 deletions pandas/tests/frame/test_block_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,20 +191,20 @@ def test_construction_with_mixed(self, float_string_frame):

# check dtypes
result = df.dtypes
expected = Series({"datetime64[ns]": 3})
expected = Series({"datetime64[us]": 3})

# mixed-type frames
float_string_frame["datetime"] = datetime.now()
float_string_frame["timedelta"] = timedelta(days=1, seconds=1)
assert float_string_frame["datetime"].dtype == "M8[ns]"
assert float_string_frame["timedelta"].dtype == "m8[ns]"
assert float_string_frame["datetime"].dtype == "M8[us]"
assert float_string_frame["timedelta"].dtype == "m8[us]"
result = float_string_frame.dtypes
expected = Series(
[np.dtype("float64")] * 4
+ [
np.dtype("object"),
np.dtype("datetime64[ns]"),
np.dtype("timedelta64[ns]"),
np.dtype("datetime64[us]"),
np.dtype("timedelta64[us]"),
],
index=list("ABCD") + ["foo", "datetime", "timedelta"],
)
Expand All @@ -230,7 +230,7 @@ def test_construction_with_conversions(self):
},
index=range(3),
)
assert expected.dtypes["dt1"] == "M8[ns]"
assert expected.dtypes["dt1"] == "M8[s]"
assert expected.dtypes["dt2"] == "M8[s]"

df = DataFrame(index=range(3))
Expand Down
43 changes: 28 additions & 15 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def test_constructor_from_2d_datetimearray(self, using_array_manager):
def test_constructor_dict_with_tzaware_scalar(self):
# GH#42505
dt = Timestamp("2019-11-03 01:00:00-0700").tz_convert("America/Los_Angeles")
dt = dt.as_unit("ns")

df = DataFrame({"dt": dt}, index=[0])
expected = DataFrame({"dt": [dt]})
Expand Down Expand Up @@ -926,7 +927,7 @@ def test_constructor_dict_extension_scalar(self, ea_scalar_and_dtype):
(Interval(left=0, right=5), IntervalDtype("int64", "right")),
(
Timestamp("2011-01-01", tz="US/Eastern"),
DatetimeTZDtype(tz="US/Eastern"),
DatetimeTZDtype(unit="s", tz="US/Eastern"),
),
],
)
Expand Down Expand Up @@ -1323,7 +1324,7 @@ def test_constructor_unequal_length_nested_list_column(self):
[[Timestamp("2021-01-01")]],
[{"x": Timestamp("2021-01-01")}],
{"x": [Timestamp("2021-01-01")]},
{"x": Timestamp("2021-01-01")},
{"x": Timestamp("2021-01-01").as_unit("ns")},
],
)
def test_constructor_one_element_data_list(self, data):
Expand Down Expand Up @@ -1814,7 +1815,6 @@ def test_constructor_single_value(self):
def test_constructor_with_datetimes(self):
intname = np.dtype(np.int_).name
floatname = np.dtype(np.float_).name
datetime64name = np.dtype("M8[ns]").name
objectname = np.dtype(np.object_).name

# single item
Expand All @@ -1832,7 +1832,7 @@ def test_constructor_with_datetimes(self):
expected = Series(
[np.dtype("int64")]
+ [np.dtype(objectname)] * 2
+ [np.dtype(datetime64name)] * 2,
+ [np.dtype("M8[s]"), np.dtype("M8[us]")],
index=list("ABCDE"),
)
tm.assert_series_equal(result, expected)
Expand Down Expand Up @@ -1912,7 +1912,7 @@ def test_constructor_with_datetimes3(self):
df = DataFrame({"End Date": dt}, index=[0])
assert df.iat[0, 0] == dt
tm.assert_series_equal(
df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"})
df.dtypes, Series({"End Date": "datetime64[us, US/Eastern]"})
)

df = DataFrame([{"End Date": dt}])
Expand Down Expand Up @@ -3047,15 +3047,22 @@ def test_from_scalar_datetimelike_mismatched(self, constructor, cls):
with pytest.raises(TypeError, match=msg):
constructor(scalar, dtype=dtype)

@pytest.mark.xfail(
reason="Timestamp constructor has been updated to cast dt64 to non-nano, "
"but DatetimeArray._from_sequence has not"
)
@pytest.mark.parametrize("cls", [datetime, np.datetime64])
def test_from_out_of_bounds_ns_datetime(self, constructor, cls):
def test_from_out_of_bounds_ns_datetime(
self, constructor, cls, request, box, frame_or_series
):
# scalar that won't fit in nanosecond dt64, but will fit in microsecond
if box is list or (frame_or_series is Series and box is dict):
mark = pytest.mark.xfail(
reason="Timestamp constructor has been updated to cast dt64 to "
"non-nano, but DatetimeArray._from_sequence has not",
strict=True,
)
request.node.add_marker(mark)

scalar = datetime(9999, 1, 1)
exp_dtype = "M8[us]" # pydatetime objects default to this reso

if cls is np.datetime64:
scalar = np.datetime64(scalar, "D")
exp_dtype = "M8[s]" # closest reso to input
Expand All @@ -3076,13 +3083,19 @@ def test_out_of_s_bounds_datetime64(self, constructor):
dtype = tm.get_dtype(result)
assert dtype == object

@pytest.mark.xfail(
reason="TimedeltaArray constructor has been updated to cast td64 to non-nano, "
"but TimedeltaArray._from_sequence has not"
)
@pytest.mark.parametrize("cls", [timedelta, np.timedelta64])
def test_from_out_of_bounds_ns_timedelta(self, constructor, cls):
def test_from_out_of_bounds_ns_timedelta(
self, constructor, cls, request, box, frame_or_series
):
# scalar that won't fit in nanosecond td64, but will fit in microsecond
if box is list or (frame_or_series is Series and box is dict):
mark = pytest.mark.xfail(
reason="TimedeltaArray constructor has been updated to cast td64 "
"to non-nano, but TimedeltaArray._from_sequence has not",
strict=True,
)
request.node.add_marker(mark)

scalar = datetime(9999, 1, 1) - datetime(1970, 1, 1)
exp_dtype = "m8[us]" # smallest reso that fits
if cls is np.timedelta64:
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,9 @@ def func_with_date(batch):
dfg_no_conversion_expected.index.name = "a"

dfg_conversion = df.groupby(by=["a"]).apply(func_with_date)
dfg_conversion_expected = DataFrame({"b": datetime(2015, 1, 1), "c": 2}, index=[1])
dfg_conversion_expected = DataFrame(
{"b": pd.Timestamp(2015, 1, 1).as_unit("ns"), "c": 2}, index=[1]
)
dfg_conversion_expected.index.name = "a"

tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_groupby_shift_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def test_group_shift_with_fill_value():

def test_group_shift_lose_timezone():
# GH 30134
now_dt = Timestamp.utcnow()
now_dt = Timestamp.utcnow().as_unit("ns")
df = DataFrame({"a": [1, 1], "date": now_dt})
result = df.groupby("a").shift(0).iloc[0]
expected = Series({"date": now_dt}, name=result.name)
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/groupby/test_timegrouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -715,7 +715,8 @@ def test_groupby_max_datetime64(self):
# GH 5869
# datetimelike dtype conversion from int
df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)})
expected = df.groupby("A")["A"].apply(lambda x: x.max())
# TODO: can we retain second reso in .apply here?
expected = df.groupby("A")["A"].apply(lambda x: x.max()).astype("M8[s]")
result = df.groupby("A")["A"].max()
tm.assert_series_equal(result, expected)

Expand Down
Loading