diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 9eb5bbc8f07d5..fc199d3fd00f8 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -365,8 +365,10 @@ I/O - Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`) - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) - Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`) +- Bug in :meth:`DataFrame.to_hdf` and :func:`read_hdf` with ``datetime64`` dtypes with non-nanosecond resolution failing to round-trip correctly (:issue:`55622`) - Bug in :meth:`pandas.read_excel` with an ODS file without cached formatted cell for float values (:issue:`55219`) - Bug where :meth:`DataFrame.to_json` would raise an ``OverflowError`` instead of a ``TypeError`` with unsupported NumPy types (:issue:`55403`) +- Period ^^^^^^ diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 138a3ee42f686..04a8ad7ef0be6 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -217,7 +217,7 @@ def stringify(value): kind = ensure_decoded(self.kind) meta = ensure_decoded(self.meta) - if kind in ("datetime64", "datetime"): + if kind == "datetime" or (kind and kind.startswith("datetime64")): if isinstance(v, (int, float)): v = stringify(v) v = ensure_decoded(v) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index c0a27ecfec803..91f7b2afec56c 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2152,7 +2152,6 @@ def convert( val_kind = _ensure_decoded(self.kind) values = _maybe_convert(values, val_kind, encoding, errors) - kwargs = {} kwargs["name"] = _ensure_decoded(self.index_name) @@ -2577,7 +2576,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): dtype = _ensure_decoded(dtype_name) # reverse converts - if dtype == "datetime64": + if dtype.startswith("datetime64"): # recreate with tz if indicated converted = _set_tz(converted, tz, coerce=True) @@ -2870,7 +2869,9 @@ def _get_index_factory(self, attrs): def f(values, freq=None, tz=None): # data are already in UTC, localize and convert if tz present - dta = DatetimeArray._simple_new(values.values, freq=freq) + dta = DatetimeArray._simple_new( + values.values, dtype=values.dtype, freq=freq + ) result = DatetimeIndex._simple_new(dta, name=None) if tz is not None: result = result.tz_localize("UTC").tz_convert(tz) @@ -2961,7 +2962,7 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None else: ret = node[start:stop] - if dtype == "datetime64": + if dtype and dtype.startswith("datetime64"): # reconstruct a timezone if indicated tz = getattr(attrs, "tz", None) ret = _set_tz(ret, tz, coerce=True) @@ -3170,7 +3171,7 @@ def write_array( elif lib.is_np_dtype(value.dtype, "M"): self._handle.create_array(self.group, key, value.view("i8")) - getattr(self.group, key)._v_attrs.value_type = "datetime64" + getattr(self.group, key)._v_attrs.value_type = str(value.dtype) elif isinstance(value.dtype, DatetimeTZDtype): # store as UTC # with a zone @@ -3185,7 +3186,7 @@ def write_array( # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no # attribute "tz" node._v_attrs.tz = _get_tz(value.tz) # type: ignore[union-attr] - node._v_attrs.value_type = "datetime64" + node._v_attrs.value_type = f"datetime64[{value.dtype.unit}]" elif lib.is_np_dtype(value.dtype, "m"): self._handle.create_array(self.group, key, value.view("i8")) getattr(self.group, key)._v_attrs.value_type = "timedelta64" @@ -4689,7 +4690,6 @@ def read( selection = Selection(self, where=where, start=start, stop=stop) # apply the selection filters & axis orderings df = self.process_axes(df, selection=selection, columns=columns) - return df @@ -4932,11 +4932,12 @@ def _set_tz( # call below (which returns an ndarray). So we are only non-lossy # if `tz` matches `values.tz`. assert values.tz is None or values.tz == tz + if values.tz is not None: + return values if tz is not None: if isinstance(values, DatetimeIndex): name = values.name - values = values.asi8 else: name = None values = values.ravel() @@ -5019,8 +5020,12 @@ def _convert_index(name: str, index: Index, encoding: str, errors: str) -> Index def _unconvert_index(data, kind: str, encoding: str, errors: str) -> np.ndarray | Index: index: Index | np.ndarray - if kind == "datetime64": - index = DatetimeIndex(data) + if kind.startswith("datetime64"): + if kind == "datetime64": + # created before we stored resolution information + index = DatetimeIndex(data) + else: + index = DatetimeIndex(data.view(kind)) elif kind == "timedelta64": index = TimedeltaIndex(data) elif kind == "date": @@ -5194,6 +5199,8 @@ def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str def _get_converter(kind: str, encoding: str, errors: str): if kind == "datetime64": return lambda x: np.asarray(x, dtype="M8[ns]") + elif "datetime64" in kind: + return lambda x: np.asarray(x, dtype=kind) elif kind == "string": return lambda x: _unconvert_string_array( x, nan_rep=None, encoding=encoding, errors=errors @@ -5203,7 +5210,7 @@ def _get_converter(kind: str, encoding: str, errors: str): def _need_convert(kind: str) -> bool: - if kind in ("datetime64", "string"): + if kind in ("datetime64", "string") or "datetime64" in kind: return True return False @@ -5248,7 +5255,7 @@ def _dtype_to_kind(dtype_str: str) -> str: elif dtype_str.startswith(("int", "uint")): kind = "integer" elif dtype_str.startswith("datetime64"): - kind = "datetime64" + kind = dtype_str elif dtype_str.startswith("timedelta"): kind = "timedelta64" elif dtype_str.startswith("bool"): @@ -5273,8 +5280,11 @@ def _get_data_and_dtype_name(data: ArrayLike): if isinstance(data, Categorical): data = data.codes - # For datetime64tz we need to drop the TZ in tests TODO: why? - dtype_name = data.dtype.name.split("[")[0] + if isinstance(data.dtype, DatetimeTZDtype): + # For datetime64tz we need to drop the TZ in tests TODO: why? + dtype_name = f"datetime64[{data.dtype.unit}]" + else: + dtype_name = data.dtype.name if data.dtype.kind in "mM": data = np.asarray(data.view("i8")) diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 3ad9abc138ed1..dace8435595ee 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -772,7 +772,7 @@ def test_append_raise(setup_path): "dtype->bytes24,kind->string,shape->(1, 30)] " "vs current table " "[name->values_block_1,cname->values_block_1," - "dtype->datetime64,kind->datetime64,shape->None]" + "dtype->datetime64[s],kind->datetime64[s],shape->None]" ) with pytest.raises(ValueError, match=msg): store.append("df", df) diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py index 4c1f7667873e1..adf42cc7e8d39 100644 --- a/pandas/tests/io/pytables/test_errors.py +++ b/pandas/tests/io/pytables/test_errors.py @@ -49,7 +49,7 @@ def test_table_index_incompatible_dtypes(setup_path): with ensure_clean_store(setup_path) as store: store.put("frame", df1, format="table") - msg = re.escape("incompatible kind in col [integer - datetime64]") + msg = re.escape("incompatible kind in col [integer - datetime64[ns]]") with pytest.raises(TypeError, match=msg): store.put("frame", df2, format="table", append=True) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index f031ac46c670c..b61b6f0772251 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -541,16 +541,22 @@ def test_store_index_name(setup_path): tm.assert_frame_equal(recons, df) +@pytest.mark.parametrize("tz", [None, "US/Pacific"]) +@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) @pytest.mark.parametrize("table_format", ["table", "fixed"]) -def test_store_index_name_numpy_str(tmp_path, table_format, setup_path): +def test_store_index_name_numpy_str(tmp_path, table_format, setup_path, unit, tz): # GH #13492 idx = Index( pd.to_datetime([dt.date(2000, 1, 1), dt.date(2000, 1, 2)]), name="cols\u05d2", - ) - idx1 = Index( - pd.to_datetime([dt.date(2010, 1, 1), dt.date(2010, 1, 2)]), - name="rows\u05d0", + ).tz_localize(tz) + idx1 = ( + Index( + pd.to_datetime([dt.date(2010, 1, 1), dt.date(2010, 1, 2)]), + name="rows\u05d0", + ) + .as_unit(unit) + .tz_localize(tz) ) df = DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1)